diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,38621 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.11582422095049567, + "eval_steps": 500, + "global_step": 5500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.1058949263726486e-05, + "grad_norm": 0.388671875, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 1 + }, + { + "epoch": 4.211789852745297e-05, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 2 + }, + { + "epoch": 6.317684779117946e-05, + "grad_norm": 0.40625, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 3 + }, + { + "epoch": 8.423579705490595e-05, + "grad_norm": 0.400390625, + "learning_rate": 0.0002, + "loss": 1.7667, + "step": 4 + }, + { + "epoch": 0.00010529474631863244, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 5 + }, + { + "epoch": 0.0001263536955823589, + "grad_norm": 0.404296875, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 6 + }, + { + "epoch": 0.00014741264484608541, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 7 + }, + { + "epoch": 0.0001684715941098119, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 1.7556, + "step": 8 + }, + { + "epoch": 0.00018953054337353837, + "grad_norm": 0.375, + "learning_rate": 0.0002, + "loss": 1.7616, + "step": 9 + }, + { + "epoch": 0.00021058949263726487, + "grad_norm": 0.39453125, + "learning_rate": 0.0002, + "loss": 1.7136, + "step": 10 + }, + { + "epoch": 0.00023164844190099135, + "grad_norm": 0.384765625, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 11 + }, + { + "epoch": 0.0002527073911647178, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 1.7446, + "step": 12 + }, + { + "epoch": 0.0002737663404284443, + "grad_norm": 0.3828125, + "learning_rate": 0.0002, + "loss": 1.7817, + "step": 13 + }, + { + "epoch": 0.00029482528969217083, + "grad_norm": 0.388671875, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 14 + }, + { + "epoch": 0.0003158842389558973, + "grad_norm": 0.37109375, + "learning_rate": 0.0002, + "loss": 1.7788, + "step": 15 + }, + { + "epoch": 0.0003369431882196238, + "grad_norm": 0.38671875, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 16 + }, + { + "epoch": 0.0003580021374833503, + "grad_norm": 0.375, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 17 + }, + { + "epoch": 0.00037906108674707673, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 18 + }, + { + "epoch": 0.00040012003601080324, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 19 + }, + { + "epoch": 0.00042117898527452974, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 1.756, + "step": 20 + }, + { + "epoch": 0.0004422379345382562, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 21 + }, + { + "epoch": 0.0004632968838019827, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 22 + }, + { + "epoch": 0.0004843558330657092, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 1.7283, + "step": 23 + }, + { + "epoch": 0.0005054147823294356, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 24 + }, + { + "epoch": 0.0005264737315931622, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 25 + }, + { + "epoch": 0.0005475326808568887, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 1.7557, + "step": 26 + }, + { + "epoch": 0.0005685916301206151, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 1.726, + "step": 27 + }, + { + "epoch": 0.0005896505793843417, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 28 + }, + { + "epoch": 0.0006107095286480681, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 1.7738, + "step": 29 + }, + { + "epoch": 0.0006317684779117946, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 30 + }, + { + "epoch": 0.0006528274271755211, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 31 + }, + { + "epoch": 0.0006738863764392476, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 1.7713, + "step": 32 + }, + { + "epoch": 0.000694945325702974, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 1.7332, + "step": 33 + }, + { + "epoch": 0.0007160042749667006, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 34 + }, + { + "epoch": 0.000737063224230427, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 35 + }, + { + "epoch": 0.0007581221734941535, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 1.7695, + "step": 36 + }, + { + "epoch": 0.00077918112275788, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 37 + }, + { + "epoch": 0.0008002400720216065, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 1.7493, + "step": 38 + }, + { + "epoch": 0.0008212990212853329, + "grad_norm": 0.83203125, + "learning_rate": 0.0002, + "loss": 1.7452, + "step": 39 + }, + { + "epoch": 0.0008423579705490595, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 40 + }, + { + "epoch": 0.0008634169198127859, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 41 + }, + { + "epoch": 0.0008844758690765124, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 42 + }, + { + "epoch": 0.0009055348183402389, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 1.7231, + "step": 43 + }, + { + "epoch": 0.0009265937676039654, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 1.7465, + "step": 44 + }, + { + "epoch": 0.0009476527168676918, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 45 + }, + { + "epoch": 0.0009687116661314184, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 1.7407, + "step": 46 + }, + { + "epoch": 0.0009897706153951448, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 1.7647, + "step": 47 + }, + { + "epoch": 0.0010108295646588713, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 48 + }, + { + "epoch": 0.0010318885139225977, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 49 + }, + { + "epoch": 0.0010529474631863244, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 50 + }, + { + "epoch": 0.0010740064124500509, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 51 + }, + { + "epoch": 0.0010950653617137773, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 1.772, + "step": 52 + }, + { + "epoch": 0.0011161243109775038, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 53 + }, + { + "epoch": 0.0011371832602412302, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 54 + }, + { + "epoch": 0.0011582422095049567, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 1.7188, + "step": 55 + }, + { + "epoch": 0.0011793011587686833, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 56 + }, + { + "epoch": 0.0012003601080324098, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 1.7238, + "step": 57 + }, + { + "epoch": 0.0012214190572961362, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 58 + }, + { + "epoch": 0.0012424780065598627, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 59 + }, + { + "epoch": 0.0012635369558235891, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 1.7522, + "step": 60 + }, + { + "epoch": 0.0012845959050873156, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 1.7609, + "step": 61 + }, + { + "epoch": 0.0013056548543510422, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 1.7026, + "step": 62 + }, + { + "epoch": 0.0013267138036147687, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 63 + }, + { + "epoch": 0.0013477727528784951, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 64 + }, + { + "epoch": 0.0013688317021422216, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 1.6928, + "step": 65 + }, + { + "epoch": 0.001389890651405948, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 1.7447, + "step": 66 + }, + { + "epoch": 0.0014109496006696747, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 67 + }, + { + "epoch": 0.0014320085499334011, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 1.7203, + "step": 68 + }, + { + "epoch": 0.0014530674991971276, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 1.7666, + "step": 69 + }, + { + "epoch": 0.001474126448460854, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 1.7205, + "step": 70 + }, + { + "epoch": 0.0014951853977245805, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 1.726, + "step": 71 + }, + { + "epoch": 0.001516244346988307, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 1.7006, + "step": 72 + }, + { + "epoch": 0.0015373032962520336, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 73 + }, + { + "epoch": 0.00155836224551576, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 1.7516, + "step": 74 + }, + { + "epoch": 0.0015794211947794865, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 75 + }, + { + "epoch": 0.001600480144043213, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 1.7405, + "step": 76 + }, + { + "epoch": 0.0016215390933069394, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 77 + }, + { + "epoch": 0.0016425980425706658, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 1.7659, + "step": 78 + }, + { + "epoch": 0.0016636569918343925, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 1.7078, + "step": 79 + }, + { + "epoch": 0.001684715941098119, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 80 + }, + { + "epoch": 0.0017057748903618454, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 1.7338, + "step": 81 + }, + { + "epoch": 0.0017268338396255719, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 1.7066, + "step": 82 + }, + { + "epoch": 0.0017478927888892983, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 1.722, + "step": 83 + }, + { + "epoch": 0.0017689517381530248, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 84 + }, + { + "epoch": 0.0017900106874167514, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 85 + }, + { + "epoch": 0.0018110696366804779, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 1.7222, + "step": 86 + }, + { + "epoch": 0.0018321285859442043, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 87 + }, + { + "epoch": 0.0018531875352079308, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 88 + }, + { + "epoch": 0.0018742464844716572, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.7576, + "step": 89 + }, + { + "epoch": 0.0018953054337353837, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 90 + }, + { + "epoch": 0.0019163643829991103, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 91 + }, + { + "epoch": 0.0019374233322628368, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.7119, + "step": 92 + }, + { + "epoch": 0.001958482281526563, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.7084, + "step": 93 + }, + { + "epoch": 0.0019795412307902897, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.709, + "step": 94 + }, + { + "epoch": 0.0020006001800540163, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 95 + }, + { + "epoch": 0.0020216591293177426, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 96 + }, + { + "epoch": 0.0020427180785814692, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.7172, + "step": 97 + }, + { + "epoch": 0.0020637770278451955, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 98 + }, + { + "epoch": 0.002084835977108922, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 1.7164, + "step": 99 + }, + { + "epoch": 0.002105894926372649, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 100 + }, + { + "epoch": 0.002126953875636375, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 101 + }, + { + "epoch": 0.0021480128249001017, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.7554, + "step": 102 + }, + { + "epoch": 0.002169071774163828, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 103 + }, + { + "epoch": 0.0021901307234275546, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 104 + }, + { + "epoch": 0.0022111896726912813, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 105 + }, + { + "epoch": 0.0022322486219550075, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.7066, + "step": 106 + }, + { + "epoch": 0.002253307571218734, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 107 + }, + { + "epoch": 0.0022743665204824604, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 108 + }, + { + "epoch": 0.002295425469746187, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 109 + }, + { + "epoch": 0.0023164844190099133, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 110 + }, + { + "epoch": 0.00233754336827364, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.7182, + "step": 111 + }, + { + "epoch": 0.0023586023175373666, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 112 + }, + { + "epoch": 0.002379661266801093, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.7472, + "step": 113 + }, + { + "epoch": 0.0024007202160648195, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 114 + }, + { + "epoch": 0.0024217791653285458, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 115 + }, + { + "epoch": 0.0024428381145922724, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 116 + }, + { + "epoch": 0.002463897063855999, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 117 + }, + { + "epoch": 0.0024849560131197253, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.7034, + "step": 118 + }, + { + "epoch": 0.002506014962383452, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.722, + "step": 119 + }, + { + "epoch": 0.0025270739116471782, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.7172, + "step": 120 + }, + { + "epoch": 0.002548132860910905, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 121 + }, + { + "epoch": 0.002569191810174631, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.7188, + "step": 122 + }, + { + "epoch": 0.002590250759438358, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 123 + }, + { + "epoch": 0.0026113097087020845, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 124 + }, + { + "epoch": 0.0026323686579658107, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 125 + }, + { + "epoch": 0.0026534276072295374, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.7352, + "step": 126 + }, + { + "epoch": 0.0026744865564932636, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.7389, + "step": 127 + }, + { + "epoch": 0.0026955455057569903, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 128 + }, + { + "epoch": 0.002716604455020717, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 129 + }, + { + "epoch": 0.002737663404284443, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.7311, + "step": 130 + }, + { + "epoch": 0.00275872235354817, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.7399, + "step": 131 + }, + { + "epoch": 0.002779781302811896, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.7003, + "step": 132 + }, + { + "epoch": 0.0028008402520756227, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.7573, + "step": 133 + }, + { + "epoch": 0.0028218992013393494, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6674, + "step": 134 + }, + { + "epoch": 0.0028429581506030756, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.7218, + "step": 135 + }, + { + "epoch": 0.0028640170998668023, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 136 + }, + { + "epoch": 0.0028850760491305285, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.7341, + "step": 137 + }, + { + "epoch": 0.002906134998394255, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.7303, + "step": 138 + }, + { + "epoch": 0.0029271939476579814, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.6965, + "step": 139 + }, + { + "epoch": 0.002948252896921708, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 140 + }, + { + "epoch": 0.0029693118461854347, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.7004, + "step": 141 + }, + { + "epoch": 0.002990370795449161, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 142 + }, + { + "epoch": 0.0030114297447128876, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.7003, + "step": 143 + }, + { + "epoch": 0.003032488693976614, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.7173, + "step": 144 + }, + { + "epoch": 0.0030535476432403405, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.7199, + "step": 145 + }, + { + "epoch": 0.003074606592504067, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.7208, + "step": 146 + }, + { + "epoch": 0.0030956655417677934, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 147 + }, + { + "epoch": 0.00311672449103152, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 148 + }, + { + "epoch": 0.0031377834402952463, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 149 + }, + { + "epoch": 0.003158842389558973, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 150 + }, + { + "epoch": 0.0031799013388226992, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.7165, + "step": 151 + }, + { + "epoch": 0.003200960288086426, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 152 + }, + { + "epoch": 0.0032220192373501526, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.7164, + "step": 153 + }, + { + "epoch": 0.003243078186613879, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 154 + }, + { + "epoch": 0.0032641371358776055, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.7018, + "step": 155 + }, + { + "epoch": 0.0032851960851413317, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.714, + "step": 156 + }, + { + "epoch": 0.0033062550344050584, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 157 + }, + { + "epoch": 0.003327313983668785, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 158 + }, + { + "epoch": 0.0033483729329325113, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.7051, + "step": 159 + }, + { + "epoch": 0.003369431882196238, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 160 + }, + { + "epoch": 0.003390490831459964, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.699, + "step": 161 + }, + { + "epoch": 0.003411549780723691, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.7237, + "step": 162 + }, + { + "epoch": 0.0034326087299874175, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6972, + "step": 163 + }, + { + "epoch": 0.0034536676792511437, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 164 + }, + { + "epoch": 0.0034747266285148704, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.731, + "step": 165 + }, + { + "epoch": 0.0034957855777785966, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6945, + "step": 166 + }, + { + "epoch": 0.0035168445270423233, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.7267, + "step": 167 + }, + { + "epoch": 0.0035379034763060495, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 168 + }, + { + "epoch": 0.003558962425569776, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.7107, + "step": 169 + }, + { + "epoch": 0.003580021374833503, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 170 + }, + { + "epoch": 0.003601080324097229, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.7131, + "step": 171 + }, + { + "epoch": 0.0036221392733609557, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6982, + "step": 172 + }, + { + "epoch": 0.003643198222624682, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 173 + }, + { + "epoch": 0.0036642571718884086, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 174 + }, + { + "epoch": 0.0036853161211521353, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 175 + }, + { + "epoch": 0.0037063750704158615, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6945, + "step": 176 + }, + { + "epoch": 0.003727434019679588, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 177 + }, + { + "epoch": 0.0037484929689433144, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.7237, + "step": 178 + }, + { + "epoch": 0.003769551918207041, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.7273, + "step": 179 + }, + { + "epoch": 0.0037906108674707673, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6934, + "step": 180 + }, + { + "epoch": 0.003811669816734494, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 181 + }, + { + "epoch": 0.0038327287659982207, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 182 + }, + { + "epoch": 0.003853787715261947, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6875, + "step": 183 + }, + { + "epoch": 0.0038748466645256736, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6636, + "step": 184 + }, + { + "epoch": 0.0038959056137894, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.7399, + "step": 185 + }, + { + "epoch": 0.003916964563053126, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.7184, + "step": 186 + }, + { + "epoch": 0.003938023512316853, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6886, + "step": 187 + }, + { + "epoch": 0.003959082461580579, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 188 + }, + { + "epoch": 0.003980141410844306, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 189 + }, + { + "epoch": 0.004001200360108033, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6534, + "step": 190 + }, + { + "epoch": 0.0040222593093717585, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.7233, + "step": 191 + }, + { + "epoch": 0.004043318258635485, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.7084, + "step": 192 + }, + { + "epoch": 0.004064377207899212, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.7158, + "step": 193 + }, + { + "epoch": 0.0040854361571629385, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 194 + }, + { + "epoch": 0.004106495106426665, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 195 + }, + { + "epoch": 0.004127554055690391, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 196 + }, + { + "epoch": 0.004148613004954118, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.7065, + "step": 197 + }, + { + "epoch": 0.004169671954217844, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.7115, + "step": 198 + }, + { + "epoch": 0.004190730903481571, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 199 + }, + { + "epoch": 0.004211789852745298, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 200 + }, + { + "epoch": 0.004232848802009023, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.714, + "step": 201 + }, + { + "epoch": 0.00425390775127275, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.7203, + "step": 202 + }, + { + "epoch": 0.004274966700536477, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6976, + "step": 203 + }, + { + "epoch": 0.004296025649800203, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 204 + }, + { + "epoch": 0.00431708459906393, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.7151, + "step": 205 + }, + { + "epoch": 0.004338143548327656, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 206 + }, + { + "epoch": 0.0043592024975913825, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.7091, + "step": 207 + }, + { + "epoch": 0.004380261446855109, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 208 + }, + { + "epoch": 0.004401320396118836, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6876, + "step": 209 + }, + { + "epoch": 0.0044223793453825625, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 210 + }, + { + "epoch": 0.004443438294646288, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.7091, + "step": 211 + }, + { + "epoch": 0.004464497243910015, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.7066, + "step": 212 + }, + { + "epoch": 0.004485556193173742, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 213 + }, + { + "epoch": 0.004506615142437468, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.761, + "step": 214 + }, + { + "epoch": 0.004527674091701194, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.7352, + "step": 215 + }, + { + "epoch": 0.004548733040964921, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6842, + "step": 216 + }, + { + "epoch": 0.0045697919902286475, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6548, + "step": 217 + }, + { + "epoch": 0.004590850939492374, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 218 + }, + { + "epoch": 0.004611909888756101, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 219 + }, + { + "epoch": 0.004632968838019827, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 220 + }, + { + "epoch": 0.004654027787283553, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 221 + }, + { + "epoch": 0.00467508673654728, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 222 + }, + { + "epoch": 0.004696145685811007, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 223 + }, + { + "epoch": 0.004717204635074733, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6975, + "step": 224 + }, + { + "epoch": 0.004738263584338459, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6649, + "step": 225 + }, + { + "epoch": 0.004759322533602186, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 226 + }, + { + "epoch": 0.004780381482865912, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 227 + }, + { + "epoch": 0.004801440432129639, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 228 + }, + { + "epoch": 0.004822499381393366, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6466, + "step": 229 + }, + { + "epoch": 0.0048435583306570915, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 230 + }, + { + "epoch": 0.004864617279920818, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 231 + }, + { + "epoch": 0.004885676229184545, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6505, + "step": 232 + }, + { + "epoch": 0.0049067351784482715, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.667, + "step": 233 + }, + { + "epoch": 0.004927794127711998, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6902, + "step": 234 + }, + { + "epoch": 0.004948853076975724, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.7194, + "step": 235 + }, + { + "epoch": 0.004969912026239451, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 236 + }, + { + "epoch": 0.004990970975503177, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.7176, + "step": 237 + }, + { + "epoch": 0.005012029924766904, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6858, + "step": 238 + }, + { + "epoch": 0.005033088874030631, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6671, + "step": 239 + }, + { + "epoch": 0.0050541478232943565, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 240 + }, + { + "epoch": 0.005075206772558083, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.703, + "step": 241 + }, + { + "epoch": 0.00509626572182181, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 242 + }, + { + "epoch": 0.0051173246710855365, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 243 + }, + { + "epoch": 0.005138383620349262, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 244 + }, + { + "epoch": 0.005159442569612989, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6968, + "step": 245 + }, + { + "epoch": 0.005180501518876716, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 246 + }, + { + "epoch": 0.005201560468140442, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 247 + }, + { + "epoch": 0.005222619417404169, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6709, + "step": 248 + }, + { + "epoch": 0.005243678366667895, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 249 + }, + { + "epoch": 0.005264737315931621, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 250 + }, + { + "epoch": 0.005285796265195348, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6926, + "step": 251 + }, + { + "epoch": 0.005306855214459075, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 252 + }, + { + "epoch": 0.005327914163722801, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 253 + }, + { + "epoch": 0.005348973112986527, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6778, + "step": 254 + }, + { + "epoch": 0.005370032062250254, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6848, + "step": 255 + }, + { + "epoch": 0.0053910910115139805, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 256 + }, + { + "epoch": 0.005412149960777707, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 257 + }, + { + "epoch": 0.005433208910041434, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 258 + }, + { + "epoch": 0.00545426785930516, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.684, + "step": 259 + }, + { + "epoch": 0.005475326808568886, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6969, + "step": 260 + }, + { + "epoch": 0.005496385757832613, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 261 + }, + { + "epoch": 0.00551744470709634, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.7236, + "step": 262 + }, + { + "epoch": 0.005538503656360066, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 263 + }, + { + "epoch": 0.005559562605623792, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6924, + "step": 264 + }, + { + "epoch": 0.005580621554887519, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 265 + }, + { + "epoch": 0.005601680504151245, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.7146, + "step": 266 + }, + { + "epoch": 0.005622739453414972, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.7107, + "step": 267 + }, + { + "epoch": 0.005643798402678699, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 268 + }, + { + "epoch": 0.0056648573519424246, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.7131, + "step": 269 + }, + { + "epoch": 0.005685916301206151, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 270 + }, + { + "epoch": 0.005706975250469878, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6973, + "step": 271 + }, + { + "epoch": 0.0057280341997336046, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 272 + }, + { + "epoch": 0.00574909314899733, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 273 + }, + { + "epoch": 0.005770152098261057, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 274 + }, + { + "epoch": 0.005791211047524784, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 275 + }, + { + "epoch": 0.00581226999678851, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6833, + "step": 276 + }, + { + "epoch": 0.005833328946052237, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 277 + }, + { + "epoch": 0.005854387895315963, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6638, + "step": 278 + }, + { + "epoch": 0.0058754468445796895, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 279 + }, + { + "epoch": 0.005896505793843416, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 280 + }, + { + "epoch": 0.005917564743107143, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 281 + }, + { + "epoch": 0.0059386236923708695, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6871, + "step": 282 + }, + { + "epoch": 0.005959682641634595, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.7185, + "step": 283 + }, + { + "epoch": 0.005980741590898322, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.6819, + "step": 284 + }, + { + "epoch": 0.006001800540162049, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 285 + }, + { + "epoch": 0.006022859489425775, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 286 + }, + { + "epoch": 0.006043918438689502, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 287 + }, + { + "epoch": 0.006064977387953228, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 288 + }, + { + "epoch": 0.006086036337216954, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 289 + }, + { + "epoch": 0.006107095286480681, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 290 + }, + { + "epoch": 0.006128154235744408, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 291 + }, + { + "epoch": 0.006149213185008134, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 292 + }, + { + "epoch": 0.00617027213427186, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6972, + "step": 293 + }, + { + "epoch": 0.006191331083535587, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 294 + }, + { + "epoch": 0.0062123900327993135, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 295 + }, + { + "epoch": 0.00623344898206304, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 296 + }, + { + "epoch": 0.006254507931326767, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 297 + }, + { + "epoch": 0.006275566880590493, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 298 + }, + { + "epoch": 0.006296625829854219, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.7012, + "step": 299 + }, + { + "epoch": 0.006317684779117946, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 300 + }, + { + "epoch": 0.006338743728381673, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 301 + }, + { + "epoch": 0.0063598026776453985, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6943, + "step": 302 + }, + { + "epoch": 0.006380861626909125, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 303 + }, + { + "epoch": 0.006401920576172852, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6784, + "step": 304 + }, + { + "epoch": 0.0064229795254365785, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 305 + }, + { + "epoch": 0.006444038474700305, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 306 + }, + { + "epoch": 0.006465097423964031, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 307 + }, + { + "epoch": 0.006486156373227758, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.7237, + "step": 308 + }, + { + "epoch": 0.006507215322491484, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.7176, + "step": 309 + }, + { + "epoch": 0.006528274271755211, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 310 + }, + { + "epoch": 0.006549333221018938, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 311 + }, + { + "epoch": 0.006570392170282663, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 312 + }, + { + "epoch": 0.00659145111954639, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 313 + }, + { + "epoch": 0.006612510068810117, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.7239, + "step": 314 + }, + { + "epoch": 0.006633569018073843, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6826, + "step": 315 + }, + { + "epoch": 0.00665462796733757, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6939, + "step": 316 + }, + { + "epoch": 0.006675686916601296, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 317 + }, + { + "epoch": 0.0066967458658650225, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 318 + }, + { + "epoch": 0.006717804815128749, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 319 + }, + { + "epoch": 0.006738863764392476, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 320 + }, + { + "epoch": 0.0067599227136562025, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 321 + }, + { + "epoch": 0.006780981662919928, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 322 + }, + { + "epoch": 0.006802040612183655, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.7182, + "step": 323 + }, + { + "epoch": 0.006823099561447382, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 324 + }, + { + "epoch": 0.006844158510711108, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6928, + "step": 325 + }, + { + "epoch": 0.006865217459974835, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.7006, + "step": 326 + }, + { + "epoch": 0.006886276409238561, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 327 + }, + { + "epoch": 0.0069073353585022874, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.686, + "step": 328 + }, + { + "epoch": 0.006928394307766014, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 329 + }, + { + "epoch": 0.006949453257029741, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 330 + }, + { + "epoch": 0.006970512206293467, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6883, + "step": 331 + }, + { + "epoch": 0.006991571155557193, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.6402, + "step": 332 + }, + { + "epoch": 0.00701263010482092, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 333 + }, + { + "epoch": 0.007033689054084647, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6617, + "step": 334 + }, + { + "epoch": 0.007054748003348373, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 335 + }, + { + "epoch": 0.007075806952612099, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 336 + }, + { + "epoch": 0.007096865901875826, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6682, + "step": 337 + }, + { + "epoch": 0.007117924851139552, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 338 + }, + { + "epoch": 0.007138983800403279, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.7078, + "step": 339 + }, + { + "epoch": 0.007160042749667006, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 340 + }, + { + "epoch": 0.0071811016989307315, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.635, + "step": 341 + }, + { + "epoch": 0.007202160648194458, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.6842, + "step": 342 + }, + { + "epoch": 0.007223219597458185, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6659, + "step": 343 + }, + { + "epoch": 0.0072442785467219115, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 344 + }, + { + "epoch": 0.007265337495985638, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 345 + }, + { + "epoch": 0.007286396445249364, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.649, + "step": 346 + }, + { + "epoch": 0.007307455394513091, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6753, + "step": 347 + }, + { + "epoch": 0.007328514343776817, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 348 + }, + { + "epoch": 0.007349573293040544, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.7328, + "step": 349 + }, + { + "epoch": 0.007370632242304271, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 350 + }, + { + "epoch": 0.007391691191567996, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 351 + }, + { + "epoch": 0.007412750140831723, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 352 + }, + { + "epoch": 0.00743380909009545, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 353 + }, + { + "epoch": 0.007454868039359176, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6871, + "step": 354 + }, + { + "epoch": 0.007475926988622903, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6619, + "step": 355 + }, + { + "epoch": 0.007496985937886629, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 356 + }, + { + "epoch": 0.0075180448871503555, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6559, + "step": 357 + }, + { + "epoch": 0.007539103836414082, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.7006, + "step": 358 + }, + { + "epoch": 0.007560162785677809, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6549, + "step": 359 + }, + { + "epoch": 0.007581221734941535, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 360 + }, + { + "epoch": 0.007602280684205261, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.687, + "step": 361 + }, + { + "epoch": 0.007623339633468988, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6972, + "step": 362 + }, + { + "epoch": 0.007644398582732715, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 363 + }, + { + "epoch": 0.007665457531996441, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6789, + "step": 364 + }, + { + "epoch": 0.007686516481260167, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 365 + }, + { + "epoch": 0.007707575430523894, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 366 + }, + { + "epoch": 0.0077286343797876205, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 367 + }, + { + "epoch": 0.007749693329051347, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6559, + "step": 368 + }, + { + "epoch": 0.007770752278315074, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.702, + "step": 369 + }, + { + "epoch": 0.0077918112275788, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 370 + }, + { + "epoch": 0.007812870176842526, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 371 + }, + { + "epoch": 0.007833929126106252, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 372 + }, + { + "epoch": 0.00785498807536998, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 373 + }, + { + "epoch": 0.007876047024633705, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.7112, + "step": 374 + }, + { + "epoch": 0.007897105973897433, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6832, + "step": 375 + }, + { + "epoch": 0.007918164923161159, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 376 + }, + { + "epoch": 0.007939223872424885, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 377 + }, + { + "epoch": 0.007960282821688612, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6416, + "step": 378 + }, + { + "epoch": 0.007981341770952338, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6833, + "step": 379 + }, + { + "epoch": 0.008002400720216065, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6796, + "step": 380 + }, + { + "epoch": 0.008023459669479791, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 381 + }, + { + "epoch": 0.008044518618743517, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.7005, + "step": 382 + }, + { + "epoch": 0.008065577568007245, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6801, + "step": 383 + }, + { + "epoch": 0.00808663651727097, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 384 + }, + { + "epoch": 0.008107695466534698, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 385 + }, + { + "epoch": 0.008128754415798424, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6733, + "step": 386 + }, + { + "epoch": 0.00814981336506215, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 387 + }, + { + "epoch": 0.008170872314325877, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 388 + }, + { + "epoch": 0.008191931263589603, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 389 + }, + { + "epoch": 0.00821299021285333, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.6684, + "step": 390 + }, + { + "epoch": 0.008234049162117056, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 391 + }, + { + "epoch": 0.008255108111380782, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 392 + }, + { + "epoch": 0.00827616706064451, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 393 + }, + { + "epoch": 0.008297226009908235, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6645, + "step": 394 + }, + { + "epoch": 0.008318284959171963, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6648, + "step": 395 + }, + { + "epoch": 0.008339343908435689, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 396 + }, + { + "epoch": 0.008360402857699414, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6276, + "step": 397 + }, + { + "epoch": 0.008381461806963142, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 398 + }, + { + "epoch": 0.008402520756226868, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 399 + }, + { + "epoch": 0.008423579705490595, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6381, + "step": 400 + }, + { + "epoch": 0.008444638654754321, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6355, + "step": 401 + }, + { + "epoch": 0.008465697604018047, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6611, + "step": 402 + }, + { + "epoch": 0.008486756553281774, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 403 + }, + { + "epoch": 0.0085078155025455, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 404 + }, + { + "epoch": 0.008528874451809228, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6809, + "step": 405 + }, + { + "epoch": 0.008549933401072954, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6693, + "step": 406 + }, + { + "epoch": 0.00857099235033668, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6621, + "step": 407 + }, + { + "epoch": 0.008592051299600407, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.679, + "step": 408 + }, + { + "epoch": 0.008613110248864133, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 409 + }, + { + "epoch": 0.00863416919812786, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6702, + "step": 410 + }, + { + "epoch": 0.008655228147391586, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 411 + }, + { + "epoch": 0.008676287096655312, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.668, + "step": 412 + }, + { + "epoch": 0.00869734604591904, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.6659, + "step": 413 + }, + { + "epoch": 0.008718404995182765, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.6605, + "step": 414 + }, + { + "epoch": 0.008739463944446493, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 415 + }, + { + "epoch": 0.008760522893710218, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6728, + "step": 416 + }, + { + "epoch": 0.008781581842973944, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6723, + "step": 417 + }, + { + "epoch": 0.008802640792237672, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.686, + "step": 418 + }, + { + "epoch": 0.008823699741501398, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 419 + }, + { + "epoch": 0.008844758690765125, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 420 + }, + { + "epoch": 0.008865817640028851, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6623, + "step": 421 + }, + { + "epoch": 0.008886876589292577, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6476, + "step": 422 + }, + { + "epoch": 0.008907935538556304, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6803, + "step": 423 + }, + { + "epoch": 0.00892899448782003, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 424 + }, + { + "epoch": 0.008950053437083758, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 425 + }, + { + "epoch": 0.008971112386347483, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6945, + "step": 426 + }, + { + "epoch": 0.00899217133561121, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 427 + }, + { + "epoch": 0.009013230284874937, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 428 + }, + { + "epoch": 0.009034289234138662, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6196, + "step": 429 + }, + { + "epoch": 0.009055348183402388, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6461, + "step": 430 + }, + { + "epoch": 0.009076407132666116, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 431 + }, + { + "epoch": 0.009097466081929842, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.664, + "step": 432 + }, + { + "epoch": 0.00911852503119357, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6939, + "step": 433 + }, + { + "epoch": 0.009139583980457295, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6849, + "step": 434 + }, + { + "epoch": 0.00916064292972102, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6325, + "step": 435 + }, + { + "epoch": 0.009181701878984748, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6732, + "step": 436 + }, + { + "epoch": 0.009202760828248474, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 437 + }, + { + "epoch": 0.009223819777512202, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 438 + }, + { + "epoch": 0.009244878726775927, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6791, + "step": 439 + }, + { + "epoch": 0.009265937676039653, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.661, + "step": 440 + }, + { + "epoch": 0.00928699662530338, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 441 + }, + { + "epoch": 0.009308055574567107, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.668, + "step": 442 + }, + { + "epoch": 0.009329114523830834, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6636, + "step": 443 + }, + { + "epoch": 0.00935017347309456, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 444 + }, + { + "epoch": 0.009371232422358286, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 445 + }, + { + "epoch": 0.009392291371622013, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6817, + "step": 446 + }, + { + "epoch": 0.009413350320885739, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6796, + "step": 447 + }, + { + "epoch": 0.009434409270149467, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 448 + }, + { + "epoch": 0.009455468219413192, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 449 + }, + { + "epoch": 0.009476527168676918, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6549, + "step": 450 + }, + { + "epoch": 0.009497586117940646, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6718, + "step": 451 + }, + { + "epoch": 0.009518645067204371, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6319, + "step": 452 + }, + { + "epoch": 0.009539704016468099, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 453 + }, + { + "epoch": 0.009560762965731825, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6703, + "step": 454 + }, + { + "epoch": 0.00958182191499555, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6476, + "step": 455 + }, + { + "epoch": 0.009602880864259278, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 456 + }, + { + "epoch": 0.009623939813523004, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 457 + }, + { + "epoch": 0.009644998762786731, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6249, + "step": 458 + }, + { + "epoch": 0.009666057712050457, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.647, + "step": 459 + }, + { + "epoch": 0.009687116661314183, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.679, + "step": 460 + }, + { + "epoch": 0.00970817561057791, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 461 + }, + { + "epoch": 0.009729234559841636, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 462 + }, + { + "epoch": 0.009750293509105364, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6558, + "step": 463 + }, + { + "epoch": 0.00977135245836909, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 464 + }, + { + "epoch": 0.009792411407632816, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.656, + "step": 465 + }, + { + "epoch": 0.009813470356896543, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6627, + "step": 466 + }, + { + "epoch": 0.009834529306160269, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 467 + }, + { + "epoch": 0.009855588255423996, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 468 + }, + { + "epoch": 0.009876647204687722, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6361, + "step": 469 + }, + { + "epoch": 0.009897706153951448, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 470 + }, + { + "epoch": 0.009918765103215176, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.647, + "step": 471 + }, + { + "epoch": 0.009939824052478901, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 472 + }, + { + "epoch": 0.009960883001742629, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6954, + "step": 473 + }, + { + "epoch": 0.009981941951006355, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 474 + }, + { + "epoch": 0.01000300090027008, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 475 + }, + { + "epoch": 0.010024059849533808, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 476 + }, + { + "epoch": 0.010045118798797534, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6731, + "step": 477 + }, + { + "epoch": 0.010066177748061261, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6636, + "step": 478 + }, + { + "epoch": 0.010087236697324987, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6697, + "step": 479 + }, + { + "epoch": 0.010108295646588713, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 480 + }, + { + "epoch": 0.01012935459585244, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.6549, + "step": 481 + }, + { + "epoch": 0.010150413545116166, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 482 + }, + { + "epoch": 0.010171472494379894, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6576, + "step": 483 + }, + { + "epoch": 0.01019253144364362, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 484 + }, + { + "epoch": 0.010213590392907345, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 485 + }, + { + "epoch": 0.010234649342171073, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 486 + }, + { + "epoch": 0.010255708291434799, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6548, + "step": 487 + }, + { + "epoch": 0.010276767240698524, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 488 + }, + { + "epoch": 0.010297826189962252, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 489 + }, + { + "epoch": 0.010318885139225978, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 490 + }, + { + "epoch": 0.010339944088489705, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 491 + }, + { + "epoch": 0.010361003037753431, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6807, + "step": 492 + }, + { + "epoch": 0.010382061987017157, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6428, + "step": 493 + }, + { + "epoch": 0.010403120936280884, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 494 + }, + { + "epoch": 0.01042417988554461, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 495 + }, + { + "epoch": 0.010445238834808338, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.69, + "step": 496 + }, + { + "epoch": 0.010466297784072064, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6481, + "step": 497 + }, + { + "epoch": 0.01048735673333579, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6509, + "step": 498 + }, + { + "epoch": 0.010508415682599517, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 499 + }, + { + "epoch": 0.010529474631863243, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6663, + "step": 500 + }, + { + "epoch": 0.010529474631863243, + "eval_loss": 1.6845688819885254, + "eval_runtime": 901.4075, + "eval_samples_per_second": 68.559, + "eval_steps_per_second": 2.143, + "step": 500 + }, + { + "epoch": 0.01055053358112697, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6882, + "step": 501 + }, + { + "epoch": 0.010571592530390696, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 502 + }, + { + "epoch": 0.010592651479654422, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6245, + "step": 503 + }, + { + "epoch": 0.01061371042891815, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 504 + }, + { + "epoch": 0.010634769378181875, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6788, + "step": 505 + }, + { + "epoch": 0.010655828327445603, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 506 + }, + { + "epoch": 0.010676887276709329, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6678, + "step": 507 + }, + { + "epoch": 0.010697946225973054, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 508 + }, + { + "epoch": 0.010719005175236782, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6639, + "step": 509 + }, + { + "epoch": 0.010740064124500508, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6506, + "step": 510 + }, + { + "epoch": 0.010761123073764235, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6731, + "step": 511 + }, + { + "epoch": 0.010782182023027961, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 512 + }, + { + "epoch": 0.010803240972291687, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 513 + }, + { + "epoch": 0.010824299921555414, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6559, + "step": 514 + }, + { + "epoch": 0.01084535887081914, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6539, + "step": 515 + }, + { + "epoch": 0.010866417820082868, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.7107, + "step": 516 + }, + { + "epoch": 0.010887476769346593, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6344, + "step": 517 + }, + { + "epoch": 0.01090853571861032, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 518 + }, + { + "epoch": 0.010929594667874047, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 519 + }, + { + "epoch": 0.010950653617137773, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 520 + }, + { + "epoch": 0.0109717125664015, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6439, + "step": 521 + }, + { + "epoch": 0.010992771515665226, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 522 + }, + { + "epoch": 0.011013830464928952, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6861, + "step": 523 + }, + { + "epoch": 0.01103488941419268, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 524 + }, + { + "epoch": 0.011055948363456405, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6635, + "step": 525 + }, + { + "epoch": 0.011077007312720133, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 526 + }, + { + "epoch": 0.011098066261983858, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 527 + }, + { + "epoch": 0.011119125211247584, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 528 + }, + { + "epoch": 0.011140184160511312, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6608, + "step": 529 + }, + { + "epoch": 0.011161243109775038, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6848, + "step": 530 + }, + { + "epoch": 0.011182302059038765, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6913, + "step": 531 + }, + { + "epoch": 0.01120336100830249, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 532 + }, + { + "epoch": 0.011224419957566217, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 533 + }, + { + "epoch": 0.011245478906829944, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.6928, + "step": 534 + }, + { + "epoch": 0.01126653785609367, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6937, + "step": 535 + }, + { + "epoch": 0.011287596805357398, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6642, + "step": 536 + }, + { + "epoch": 0.011308655754621123, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6671, + "step": 537 + }, + { + "epoch": 0.011329714703884849, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6381, + "step": 538 + }, + { + "epoch": 0.011350773653148577, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 539 + }, + { + "epoch": 0.011371832602412302, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 540 + }, + { + "epoch": 0.01139289155167603, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 541 + }, + { + "epoch": 0.011413950500939756, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 542 + }, + { + "epoch": 0.011435009450203482, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 543 + }, + { + "epoch": 0.011456068399467209, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6619, + "step": 544 + }, + { + "epoch": 0.011477127348730935, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6387, + "step": 545 + }, + { + "epoch": 0.01149818629799466, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6443, + "step": 546 + }, + { + "epoch": 0.011519245247258388, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 547 + }, + { + "epoch": 0.011540304196522114, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6595, + "step": 548 + }, + { + "epoch": 0.011561363145785842, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 549 + }, + { + "epoch": 0.011582422095049567, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6484, + "step": 550 + }, + { + "epoch": 0.011603481044313293, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6609, + "step": 551 + }, + { + "epoch": 0.01162453999357702, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 552 + }, + { + "epoch": 0.011645598942840747, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6618, + "step": 553 + }, + { + "epoch": 0.011666657892104474, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6587, + "step": 554 + }, + { + "epoch": 0.0116877168413682, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6424, + "step": 555 + }, + { + "epoch": 0.011708775790631926, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6329, + "step": 556 + }, + { + "epoch": 0.011729834739895653, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 557 + }, + { + "epoch": 0.011750893689159379, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 558 + }, + { + "epoch": 0.011771952638423107, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6447, + "step": 559 + }, + { + "epoch": 0.011793011587686832, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 560 + }, + { + "epoch": 0.011814070536950558, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 561 + }, + { + "epoch": 0.011835129486214286, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 562 + }, + { + "epoch": 0.011856188435478011, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6627, + "step": 563 + }, + { + "epoch": 0.011877247384741739, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 564 + }, + { + "epoch": 0.011898306334005465, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 565 + }, + { + "epoch": 0.01191936528326919, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6504, + "step": 566 + }, + { + "epoch": 0.011940424232532918, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6583, + "step": 567 + }, + { + "epoch": 0.011961483181796644, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6457, + "step": 568 + }, + { + "epoch": 0.011982542131060371, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 569 + }, + { + "epoch": 0.012003601080324097, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6665, + "step": 570 + }, + { + "epoch": 0.012024660029587823, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 571 + }, + { + "epoch": 0.01204571897885155, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6524, + "step": 572 + }, + { + "epoch": 0.012066777928115276, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6298, + "step": 573 + }, + { + "epoch": 0.012087836877379004, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6595, + "step": 574 + }, + { + "epoch": 0.01210889582664273, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 575 + }, + { + "epoch": 0.012129954775906455, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 576 + }, + { + "epoch": 0.012151013725170183, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.7005, + "step": 577 + }, + { + "epoch": 0.012172072674433909, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6646, + "step": 578 + }, + { + "epoch": 0.012193131623697636, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6449, + "step": 579 + }, + { + "epoch": 0.012214190572961362, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 580 + }, + { + "epoch": 0.012235249522225088, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6523, + "step": 581 + }, + { + "epoch": 0.012256308471488815, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6457, + "step": 582 + }, + { + "epoch": 0.012277367420752541, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6428, + "step": 583 + }, + { + "epoch": 0.012298426370016269, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6509, + "step": 584 + }, + { + "epoch": 0.012319485319279995, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 585 + }, + { + "epoch": 0.01234054426854372, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.635, + "step": 586 + }, + { + "epoch": 0.012361603217807448, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 587 + }, + { + "epoch": 0.012382662167071174, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6599, + "step": 588 + }, + { + "epoch": 0.012403721116334901, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6789, + "step": 589 + }, + { + "epoch": 0.012424780065598627, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 590 + }, + { + "epoch": 0.012445839014862353, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 591 + }, + { + "epoch": 0.01246689796412608, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6422, + "step": 592 + }, + { + "epoch": 0.012487956913389806, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 593 + }, + { + "epoch": 0.012509015862653534, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 594 + }, + { + "epoch": 0.01253007481191726, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 595 + }, + { + "epoch": 0.012551133761180985, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 596 + }, + { + "epoch": 0.012572192710444713, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 597 + }, + { + "epoch": 0.012593251659708439, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6488, + "step": 598 + }, + { + "epoch": 0.012614310608972164, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6702, + "step": 599 + }, + { + "epoch": 0.012635369558235892, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6282, + "step": 600 + }, + { + "epoch": 0.012656428507499618, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 601 + }, + { + "epoch": 0.012677487456763345, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6627, + "step": 602 + }, + { + "epoch": 0.012698546406027071, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 603 + }, + { + "epoch": 0.012719605355290797, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 604 + }, + { + "epoch": 0.012740664304554524, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 605 + }, + { + "epoch": 0.01276172325381825, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6564, + "step": 606 + }, + { + "epoch": 0.012782782203081978, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6118, + "step": 607 + }, + { + "epoch": 0.012803841152345704, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 608 + }, + { + "epoch": 0.01282490010160943, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 609 + }, + { + "epoch": 0.012845959050873157, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 610 + }, + { + "epoch": 0.012867018000136883, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6505, + "step": 611 + }, + { + "epoch": 0.01288807694940061, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 612 + }, + { + "epoch": 0.012909135898664336, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 613 + }, + { + "epoch": 0.012930194847928062, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 614 + }, + { + "epoch": 0.01295125379719179, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 615 + }, + { + "epoch": 0.012972312746455515, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 616 + }, + { + "epoch": 0.012993371695719243, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 617 + }, + { + "epoch": 0.013014430644982969, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5986, + "step": 618 + }, + { + "epoch": 0.013035489594246694, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 619 + }, + { + "epoch": 0.013056548543510422, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6803, + "step": 620 + }, + { + "epoch": 0.013077607492774148, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6713, + "step": 621 + }, + { + "epoch": 0.013098666442037875, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6621, + "step": 622 + }, + { + "epoch": 0.013119725391301601, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6544, + "step": 623 + }, + { + "epoch": 0.013140784340565327, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6452, + "step": 624 + }, + { + "epoch": 0.013161843289829054, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 625 + }, + { + "epoch": 0.01318290223909278, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 626 + }, + { + "epoch": 0.013203961188356508, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 627 + }, + { + "epoch": 0.013225020137620233, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 628 + }, + { + "epoch": 0.01324607908688396, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 629 + }, + { + "epoch": 0.013267138036147687, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6771, + "step": 630 + }, + { + "epoch": 0.013288196985411413, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6476, + "step": 631 + }, + { + "epoch": 0.01330925593467514, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 632 + }, + { + "epoch": 0.013330314883938866, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.6583, + "step": 633 + }, + { + "epoch": 0.013351373833202592, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 634 + }, + { + "epoch": 0.01337243278246632, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6457, + "step": 635 + }, + { + "epoch": 0.013393491731730045, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6568, + "step": 636 + }, + { + "epoch": 0.013414550680993773, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6341, + "step": 637 + }, + { + "epoch": 0.013435609630257498, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 638 + }, + { + "epoch": 0.013456668579521224, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6557, + "step": 639 + }, + { + "epoch": 0.013477727528784952, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 640 + }, + { + "epoch": 0.013498786478048677, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6195, + "step": 641 + }, + { + "epoch": 0.013519845427312405, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 642 + }, + { + "epoch": 0.01354090437657613, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 643 + }, + { + "epoch": 0.013561963325839857, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6581, + "step": 644 + }, + { + "epoch": 0.013583022275103584, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 645 + }, + { + "epoch": 0.01360408122436731, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 646 + }, + { + "epoch": 0.013625140173631037, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 647 + }, + { + "epoch": 0.013646199122894763, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 648 + }, + { + "epoch": 0.013667258072158489, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6882, + "step": 649 + }, + { + "epoch": 0.013688317021422217, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 650 + }, + { + "epoch": 0.013709375970685942, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6521, + "step": 651 + }, + { + "epoch": 0.01373043491994967, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 652 + }, + { + "epoch": 0.013751493869213396, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 653 + }, + { + "epoch": 0.013772552818477122, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6688, + "step": 654 + }, + { + "epoch": 0.013793611767740849, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.655, + "step": 655 + }, + { + "epoch": 0.013814670717004575, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6473, + "step": 656 + }, + { + "epoch": 0.0138357296662683, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6272, + "step": 657 + }, + { + "epoch": 0.013856788615532028, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 658 + }, + { + "epoch": 0.013877847564795754, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 659 + }, + { + "epoch": 0.013898906514059482, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 660 + }, + { + "epoch": 0.013919965463323207, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 661 + }, + { + "epoch": 0.013941024412586933, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6506, + "step": 662 + }, + { + "epoch": 0.01396208336185066, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6826, + "step": 663 + }, + { + "epoch": 0.013983142311114386, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 664 + }, + { + "epoch": 0.014004201260378114, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6452, + "step": 665 + }, + { + "epoch": 0.01402526020964184, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 666 + }, + { + "epoch": 0.014046319158905566, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6473, + "step": 667 + }, + { + "epoch": 0.014067378108169293, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 668 + }, + { + "epoch": 0.014088437057433019, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6528, + "step": 669 + }, + { + "epoch": 0.014109496006696746, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 670 + }, + { + "epoch": 0.014130554955960472, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6507, + "step": 671 + }, + { + "epoch": 0.014151613905224198, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 672 + }, + { + "epoch": 0.014172672854487926, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 673 + }, + { + "epoch": 0.014193731803751651, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 674 + }, + { + "epoch": 0.014214790753015379, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6615, + "step": 675 + }, + { + "epoch": 0.014235849702279105, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 676 + }, + { + "epoch": 0.01425690865154283, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 677 + }, + { + "epoch": 0.014277967600806558, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 678 + }, + { + "epoch": 0.014299026550070284, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6616, + "step": 679 + }, + { + "epoch": 0.014320085499334011, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6433, + "step": 680 + }, + { + "epoch": 0.014341144448597737, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6371, + "step": 681 + }, + { + "epoch": 0.014362203397861463, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 682 + }, + { + "epoch": 0.01438326234712519, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6687, + "step": 683 + }, + { + "epoch": 0.014404321296388916, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 684 + }, + { + "epoch": 0.014425380245652644, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6346, + "step": 685 + }, + { + "epoch": 0.01444643919491637, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 686 + }, + { + "epoch": 0.014467498144180095, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6539, + "step": 687 + }, + { + "epoch": 0.014488557093443823, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6643, + "step": 688 + }, + { + "epoch": 0.014509616042707549, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6504, + "step": 689 + }, + { + "epoch": 0.014530674991971276, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6665, + "step": 690 + }, + { + "epoch": 0.014551733941235002, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 691 + }, + { + "epoch": 0.014572792890498728, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6548, + "step": 692 + }, + { + "epoch": 0.014593851839762455, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6446, + "step": 693 + }, + { + "epoch": 0.014614910789026181, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 694 + }, + { + "epoch": 0.014635969738289909, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6635, + "step": 695 + }, + { + "epoch": 0.014657028687553635, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 696 + }, + { + "epoch": 0.01467808763681736, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6485, + "step": 697 + }, + { + "epoch": 0.014699146586081088, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 698 + }, + { + "epoch": 0.014720205535344814, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5934, + "step": 699 + }, + { + "epoch": 0.014741264484608541, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 700 + }, + { + "epoch": 0.014762323433872267, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6674, + "step": 701 + }, + { + "epoch": 0.014783382383135993, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 702 + }, + { + "epoch": 0.01480444133239972, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 703 + }, + { + "epoch": 0.014825500281663446, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6374, + "step": 704 + }, + { + "epoch": 0.014846559230927174, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 705 + }, + { + "epoch": 0.0148676181801909, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6413, + "step": 706 + }, + { + "epoch": 0.014888677129454625, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 707 + }, + { + "epoch": 0.014909736078718353, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 708 + }, + { + "epoch": 0.014930795027982079, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6801, + "step": 709 + }, + { + "epoch": 0.014951853977245806, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 710 + }, + { + "epoch": 0.014972912926509532, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 711 + }, + { + "epoch": 0.014993971875773258, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6765, + "step": 712 + }, + { + "epoch": 0.015015030825036985, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 713 + }, + { + "epoch": 0.015036089774300711, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 714 + }, + { + "epoch": 0.015057148723564437, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6558, + "step": 715 + }, + { + "epoch": 0.015078207672828164, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6495, + "step": 716 + }, + { + "epoch": 0.01509926662209189, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 717 + }, + { + "epoch": 0.015120325571355618, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6524, + "step": 718 + }, + { + "epoch": 0.015141384520619344, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6446, + "step": 719 + }, + { + "epoch": 0.01516244346988307, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6619, + "step": 720 + }, + { + "epoch": 0.015183502419146797, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 721 + }, + { + "epoch": 0.015204561368410523, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 722 + }, + { + "epoch": 0.01522562031767425, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 723 + }, + { + "epoch": 0.015246679266937976, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 724 + }, + { + "epoch": 0.015267738216201702, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6346, + "step": 725 + }, + { + "epoch": 0.01528879716546543, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 726 + }, + { + "epoch": 0.015309856114729155, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6371, + "step": 727 + }, + { + "epoch": 0.015330915063992883, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 728 + }, + { + "epoch": 0.015351974013256608, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 729 + }, + { + "epoch": 0.015373032962520334, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6505, + "step": 730 + }, + { + "epoch": 0.015394091911784062, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6174, + "step": 731 + }, + { + "epoch": 0.015415150861047788, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6263, + "step": 732 + }, + { + "epoch": 0.015436209810311515, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 733 + }, + { + "epoch": 0.015457268759575241, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 734 + }, + { + "epoch": 0.015478327708838967, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 735 + }, + { + "epoch": 0.015499386658102694, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6206, + "step": 736 + }, + { + "epoch": 0.01552044560736642, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 737 + }, + { + "epoch": 0.015541504556630148, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6379, + "step": 738 + }, + { + "epoch": 0.015562563505893873, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 739 + }, + { + "epoch": 0.0155836224551576, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6372, + "step": 740 + }, + { + "epoch": 0.015604681404421327, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 741 + }, + { + "epoch": 0.015625740353685053, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.633, + "step": 742 + }, + { + "epoch": 0.01564679930294878, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6288, + "step": 743 + }, + { + "epoch": 0.015667858252212504, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 744 + }, + { + "epoch": 0.01568891720147623, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 745 + }, + { + "epoch": 0.01570997615073996, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6488, + "step": 746 + }, + { + "epoch": 0.015731035100003687, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6262, + "step": 747 + }, + { + "epoch": 0.01575209404926741, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6411, + "step": 748 + }, + { + "epoch": 0.01577315299853114, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 749 + }, + { + "epoch": 0.015794211947794866, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6531, + "step": 750 + }, + { + "epoch": 0.01581527089705859, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 751 + }, + { + "epoch": 0.015836329846322317, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 752 + }, + { + "epoch": 0.015857388795586045, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6466, + "step": 753 + }, + { + "epoch": 0.01587844774484977, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 754 + }, + { + "epoch": 0.015899506694113497, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 755 + }, + { + "epoch": 0.015920565643377224, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 756 + }, + { + "epoch": 0.01594162459264095, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 757 + }, + { + "epoch": 0.015962683541904676, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 758 + }, + { + "epoch": 0.015983742491168403, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6286, + "step": 759 + }, + { + "epoch": 0.01600480144043213, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 760 + }, + { + "epoch": 0.016025860389695855, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 761 + }, + { + "epoch": 0.016046919338959582, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6751, + "step": 762 + }, + { + "epoch": 0.01606797828822331, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 763 + }, + { + "epoch": 0.016089037237487034, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6213, + "step": 764 + }, + { + "epoch": 0.01611009618675076, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 765 + }, + { + "epoch": 0.01613115513601449, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 766 + }, + { + "epoch": 0.016152214085278217, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 767 + }, + { + "epoch": 0.01617327303454194, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6697, + "step": 768 + }, + { + "epoch": 0.016194331983805668, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 769 + }, + { + "epoch": 0.016215390933069396, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 770 + }, + { + "epoch": 0.01623644988233312, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6593, + "step": 771 + }, + { + "epoch": 0.016257508831596847, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 772 + }, + { + "epoch": 0.016278567780860575, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6388, + "step": 773 + }, + { + "epoch": 0.0162996267301243, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 774 + }, + { + "epoch": 0.016320685679388026, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 775 + }, + { + "epoch": 0.016341744628651754, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6444, + "step": 776 + }, + { + "epoch": 0.01636280357791548, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6093, + "step": 777 + }, + { + "epoch": 0.016383862527179206, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6548, + "step": 778 + }, + { + "epoch": 0.016404921476442933, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6248, + "step": 779 + }, + { + "epoch": 0.01642598042570666, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6233, + "step": 780 + }, + { + "epoch": 0.016447039374970385, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 781 + }, + { + "epoch": 0.016468098324234112, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6413, + "step": 782 + }, + { + "epoch": 0.01648915727349784, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 783 + }, + { + "epoch": 0.016510216222761564, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.6424, + "step": 784 + }, + { + "epoch": 0.01653127517202529, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 785 + }, + { + "epoch": 0.01655233412128902, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6373, + "step": 786 + }, + { + "epoch": 0.016573393070552746, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 787 + }, + { + "epoch": 0.01659445201981647, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 788 + }, + { + "epoch": 0.016615510969080198, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 789 + }, + { + "epoch": 0.016636569918343926, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 790 + }, + { + "epoch": 0.01665762886760765, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6175, + "step": 791 + }, + { + "epoch": 0.016678687816871377, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 792 + }, + { + "epoch": 0.016699746766135105, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.635, + "step": 793 + }, + { + "epoch": 0.01672080571539883, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6122, + "step": 794 + }, + { + "epoch": 0.016741864664662556, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6562, + "step": 795 + }, + { + "epoch": 0.016762923613926284, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 796 + }, + { + "epoch": 0.01678398256319001, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6267, + "step": 797 + }, + { + "epoch": 0.016805041512453735, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 798 + }, + { + "epoch": 0.016826100461717463, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.6333, + "step": 799 + }, + { + "epoch": 0.01684715941098119, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6618, + "step": 800 + }, + { + "epoch": 0.016868218360244915, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6206, + "step": 801 + }, + { + "epoch": 0.016889277309508642, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 802 + }, + { + "epoch": 0.01691033625877237, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6424, + "step": 803 + }, + { + "epoch": 0.016931395208036094, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 804 + }, + { + "epoch": 0.01695245415729982, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6269, + "step": 805 + }, + { + "epoch": 0.01697351310656355, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.609, + "step": 806 + }, + { + "epoch": 0.016994572055827273, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 807 + }, + { + "epoch": 0.017015631005091, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.668, + "step": 808 + }, + { + "epoch": 0.017036689954354728, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 809 + }, + { + "epoch": 0.017057748903618455, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6446, + "step": 810 + }, + { + "epoch": 0.01707880785288218, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 811 + }, + { + "epoch": 0.017099866802145907, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6333, + "step": 812 + }, + { + "epoch": 0.017120925751409635, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 813 + }, + { + "epoch": 0.01714198470067336, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 814 + }, + { + "epoch": 0.017163043649937086, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6298, + "step": 815 + }, + { + "epoch": 0.017184102599200814, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 816 + }, + { + "epoch": 0.017205161548464538, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 817 + }, + { + "epoch": 0.017226220497728265, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6361, + "step": 818 + }, + { + "epoch": 0.017247279446991993, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 819 + }, + { + "epoch": 0.01726833839625572, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 820 + }, + { + "epoch": 0.017289397345519444, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 821 + }, + { + "epoch": 0.017310456294783172, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 822 + }, + { + "epoch": 0.0173315152440469, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6813, + "step": 823 + }, + { + "epoch": 0.017352574193310624, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 824 + }, + { + "epoch": 0.01737363314257435, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 825 + }, + { + "epoch": 0.01739469209183808, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 826 + }, + { + "epoch": 0.017415751041101803, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 827 + }, + { + "epoch": 0.01743680999036553, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 828 + }, + { + "epoch": 0.017457868939629258, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 829 + }, + { + "epoch": 0.017478927888892985, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 830 + }, + { + "epoch": 0.01749998683815671, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6194, + "step": 831 + }, + { + "epoch": 0.017521045787420437, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6902, + "step": 832 + }, + { + "epoch": 0.017542104736684164, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6452, + "step": 833 + }, + { + "epoch": 0.01756316368594789, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 834 + }, + { + "epoch": 0.017584222635211616, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6649, + "step": 835 + }, + { + "epoch": 0.017605281584475344, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 836 + }, + { + "epoch": 0.017626340533739068, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 837 + }, + { + "epoch": 0.017647399483002795, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6355, + "step": 838 + }, + { + "epoch": 0.017668458432266523, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 839 + }, + { + "epoch": 0.01768951738153025, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6293, + "step": 840 + }, + { + "epoch": 0.017710576330793974, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6538, + "step": 841 + }, + { + "epoch": 0.017731635280057702, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6452, + "step": 842 + }, + { + "epoch": 0.01775269422932143, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6397, + "step": 843 + }, + { + "epoch": 0.017773753178585153, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 844 + }, + { + "epoch": 0.01779481212784888, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 845 + }, + { + "epoch": 0.01781587107711261, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 846 + }, + { + "epoch": 0.017836930026376333, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6298, + "step": 847 + }, + { + "epoch": 0.01785798897564006, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 848 + }, + { + "epoch": 0.017879047924903788, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.649, + "step": 849 + }, + { + "epoch": 0.017900106874167515, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 850 + }, + { + "epoch": 0.01792116582343124, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6536, + "step": 851 + }, + { + "epoch": 0.017942224772694967, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 852 + }, + { + "epoch": 0.017963283721958694, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 853 + }, + { + "epoch": 0.01798434267122242, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 854 + }, + { + "epoch": 0.018005401620486146, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 855 + }, + { + "epoch": 0.018026460569749873, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 856 + }, + { + "epoch": 0.018047519519013597, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6619, + "step": 857 + }, + { + "epoch": 0.018068578468277325, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6174, + "step": 858 + }, + { + "epoch": 0.018089637417541053, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 859 + }, + { + "epoch": 0.018110696366804777, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6318, + "step": 860 + }, + { + "epoch": 0.018131755316068504, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 861 + }, + { + "epoch": 0.01815281426533223, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 862 + }, + { + "epoch": 0.01817387321459596, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 863 + }, + { + "epoch": 0.018194932163859683, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6406, + "step": 864 + }, + { + "epoch": 0.01821599111312341, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 865 + }, + { + "epoch": 0.01823705006238714, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6476, + "step": 866 + }, + { + "epoch": 0.018258109011650862, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6244, + "step": 867 + }, + { + "epoch": 0.01827916796091459, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 868 + }, + { + "epoch": 0.018300226910178317, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 869 + }, + { + "epoch": 0.01832128585944204, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 870 + }, + { + "epoch": 0.01834234480870577, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 871 + }, + { + "epoch": 0.018363403757969497, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6593, + "step": 872 + }, + { + "epoch": 0.018384462707233224, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 873 + }, + { + "epoch": 0.018405521656496948, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6886, + "step": 874 + }, + { + "epoch": 0.018426580605760676, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6267, + "step": 875 + }, + { + "epoch": 0.018447639555024403, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 876 + }, + { + "epoch": 0.018468698504288127, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6372, + "step": 877 + }, + { + "epoch": 0.018489757453551855, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6797, + "step": 878 + }, + { + "epoch": 0.018510816402815582, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6379, + "step": 879 + }, + { + "epoch": 0.018531875352079306, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6374, + "step": 880 + }, + { + "epoch": 0.018552934301343034, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 881 + }, + { + "epoch": 0.01857399325060676, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 882 + }, + { + "epoch": 0.01859505219987049, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.63, + "step": 883 + }, + { + "epoch": 0.018616111149134213, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 884 + }, + { + "epoch": 0.01863717009839794, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 885 + }, + { + "epoch": 0.018658229047661668, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6624, + "step": 886 + }, + { + "epoch": 0.018679287996925392, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 887 + }, + { + "epoch": 0.01870034694618912, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6102, + "step": 888 + }, + { + "epoch": 0.018721405895452847, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6295, + "step": 889 + }, + { + "epoch": 0.01874246484471657, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 890 + }, + { + "epoch": 0.0187635237939803, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.628, + "step": 891 + }, + { + "epoch": 0.018784582743244026, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 892 + }, + { + "epoch": 0.018805641692507754, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 893 + }, + { + "epoch": 0.018826700641771478, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 894 + }, + { + "epoch": 0.018847759591035206, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6387, + "step": 895 + }, + { + "epoch": 0.018868818540298933, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 896 + }, + { + "epoch": 0.018889877489562657, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 897 + }, + { + "epoch": 0.018910936438826385, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 898 + }, + { + "epoch": 0.018931995388090112, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6388, + "step": 899 + }, + { + "epoch": 0.018953054337353836, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.6062, + "step": 900 + }, + { + "epoch": 0.018974113286617564, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6045, + "step": 901 + }, + { + "epoch": 0.01899517223588129, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 902 + }, + { + "epoch": 0.01901623118514502, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6442, + "step": 903 + }, + { + "epoch": 0.019037290134408743, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 904 + }, + { + "epoch": 0.01905834908367247, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 905 + }, + { + "epoch": 0.019079408032936198, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 906 + }, + { + "epoch": 0.019100466982199922, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 907 + }, + { + "epoch": 0.01912152593146365, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6196, + "step": 908 + }, + { + "epoch": 0.019142584880727377, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.629, + "step": 909 + }, + { + "epoch": 0.0191636438299911, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.665, + "step": 910 + }, + { + "epoch": 0.01918470277925483, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 911 + }, + { + "epoch": 0.019205761728518556, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 912 + }, + { + "epoch": 0.01922682067778228, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6372, + "step": 913 + }, + { + "epoch": 0.019247879627046008, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6359, + "step": 914 + }, + { + "epoch": 0.019268938576309735, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 915 + }, + { + "epoch": 0.019289997525573463, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 916 + }, + { + "epoch": 0.019311056474837187, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6265, + "step": 917 + }, + { + "epoch": 0.019332115424100915, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6249, + "step": 918 + }, + { + "epoch": 0.019353174373364642, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6165, + "step": 919 + }, + { + "epoch": 0.019374233322628366, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 920 + }, + { + "epoch": 0.019395292271892094, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6338, + "step": 921 + }, + { + "epoch": 0.01941635122115582, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 922 + }, + { + "epoch": 0.019437410170419545, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6039, + "step": 923 + }, + { + "epoch": 0.019458469119683273, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 924 + }, + { + "epoch": 0.019479528068947, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6254, + "step": 925 + }, + { + "epoch": 0.019500587018210728, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 926 + }, + { + "epoch": 0.019521645967474452, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 927 + }, + { + "epoch": 0.01954270491673818, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 928 + }, + { + "epoch": 0.019563763866001907, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.6116, + "step": 929 + }, + { + "epoch": 0.01958482281526563, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 930 + }, + { + "epoch": 0.01960588176452936, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 931 + }, + { + "epoch": 0.019626940713793086, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 932 + }, + { + "epoch": 0.01964799966305681, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.6564, + "step": 933 + }, + { + "epoch": 0.019669058612320538, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 934 + }, + { + "epoch": 0.019690117561584265, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.618, + "step": 935 + }, + { + "epoch": 0.019711176510847993, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6503, + "step": 936 + }, + { + "epoch": 0.019732235460111717, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6146, + "step": 937 + }, + { + "epoch": 0.019753294409375444, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 938 + }, + { + "epoch": 0.019774353358639172, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 939 + }, + { + "epoch": 0.019795412307902896, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 940 + }, + { + "epoch": 0.019816471257166623, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 941 + }, + { + "epoch": 0.01983753020643035, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 942 + }, + { + "epoch": 0.019858589155694075, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 943 + }, + { + "epoch": 0.019879648104957803, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6524, + "step": 944 + }, + { + "epoch": 0.01990070705422153, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 945 + }, + { + "epoch": 0.019921766003485258, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 946 + }, + { + "epoch": 0.019942824952748982, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6275, + "step": 947 + }, + { + "epoch": 0.01996388390201271, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 948 + }, + { + "epoch": 0.019984942851276437, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6079, + "step": 949 + }, + { + "epoch": 0.02000600180054016, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 950 + }, + { + "epoch": 0.02002706074980389, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 951 + }, + { + "epoch": 0.020048119699067616, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 952 + }, + { + "epoch": 0.02006917864833134, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6926, + "step": 953 + }, + { + "epoch": 0.020090237597595068, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 954 + }, + { + "epoch": 0.020111296546858795, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6402, + "step": 955 + }, + { + "epoch": 0.020132355496122523, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 956 + }, + { + "epoch": 0.020153414445386247, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6506, + "step": 957 + }, + { + "epoch": 0.020174473394649974, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 958 + }, + { + "epoch": 0.020195532343913702, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 959 + }, + { + "epoch": 0.020216591293177426, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 960 + }, + { + "epoch": 0.020237650242441153, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 961 + }, + { + "epoch": 0.02025870919170488, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6411, + "step": 962 + }, + { + "epoch": 0.020279768140968605, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6045, + "step": 963 + }, + { + "epoch": 0.020300827090232332, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 964 + }, + { + "epoch": 0.02032188603949606, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 965 + }, + { + "epoch": 0.020342944988759788, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6363, + "step": 966 + }, + { + "epoch": 0.02036400393802351, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6232, + "step": 967 + }, + { + "epoch": 0.02038506288728724, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6178, + "step": 968 + }, + { + "epoch": 0.020406121836550967, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 969 + }, + { + "epoch": 0.02042718078581469, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 970 + }, + { + "epoch": 0.02044823973507842, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 971 + }, + { + "epoch": 0.020469298684342146, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 972 + }, + { + "epoch": 0.02049035763360587, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6401, + "step": 973 + }, + { + "epoch": 0.020511416582869597, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 974 + }, + { + "epoch": 0.020532475532133325, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6171, + "step": 975 + }, + { + "epoch": 0.02055353448139705, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 976 + }, + { + "epoch": 0.020574593430660777, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 977 + }, + { + "epoch": 0.020595652379924504, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 978 + }, + { + "epoch": 0.02061671132918823, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 979 + }, + { + "epoch": 0.020637770278451956, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 980 + }, + { + "epoch": 0.020658829227715683, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 981 + }, + { + "epoch": 0.02067988817697941, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 982 + }, + { + "epoch": 0.020700947126243135, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 983 + }, + { + "epoch": 0.020722006075506862, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6533, + "step": 984 + }, + { + "epoch": 0.02074306502477059, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6294, + "step": 985 + }, + { + "epoch": 0.020764123974034314, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6164, + "step": 986 + }, + { + "epoch": 0.02078518292329804, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 987 + }, + { + "epoch": 0.02080624187256177, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 988 + }, + { + "epoch": 0.020827300821825497, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 989 + }, + { + "epoch": 0.02084835977108922, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 990 + }, + { + "epoch": 0.020869418720352948, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6195, + "step": 991 + }, + { + "epoch": 0.020890477669616676, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 992 + }, + { + "epoch": 0.0209115366188804, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 993 + }, + { + "epoch": 0.020932595568144127, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6278, + "step": 994 + }, + { + "epoch": 0.020953654517407855, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6191, + "step": 995 + }, + { + "epoch": 0.02097471346667158, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 996 + }, + { + "epoch": 0.020995772415935306, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 997 + }, + { + "epoch": 0.021016831365199034, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6593, + "step": 998 + }, + { + "epoch": 0.02103789031446276, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5955, + "step": 999 + }, + { + "epoch": 0.021058949263726486, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6325, + "step": 1000 + }, + { + "epoch": 0.021058949263726486, + "eval_loss": 1.6914067268371582, + "eval_runtime": 898.0675, + "eval_samples_per_second": 68.814, + "eval_steps_per_second": 2.151, + "step": 1000 + }, + { + "epoch": 0.021080008212990213, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 1001 + }, + { + "epoch": 0.02110106716225394, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6411, + "step": 1002 + }, + { + "epoch": 0.021122126111517665, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6576, + "step": 1003 + }, + { + "epoch": 0.021143185060781392, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 1004 + }, + { + "epoch": 0.02116424401004512, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6233, + "step": 1005 + }, + { + "epoch": 0.021185302959308844, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1006 + }, + { + "epoch": 0.02120636190857257, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1007 + }, + { + "epoch": 0.0212274208578363, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 1008 + }, + { + "epoch": 0.021248479807100026, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6352, + "step": 1009 + }, + { + "epoch": 0.02126953875636375, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6402, + "step": 1010 + }, + { + "epoch": 0.021290597705627478, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 1011 + }, + { + "epoch": 0.021311656654891206, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 1012 + }, + { + "epoch": 0.02133271560415493, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6245, + "step": 1013 + }, + { + "epoch": 0.021353774553418657, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 1014 + }, + { + "epoch": 0.021374833502682385, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6911, + "step": 1015 + }, + { + "epoch": 0.02139589245194611, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6083, + "step": 1016 + }, + { + "epoch": 0.021416951401209836, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 1017 + }, + { + "epoch": 0.021438010350473564, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 1018 + }, + { + "epoch": 0.02145906929973729, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 1019 + }, + { + "epoch": 0.021480128249001015, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 1020 + }, + { + "epoch": 0.021501187198264743, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 1021 + }, + { + "epoch": 0.02152224614752847, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 1022 + }, + { + "epoch": 0.021543305096792194, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6064, + "step": 1023 + }, + { + "epoch": 0.021564364046055922, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6111, + "step": 1024 + }, + { + "epoch": 0.02158542299531965, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6346, + "step": 1025 + }, + { + "epoch": 0.021606481944583374, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 1026 + }, + { + "epoch": 0.0216275408938471, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6298, + "step": 1027 + }, + { + "epoch": 0.02164859984311083, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 1028 + }, + { + "epoch": 0.021669658792374553, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 1029 + }, + { + "epoch": 0.02169071774163828, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6476, + "step": 1030 + }, + { + "epoch": 0.021711776690902008, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5851, + "step": 1031 + }, + { + "epoch": 0.021732835640165735, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6288, + "step": 1032 + }, + { + "epoch": 0.02175389458942946, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 1033 + }, + { + "epoch": 0.021774953538693187, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 1034 + }, + { + "epoch": 0.021796012487956914, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 1035 + }, + { + "epoch": 0.02181707143722064, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1036 + }, + { + "epoch": 0.021838130386484366, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 1037 + }, + { + "epoch": 0.021859189335748094, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 1038 + }, + { + "epoch": 0.021880248285011818, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 1039 + }, + { + "epoch": 0.021901307234275545, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5813, + "step": 1040 + }, + { + "epoch": 0.021922366183539273, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6238, + "step": 1041 + }, + { + "epoch": 0.021943425132803, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 1042 + }, + { + "epoch": 0.021964484082066724, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1043 + }, + { + "epoch": 0.021985543031330452, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 1044 + }, + { + "epoch": 0.02200660198059418, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6219, + "step": 1045 + }, + { + "epoch": 0.022027660929857903, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6233, + "step": 1046 + }, + { + "epoch": 0.02204871987912163, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 1047 + }, + { + "epoch": 0.02206977882838536, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.62, + "step": 1048 + }, + { + "epoch": 0.022090837777649083, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6118, + "step": 1049 + }, + { + "epoch": 0.02211189672691281, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6278, + "step": 1050 + }, + { + "epoch": 0.022132955676176538, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.637, + "step": 1051 + }, + { + "epoch": 0.022154014625440265, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 1052 + }, + { + "epoch": 0.02217507357470399, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 1053 + }, + { + "epoch": 0.022196132523967717, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6094, + "step": 1054 + }, + { + "epoch": 0.022217191473231444, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 1055 + }, + { + "epoch": 0.02223825042249517, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6495, + "step": 1056 + }, + { + "epoch": 0.022259309371758896, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1057 + }, + { + "epoch": 0.022280368321022623, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6204, + "step": 1058 + }, + { + "epoch": 0.022301427270286348, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 1059 + }, + { + "epoch": 0.022322486219550075, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6116, + "step": 1060 + }, + { + "epoch": 0.022343545168813803, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6374, + "step": 1061 + }, + { + "epoch": 0.02236460411807753, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 1062 + }, + { + "epoch": 0.022385663067341254, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6663, + "step": 1063 + }, + { + "epoch": 0.02240672201660498, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6162, + "step": 1064 + }, + { + "epoch": 0.02242778096586871, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1065 + }, + { + "epoch": 0.022448839915132433, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 1066 + }, + { + "epoch": 0.02246989886439616, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 1067 + }, + { + "epoch": 0.02249095781365989, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 1068 + }, + { + "epoch": 0.022512016762923612, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 1069 + }, + { + "epoch": 0.02253307571218734, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1070 + }, + { + "epoch": 0.022554134661451068, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 1071 + }, + { + "epoch": 0.022575193610714795, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 1072 + }, + { + "epoch": 0.02259625255997852, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6481, + "step": 1073 + }, + { + "epoch": 0.022617311509242247, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6485, + "step": 1074 + }, + { + "epoch": 0.022638370458505974, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 1075 + }, + { + "epoch": 0.022659429407769698, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6267, + "step": 1076 + }, + { + "epoch": 0.022680488357033426, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 1077 + }, + { + "epoch": 0.022701547306297153, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6177, + "step": 1078 + }, + { + "epoch": 0.022722606255560877, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 1079 + }, + { + "epoch": 0.022743665204824605, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5567, + "step": 1080 + }, + { + "epoch": 0.022764724154088332, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1081 + }, + { + "epoch": 0.02278578310335206, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6195, + "step": 1082 + }, + { + "epoch": 0.022806842052615784, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6194, + "step": 1083 + }, + { + "epoch": 0.02282790100187951, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6262, + "step": 1084 + }, + { + "epoch": 0.02284895995114324, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 1085 + }, + { + "epoch": 0.022870018900406963, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1086 + }, + { + "epoch": 0.02289107784967069, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 1087 + }, + { + "epoch": 0.022912136798934418, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 1088 + }, + { + "epoch": 0.022933195748198142, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 1089 + }, + { + "epoch": 0.02295425469746187, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5949, + "step": 1090 + }, + { + "epoch": 0.022975313646725597, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 1091 + }, + { + "epoch": 0.02299637259598932, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 1092 + }, + { + "epoch": 0.02301743154525305, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 1093 + }, + { + "epoch": 0.023038490494516776, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1094 + }, + { + "epoch": 0.023059549443780504, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 1095 + }, + { + "epoch": 0.023080608393044228, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 1096 + }, + { + "epoch": 0.023101667342307956, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6344, + "step": 1097 + }, + { + "epoch": 0.023122726291571683, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1098 + }, + { + "epoch": 0.023143785240835407, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5975, + "step": 1099 + }, + { + "epoch": 0.023164844190099135, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 1100 + }, + { + "epoch": 0.023185903139362862, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 1101 + }, + { + "epoch": 0.023206962088626586, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 1102 + }, + { + "epoch": 0.023228021037890314, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 1103 + }, + { + "epoch": 0.02324907998715404, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6211, + "step": 1104 + }, + { + "epoch": 0.02327013893641777, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 1105 + }, + { + "epoch": 0.023291197885681493, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.5897, + "step": 1106 + }, + { + "epoch": 0.02331225683494522, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 1107 + }, + { + "epoch": 0.023333315784208948, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 1108 + }, + { + "epoch": 0.023354374733472672, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1109 + }, + { + "epoch": 0.0233754336827364, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 1110 + }, + { + "epoch": 0.023396492632000127, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6505, + "step": 1111 + }, + { + "epoch": 0.02341755158126385, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6487, + "step": 1112 + }, + { + "epoch": 0.02343861053052758, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.6249, + "step": 1113 + }, + { + "epoch": 0.023459669479791306, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 1114 + }, + { + "epoch": 0.023480728429055034, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 1115 + }, + { + "epoch": 0.023501787378318758, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6267, + "step": 1116 + }, + { + "epoch": 0.023522846327582485, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1117 + }, + { + "epoch": 0.023543905276846213, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 1118 + }, + { + "epoch": 0.023564964226109937, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 1119 + }, + { + "epoch": 0.023586023175373665, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 1120 + }, + { + "epoch": 0.023607082124637392, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 1121 + }, + { + "epoch": 0.023628141073901116, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 1122 + }, + { + "epoch": 0.023649200023164844, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 1123 + }, + { + "epoch": 0.02367025897242857, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 1124 + }, + { + "epoch": 0.0236913179216923, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 1125 + }, + { + "epoch": 0.023712376870956023, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 1126 + }, + { + "epoch": 0.02373343582021975, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6247, + "step": 1127 + }, + { + "epoch": 0.023754494769483478, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5967, + "step": 1128 + }, + { + "epoch": 0.023775553718747202, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1129 + }, + { + "epoch": 0.02379661266801093, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6135, + "step": 1130 + }, + { + "epoch": 0.023817671617274657, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 1131 + }, + { + "epoch": 0.02383873056653838, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 1132 + }, + { + "epoch": 0.02385978951580211, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 1133 + }, + { + "epoch": 0.023880848465065836, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 1134 + }, + { + "epoch": 0.023901907414329564, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 1135 + }, + { + "epoch": 0.023922966363593288, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6119, + "step": 1136 + }, + { + "epoch": 0.023944025312857015, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6003, + "step": 1137 + }, + { + "epoch": 0.023965084262120743, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 1138 + }, + { + "epoch": 0.023986143211384467, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6294, + "step": 1139 + }, + { + "epoch": 0.024007202160648194, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 1140 + }, + { + "epoch": 0.024028261109911922, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 1141 + }, + { + "epoch": 0.024049320059175646, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1142 + }, + { + "epoch": 0.024070379008439374, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6204, + "step": 1143 + }, + { + "epoch": 0.0240914379577031, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6054, + "step": 1144 + }, + { + "epoch": 0.024112496906966825, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6421, + "step": 1145 + }, + { + "epoch": 0.024133555856230553, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 1146 + }, + { + "epoch": 0.02415461480549428, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6592, + "step": 1147 + }, + { + "epoch": 0.024175673754758008, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6276, + "step": 1148 + }, + { + "epoch": 0.024196732704021732, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 1149 + }, + { + "epoch": 0.02421779165328546, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 1150 + }, + { + "epoch": 0.024238850602549187, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 1151 + }, + { + "epoch": 0.02425990955181291, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 1152 + }, + { + "epoch": 0.02428096850107664, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 1153 + }, + { + "epoch": 0.024302027450340366, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5756, + "step": 1154 + }, + { + "epoch": 0.02432308639960409, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.633, + "step": 1155 + }, + { + "epoch": 0.024344145348867818, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 1156 + }, + { + "epoch": 0.024365204298131545, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6181, + "step": 1157 + }, + { + "epoch": 0.024386263247395273, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 1158 + }, + { + "epoch": 0.024407322196658997, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5906, + "step": 1159 + }, + { + "epoch": 0.024428381145922724, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6298, + "step": 1160 + }, + { + "epoch": 0.024449440095186452, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5965, + "step": 1161 + }, + { + "epoch": 0.024470499044450176, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 1162 + }, + { + "epoch": 0.024491557993713903, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 1163 + }, + { + "epoch": 0.02451261694297763, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1164 + }, + { + "epoch": 0.024533675892241355, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6029, + "step": 1165 + }, + { + "epoch": 0.024554734841505083, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6462, + "step": 1166 + }, + { + "epoch": 0.02457579379076881, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 1167 + }, + { + "epoch": 0.024596852740032538, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 1168 + }, + { + "epoch": 0.02461791168929626, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 1169 + }, + { + "epoch": 0.02463897063855999, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 1170 + }, + { + "epoch": 0.024660029587823717, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 1171 + }, + { + "epoch": 0.02468108853708744, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6419, + "step": 1172 + }, + { + "epoch": 0.02470214748635117, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 1173 + }, + { + "epoch": 0.024723206435614896, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6452, + "step": 1174 + }, + { + "epoch": 0.02474426538487862, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 1175 + }, + { + "epoch": 0.024765324334142347, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 1176 + }, + { + "epoch": 0.024786383283406075, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 1177 + }, + { + "epoch": 0.024807442232669803, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 1178 + }, + { + "epoch": 0.024828501181933527, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 1179 + }, + { + "epoch": 0.024849560131197254, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 1180 + }, + { + "epoch": 0.02487061908046098, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.5912, + "step": 1181 + }, + { + "epoch": 0.024891678029724706, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 1182 + }, + { + "epoch": 0.024912736978988433, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6034, + "step": 1183 + }, + { + "epoch": 0.02493379592825216, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 1184 + }, + { + "epoch": 0.024954854877515885, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 1185 + }, + { + "epoch": 0.024975913826779612, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6475, + "step": 1186 + }, + { + "epoch": 0.02499697277604334, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 1187 + }, + { + "epoch": 0.025018031725307067, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 1188 + }, + { + "epoch": 0.02503909067457079, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 1189 + }, + { + "epoch": 0.02506014962383452, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 1190 + }, + { + "epoch": 0.025081208573098247, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 1191 + }, + { + "epoch": 0.02510226752236197, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 1192 + }, + { + "epoch": 0.025123326471625698, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 1193 + }, + { + "epoch": 0.025144385420889426, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 1194 + }, + { + "epoch": 0.02516544437015315, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.568, + "step": 1195 + }, + { + "epoch": 0.025186503319416877, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 1196 + }, + { + "epoch": 0.025207562268680605, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.635, + "step": 1197 + }, + { + "epoch": 0.02522862121794433, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6003, + "step": 1198 + }, + { + "epoch": 0.025249680167208056, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 1199 + }, + { + "epoch": 0.025270739116471784, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 1200 + }, + { + "epoch": 0.02529179806573551, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6229, + "step": 1201 + }, + { + "epoch": 0.025312857014999236, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 1202 + }, + { + "epoch": 0.025333915964262963, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1203 + }, + { + "epoch": 0.02535497491352669, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 1204 + }, + { + "epoch": 0.025376033862790415, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 1205 + }, + { + "epoch": 0.025397092812054142, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 1206 + }, + { + "epoch": 0.02541815176131787, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5929, + "step": 1207 + }, + { + "epoch": 0.025439210710581594, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 1208 + }, + { + "epoch": 0.02546026965984532, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6064, + "step": 1209 + }, + { + "epoch": 0.02548132860910905, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 1210 + }, + { + "epoch": 0.025502387558372776, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 1211 + }, + { + "epoch": 0.0255234465076365, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 1212 + }, + { + "epoch": 0.025544505456900228, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1213 + }, + { + "epoch": 0.025565564406163956, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 1214 + }, + { + "epoch": 0.02558662335542768, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 1215 + }, + { + "epoch": 0.025607682304691407, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6388, + "step": 1216 + }, + { + "epoch": 0.025628741253955135, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5735, + "step": 1217 + }, + { + "epoch": 0.02564980020321886, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 1218 + }, + { + "epoch": 0.025670859152482586, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 1219 + }, + { + "epoch": 0.025691918101746314, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6351, + "step": 1220 + }, + { + "epoch": 0.02571297705101004, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 1221 + }, + { + "epoch": 0.025734036000273765, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6674, + "step": 1222 + }, + { + "epoch": 0.025755094949537493, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 1223 + }, + { + "epoch": 0.02577615389880122, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5934, + "step": 1224 + }, + { + "epoch": 0.025797212848064945, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1225 + }, + { + "epoch": 0.025818271797328672, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 1226 + }, + { + "epoch": 0.0258393307465924, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 1227 + }, + { + "epoch": 0.025860389695856124, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 1228 + }, + { + "epoch": 0.02588144864511985, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 1229 + }, + { + "epoch": 0.02590250759438358, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6302, + "step": 1230 + }, + { + "epoch": 0.025923566543647306, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6049, + "step": 1231 + }, + { + "epoch": 0.02594462549291103, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 1232 + }, + { + "epoch": 0.025965684442174758, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1233 + }, + { + "epoch": 0.025986743391438485, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6165, + "step": 1234 + }, + { + "epoch": 0.02600780234070221, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 1235 + }, + { + "epoch": 0.026028861289965937, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 1236 + }, + { + "epoch": 0.026049920239229665, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 1237 + }, + { + "epoch": 0.02607097918849339, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 1238 + }, + { + "epoch": 0.026092038137757116, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 1239 + }, + { + "epoch": 0.026113097087020844, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.5955, + "step": 1240 + }, + { + "epoch": 0.02613415603628457, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 1241 + }, + { + "epoch": 0.026155214985548295, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 1242 + }, + { + "epoch": 0.026176273934812023, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 1243 + }, + { + "epoch": 0.02619733288407575, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5983, + "step": 1244 + }, + { + "epoch": 0.026218391833339474, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5949, + "step": 1245 + }, + { + "epoch": 0.026239450782603202, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 1246 + }, + { + "epoch": 0.02626050973186693, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1247 + }, + { + "epoch": 0.026281568681130654, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5977, + "step": 1248 + }, + { + "epoch": 0.02630262763039438, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6097, + "step": 1249 + }, + { + "epoch": 0.02632368657965811, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 1250 + }, + { + "epoch": 0.026344745528921836, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6219, + "step": 1251 + }, + { + "epoch": 0.02636580447818556, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6273, + "step": 1252 + }, + { + "epoch": 0.026386863427449288, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6068, + "step": 1253 + }, + { + "epoch": 0.026407922376713015, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 1254 + }, + { + "epoch": 0.02642898132597674, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.635, + "step": 1255 + }, + { + "epoch": 0.026450040275240467, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.62, + "step": 1256 + }, + { + "epoch": 0.026471099224504194, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 1257 + }, + { + "epoch": 0.02649215817376792, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6102, + "step": 1258 + }, + { + "epoch": 0.026513217123031646, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 1259 + }, + { + "epoch": 0.026534276072295374, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 1260 + }, + { + "epoch": 0.026555335021559098, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 1261 + }, + { + "epoch": 0.026576393970822825, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6175, + "step": 1262 + }, + { + "epoch": 0.026597452920086553, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 1263 + }, + { + "epoch": 0.02661851186935028, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 1264 + }, + { + "epoch": 0.026639570818614004, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 1265 + }, + { + "epoch": 0.026660629767877732, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6079, + "step": 1266 + }, + { + "epoch": 0.02668168871714146, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 1267 + }, + { + "epoch": 0.026702747666405183, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 1268 + }, + { + "epoch": 0.02672380661566891, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 1269 + }, + { + "epoch": 0.02674486556493264, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 1270 + }, + { + "epoch": 0.026765924514196363, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6342, + "step": 1271 + }, + { + "epoch": 0.02678698346346009, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6329, + "step": 1272 + }, + { + "epoch": 0.026808042412723818, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.6079, + "step": 1273 + }, + { + "epoch": 0.026829101361987545, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6079, + "step": 1274 + }, + { + "epoch": 0.02685016031125127, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1275 + }, + { + "epoch": 0.026871219260514997, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 1276 + }, + { + "epoch": 0.026892278209778724, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 1277 + }, + { + "epoch": 0.02691333715904245, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 1278 + }, + { + "epoch": 0.026934396108306176, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6379, + "step": 1279 + }, + { + "epoch": 0.026955455057569903, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 1280 + }, + { + "epoch": 0.026976514006833627, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 1281 + }, + { + "epoch": 0.026997572956097355, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5876, + "step": 1282 + }, + { + "epoch": 0.027018631905361083, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1283 + }, + { + "epoch": 0.02703969085462481, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6562, + "step": 1284 + }, + { + "epoch": 0.027060749803888534, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 1285 + }, + { + "epoch": 0.02708180875315226, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6031, + "step": 1286 + }, + { + "epoch": 0.02710286770241599, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 1287 + }, + { + "epoch": 0.027123926651679713, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1288 + }, + { + "epoch": 0.02714498560094344, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 1289 + }, + { + "epoch": 0.02716604455020717, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 1290 + }, + { + "epoch": 0.027187103499470892, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6007, + "step": 1291 + }, + { + "epoch": 0.02720816244873462, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 1292 + }, + { + "epoch": 0.027229221397998347, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6411, + "step": 1293 + }, + { + "epoch": 0.027250280347262075, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 1294 + }, + { + "epoch": 0.0272713392965258, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 1295 + }, + { + "epoch": 0.027292398245789527, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 1296 + }, + { + "epoch": 0.027313457195053254, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 1297 + }, + { + "epoch": 0.027334516144316978, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6282, + "step": 1298 + }, + { + "epoch": 0.027355575093580706, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 1299 + }, + { + "epoch": 0.027376634042844433, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6165, + "step": 1300 + }, + { + "epoch": 0.027397692992108157, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 1301 + }, + { + "epoch": 0.027418751941371885, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 1302 + }, + { + "epoch": 0.027439810890635612, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 1303 + }, + { + "epoch": 0.02746086983989934, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6059, + "step": 1304 + }, + { + "epoch": 0.027481928789163064, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 1305 + }, + { + "epoch": 0.02750298773842679, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 1306 + }, + { + "epoch": 0.02752404668769052, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 1307 + }, + { + "epoch": 0.027545105636954243, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.6319, + "step": 1308 + }, + { + "epoch": 0.02756616458621797, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 1309 + }, + { + "epoch": 0.027587223535481698, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.618, + "step": 1310 + }, + { + "epoch": 0.027608282484745422, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6211, + "step": 1311 + }, + { + "epoch": 0.02762934143400915, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1312 + }, + { + "epoch": 0.027650400383272877, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6581, + "step": 1313 + }, + { + "epoch": 0.0276714593325366, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 1314 + }, + { + "epoch": 0.02769251828180033, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 1315 + }, + { + "epoch": 0.027713577231064056, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5705, + "step": 1316 + }, + { + "epoch": 0.027734636180327784, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6264, + "step": 1317 + }, + { + "epoch": 0.027755695129591508, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 1318 + }, + { + "epoch": 0.027776754078855236, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1319 + }, + { + "epoch": 0.027797813028118963, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6443, + "step": 1320 + }, + { + "epoch": 0.027818871977382687, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6177, + "step": 1321 + }, + { + "epoch": 0.027839930926646415, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 1322 + }, + { + "epoch": 0.027860989875910142, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 1323 + }, + { + "epoch": 0.027882048825173866, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1324 + }, + { + "epoch": 0.027903107774437594, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6342, + "step": 1325 + }, + { + "epoch": 0.02792416672370132, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6219, + "step": 1326 + }, + { + "epoch": 0.02794522567296505, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 1327 + }, + { + "epoch": 0.027966284622228773, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 1328 + }, + { + "epoch": 0.0279873435714925, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 1329 + }, + { + "epoch": 0.028008402520756228, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 1330 + }, + { + "epoch": 0.028029461470019952, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6226, + "step": 1331 + }, + { + "epoch": 0.02805052041928368, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6226, + "step": 1332 + }, + { + "epoch": 0.028071579368547407, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6256, + "step": 1333 + }, + { + "epoch": 0.02809263831781113, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.6391, + "step": 1334 + }, + { + "epoch": 0.02811369726707486, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6069, + "step": 1335 + }, + { + "epoch": 0.028134756216338586, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 1336 + }, + { + "epoch": 0.028155815165602314, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 1337 + }, + { + "epoch": 0.028176874114866038, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 1338 + }, + { + "epoch": 0.028197933064129765, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 1339 + }, + { + "epoch": 0.028218992013393493, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6352, + "step": 1340 + }, + { + "epoch": 0.028240050962657217, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1341 + }, + { + "epoch": 0.028261109911920945, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 1342 + }, + { + "epoch": 0.028282168861184672, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 1343 + }, + { + "epoch": 0.028303227810448396, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1344 + }, + { + "epoch": 0.028324286759712124, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6416, + "step": 1345 + }, + { + "epoch": 0.02834534570897585, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 1346 + }, + { + "epoch": 0.02836640465823958, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 1347 + }, + { + "epoch": 0.028387463607503303, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 1348 + }, + { + "epoch": 0.02840852255676703, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 1349 + }, + { + "epoch": 0.028429581506030758, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6349, + "step": 1350 + }, + { + "epoch": 0.028450640455294482, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 1351 + }, + { + "epoch": 0.02847169940455821, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 1352 + }, + { + "epoch": 0.028492758353821937, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 1353 + }, + { + "epoch": 0.02851381730308566, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 1354 + }, + { + "epoch": 0.02853487625234939, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 1355 + }, + { + "epoch": 0.028555935201613116, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 1356 + }, + { + "epoch": 0.028576994150876844, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1357 + }, + { + "epoch": 0.028598053100140568, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6162, + "step": 1358 + }, + { + "epoch": 0.028619112049404295, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 1359 + }, + { + "epoch": 0.028640170998668023, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 1360 + }, + { + "epoch": 0.028661229947931747, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 1361 + }, + { + "epoch": 0.028682288897195474, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 1362 + }, + { + "epoch": 0.028703347846459202, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 1363 + }, + { + "epoch": 0.028724406795722926, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 1364 + }, + { + "epoch": 0.028745465744986654, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1365 + }, + { + "epoch": 0.02876652469425038, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 1366 + }, + { + "epoch": 0.02878758364351411, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 1367 + }, + { + "epoch": 0.028808642592777833, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6521, + "step": 1368 + }, + { + "epoch": 0.02882970154204156, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6255, + "step": 1369 + }, + { + "epoch": 0.028850760491305288, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5906, + "step": 1370 + }, + { + "epoch": 0.028871819440569012, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6229, + "step": 1371 + }, + { + "epoch": 0.02889287838983274, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 1372 + }, + { + "epoch": 0.028913937339096467, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 1373 + }, + { + "epoch": 0.02893499628836019, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 1374 + }, + { + "epoch": 0.02895605523762392, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6029, + "step": 1375 + }, + { + "epoch": 0.028977114186887646, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 1376 + }, + { + "epoch": 0.02899817313615137, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 1377 + }, + { + "epoch": 0.029019232085415098, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6194, + "step": 1378 + }, + { + "epoch": 0.029040291034678825, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6062, + "step": 1379 + }, + { + "epoch": 0.029061349983942553, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6184, + "step": 1380 + }, + { + "epoch": 0.029082408933206277, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6045, + "step": 1381 + }, + { + "epoch": 0.029103467882470004, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 1382 + }, + { + "epoch": 0.029124526831733732, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.615, + "step": 1383 + }, + { + "epoch": 0.029145585780997456, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.618, + "step": 1384 + }, + { + "epoch": 0.029166644730261183, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5576, + "step": 1385 + }, + { + "epoch": 0.02918770367952491, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1386 + }, + { + "epoch": 0.029208762628788635, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 1387 + }, + { + "epoch": 0.029229821578052363, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6104, + "step": 1388 + }, + { + "epoch": 0.02925088052731609, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 1389 + }, + { + "epoch": 0.029271939476579818, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 1390 + }, + { + "epoch": 0.02929299842584354, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 1391 + }, + { + "epoch": 0.02931405737510727, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.616, + "step": 1392 + }, + { + "epoch": 0.029335116324370997, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6148, + "step": 1393 + }, + { + "epoch": 0.02935617527363472, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6599, + "step": 1394 + }, + { + "epoch": 0.02937723422289845, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 1395 + }, + { + "epoch": 0.029398293172162176, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 1396 + }, + { + "epoch": 0.0294193521214259, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 1397 + }, + { + "epoch": 0.029440411070689627, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6078, + "step": 1398 + }, + { + "epoch": 0.029461470019953355, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 1399 + }, + { + "epoch": 0.029482528969217082, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6385, + "step": 1400 + }, + { + "epoch": 0.029503587918480807, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 1401 + }, + { + "epoch": 0.029524646867744534, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 1402 + }, + { + "epoch": 0.02954570581700826, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 1403 + }, + { + "epoch": 0.029566764766271986, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6191, + "step": 1404 + }, + { + "epoch": 0.029587823715535713, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 1405 + }, + { + "epoch": 0.02960888266479944, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 1406 + }, + { + "epoch": 0.029629941614063165, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 1407 + }, + { + "epoch": 0.029651000563326892, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 1408 + }, + { + "epoch": 0.02967205951259062, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 1409 + }, + { + "epoch": 0.029693118461854347, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6238, + "step": 1410 + }, + { + "epoch": 0.02971417741111807, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6273, + "step": 1411 + }, + { + "epoch": 0.0297352363603818, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 1412 + }, + { + "epoch": 0.029756295309645527, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5945, + "step": 1413 + }, + { + "epoch": 0.02977735425890925, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.637, + "step": 1414 + }, + { + "epoch": 0.029798413208172978, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5983, + "step": 1415 + }, + { + "epoch": 0.029819472157436706, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 1416 + }, + { + "epoch": 0.02984053110670043, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 1417 + }, + { + "epoch": 0.029861590055964157, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 1418 + }, + { + "epoch": 0.029882649005227885, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6096, + "step": 1419 + }, + { + "epoch": 0.029903707954491612, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 1420 + }, + { + "epoch": 0.029924766903755336, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6093, + "step": 1421 + }, + { + "epoch": 0.029945825853019064, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 1422 + }, + { + "epoch": 0.02996688480228279, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6429, + "step": 1423 + }, + { + "epoch": 0.029987943751546516, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5726, + "step": 1424 + }, + { + "epoch": 0.030009002700810243, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 1425 + }, + { + "epoch": 0.03003006165007397, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 1426 + }, + { + "epoch": 0.030051120599337695, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6457, + "step": 1427 + }, + { + "epoch": 0.030072179548601422, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6293, + "step": 1428 + }, + { + "epoch": 0.03009323849786515, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 1429 + }, + { + "epoch": 0.030114297447128874, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 1430 + }, + { + "epoch": 0.0301353563963926, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1431 + }, + { + "epoch": 0.03015641534565633, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 1432 + }, + { + "epoch": 0.030177474294920056, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 1433 + }, + { + "epoch": 0.03019853324418378, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6294, + "step": 1434 + }, + { + "epoch": 0.030219592193447508, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 1435 + }, + { + "epoch": 0.030240651142711236, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 1436 + }, + { + "epoch": 0.03026171009197496, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6093, + "step": 1437 + }, + { + "epoch": 0.030282769041238687, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 1438 + }, + { + "epoch": 0.030303827990502415, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6133, + "step": 1439 + }, + { + "epoch": 0.03032488693976614, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5965, + "step": 1440 + }, + { + "epoch": 0.030345945889029866, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 1441 + }, + { + "epoch": 0.030367004838293594, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 1442 + }, + { + "epoch": 0.03038806378755732, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1443 + }, + { + "epoch": 0.030409122736821045, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 1444 + }, + { + "epoch": 0.030430181686084773, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5967, + "step": 1445 + }, + { + "epoch": 0.0304512406353485, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6281, + "step": 1446 + }, + { + "epoch": 0.030472299584612225, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 1447 + }, + { + "epoch": 0.030493358533875952, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 1448 + }, + { + "epoch": 0.03051441748313968, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6102, + "step": 1449 + }, + { + "epoch": 0.030535476432403404, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 1450 + }, + { + "epoch": 0.03055653538166713, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6007, + "step": 1451 + }, + { + "epoch": 0.03057759433093086, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6162, + "step": 1452 + }, + { + "epoch": 0.030598653280194586, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 1453 + }, + { + "epoch": 0.03061971222945831, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 1454 + }, + { + "epoch": 0.030640771178722038, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 1455 + }, + { + "epoch": 0.030661830127985765, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 1456 + }, + { + "epoch": 0.03068288907724949, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 1457 + }, + { + "epoch": 0.030703948026513217, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 1458 + }, + { + "epoch": 0.030725006975776945, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1459 + }, + { + "epoch": 0.03074606592504067, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 1460 + }, + { + "epoch": 0.030767124874304396, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 1461 + }, + { + "epoch": 0.030788183823568124, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 1462 + }, + { + "epoch": 0.03080924277283185, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 1463 + }, + { + "epoch": 0.030830301722095575, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 1464 + }, + { + "epoch": 0.030851360671359303, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 1465 + }, + { + "epoch": 0.03087241962062303, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.5986, + "step": 1466 + }, + { + "epoch": 0.030893478569886754, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6344, + "step": 1467 + }, + { + "epoch": 0.030914537519150482, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 1468 + }, + { + "epoch": 0.03093559646841421, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6097, + "step": 1469 + }, + { + "epoch": 0.030956655417677933, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 1470 + }, + { + "epoch": 0.03097771436694166, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6254, + "step": 1471 + }, + { + "epoch": 0.03099877331620539, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5912, + "step": 1472 + }, + { + "epoch": 0.031019832265469116, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 1473 + }, + { + "epoch": 0.03104089121473284, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.633, + "step": 1474 + }, + { + "epoch": 0.031061950163996568, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5669, + "step": 1475 + }, + { + "epoch": 0.031083009113260295, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6276, + "step": 1476 + }, + { + "epoch": 0.03110406806252402, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 1477 + }, + { + "epoch": 0.031125127011787747, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5737, + "step": 1478 + }, + { + "epoch": 0.031146185961051474, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 1479 + }, + { + "epoch": 0.0311672449103152, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 1480 + }, + { + "epoch": 0.031188303859578926, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 1481 + }, + { + "epoch": 0.031209362808842653, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.631, + "step": 1482 + }, + { + "epoch": 0.031230421758106378, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5815, + "step": 1483 + }, + { + "epoch": 0.031251480707370105, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 1484 + }, + { + "epoch": 0.03127253965663383, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6116, + "step": 1485 + }, + { + "epoch": 0.03129359860589756, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 1486 + }, + { + "epoch": 0.03131465755516129, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 1487 + }, + { + "epoch": 0.03133571650442501, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6482, + "step": 1488 + }, + { + "epoch": 0.031356775453688736, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 1489 + }, + { + "epoch": 0.03137783440295246, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 1490 + }, + { + "epoch": 0.03139889335221619, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 1491 + }, + { + "epoch": 0.03141995230147992, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1492 + }, + { + "epoch": 0.031441011250743646, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.629, + "step": 1493 + }, + { + "epoch": 0.031462070200007373, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 1494 + }, + { + "epoch": 0.031483129149271094, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1495 + }, + { + "epoch": 0.03150418809853482, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.59, + "step": 1496 + }, + { + "epoch": 0.03152524704779855, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 1497 + }, + { + "epoch": 0.03154630599706228, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 1498 + }, + { + "epoch": 0.031567364946326004, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6102, + "step": 1499 + }, + { + "epoch": 0.03158842389558973, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1500 + }, + { + "epoch": 0.03158842389558973, + "eval_loss": 1.7313117980957031, + "eval_runtime": 898.7214, + "eval_samples_per_second": 68.764, + "eval_steps_per_second": 2.15, + "step": 1500 + }, + { + "epoch": 0.03160948284485346, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 1501 + }, + { + "epoch": 0.03163054179411718, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 1502 + }, + { + "epoch": 0.03165160074338091, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6045, + "step": 1503 + }, + { + "epoch": 0.031672659692644635, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1504 + }, + { + "epoch": 0.03169371864190836, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 1505 + }, + { + "epoch": 0.03171477759117209, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 1506 + }, + { + "epoch": 0.03173583654043582, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1507 + }, + { + "epoch": 0.03175689548969954, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5827, + "step": 1508 + }, + { + "epoch": 0.031777954438963266, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6446, + "step": 1509 + }, + { + "epoch": 0.03179901338822699, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 1510 + }, + { + "epoch": 0.03182007233749072, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 1511 + }, + { + "epoch": 0.03184113128675445, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 1512 + }, + { + "epoch": 0.031862190236018176, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 1513 + }, + { + "epoch": 0.0318832491852819, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 1514 + }, + { + "epoch": 0.031904308134545624, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 1515 + }, + { + "epoch": 0.03192536708380935, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5971, + "step": 1516 + }, + { + "epoch": 0.03194642603307308, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6177, + "step": 1517 + }, + { + "epoch": 0.031967484982336807, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.615, + "step": 1518 + }, + { + "epoch": 0.031988543931600534, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 1519 + }, + { + "epoch": 0.03200960288086426, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5807, + "step": 1520 + }, + { + "epoch": 0.03203066183012799, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6462, + "step": 1521 + }, + { + "epoch": 0.03205172077939171, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6035, + "step": 1522 + }, + { + "epoch": 0.03207277972865544, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 1523 + }, + { + "epoch": 0.032093838677919165, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 1524 + }, + { + "epoch": 0.03211489762718289, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6359, + "step": 1525 + }, + { + "epoch": 0.03213595657644662, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6127, + "step": 1526 + }, + { + "epoch": 0.03215701552571035, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 1527 + }, + { + "epoch": 0.03217807447497407, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 1528 + }, + { + "epoch": 0.032199133424237796, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5891, + "step": 1529 + }, + { + "epoch": 0.03222019237350152, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 1530 + }, + { + "epoch": 0.03224125132276525, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6025, + "step": 1531 + }, + { + "epoch": 0.03226231027202898, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 1532 + }, + { + "epoch": 0.032283369221292706, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6418, + "step": 1533 + }, + { + "epoch": 0.03230442817055643, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1534 + }, + { + "epoch": 0.032325487119820154, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6064, + "step": 1535 + }, + { + "epoch": 0.03234654606908388, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6064, + "step": 1536 + }, + { + "epoch": 0.03236760501834761, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1537 + }, + { + "epoch": 0.032388663967611336, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5879, + "step": 1538 + }, + { + "epoch": 0.032409722916875064, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6506, + "step": 1539 + }, + { + "epoch": 0.03243078186613879, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 1540 + }, + { + "epoch": 0.03245184081540251, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 1541 + }, + { + "epoch": 0.03247289976466624, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6056, + "step": 1542 + }, + { + "epoch": 0.03249395871392997, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 1543 + }, + { + "epoch": 0.032515017663193695, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 1544 + }, + { + "epoch": 0.03253607661245742, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1545 + }, + { + "epoch": 0.03255713556172115, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.628, + "step": 1546 + }, + { + "epoch": 0.03257819451098488, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6171, + "step": 1547 + }, + { + "epoch": 0.0325992534602486, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5599, + "step": 1548 + }, + { + "epoch": 0.032620312409512325, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.574, + "step": 1549 + }, + { + "epoch": 0.03264137135877605, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 1550 + }, + { + "epoch": 0.03266243030803978, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5824, + "step": 1551 + }, + { + "epoch": 0.03268348925730351, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6265, + "step": 1552 + }, + { + "epoch": 0.032704548206567235, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 1553 + }, + { + "epoch": 0.03272560715583096, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6078, + "step": 1554 + }, + { + "epoch": 0.032746666105094684, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1555 + }, + { + "epoch": 0.03276772505435841, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.5757, + "step": 1556 + }, + { + "epoch": 0.03278878400362214, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6165, + "step": 1557 + }, + { + "epoch": 0.032809842952885866, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 1558 + }, + { + "epoch": 0.032830901902149594, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 1559 + }, + { + "epoch": 0.03285196085141332, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6509, + "step": 1560 + }, + { + "epoch": 0.03287301980067704, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 1561 + }, + { + "epoch": 0.03289407874994077, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6204, + "step": 1562 + }, + { + "epoch": 0.0329151376992045, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5967, + "step": 1563 + }, + { + "epoch": 0.032936196648468224, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 1564 + }, + { + "epoch": 0.03295725559773195, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6286, + "step": 1565 + }, + { + "epoch": 0.03297831454699568, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 1566 + }, + { + "epoch": 0.03299937349625941, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1567 + }, + { + "epoch": 0.03302043244552313, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 1568 + }, + { + "epoch": 0.033041491394786855, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5737, + "step": 1569 + }, + { + "epoch": 0.03306255034405058, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 1570 + }, + { + "epoch": 0.03308360929331431, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.6129, + "step": 1571 + }, + { + "epoch": 0.03310466824257804, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 1572 + }, + { + "epoch": 0.033125727191841765, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6233, + "step": 1573 + }, + { + "epoch": 0.03314678614110549, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1574 + }, + { + "epoch": 0.03316784509036921, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 1575 + }, + { + "epoch": 0.03318890403963294, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 1576 + }, + { + "epoch": 0.03320996298889667, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6259, + "step": 1577 + }, + { + "epoch": 0.033231021938160396, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1578 + }, + { + "epoch": 0.033252080887424124, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 1579 + }, + { + "epoch": 0.03327313983668785, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5902, + "step": 1580 + }, + { + "epoch": 0.03329419878595157, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6249, + "step": 1581 + }, + { + "epoch": 0.0333152577352153, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6291, + "step": 1582 + }, + { + "epoch": 0.03333631668447903, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 1583 + }, + { + "epoch": 0.033357375633742754, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 1584 + }, + { + "epoch": 0.03337843458300648, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 1585 + }, + { + "epoch": 0.03339949353227021, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6275, + "step": 1586 + }, + { + "epoch": 0.03342055248153394, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 1.5987, + "step": 1587 + }, + { + "epoch": 0.03344161143079766, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6521, + "step": 1588 + }, + { + "epoch": 0.033462670380061385, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 1589 + }, + { + "epoch": 0.03348372932932511, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6361, + "step": 1590 + }, + { + "epoch": 0.03350478827858884, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 1591 + }, + { + "epoch": 0.03352584722785257, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 1592 + }, + { + "epoch": 0.033546906177116295, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.629, + "step": 1593 + }, + { + "epoch": 0.03356796512638002, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 1594 + }, + { + "epoch": 0.03358902407564374, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 1595 + }, + { + "epoch": 0.03361008302490747, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6035, + "step": 1596 + }, + { + "epoch": 0.0336311419741712, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6554, + "step": 1597 + }, + { + "epoch": 0.033652200923434926, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 1598 + }, + { + "epoch": 0.03367325987269865, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 1599 + }, + { + "epoch": 0.03369431882196238, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 1600 + }, + { + "epoch": 0.0337153777712261, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 1601 + }, + { + "epoch": 0.03373643672048983, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 1602 + }, + { + "epoch": 0.03375749566975356, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 1603 + }, + { + "epoch": 0.033778554619017284, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 1604 + }, + { + "epoch": 0.03379961356828101, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6178, + "step": 1605 + }, + { + "epoch": 0.03382067251754474, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6154, + "step": 1606 + }, + { + "epoch": 0.03384173146680847, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 1607 + }, + { + "epoch": 0.03386279041607219, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 1608 + }, + { + "epoch": 0.033883849365335915, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1609 + }, + { + "epoch": 0.03390490831459964, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1610 + }, + { + "epoch": 0.03392596726386337, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6262, + "step": 1611 + }, + { + "epoch": 0.0339470262131271, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6007, + "step": 1612 + }, + { + "epoch": 0.033968085162390825, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 1613 + }, + { + "epoch": 0.033989144111654546, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 1614 + }, + { + "epoch": 0.03401020306091827, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 1615 + }, + { + "epoch": 0.034031262010182, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6194, + "step": 1616 + }, + { + "epoch": 0.03405232095944573, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1617 + }, + { + "epoch": 0.034073379908709456, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 1618 + }, + { + "epoch": 0.03409443885797318, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 1619 + }, + { + "epoch": 0.03411549780723691, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1620 + }, + { + "epoch": 0.03413655675650063, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 1621 + }, + { + "epoch": 0.03415761570576436, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 1622 + }, + { + "epoch": 0.034178674655028086, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1623 + }, + { + "epoch": 0.034199733604291814, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6127, + "step": 1624 + }, + { + "epoch": 0.03422079255355554, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 1625 + }, + { + "epoch": 0.03424185150281927, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 1626 + }, + { + "epoch": 0.034262910452083, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 1627 + }, + { + "epoch": 0.03428396940134672, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6116, + "step": 1628 + }, + { + "epoch": 0.034305028350610445, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.628, + "step": 1629 + }, + { + "epoch": 0.03432608729987417, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 1630 + }, + { + "epoch": 0.0343471462491379, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5662, + "step": 1631 + }, + { + "epoch": 0.03436820519840163, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 1632 + }, + { + "epoch": 0.034389264147665355, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6273, + "step": 1633 + }, + { + "epoch": 0.034410323096929075, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5463, + "step": 1634 + }, + { + "epoch": 0.0344313820461928, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 1635 + }, + { + "epoch": 0.03445244099545653, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 1636 + }, + { + "epoch": 0.03447349994472026, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6039, + "step": 1637 + }, + { + "epoch": 0.034494558893983986, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 1638 + }, + { + "epoch": 0.03451561784324771, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6061, + "step": 1639 + }, + { + "epoch": 0.03453667679251144, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 1640 + }, + { + "epoch": 0.03455773574177516, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6254, + "step": 1641 + }, + { + "epoch": 0.03457879469103889, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 1642 + }, + { + "epoch": 0.034599853640302616, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 1643 + }, + { + "epoch": 0.034620912589566344, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 1644 + }, + { + "epoch": 0.03464197153883007, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6329, + "step": 1645 + }, + { + "epoch": 0.0346630304880938, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 1646 + }, + { + "epoch": 0.034684089437357526, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6358, + "step": 1647 + }, + { + "epoch": 0.03470514838662125, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 1648 + }, + { + "epoch": 0.034726207335884975, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 1649 + }, + { + "epoch": 0.0347472662851487, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.6247, + "step": 1650 + }, + { + "epoch": 0.03476832523441243, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5739, + "step": 1651 + }, + { + "epoch": 0.03478938418367616, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 1652 + }, + { + "epoch": 0.034810443132939885, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5943, + "step": 1653 + }, + { + "epoch": 0.034831502082203605, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6355, + "step": 1654 + }, + { + "epoch": 0.03485256103146733, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6046, + "step": 1655 + }, + { + "epoch": 0.03487361998073106, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 1656 + }, + { + "epoch": 0.03489467892999479, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 1657 + }, + { + "epoch": 0.034915737879258515, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 1658 + }, + { + "epoch": 0.03493679682852224, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5824, + "step": 1659 + }, + { + "epoch": 0.03495785577778597, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6045, + "step": 1660 + }, + { + "epoch": 0.03497891472704969, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1661 + }, + { + "epoch": 0.03499997367631342, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6329, + "step": 1662 + }, + { + "epoch": 0.035021032625577146, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 1663 + }, + { + "epoch": 0.035042091574840874, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.664, + "step": 1664 + }, + { + "epoch": 0.0350631505241046, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 1665 + }, + { + "epoch": 0.03508420947336833, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5835, + "step": 1666 + }, + { + "epoch": 0.03510526842263205, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 1667 + }, + { + "epoch": 0.03512632737189578, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 1668 + }, + { + "epoch": 0.035147386321159504, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5672, + "step": 1669 + }, + { + "epoch": 0.03516844527042323, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 1670 + }, + { + "epoch": 0.03518950421968696, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 1671 + }, + { + "epoch": 0.03521056316895069, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1672 + }, + { + "epoch": 0.035231622118214415, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 1673 + }, + { + "epoch": 0.035252681067478135, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6174, + "step": 1674 + }, + { + "epoch": 0.03527374001674186, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1675 + }, + { + "epoch": 0.03529479896600559, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 1676 + }, + { + "epoch": 0.03531585791526932, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 1677 + }, + { + "epoch": 0.035336916864533045, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 1678 + }, + { + "epoch": 0.03535797581379677, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5799, + "step": 1679 + }, + { + "epoch": 0.0353790347630605, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 1680 + }, + { + "epoch": 0.03540009371232422, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 1681 + }, + { + "epoch": 0.03542115266158795, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.575, + "step": 1682 + }, + { + "epoch": 0.035442211610851676, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 1683 + }, + { + "epoch": 0.035463270560115404, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 1684 + }, + { + "epoch": 0.03548432950937913, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6263, + "step": 1685 + }, + { + "epoch": 0.03550538845864286, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6079, + "step": 1686 + }, + { + "epoch": 0.03552644740790658, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5731, + "step": 1687 + }, + { + "epoch": 0.03554750635717031, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5906, + "step": 1688 + }, + { + "epoch": 0.035568565306434034, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5599, + "step": 1689 + }, + { + "epoch": 0.03558962425569776, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6101, + "step": 1690 + }, + { + "epoch": 0.03561068320496149, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5947, + "step": 1691 + }, + { + "epoch": 0.03563174215422522, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 1692 + }, + { + "epoch": 0.035652801103488944, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6101, + "step": 1693 + }, + { + "epoch": 0.035673860052752665, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 1694 + }, + { + "epoch": 0.03569491900201639, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1695 + }, + { + "epoch": 0.03571597795128012, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5894, + "step": 1696 + }, + { + "epoch": 0.03573703690054385, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 1697 + }, + { + "epoch": 0.035758095849807575, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.6318, + "step": 1698 + }, + { + "epoch": 0.0357791547990713, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5965, + "step": 1699 + }, + { + "epoch": 0.03580021374833503, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5629, + "step": 1700 + }, + { + "epoch": 0.03582127269759875, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5906, + "step": 1701 + }, + { + "epoch": 0.03584233164686248, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 1702 + }, + { + "epoch": 0.035863390596126206, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 1703 + }, + { + "epoch": 0.03588444954538993, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 1704 + }, + { + "epoch": 0.03590550849465366, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 1705 + }, + { + "epoch": 0.03592656744391739, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6165, + "step": 1706 + }, + { + "epoch": 0.03594762639318111, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 1707 + }, + { + "epoch": 0.03596868534244484, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 1708 + }, + { + "epoch": 0.035989744291708564, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 1709 + }, + { + "epoch": 0.03601080324097229, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5995, + "step": 1710 + }, + { + "epoch": 0.03603186219023602, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 1711 + }, + { + "epoch": 0.03605292113949975, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6147, + "step": 1712 + }, + { + "epoch": 0.036073980088763474, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 1713 + }, + { + "epoch": 0.036095039038027195, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 1714 + }, + { + "epoch": 0.03611609798729092, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 1715 + }, + { + "epoch": 0.03613715693655465, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 1716 + }, + { + "epoch": 0.03615821588581838, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 1717 + }, + { + "epoch": 0.036179274835082105, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 1718 + }, + { + "epoch": 0.03620033378434583, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1719 + }, + { + "epoch": 0.03622139273360955, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 1720 + }, + { + "epoch": 0.03624245168287328, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 1721 + }, + { + "epoch": 0.03626351063213701, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5913, + "step": 1722 + }, + { + "epoch": 0.036284569581400736, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 1723 + }, + { + "epoch": 0.03630562853066446, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 1724 + }, + { + "epoch": 0.03632668747992819, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 1725 + }, + { + "epoch": 0.03634774642919192, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 1726 + }, + { + "epoch": 0.03636880537845564, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 1727 + }, + { + "epoch": 0.036389864327719366, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 1728 + }, + { + "epoch": 0.036410923276983094, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5759, + "step": 1729 + }, + { + "epoch": 0.03643198222624682, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 1730 + }, + { + "epoch": 0.03645304117551055, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5542, + "step": 1731 + }, + { + "epoch": 0.03647410012477428, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6252, + "step": 1732 + }, + { + "epoch": 0.036495159074038004, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 1733 + }, + { + "epoch": 0.036516218023301725, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1734 + }, + { + "epoch": 0.03653727697256545, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 1735 + }, + { + "epoch": 0.03655833592182918, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 1736 + }, + { + "epoch": 0.03657939487109291, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1737 + }, + { + "epoch": 0.036600453820356635, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 1738 + }, + { + "epoch": 0.03662151276962036, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 1739 + }, + { + "epoch": 0.03664257171888408, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 1740 + }, + { + "epoch": 0.03666363066814781, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 1741 + }, + { + "epoch": 0.03668468961741154, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 1742 + }, + { + "epoch": 0.036705748566675266, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6045, + "step": 1743 + }, + { + "epoch": 0.03672680751593899, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.5764, + "step": 1744 + }, + { + "epoch": 0.03674786646520272, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 1745 + }, + { + "epoch": 0.03676892541446645, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 1746 + }, + { + "epoch": 0.03678998436373017, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 1747 + }, + { + "epoch": 0.036811043312993896, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 1748 + }, + { + "epoch": 0.036832102262257624, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6318, + "step": 1749 + }, + { + "epoch": 0.03685316121152135, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6391, + "step": 1750 + }, + { + "epoch": 0.03687422016078508, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 1751 + }, + { + "epoch": 0.036895279110048806, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6101, + "step": 1752 + }, + { + "epoch": 0.036916338059312534, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 1753 + }, + { + "epoch": 0.036937397008576255, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 1754 + }, + { + "epoch": 0.03695845595783998, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 1755 + }, + { + "epoch": 0.03697951490710371, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 1756 + }, + { + "epoch": 0.03700057385636744, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 1757 + }, + { + "epoch": 0.037021632805631165, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 1758 + }, + { + "epoch": 0.03704269175489489, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 1759 + }, + { + "epoch": 0.03706375070415861, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.5955, + "step": 1760 + }, + { + "epoch": 0.03708480965342234, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6174, + "step": 1761 + }, + { + "epoch": 0.03710586860268607, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.6206, + "step": 1762 + }, + { + "epoch": 0.037126927551949795, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 1763 + }, + { + "epoch": 0.03714798650121352, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 1764 + }, + { + "epoch": 0.03716904545047725, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5848, + "step": 1765 + }, + { + "epoch": 0.03719010439974098, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1766 + }, + { + "epoch": 0.0372111633490047, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 1767 + }, + { + "epoch": 0.037232222298268426, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5865, + "step": 1768 + }, + { + "epoch": 0.037253281247532154, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.6111, + "step": 1769 + }, + { + "epoch": 0.03727434019679588, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 1770 + }, + { + "epoch": 0.03729539914605961, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 1771 + }, + { + "epoch": 0.037316458095323336, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 1772 + }, + { + "epoch": 0.03733751704458706, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6174, + "step": 1773 + }, + { + "epoch": 0.037358575993850784, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 1774 + }, + { + "epoch": 0.03737963494311451, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6294, + "step": 1775 + }, + { + "epoch": 0.03740069389237824, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 1776 + }, + { + "epoch": 0.03742175284164197, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6171, + "step": 1777 + }, + { + "epoch": 0.037442811790905695, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 1778 + }, + { + "epoch": 0.03746387074016942, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6256, + "step": 1779 + }, + { + "epoch": 0.03748492968943314, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 1780 + }, + { + "epoch": 0.03750598863869687, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5856, + "step": 1781 + }, + { + "epoch": 0.0375270475879606, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 1782 + }, + { + "epoch": 0.037548106537224325, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 1783 + }, + { + "epoch": 0.03756916548648805, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.5898, + "step": 1784 + }, + { + "epoch": 0.03759022443575178, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 1785 + }, + { + "epoch": 0.03761128338501551, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 1786 + }, + { + "epoch": 0.03763234233427923, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 1787 + }, + { + "epoch": 0.037653401283542956, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5865, + "step": 1788 + }, + { + "epoch": 0.037674460232806684, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 1789 + }, + { + "epoch": 0.03769551918207041, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5975, + "step": 1790 + }, + { + "epoch": 0.03771657813133414, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 1791 + }, + { + "epoch": 0.037737637080597866, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 1792 + }, + { + "epoch": 0.03775869602986159, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 1793 + }, + { + "epoch": 0.037779754979125314, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1794 + }, + { + "epoch": 0.03780081392838904, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5722, + "step": 1795 + }, + { + "epoch": 0.03782187287765277, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1796 + }, + { + "epoch": 0.0378429318269165, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 1797 + }, + { + "epoch": 0.037863990776180224, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 1798 + }, + { + "epoch": 0.03788504972544395, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 1799 + }, + { + "epoch": 0.03790610867470767, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6147, + "step": 1800 + }, + { + "epoch": 0.0379271676239714, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5663, + "step": 1801 + }, + { + "epoch": 0.03794822657323513, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 1802 + }, + { + "epoch": 0.037969285522498855, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5665, + "step": 1803 + }, + { + "epoch": 0.03799034447176258, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6276, + "step": 1804 + }, + { + "epoch": 0.03801140342102631, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 1805 + }, + { + "epoch": 0.03803246237029004, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 1806 + }, + { + "epoch": 0.03805352131955376, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 1807 + }, + { + "epoch": 0.038074580268817486, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1808 + }, + { + "epoch": 0.03809563921808121, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 1809 + }, + { + "epoch": 0.03811669816734494, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 1810 + }, + { + "epoch": 0.03813775711660867, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6118, + "step": 1811 + }, + { + "epoch": 0.038158816065872396, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 1812 + }, + { + "epoch": 0.03817987501513612, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 1813 + }, + { + "epoch": 0.038200933964399844, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6181, + "step": 1814 + }, + { + "epoch": 0.03822199291366357, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 1815 + }, + { + "epoch": 0.0382430518629273, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 1816 + }, + { + "epoch": 0.03826411081219103, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 1817 + }, + { + "epoch": 0.038285169761454754, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6372, + "step": 1818 + }, + { + "epoch": 0.03830622871071848, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6259, + "step": 1819 + }, + { + "epoch": 0.0383272876599822, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1820 + }, + { + "epoch": 0.03834834660924593, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6278, + "step": 1821 + }, + { + "epoch": 0.03836940555850966, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 1822 + }, + { + "epoch": 0.038390464507773385, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5975, + "step": 1823 + }, + { + "epoch": 0.03841152345703711, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 1824 + }, + { + "epoch": 0.03843258240630084, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6069, + "step": 1825 + }, + { + "epoch": 0.03845364135556456, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 1826 + }, + { + "epoch": 0.03847470030482829, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6003, + "step": 1827 + }, + { + "epoch": 0.038495759254092016, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 1828 + }, + { + "epoch": 0.03851681820335574, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5912, + "step": 1829 + }, + { + "epoch": 0.03853787715261947, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 1830 + }, + { + "epoch": 0.0385589361018832, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 1831 + }, + { + "epoch": 0.038579995051146926, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 1832 + }, + { + "epoch": 0.038601054000410646, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.574, + "step": 1833 + }, + { + "epoch": 0.038622112949674374, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 1834 + }, + { + "epoch": 0.0386431718989381, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 1835 + }, + { + "epoch": 0.03866423084820183, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 1836 + }, + { + "epoch": 0.03868528979746556, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5771, + "step": 1837 + }, + { + "epoch": 0.038706348746729284, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5815, + "step": 1838 + }, + { + "epoch": 0.03872740769599301, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 1839 + }, + { + "epoch": 0.03874846664525673, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 1840 + }, + { + "epoch": 0.03876952559452046, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 1841 + }, + { + "epoch": 0.03879058454378419, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 1842 + }, + { + "epoch": 0.038811643493047915, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.582, + "step": 1843 + }, + { + "epoch": 0.03883270244231164, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5934, + "step": 1844 + }, + { + "epoch": 0.03885376139157537, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 1845 + }, + { + "epoch": 0.03887482034083909, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 1846 + }, + { + "epoch": 0.03889587929010282, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5832, + "step": 1847 + }, + { + "epoch": 0.038916938239366546, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 1848 + }, + { + "epoch": 0.03893799718863027, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5599, + "step": 1849 + }, + { + "epoch": 0.038959056137894, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5752, + "step": 1850 + }, + { + "epoch": 0.03898011508715773, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 1851 + }, + { + "epoch": 0.039001174036421456, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6291, + "step": 1852 + }, + { + "epoch": 0.039022232985685176, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 1853 + }, + { + "epoch": 0.039043291934948904, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 1854 + }, + { + "epoch": 0.03906435088421263, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 1855 + }, + { + "epoch": 0.03908540983347636, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 1856 + }, + { + "epoch": 0.039106468782740086, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 1857 + }, + { + "epoch": 0.039127527732003814, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6422, + "step": 1858 + }, + { + "epoch": 0.03914858668126754, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 1859 + }, + { + "epoch": 0.03916964563053126, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5876, + "step": 1860 + }, + { + "epoch": 0.03919070457979499, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1861 + }, + { + "epoch": 0.03921176352905872, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 1862 + }, + { + "epoch": 0.039232822478322445, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 1863 + }, + { + "epoch": 0.03925388142758617, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 1864 + }, + { + "epoch": 0.0392749403768499, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1865 + }, + { + "epoch": 0.03929599932611362, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 1866 + }, + { + "epoch": 0.03931705827537735, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 1867 + }, + { + "epoch": 0.039338117224641075, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5818, + "step": 1868 + }, + { + "epoch": 0.0393591761739048, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 1869 + }, + { + "epoch": 0.03938023512316853, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 1870 + }, + { + "epoch": 0.03940129407243226, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 1871 + }, + { + "epoch": 0.039422353021695986, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 1872 + }, + { + "epoch": 0.039443411970959706, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5995, + "step": 1873 + }, + { + "epoch": 0.039464470920223434, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 1874 + }, + { + "epoch": 0.03948552986948716, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6056, + "step": 1875 + }, + { + "epoch": 0.03950658881875089, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 1876 + }, + { + "epoch": 0.039527647768014616, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6122, + "step": 1877 + }, + { + "epoch": 0.039548706717278344, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 1878 + }, + { + "epoch": 0.03956976566654207, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6212, + "step": 1879 + }, + { + "epoch": 0.03959082461580579, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 1880 + }, + { + "epoch": 0.03961188356506952, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 1881 + }, + { + "epoch": 0.03963294251433325, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 1882 + }, + { + "epoch": 0.039654001463596975, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 1883 + }, + { + "epoch": 0.0396750604128607, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 1884 + }, + { + "epoch": 0.03969611936212443, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5667, + "step": 1885 + }, + { + "epoch": 0.03971717831138815, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6094, + "step": 1886 + }, + { + "epoch": 0.03973823726065188, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 1887 + }, + { + "epoch": 0.039759296209915605, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 1888 + }, + { + "epoch": 0.03978035515917933, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6253, + "step": 1889 + }, + { + "epoch": 0.03980141410844306, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 1890 + }, + { + "epoch": 0.03982247305770679, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 1891 + }, + { + "epoch": 0.039843532006970515, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 1892 + }, + { + "epoch": 0.039864590956234236, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 1893 + }, + { + "epoch": 0.039885649905497964, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6039, + "step": 1894 + }, + { + "epoch": 0.03990670885476169, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6171, + "step": 1895 + }, + { + "epoch": 0.03992776780402542, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 1896 + }, + { + "epoch": 0.039948826753289146, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6371, + "step": 1897 + }, + { + "epoch": 0.039969885702552874, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 1898 + }, + { + "epoch": 0.039990944651816594, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 1899 + }, + { + "epoch": 0.04001200360108032, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 1900 + }, + { + "epoch": 0.04003306255034405, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 1901 + }, + { + "epoch": 0.04005412149960778, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 1902 + }, + { + "epoch": 0.040075180448871504, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 1903 + }, + { + "epoch": 0.04009623939813523, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 1904 + }, + { + "epoch": 0.04011729834739896, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 1905 + }, + { + "epoch": 0.04013835729666268, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 1906 + }, + { + "epoch": 0.04015941624592641, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1907 + }, + { + "epoch": 0.040180475195190135, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5607, + "step": 1908 + }, + { + "epoch": 0.04020153414445386, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 1909 + }, + { + "epoch": 0.04022259309371759, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5876, + "step": 1910 + }, + { + "epoch": 0.04024365204298132, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 1911 + }, + { + "epoch": 0.040264710992245045, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6007, + "step": 1912 + }, + { + "epoch": 0.040285769941508766, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 1913 + }, + { + "epoch": 0.04030682889077249, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 1914 + }, + { + "epoch": 0.04032788784003622, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5898, + "step": 1915 + }, + { + "epoch": 0.04034894678929995, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 1916 + }, + { + "epoch": 0.040370005738563676, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6196, + "step": 1917 + }, + { + "epoch": 0.040391064687827404, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6182, + "step": 1918 + }, + { + "epoch": 0.040412123637091124, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 1919 + }, + { + "epoch": 0.04043318258635485, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6046, + "step": 1920 + }, + { + "epoch": 0.04045424153561858, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 1921 + }, + { + "epoch": 0.04047530048488231, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 1922 + }, + { + "epoch": 0.040496359434146034, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 1923 + }, + { + "epoch": 0.04051741838340976, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 1924 + }, + { + "epoch": 0.04053847733267349, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 1925 + }, + { + "epoch": 0.04055953628193721, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 1926 + }, + { + "epoch": 0.04058059523120094, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 1927 + }, + { + "epoch": 0.040601654180464665, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6079, + "step": 1928 + }, + { + "epoch": 0.04062271312972839, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 1929 + }, + { + "epoch": 0.04064377207899212, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6206, + "step": 1930 + }, + { + "epoch": 0.04066483102825585, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 1931 + }, + { + "epoch": 0.040685889977519575, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 1932 + }, + { + "epoch": 0.040706948926783296, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5739, + "step": 1933 + }, + { + "epoch": 0.04072800787604702, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5967, + "step": 1934 + }, + { + "epoch": 0.04074906682531075, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 1935 + }, + { + "epoch": 0.04077012577457448, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 1936 + }, + { + "epoch": 0.040791184723838206, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 1937 + }, + { + "epoch": 0.04081224367310193, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 1938 + }, + { + "epoch": 0.040833302622365654, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 1939 + }, + { + "epoch": 0.04085436157162938, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5733, + "step": 1940 + }, + { + "epoch": 0.04087542052089311, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6116, + "step": 1941 + }, + { + "epoch": 0.04089647947015684, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 1942 + }, + { + "epoch": 0.040917538419420564, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 1943 + }, + { + "epoch": 0.04093859736868429, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 1944 + }, + { + "epoch": 0.04095965631794802, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 1945 + }, + { + "epoch": 0.04098071526721174, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5799, + "step": 1946 + }, + { + "epoch": 0.04100177421647547, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5829, + "step": 1947 + }, + { + "epoch": 0.041022833165739195, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 1948 + }, + { + "epoch": 0.04104389211500292, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6079, + "step": 1949 + }, + { + "epoch": 0.04106495106426665, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 1950 + }, + { + "epoch": 0.04108601001353038, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 1951 + }, + { + "epoch": 0.0411070689627941, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 1952 + }, + { + "epoch": 0.041128127912057826, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 1953 + }, + { + "epoch": 0.04114918686132155, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1954 + }, + { + "epoch": 0.04117024581058528, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1955 + }, + { + "epoch": 0.04119130475984901, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5998, + "step": 1956 + }, + { + "epoch": 0.041212363709112736, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 1957 + }, + { + "epoch": 0.04123342265837646, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6213, + "step": 1958 + }, + { + "epoch": 0.041254481607640184, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5695, + "step": 1959 + }, + { + "epoch": 0.04127554055690391, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1960 + }, + { + "epoch": 0.04129659950616764, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1961 + }, + { + "epoch": 0.041317658455431366, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 1962 + }, + { + "epoch": 0.041338717404695094, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6034, + "step": 1963 + }, + { + "epoch": 0.04135977635395882, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5724, + "step": 1964 + }, + { + "epoch": 0.04138083530322255, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1965 + }, + { + "epoch": 0.04140189425248627, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5908, + "step": 1966 + }, + { + "epoch": 0.04142295320175, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6162, + "step": 1967 + }, + { + "epoch": 0.041444012151013725, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 1968 + }, + { + "epoch": 0.04146507110027745, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6013, + "step": 1969 + }, + { + "epoch": 0.04148613004954118, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 1970 + }, + { + "epoch": 0.04150718899880491, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 1971 + }, + { + "epoch": 0.04152824794806863, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 1972 + }, + { + "epoch": 0.041549306897332355, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 1973 + }, + { + "epoch": 0.04157036584659608, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 1974 + }, + { + "epoch": 0.04159142479585981, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 1975 + }, + { + "epoch": 0.04161248374512354, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 1976 + }, + { + "epoch": 0.041633542694387266, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 1977 + }, + { + "epoch": 0.04165460164365099, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 1978 + }, + { + "epoch": 0.041675660592914714, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5493, + "step": 1979 + }, + { + "epoch": 0.04169671954217844, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 1980 + }, + { + "epoch": 0.04171777849144217, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 1981 + }, + { + "epoch": 0.041738837440705896, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5943, + "step": 1982 + }, + { + "epoch": 0.041759896389969624, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 1983 + }, + { + "epoch": 0.04178095533923335, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 1984 + }, + { + "epoch": 0.04180201428849708, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1985 + }, + { + "epoch": 0.0418230732377608, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 1986 + }, + { + "epoch": 0.04184413218702453, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5919, + "step": 1987 + }, + { + "epoch": 0.041865191136288255, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.601, + "step": 1988 + }, + { + "epoch": 0.04188625008555198, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1989 + }, + { + "epoch": 0.04190730903481571, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5902, + "step": 1990 + }, + { + "epoch": 0.04192836798407944, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 1991 + }, + { + "epoch": 0.04194942693334316, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 1992 + }, + { + "epoch": 0.041970485882606885, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 1993 + }, + { + "epoch": 0.04199154483187061, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5832, + "step": 1994 + }, + { + "epoch": 0.04201260378113434, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5696, + "step": 1995 + }, + { + "epoch": 0.04203366273039807, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 1996 + }, + { + "epoch": 0.042054721679661795, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 1997 + }, + { + "epoch": 0.04207578062892552, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 1998 + }, + { + "epoch": 0.042096839578189243, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 1999 + }, + { + "epoch": 0.04211789852745297, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 2000 + }, + { + "epoch": 0.04211789852745297, + "eval_loss": 1.7890796661376953, + "eval_runtime": 898.0231, + "eval_samples_per_second": 68.818, + "eval_steps_per_second": 2.151, + "step": 2000 + }, + { + "epoch": 0.0421389574767167, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5621, + "step": 2001 + }, + { + "epoch": 0.042160016425980426, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 2002 + }, + { + "epoch": 0.042181075375244154, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6679, + "step": 2003 + }, + { + "epoch": 0.04220213432450788, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5733, + "step": 2004 + }, + { + "epoch": 0.0422231932737716, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 2005 + }, + { + "epoch": 0.04224425222303533, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5922, + "step": 2006 + }, + { + "epoch": 0.04226531117229906, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6025, + "step": 2007 + }, + { + "epoch": 0.042286370121562784, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5753, + "step": 2008 + }, + { + "epoch": 0.04230742907082651, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.574, + "step": 2009 + }, + { + "epoch": 0.04232848802009024, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2010 + }, + { + "epoch": 0.04234954696935397, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 2011 + }, + { + "epoch": 0.04237060591861769, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5817, + "step": 2012 + }, + { + "epoch": 0.042391664867881415, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.5955, + "step": 2013 + }, + { + "epoch": 0.04241272381714514, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.59, + "step": 2014 + }, + { + "epoch": 0.04243378276640887, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2015 + }, + { + "epoch": 0.0424548417156726, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5851, + "step": 2016 + }, + { + "epoch": 0.042475900664936325, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 2017 + }, + { + "epoch": 0.04249695961420005, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 2018 + }, + { + "epoch": 0.04251801856346377, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5685, + "step": 2019 + }, + { + "epoch": 0.0425390775127275, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6, + "step": 2020 + }, + { + "epoch": 0.04256013646199123, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2021 + }, + { + "epoch": 0.042581195411254956, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6069, + "step": 2022 + }, + { + "epoch": 0.042602254360518683, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5855, + "step": 2023 + }, + { + "epoch": 0.04262331330978241, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6244, + "step": 2024 + }, + { + "epoch": 0.04264437225904613, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 2025 + }, + { + "epoch": 0.04266543120830986, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5772, + "step": 2026 + }, + { + "epoch": 0.04268649015757359, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 2027 + }, + { + "epoch": 0.042707549106837314, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 2028 + }, + { + "epoch": 0.04272860805610104, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5894, + "step": 2029 + }, + { + "epoch": 0.04274966700536477, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5752, + "step": 2030 + }, + { + "epoch": 0.0427707259546285, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 2031 + }, + { + "epoch": 0.04279178490389222, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5786, + "step": 2032 + }, + { + "epoch": 0.042812843853155945, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 2033 + }, + { + "epoch": 0.04283390280241967, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6147, + "step": 2034 + }, + { + "epoch": 0.0428549617516834, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6133, + "step": 2035 + }, + { + "epoch": 0.04287602070094713, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5375, + "step": 2036 + }, + { + "epoch": 0.042897079650210855, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 2037 + }, + { + "epoch": 0.04291813859947458, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 2038 + }, + { + "epoch": 0.0429391975487383, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 2039 + }, + { + "epoch": 0.04296025649800203, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2040 + }, + { + "epoch": 0.04298131544726576, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5977, + "step": 2041 + }, + { + "epoch": 0.043002374396529486, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 2042 + }, + { + "epoch": 0.04302343334579321, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6905, + "step": 2043 + }, + { + "epoch": 0.04304449229505694, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5851, + "step": 2044 + }, + { + "epoch": 0.04306555124432066, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 2045 + }, + { + "epoch": 0.04308661019358439, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 2046 + }, + { + "epoch": 0.043107669142848117, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 2047 + }, + { + "epoch": 0.043128728092111844, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 2048 + }, + { + "epoch": 0.04314978704137557, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.561, + "step": 2049 + }, + { + "epoch": 0.0431708459906393, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 2050 + }, + { + "epoch": 0.04319190493990303, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 2051 + }, + { + "epoch": 0.04321296388916675, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 2052 + }, + { + "epoch": 0.043234022838430475, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5737, + "step": 2053 + }, + { + "epoch": 0.0432550817876942, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6068, + "step": 2054 + }, + { + "epoch": 0.04327614073695793, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 2055 + }, + { + "epoch": 0.04329719968622166, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6127, + "step": 2056 + }, + { + "epoch": 0.043318258635485385, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5723, + "step": 2057 + }, + { + "epoch": 0.043339317584749106, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6329, + "step": 2058 + }, + { + "epoch": 0.04336037653401283, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 2059 + }, + { + "epoch": 0.04338143548327656, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 2060 + }, + { + "epoch": 0.04340249443254029, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.552, + "step": 2061 + }, + { + "epoch": 0.043423553381804016, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 2062 + }, + { + "epoch": 0.04344461233106774, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6175, + "step": 2063 + }, + { + "epoch": 0.04346567128033147, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 2064 + }, + { + "epoch": 0.04348673022959519, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2065 + }, + { + "epoch": 0.04350778917885892, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.618, + "step": 2066 + }, + { + "epoch": 0.043528848128122646, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5708, + "step": 2067 + }, + { + "epoch": 0.043549907077386374, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.59, + "step": 2068 + }, + { + "epoch": 0.0435709660266501, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 2069 + }, + { + "epoch": 0.04359202497591383, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 2070 + }, + { + "epoch": 0.043613083925177556, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5735, + "step": 2071 + }, + { + "epoch": 0.04363414287444128, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 2072 + }, + { + "epoch": 0.043655201823705005, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 2073 + }, + { + "epoch": 0.04367626077296873, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6232, + "step": 2074 + }, + { + "epoch": 0.04369731972223246, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 2075 + }, + { + "epoch": 0.04371837867149619, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2076 + }, + { + "epoch": 0.043739437620759915, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 2077 + }, + { + "epoch": 0.043760496570023635, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 2078 + }, + { + "epoch": 0.04378155551928736, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2079 + }, + { + "epoch": 0.04380261446855109, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6049, + "step": 2080 + }, + { + "epoch": 0.04382367341781482, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 2081 + }, + { + "epoch": 0.043844732367078545, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 2082 + }, + { + "epoch": 0.04386579131634227, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6031, + "step": 2083 + }, + { + "epoch": 0.043886850265606, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 2084 + }, + { + "epoch": 0.04390790921486972, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 2085 + }, + { + "epoch": 0.04392896816413345, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 2086 + }, + { + "epoch": 0.043950027113397176, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5633, + "step": 2087 + }, + { + "epoch": 0.043971086062660904, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 2088 + }, + { + "epoch": 0.04399214501192463, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5983, + "step": 2089 + }, + { + "epoch": 0.04401320396118836, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 2090 + }, + { + "epoch": 0.044034262910452086, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 2091 + }, + { + "epoch": 0.04405532185971581, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6104, + "step": 2092 + }, + { + "epoch": 0.044076380808979534, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6049, + "step": 2093 + }, + { + "epoch": 0.04409743975824326, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 2094 + }, + { + "epoch": 0.04411849870750699, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 2095 + }, + { + "epoch": 0.04413955765677072, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 2096 + }, + { + "epoch": 0.044160616606034445, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5983, + "step": 2097 + }, + { + "epoch": 0.044181675555298165, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5902, + "step": 2098 + }, + { + "epoch": 0.04420273450456189, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 2099 + }, + { + "epoch": 0.04422379345382562, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 2100 + }, + { + "epoch": 0.04424485240308935, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 2101 + }, + { + "epoch": 0.044265911352353075, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 2102 + }, + { + "epoch": 0.0442869703016168, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5794, + "step": 2103 + }, + { + "epoch": 0.04430802925088053, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 2104 + }, + { + "epoch": 0.04432908820014425, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5852, + "step": 2105 + }, + { + "epoch": 0.04435014714940798, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 2106 + }, + { + "epoch": 0.044371206098671706, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.615, + "step": 2107 + }, + { + "epoch": 0.044392265047935434, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 2108 + }, + { + "epoch": 0.04441332399719916, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 2109 + }, + { + "epoch": 0.04443438294646289, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.575, + "step": 2110 + }, + { + "epoch": 0.04445544189572661, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 2111 + }, + { + "epoch": 0.04447650084499034, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 2112 + }, + { + "epoch": 0.044497559794254064, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 2113 + }, + { + "epoch": 0.04451861874351779, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 2114 + }, + { + "epoch": 0.04453967769278152, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 2115 + }, + { + "epoch": 0.04456073664204525, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 2116 + }, + { + "epoch": 0.044581795591308974, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 2117 + }, + { + "epoch": 0.044602854540572695, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 2118 + }, + { + "epoch": 0.04462391348983642, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 2119 + }, + { + "epoch": 0.04464497243910015, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 2120 + }, + { + "epoch": 0.04466603138836388, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 2121 + }, + { + "epoch": 0.044687090337627605, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6516, + "step": 2122 + }, + { + "epoch": 0.04470814928689133, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 2123 + }, + { + "epoch": 0.04472920823615506, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 2124 + }, + { + "epoch": 0.04475026718541878, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 2125 + }, + { + "epoch": 0.04477132613468251, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6272, + "step": 2126 + }, + { + "epoch": 0.044792385083946236, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 2127 + }, + { + "epoch": 0.04481344403320996, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 2128 + }, + { + "epoch": 0.04483450298247369, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5827, + "step": 2129 + }, + { + "epoch": 0.04485556193173742, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 2130 + }, + { + "epoch": 0.04487662088100114, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 2131 + }, + { + "epoch": 0.04489767983026487, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 2132 + }, + { + "epoch": 0.044918738779528594, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 2133 + }, + { + "epoch": 0.04493979772879232, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 2134 + }, + { + "epoch": 0.04496085667805605, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2135 + }, + { + "epoch": 0.04498191562731978, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 2136 + }, + { + "epoch": 0.045002974576583504, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 2137 + }, + { + "epoch": 0.045024033525847225, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.603, + "step": 2138 + }, + { + "epoch": 0.04504509247511095, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 2139 + }, + { + "epoch": 0.04506615142437468, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 2140 + }, + { + "epoch": 0.04508721037363841, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5794, + "step": 2141 + }, + { + "epoch": 0.045108269322902135, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 2142 + }, + { + "epoch": 0.04512932827216586, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6446, + "step": 2143 + }, + { + "epoch": 0.04515038722142959, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 2144 + }, + { + "epoch": 0.04517144617069331, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.6259, + "step": 2145 + }, + { + "epoch": 0.04519250511995704, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6116, + "step": 2146 + }, + { + "epoch": 0.045213564069220766, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5756, + "step": 2147 + }, + { + "epoch": 0.04523462301848449, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 2148 + }, + { + "epoch": 0.04525568196774822, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5883, + "step": 2149 + }, + { + "epoch": 0.04527674091701195, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 2150 + }, + { + "epoch": 0.04529779986627567, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5829, + "step": 2151 + }, + { + "epoch": 0.045318858815539396, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 2152 + }, + { + "epoch": 0.045339917764803124, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 2153 + }, + { + "epoch": 0.04536097671406685, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 2154 + }, + { + "epoch": 0.04538203566333058, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 2155 + }, + { + "epoch": 0.04540309461259431, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 2156 + }, + { + "epoch": 0.045424153561858034, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 2157 + }, + { + "epoch": 0.045445212511121755, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 2158 + }, + { + "epoch": 0.04546627146038548, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5725, + "step": 2159 + }, + { + "epoch": 0.04548733040964921, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 2160 + }, + { + "epoch": 0.04550838935891294, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 2161 + }, + { + "epoch": 0.045529448308176665, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 2162 + }, + { + "epoch": 0.04555050725744039, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.6093, + "step": 2163 + }, + { + "epoch": 0.04557156620670412, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5837, + "step": 2164 + }, + { + "epoch": 0.04559262515596784, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 2165 + }, + { + "epoch": 0.04561368410523157, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 2166 + }, + { + "epoch": 0.045634743054495296, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 2167 + }, + { + "epoch": 0.04565580200375902, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2168 + }, + { + "epoch": 0.04567686095302275, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6379, + "step": 2169 + }, + { + "epoch": 0.04569791990228648, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6101, + "step": 2170 + }, + { + "epoch": 0.0457189788515502, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5986, + "step": 2171 + }, + { + "epoch": 0.045740037800813926, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5735, + "step": 2172 + }, + { + "epoch": 0.045761096750077654, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 2173 + }, + { + "epoch": 0.04578215569934138, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5571, + "step": 2174 + }, + { + "epoch": 0.04580321464860511, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 2175 + }, + { + "epoch": 0.045824273597868836, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 2176 + }, + { + "epoch": 0.045845332547132564, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6039, + "step": 2177 + }, + { + "epoch": 0.045866391496396285, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 2178 + }, + { + "epoch": 0.04588745044566001, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6119, + "step": 2179 + }, + { + "epoch": 0.04590850939492374, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5567, + "step": 2180 + }, + { + "epoch": 0.04592956834418747, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5945, + "step": 2181 + }, + { + "epoch": 0.045950627293451195, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 2182 + }, + { + "epoch": 0.04597168624271492, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2183 + }, + { + "epoch": 0.04599274519197864, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6273, + "step": 2184 + }, + { + "epoch": 0.04601380414124237, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 2185 + }, + { + "epoch": 0.0460348630905061, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 2186 + }, + { + "epoch": 0.046055922039769825, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5695, + "step": 2187 + }, + { + "epoch": 0.04607698098903355, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5949, + "step": 2188 + }, + { + "epoch": 0.04609803993829728, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5592, + "step": 2189 + }, + { + "epoch": 0.04611909888756101, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 2190 + }, + { + "epoch": 0.04614015783682473, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6061, + "step": 2191 + }, + { + "epoch": 0.046161216786088456, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 2192 + }, + { + "epoch": 0.046182275735352184, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 2193 + }, + { + "epoch": 0.04620333468461591, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6178, + "step": 2194 + }, + { + "epoch": 0.04622439363387964, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5679, + "step": 2195 + }, + { + "epoch": 0.046245452583143366, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2196 + }, + { + "epoch": 0.046266511532407094, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 2197 + }, + { + "epoch": 0.046287570481670814, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 2198 + }, + { + "epoch": 0.04630862943093454, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 2199 + }, + { + "epoch": 0.04632968838019827, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6175, + "step": 2200 + }, + { + "epoch": 0.046350747329462, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6204, + "step": 2201 + }, + { + "epoch": 0.046371806278725725, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6096, + "step": 2202 + }, + { + "epoch": 0.04639286522798945, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 2203 + }, + { + "epoch": 0.04641392417725317, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 1.6191, + "step": 2204 + }, + { + "epoch": 0.0464349831265169, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 2205 + }, + { + "epoch": 0.04645604207578063, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 2206 + }, + { + "epoch": 0.046477101025044355, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6177, + "step": 2207 + }, + { + "epoch": 0.04649815997430808, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 2208 + }, + { + "epoch": 0.04651921892357181, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 2209 + }, + { + "epoch": 0.04654027787283554, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 2210 + }, + { + "epoch": 0.04656133682209926, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5902, + "step": 2211 + }, + { + "epoch": 0.046582395771362986, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6061, + "step": 2212 + }, + { + "epoch": 0.046603454720626714, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 2213 + }, + { + "epoch": 0.04662451366989044, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6374, + "step": 2214 + }, + { + "epoch": 0.04664557261915417, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 2215 + }, + { + "epoch": 0.046666631568417896, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5722, + "step": 2216 + }, + { + "epoch": 0.046687690517681624, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 2217 + }, + { + "epoch": 0.046708749466945344, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 2218 + }, + { + "epoch": 0.04672980841620907, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5628, + "step": 2219 + }, + { + "epoch": 0.0467508673654728, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.628, + "step": 2220 + }, + { + "epoch": 0.04677192631473653, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 2221 + }, + { + "epoch": 0.046792985264000254, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5852, + "step": 2222 + }, + { + "epoch": 0.04681404421326398, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5906, + "step": 2223 + }, + { + "epoch": 0.0468351031625277, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 2224 + }, + { + "epoch": 0.04685616211179143, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 2225 + }, + { + "epoch": 0.04687722106105516, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 2226 + }, + { + "epoch": 0.046898280010318885, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.6256, + "step": 2227 + }, + { + "epoch": 0.04691933895958261, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 2228 + }, + { + "epoch": 0.04694039790884634, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 2229 + }, + { + "epoch": 0.04696145685811007, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 2230 + }, + { + "epoch": 0.04698251580737379, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2231 + }, + { + "epoch": 0.047003574756637516, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5975, + "step": 2232 + }, + { + "epoch": 0.04702463370590124, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 2233 + }, + { + "epoch": 0.04704569265516497, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 2234 + }, + { + "epoch": 0.0470667516044287, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 2235 + }, + { + "epoch": 0.047087810553692426, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2236 + }, + { + "epoch": 0.04710886950295615, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5549, + "step": 2237 + }, + { + "epoch": 0.047129928452219874, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 2238 + }, + { + "epoch": 0.0471509874014836, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 2239 + }, + { + "epoch": 0.04717204635074733, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5802, + "step": 2240 + }, + { + "epoch": 0.04719310530001106, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 2241 + }, + { + "epoch": 0.047214164249274784, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 2242 + }, + { + "epoch": 0.04723522319853851, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 2243 + }, + { + "epoch": 0.04725628214780223, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 2244 + }, + { + "epoch": 0.04727734109706596, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5899, + "step": 2245 + }, + { + "epoch": 0.04729840004632969, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5898, + "step": 2246 + }, + { + "epoch": 0.047319458995593415, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 2247 + }, + { + "epoch": 0.04734051794485714, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 2248 + }, + { + "epoch": 0.04736157689412087, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 2249 + }, + { + "epoch": 0.0473826358433846, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5657, + "step": 2250 + }, + { + "epoch": 0.04740369479264832, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.643, + "step": 2251 + }, + { + "epoch": 0.047424753741912046, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 2252 + }, + { + "epoch": 0.04744581269117577, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 2253 + }, + { + "epoch": 0.0474668716404395, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 2254 + }, + { + "epoch": 0.04748793058970323, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 2255 + }, + { + "epoch": 0.047508989538966956, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 2256 + }, + { + "epoch": 0.047530048488230676, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6007, + "step": 2257 + }, + { + "epoch": 0.047551107437494404, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2258 + }, + { + "epoch": 0.04757216638675813, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 2259 + }, + { + "epoch": 0.04759322533602186, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 2260 + }, + { + "epoch": 0.04761428428528559, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2261 + }, + { + "epoch": 0.047635343234549314, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 2262 + }, + { + "epoch": 0.04765640218381304, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.5893, + "step": 2263 + }, + { + "epoch": 0.04767746113307676, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 2264 + }, + { + "epoch": 0.04769852008234049, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.6333, + "step": 2265 + }, + { + "epoch": 0.04771957903160422, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 1.6663, + "step": 2266 + }, + { + "epoch": 0.047740637980867945, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.616, + "step": 2267 + }, + { + "epoch": 0.04776169693013167, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 2268 + }, + { + "epoch": 0.0477827558793954, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 2269 + }, + { + "epoch": 0.04780381482865913, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 2270 + }, + { + "epoch": 0.04782487377792285, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5998, + "step": 2271 + }, + { + "epoch": 0.047845932727186576, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 2272 + }, + { + "epoch": 0.0478669916764503, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 2273 + }, + { + "epoch": 0.04788805062571403, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 2274 + }, + { + "epoch": 0.04790910957497776, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6102, + "step": 2275 + }, + { + "epoch": 0.047930168524241486, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6282, + "step": 2276 + }, + { + "epoch": 0.047951227473505206, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 2277 + }, + { + "epoch": 0.047972286422768934, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 2278 + }, + { + "epoch": 0.04799334537203266, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 2279 + }, + { + "epoch": 0.04801440432129639, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.603, + "step": 2280 + }, + { + "epoch": 0.048035463270560116, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 2281 + }, + { + "epoch": 0.048056522219823844, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2282 + }, + { + "epoch": 0.04807758116908757, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 2283 + }, + { + "epoch": 0.04809864011835129, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.5623, + "step": 2284 + }, + { + "epoch": 0.04811969906761502, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5876, + "step": 2285 + }, + { + "epoch": 0.04814075801687875, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 2286 + }, + { + "epoch": 0.048161816966142475, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 2287 + }, + { + "epoch": 0.0481828759154062, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 2288 + }, + { + "epoch": 0.04820393486466993, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 2289 + }, + { + "epoch": 0.04822499381393365, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5955, + "step": 2290 + }, + { + "epoch": 0.04824605276319738, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 2291 + }, + { + "epoch": 0.048267111712461105, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6034, + "step": 2292 + }, + { + "epoch": 0.04828817066172483, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 2293 + }, + { + "epoch": 0.04830922961098856, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 2294 + }, + { + "epoch": 0.04833028856025229, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.5621, + "step": 2295 + }, + { + "epoch": 0.048351347509516016, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 2296 + }, + { + "epoch": 0.048372406458779736, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5719, + "step": 2297 + }, + { + "epoch": 0.048393465408043464, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6334, + "step": 2298 + }, + { + "epoch": 0.04841452435730719, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5648, + "step": 2299 + }, + { + "epoch": 0.04843558330657092, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 2300 + }, + { + "epoch": 0.048456642255834646, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 2301 + }, + { + "epoch": 0.048477701205098374, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 2302 + }, + { + "epoch": 0.0484987601543621, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 2303 + }, + { + "epoch": 0.04851981910362582, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 2304 + }, + { + "epoch": 0.04854087805288955, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 2305 + }, + { + "epoch": 0.04856193700215328, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 2306 + }, + { + "epoch": 0.048582995951417005, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6034, + "step": 2307 + }, + { + "epoch": 0.04860405490068073, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 2308 + }, + { + "epoch": 0.04862511384994446, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 2309 + }, + { + "epoch": 0.04864617279920818, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2310 + }, + { + "epoch": 0.04866723174847191, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5786, + "step": 2311 + }, + { + "epoch": 0.048688290697735635, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 2312 + }, + { + "epoch": 0.04870934964699936, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 2313 + }, + { + "epoch": 0.04873040859626309, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 2314 + }, + { + "epoch": 0.04875146754552682, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 2315 + }, + { + "epoch": 0.048772526494790545, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5898, + "step": 2316 + }, + { + "epoch": 0.048793585444054266, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 2317 + }, + { + "epoch": 0.048814644393317994, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 2318 + }, + { + "epoch": 0.04883570334258172, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2319 + }, + { + "epoch": 0.04885676229184545, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6098, + "step": 2320 + }, + { + "epoch": 0.048877821241109176, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 2321 + }, + { + "epoch": 0.048898880190372904, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 2322 + }, + { + "epoch": 0.04891993913963663, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 2323 + }, + { + "epoch": 0.04894099808890035, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.598, + "step": 2324 + }, + { + "epoch": 0.04896205703816408, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 2325 + }, + { + "epoch": 0.04898311598742781, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6068, + "step": 2326 + }, + { + "epoch": 0.049004174936691534, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5947, + "step": 2327 + }, + { + "epoch": 0.04902523388595526, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 2328 + }, + { + "epoch": 0.04904629283521899, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5654, + "step": 2329 + }, + { + "epoch": 0.04906735178448271, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 2330 + }, + { + "epoch": 0.04908841073374644, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 2331 + }, + { + "epoch": 0.049109469683010165, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5908, + "step": 2332 + }, + { + "epoch": 0.04913052863227389, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 2333 + }, + { + "epoch": 0.04915158758153762, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6278, + "step": 2334 + }, + { + "epoch": 0.04917264653080135, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 2335 + }, + { + "epoch": 0.049193705480065075, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 2336 + }, + { + "epoch": 0.049214764429328796, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 2337 + }, + { + "epoch": 0.04923582337859252, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 2338 + }, + { + "epoch": 0.04925688232785625, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.5824, + "step": 2339 + }, + { + "epoch": 0.04927794127711998, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 2340 + }, + { + "epoch": 0.049299000226383706, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 2341 + }, + { + "epoch": 0.049320059175647434, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6241, + "step": 2342 + }, + { + "epoch": 0.049341118124911154, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 2343 + }, + { + "epoch": 0.04936217707417488, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 2344 + }, + { + "epoch": 0.04938323602343861, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5719, + "step": 2345 + }, + { + "epoch": 0.04940429497270234, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 2346 + }, + { + "epoch": 0.049425353921966064, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6035, + "step": 2347 + }, + { + "epoch": 0.04944641287122979, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 2348 + }, + { + "epoch": 0.04946747182049352, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 2349 + }, + { + "epoch": 0.04948853076975724, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 2350 + }, + { + "epoch": 0.04950958971902097, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 2351 + }, + { + "epoch": 0.049530648668284695, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 2352 + }, + { + "epoch": 0.04955170761754842, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 2353 + }, + { + "epoch": 0.04957276656681215, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5635, + "step": 2354 + }, + { + "epoch": 0.04959382551607588, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 2355 + }, + { + "epoch": 0.049614884465339605, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2356 + }, + { + "epoch": 0.049635943414603326, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 2357 + }, + { + "epoch": 0.04965700236386705, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 2358 + }, + { + "epoch": 0.04967806131313078, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 2359 + }, + { + "epoch": 0.04969912026239451, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6, + "step": 2360 + }, + { + "epoch": 0.049720179211658236, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5986, + "step": 2361 + }, + { + "epoch": 0.04974123816092196, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5897, + "step": 2362 + }, + { + "epoch": 0.049762297110185684, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5719, + "step": 2363 + }, + { + "epoch": 0.04978335605944941, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5908, + "step": 2364 + }, + { + "epoch": 0.04980441500871314, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 2365 + }, + { + "epoch": 0.04982547395797687, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5835, + "step": 2366 + }, + { + "epoch": 0.049846532907240594, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6007, + "step": 2367 + }, + { + "epoch": 0.04986759185650432, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 2368 + }, + { + "epoch": 0.04988865080576805, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6026, + "step": 2369 + }, + { + "epoch": 0.04990970975503177, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 2370 + }, + { + "epoch": 0.0499307687042955, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 2371 + }, + { + "epoch": 0.049951827653559225, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 2372 + }, + { + "epoch": 0.04997288660282295, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 2373 + }, + { + "epoch": 0.04999394555208668, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 2374 + }, + { + "epoch": 0.05001500450135041, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.59, + "step": 2375 + }, + { + "epoch": 0.050036063450614135, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 2376 + }, + { + "epoch": 0.050057122399877856, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 2377 + }, + { + "epoch": 0.05007818134914158, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 2378 + }, + { + "epoch": 0.05009924029840531, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 2379 + }, + { + "epoch": 0.05012029924766904, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 2380 + }, + { + "epoch": 0.050141358196932766, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 2381 + }, + { + "epoch": 0.05016241714619649, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5943, + "step": 2382 + }, + { + "epoch": 0.050183476095460214, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6012, + "step": 2383 + }, + { + "epoch": 0.05020453504472394, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 2384 + }, + { + "epoch": 0.05022559399398767, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 2385 + }, + { + "epoch": 0.050246652943251396, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6204, + "step": 2386 + }, + { + "epoch": 0.050267711892515124, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5849, + "step": 2387 + }, + { + "epoch": 0.05028877084177885, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5819, + "step": 2388 + }, + { + "epoch": 0.05030982979104258, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 2389 + }, + { + "epoch": 0.0503308887403063, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 2390 + }, + { + "epoch": 0.05035194768957003, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.6118, + "step": 2391 + }, + { + "epoch": 0.050373006638833755, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 2392 + }, + { + "epoch": 0.05039406558809748, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.609, + "step": 2393 + }, + { + "epoch": 0.05041512453736121, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 2394 + }, + { + "epoch": 0.05043618348662494, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 2395 + }, + { + "epoch": 0.05045724243588866, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.5839, + "step": 2396 + }, + { + "epoch": 0.050478301385152385, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 2397 + }, + { + "epoch": 0.05049936033441611, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6133, + "step": 2398 + }, + { + "epoch": 0.05052041928367984, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 2399 + }, + { + "epoch": 0.05054147823294357, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 2400 + }, + { + "epoch": 0.050562537182207296, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 2401 + }, + { + "epoch": 0.05058359613147102, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 2402 + }, + { + "epoch": 0.050604655080734744, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 2403 + }, + { + "epoch": 0.05062571402999847, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2404 + }, + { + "epoch": 0.0506467729792622, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 2405 + }, + { + "epoch": 0.050667831928525926, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6173, + "step": 2406 + }, + { + "epoch": 0.050688890877789654, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 2407 + }, + { + "epoch": 0.05070994982705338, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6027, + "step": 2408 + }, + { + "epoch": 0.05073100877631711, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 2409 + }, + { + "epoch": 0.05075206772558083, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6061, + "step": 2410 + }, + { + "epoch": 0.05077312667484456, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 2411 + }, + { + "epoch": 0.050794185624108285, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 2412 + }, + { + "epoch": 0.05081524457337201, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 2413 + }, + { + "epoch": 0.05083630352263574, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 2414 + }, + { + "epoch": 0.05085736247189947, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5654, + "step": 2415 + }, + { + "epoch": 0.05087842142116319, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2416 + }, + { + "epoch": 0.050899480370426915, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5802, + "step": 2417 + }, + { + "epoch": 0.05092053931969064, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5807, + "step": 2418 + }, + { + "epoch": 0.05094159826895437, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 2419 + }, + { + "epoch": 0.0509626572182181, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 2420 + }, + { + "epoch": 0.050983716167481825, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 2421 + }, + { + "epoch": 0.05100477511674555, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 2422 + }, + { + "epoch": 0.051025834066009274, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5709, + "step": 2423 + }, + { + "epoch": 0.051046893015273, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 2424 + }, + { + "epoch": 0.05106795196453673, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 2425 + }, + { + "epoch": 0.051089010913800456, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 2426 + }, + { + "epoch": 0.051110069863064184, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 2427 + }, + { + "epoch": 0.05113112881232791, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5758, + "step": 2428 + }, + { + "epoch": 0.05115218776159164, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 2429 + }, + { + "epoch": 0.05117324671085536, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5977, + "step": 2430 + }, + { + "epoch": 0.05119430566011909, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 2431 + }, + { + "epoch": 0.051215364609382814, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 2432 + }, + { + "epoch": 0.05123642355864654, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 2433 + }, + { + "epoch": 0.05125748250791027, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5849, + "step": 2434 + }, + { + "epoch": 0.051278541457174, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5851, + "step": 2435 + }, + { + "epoch": 0.05129960040643772, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5922, + "step": 2436 + }, + { + "epoch": 0.051320659355701445, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 2437 + }, + { + "epoch": 0.05134171830496517, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 2438 + }, + { + "epoch": 0.0513627772542289, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 2439 + }, + { + "epoch": 0.05138383620349263, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 2440 + }, + { + "epoch": 0.051404895152756355, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6003, + "step": 2441 + }, + { + "epoch": 0.05142595410202008, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6046, + "step": 2442 + }, + { + "epoch": 0.0514470130512838, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 2443 + }, + { + "epoch": 0.05146807200054753, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 2444 + }, + { + "epoch": 0.05148913094981126, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 2445 + }, + { + "epoch": 0.051510189899074986, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2446 + }, + { + "epoch": 0.051531248848338713, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5636, + "step": 2447 + }, + { + "epoch": 0.05155230779760244, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6255, + "step": 2448 + }, + { + "epoch": 0.05157336674686617, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 2449 + }, + { + "epoch": 0.05159442569612989, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6109, + "step": 2450 + }, + { + "epoch": 0.05161548464539362, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 2451 + }, + { + "epoch": 0.051636543594657344, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5943, + "step": 2452 + }, + { + "epoch": 0.05165760254392107, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 2453 + }, + { + "epoch": 0.0516786614931848, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2454 + }, + { + "epoch": 0.05169972044244853, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6096, + "step": 2455 + }, + { + "epoch": 0.05172077939171225, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5665, + "step": 2456 + }, + { + "epoch": 0.051741838340975975, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5819, + "step": 2457 + }, + { + "epoch": 0.0517628972902397, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6206, + "step": 2458 + }, + { + "epoch": 0.05178395623950343, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6381, + "step": 2459 + }, + { + "epoch": 0.05180501518876716, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2460 + }, + { + "epoch": 0.051826074138030885, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 2461 + }, + { + "epoch": 0.05184713308729461, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.598, + "step": 2462 + }, + { + "epoch": 0.05186819203655833, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 2463 + }, + { + "epoch": 0.05188925098582206, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 2464 + }, + { + "epoch": 0.05191030993508579, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5799, + "step": 2465 + }, + { + "epoch": 0.051931368884349516, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 2466 + }, + { + "epoch": 0.05195242783361324, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 2467 + }, + { + "epoch": 0.05197348678287697, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 2468 + }, + { + "epoch": 0.05199454573214069, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 2469 + }, + { + "epoch": 0.05201560468140442, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5735, + "step": 2470 + }, + { + "epoch": 0.05203666363066815, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 2471 + }, + { + "epoch": 0.052057722579931874, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5947, + "step": 2472 + }, + { + "epoch": 0.0520787815291956, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 2473 + }, + { + "epoch": 0.05209984047845933, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 2474 + }, + { + "epoch": 0.05212089942772306, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 2475 + }, + { + "epoch": 0.05214195837698678, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6371, + "step": 2476 + }, + { + "epoch": 0.052163017326250505, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5803, + "step": 2477 + }, + { + "epoch": 0.05218407627551423, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 2478 + }, + { + "epoch": 0.05220513522477796, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 2479 + }, + { + "epoch": 0.05222619417404169, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2480 + }, + { + "epoch": 0.052247253123305415, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5947, + "step": 2481 + }, + { + "epoch": 0.05226831207256914, + "grad_norm": 0.48828125, + "learning_rate": 0.0002, + "loss": 1.6069, + "step": 2482 + }, + { + "epoch": 0.05228937102183286, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5696, + "step": 2483 + }, + { + "epoch": 0.05231042997109659, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5729, + "step": 2484 + }, + { + "epoch": 0.05233148892036032, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5934, + "step": 2485 + }, + { + "epoch": 0.052352547869624046, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 2486 + }, + { + "epoch": 0.05237360681888777, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 2487 + }, + { + "epoch": 0.0523946657681515, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 2488 + }, + { + "epoch": 0.05241572471741522, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2489 + }, + { + "epoch": 0.05243678366667895, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5847, + "step": 2490 + }, + { + "epoch": 0.052457842615942676, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 2491 + }, + { + "epoch": 0.052478901565206404, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.602, + "step": 2492 + }, + { + "epoch": 0.05249996051447013, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 2493 + }, + { + "epoch": 0.05252101946373386, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5852, + "step": 2494 + }, + { + "epoch": 0.052542078412997587, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5672, + "step": 2495 + }, + { + "epoch": 0.05256313736226131, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 2496 + }, + { + "epoch": 0.052584196311525035, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5786, + "step": 2497 + }, + { + "epoch": 0.05260525526078876, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5817, + "step": 2498 + }, + { + "epoch": 0.05262631421005249, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 2499 + }, + { + "epoch": 0.05264737315931622, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 2500 + }, + { + "epoch": 0.05264737315931622, + "eval_loss": 1.8555322885513306, + "eval_runtime": 897.3744, + "eval_samples_per_second": 68.868, + "eval_steps_per_second": 2.153, + "step": 2500 + }, + { + "epoch": 0.052668432108579945, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 2501 + }, + { + "epoch": 0.05268949105784367, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 2502 + }, + { + "epoch": 0.05271055000710739, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5756, + "step": 2503 + }, + { + "epoch": 0.05273160895637112, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 2504 + }, + { + "epoch": 0.05275266790563485, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 2505 + }, + { + "epoch": 0.052773726854898576, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6013, + "step": 2506 + }, + { + "epoch": 0.0527947858041623, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 2507 + }, + { + "epoch": 0.05281584475342603, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 2508 + }, + { + "epoch": 0.05283690370268975, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 2509 + }, + { + "epoch": 0.05285796265195348, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 2510 + }, + { + "epoch": 0.052879021601217206, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 2511 + }, + { + "epoch": 0.052900080550480934, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5908, + "step": 2512 + }, + { + "epoch": 0.05292113949974466, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 2513 + }, + { + "epoch": 0.05294219844900839, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5697, + "step": 2514 + }, + { + "epoch": 0.052963257398272116, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 2515 + }, + { + "epoch": 0.05298431634753584, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 2516 + }, + { + "epoch": 0.053005375296799564, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 2517 + }, + { + "epoch": 0.05302643424606329, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.585, + "step": 2518 + }, + { + "epoch": 0.05304749319532702, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 2519 + }, + { + "epoch": 0.05306855214459075, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 2520 + }, + { + "epoch": 0.053089611093854475, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 2521 + }, + { + "epoch": 0.053110670043118195, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 2522 + }, + { + "epoch": 0.05313172899238192, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 2523 + }, + { + "epoch": 0.05315278794164565, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 2524 + }, + { + "epoch": 0.05317384689090938, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 2525 + }, + { + "epoch": 0.053194905840173105, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 2526 + }, + { + "epoch": 0.05321596478943683, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5898, + "step": 2527 + }, + { + "epoch": 0.05323702373870056, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2528 + }, + { + "epoch": 0.05325808268796428, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 2529 + }, + { + "epoch": 0.05327914163722801, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.602, + "step": 2530 + }, + { + "epoch": 0.053300200586491736, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5657, + "step": 2531 + }, + { + "epoch": 0.053321259535755464, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 2532 + }, + { + "epoch": 0.05334231848501919, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5756, + "step": 2533 + }, + { + "epoch": 0.05336337743428292, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6181, + "step": 2534 + }, + { + "epoch": 0.053384436383546646, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 2535 + }, + { + "epoch": 0.05340549533281037, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 2536 + }, + { + "epoch": 0.053426554282074094, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 2537 + }, + { + "epoch": 0.05344761323133782, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 2538 + }, + { + "epoch": 0.05346867218060155, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 2539 + }, + { + "epoch": 0.05348973112986528, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 2540 + }, + { + "epoch": 0.053510790079129004, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 2541 + }, + { + "epoch": 0.053531849028392725, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5955, + "step": 2542 + }, + { + "epoch": 0.05355290797765645, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2543 + }, + { + "epoch": 0.05357396692692018, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6078, + "step": 2544 + }, + { + "epoch": 0.05359502587618391, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2545 + }, + { + "epoch": 0.053616084825447635, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 2546 + }, + { + "epoch": 0.05363714377471136, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 2547 + }, + { + "epoch": 0.05365820272397509, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 2548 + }, + { + "epoch": 0.05367926167323881, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 2549 + }, + { + "epoch": 0.05370032062250254, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 2550 + }, + { + "epoch": 0.053721379571766266, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5784, + "step": 2551 + }, + { + "epoch": 0.053742438521029993, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6059, + "step": 2552 + }, + { + "epoch": 0.05376349747029372, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.5758, + "step": 2553 + }, + { + "epoch": 0.05378455641955745, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 2554 + }, + { + "epoch": 0.053805615368821176, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5824, + "step": 2555 + }, + { + "epoch": 0.0538266743180849, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5912, + "step": 2556 + }, + { + "epoch": 0.053847733267348624, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 2557 + }, + { + "epoch": 0.05386879221661235, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 2558 + }, + { + "epoch": 0.05388985116587608, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 2559 + }, + { + "epoch": 0.05391091011513981, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6282, + "step": 2560 + }, + { + "epoch": 0.053931969064403534, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 2561 + }, + { + "epoch": 0.053953028013667255, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5955, + "step": 2562 + }, + { + "epoch": 0.05397408696293098, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6096, + "step": 2563 + }, + { + "epoch": 0.05399514591219471, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 2564 + }, + { + "epoch": 0.05401620486145844, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5965, + "step": 2565 + }, + { + "epoch": 0.054037263810722165, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6102, + "step": 2566 + }, + { + "epoch": 0.05405832275998589, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 2567 + }, + { + "epoch": 0.05407938170924962, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 2568 + }, + { + "epoch": 0.05410044065851334, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 2569 + }, + { + "epoch": 0.05412149960777707, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 2570 + }, + { + "epoch": 0.054142558557040796, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2571 + }, + { + "epoch": 0.05416361750630452, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.585, + "step": 2572 + }, + { + "epoch": 0.05418467645556825, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2573 + }, + { + "epoch": 0.05420573540483198, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 2574 + }, + { + "epoch": 0.0542267943540957, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 2575 + }, + { + "epoch": 0.054247853303359427, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 2576 + }, + { + "epoch": 0.054268912252623154, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6102, + "step": 2577 + }, + { + "epoch": 0.05428997120188688, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5528, + "step": 2578 + }, + { + "epoch": 0.05431103015115061, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6068, + "step": 2579 + }, + { + "epoch": 0.05433208910041434, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 2580 + }, + { + "epoch": 0.054353148049678064, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 2581 + }, + { + "epoch": 0.054374206998941785, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2582 + }, + { + "epoch": 0.05439526594820551, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 2583 + }, + { + "epoch": 0.05441632489746924, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 2584 + }, + { + "epoch": 0.05443738384673297, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.6068, + "step": 2585 + }, + { + "epoch": 0.054458442795996695, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 2586 + }, + { + "epoch": 0.05447950174526042, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 2587 + }, + { + "epoch": 0.05450056069452415, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 2588 + }, + { + "epoch": 0.05452161964378787, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 2589 + }, + { + "epoch": 0.0545426785930516, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6118, + "step": 2590 + }, + { + "epoch": 0.054563737542315326, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5729, + "step": 2591 + }, + { + "epoch": 0.05458479649157905, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 2592 + }, + { + "epoch": 0.05460585544084278, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 2593 + }, + { + "epoch": 0.05462691439010651, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5633, + "step": 2594 + }, + { + "epoch": 0.05464797333937023, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 2595 + }, + { + "epoch": 0.054669032288633956, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5818, + "step": 2596 + }, + { + "epoch": 0.054690091237897684, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 2597 + }, + { + "epoch": 0.05471115018716141, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 2598 + }, + { + "epoch": 0.05473220913642514, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 2599 + }, + { + "epoch": 0.054753268085688866, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5863, + "step": 2600 + }, + { + "epoch": 0.054774327034952594, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5708, + "step": 2601 + }, + { + "epoch": 0.054795385984216315, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5663, + "step": 2602 + }, + { + "epoch": 0.05481644493348004, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5629, + "step": 2603 + }, + { + "epoch": 0.05483750388274377, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5663, + "step": 2604 + }, + { + "epoch": 0.0548585628320075, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6129, + "step": 2605 + }, + { + "epoch": 0.054879621781271225, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 2606 + }, + { + "epoch": 0.05490068073053495, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5558, + "step": 2607 + }, + { + "epoch": 0.05492173967979868, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5482, + "step": 2608 + }, + { + "epoch": 0.0549427986290624, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5628, + "step": 2609 + }, + { + "epoch": 0.05496385757832613, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 2610 + }, + { + "epoch": 0.054984916527589855, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 2611 + }, + { + "epoch": 0.05500597547685358, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6135, + "step": 2612 + }, + { + "epoch": 0.05502703442611731, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 2613 + }, + { + "epoch": 0.05504809337538104, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 2614 + }, + { + "epoch": 0.05506915232464476, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5719, + "step": 2615 + }, + { + "epoch": 0.055090211273908486, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 2616 + }, + { + "epoch": 0.055111270223172214, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6253, + "step": 2617 + }, + { + "epoch": 0.05513232917243594, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.615, + "step": 2618 + }, + { + "epoch": 0.05515338812169967, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 2619 + }, + { + "epoch": 0.055174447070963396, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6247, + "step": 2620 + }, + { + "epoch": 0.055195506020227124, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 2621 + }, + { + "epoch": 0.055216564969490844, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 2622 + }, + { + "epoch": 0.05523762391875457, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6238, + "step": 2623 + }, + { + "epoch": 0.0552586828680183, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5523, + "step": 2624 + }, + { + "epoch": 0.05527974181728203, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 2625 + }, + { + "epoch": 0.055300800766545755, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5625, + "step": 2626 + }, + { + "epoch": 0.05532185971580948, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5852, + "step": 2627 + }, + { + "epoch": 0.0553429186650732, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5943, + "step": 2628 + }, + { + "epoch": 0.05536397761433693, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5604, + "step": 2629 + }, + { + "epoch": 0.05538503656360066, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 2630 + }, + { + "epoch": 0.055406095512864385, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5855, + "step": 2631 + }, + { + "epoch": 0.05542715446212811, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6307, + "step": 2632 + }, + { + "epoch": 0.05544821341139184, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 2633 + }, + { + "epoch": 0.05546927236065557, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 2634 + }, + { + "epoch": 0.05549033130991929, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5597, + "step": 2635 + }, + { + "epoch": 0.055511390259183016, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6119, + "step": 2636 + }, + { + "epoch": 0.055532449208446744, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 2637 + }, + { + "epoch": 0.05555350815771047, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 2638 + }, + { + "epoch": 0.0555745671069742, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 2639 + }, + { + "epoch": 0.055595626056237926, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2640 + }, + { + "epoch": 0.055616685005501654, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 2641 + }, + { + "epoch": 0.055637743954765374, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 2642 + }, + { + "epoch": 0.0556588029040291, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 2643 + }, + { + "epoch": 0.05567986185329283, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 2644 + }, + { + "epoch": 0.05570092080255656, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5759, + "step": 2645 + }, + { + "epoch": 0.055721979751820284, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 2646 + }, + { + "epoch": 0.05574303870108401, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 2647 + }, + { + "epoch": 0.05576409765034773, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5902, + "step": 2648 + }, + { + "epoch": 0.05578515659961146, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 2649 + }, + { + "epoch": 0.05580621554887519, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 2650 + }, + { + "epoch": 0.055827274498138915, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 2651 + }, + { + "epoch": 0.05584833344740264, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.6069, + "step": 2652 + }, + { + "epoch": 0.05586939239666637, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 2653 + }, + { + "epoch": 0.0558904513459301, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 2654 + }, + { + "epoch": 0.05591151029519382, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5855, + "step": 2655 + }, + { + "epoch": 0.055932569244457546, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6026, + "step": 2656 + }, + { + "epoch": 0.05595362819372127, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 2657 + }, + { + "epoch": 0.055974687142985, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 2658 + }, + { + "epoch": 0.05599574609224873, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6029, + "step": 2659 + }, + { + "epoch": 0.056016805041512456, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 2660 + }, + { + "epoch": 0.056037863990776184, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5865, + "step": 2661 + }, + { + "epoch": 0.056058922940039904, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5665, + "step": 2662 + }, + { + "epoch": 0.05607998188930363, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 2663 + }, + { + "epoch": 0.05610104083856736, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 2664 + }, + { + "epoch": 0.05612209978783109, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 2665 + }, + { + "epoch": 0.056143158737094814, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 2666 + }, + { + "epoch": 0.05616421768635854, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 2667 + }, + { + "epoch": 0.05618527663562226, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6226, + "step": 2668 + }, + { + "epoch": 0.05620633558488599, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 2669 + }, + { + "epoch": 0.05622739453414972, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5784, + "step": 2670 + }, + { + "epoch": 0.056248453483413445, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 2671 + }, + { + "epoch": 0.05626951243267717, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2672 + }, + { + "epoch": 0.0562905713819409, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 2673 + }, + { + "epoch": 0.05631163033120463, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 2674 + }, + { + "epoch": 0.05633268928046835, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 2675 + }, + { + "epoch": 0.056353748229732076, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5929, + "step": 2676 + }, + { + "epoch": 0.0563748071789958, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 2677 + }, + { + "epoch": 0.05639586612825953, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6212, + "step": 2678 + }, + { + "epoch": 0.05641692507752326, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.574, + "step": 2679 + }, + { + "epoch": 0.056437984026786986, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 2680 + }, + { + "epoch": 0.056459042976050706, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 2681 + }, + { + "epoch": 0.056480101925314434, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 2682 + }, + { + "epoch": 0.05650116087457816, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5809, + "step": 2683 + }, + { + "epoch": 0.05652221982384189, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2684 + }, + { + "epoch": 0.05654327877310562, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5698, + "step": 2685 + }, + { + "epoch": 0.056564337722369344, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 2686 + }, + { + "epoch": 0.05658539667163307, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 2687 + }, + { + "epoch": 0.05660645562089679, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.577, + "step": 2688 + }, + { + "epoch": 0.05662751457016052, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 2689 + }, + { + "epoch": 0.05664857351942425, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 2690 + }, + { + "epoch": 0.056669632468687975, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 2691 + }, + { + "epoch": 0.0566906914179517, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6102, + "step": 2692 + }, + { + "epoch": 0.05671175036721543, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5731, + "step": 2693 + }, + { + "epoch": 0.05673280931647916, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 2694 + }, + { + "epoch": 0.05675386826574288, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 2695 + }, + { + "epoch": 0.056774927215006606, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 2696 + }, + { + "epoch": 0.05679598616427033, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 2697 + }, + { + "epoch": 0.05681704511353406, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6029, + "step": 2698 + }, + { + "epoch": 0.05683810406279779, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 2699 + }, + { + "epoch": 0.056859163012061516, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 2700 + }, + { + "epoch": 0.056880221961325236, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5855, + "step": 2701 + }, + { + "epoch": 0.056901280910588964, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5619, + "step": 2702 + }, + { + "epoch": 0.05692233985985269, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 2703 + }, + { + "epoch": 0.05694339880911642, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.562, + "step": 2704 + }, + { + "epoch": 0.056964457758380146, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 2705 + }, + { + "epoch": 0.056985516707643874, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 2706 + }, + { + "epoch": 0.0570065756569076, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 2707 + }, + { + "epoch": 0.05702763460617132, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2708 + }, + { + "epoch": 0.05704869355543505, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 2709 + }, + { + "epoch": 0.05706975250469878, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5839, + "step": 2710 + }, + { + "epoch": 0.057090811453962505, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 2711 + }, + { + "epoch": 0.05711187040322623, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5614, + "step": 2712 + }, + { + "epoch": 0.05713292935248996, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 2713 + }, + { + "epoch": 0.05715398830175369, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 2714 + }, + { + "epoch": 0.05717504725101741, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6035, + "step": 2715 + }, + { + "epoch": 0.057196106200281135, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6472, + "step": 2716 + }, + { + "epoch": 0.05721716514954486, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 2717 + }, + { + "epoch": 0.05723822409880859, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5716, + "step": 2718 + }, + { + "epoch": 0.05725928304807232, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2719 + }, + { + "epoch": 0.057280341997336046, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 2720 + }, + { + "epoch": 0.057301400946599766, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 2721 + }, + { + "epoch": 0.057322459895863494, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 2722 + }, + { + "epoch": 0.05734351884512722, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 2723 + }, + { + "epoch": 0.05736457779439095, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 2724 + }, + { + "epoch": 0.057385636743654676, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6142, + "step": 2725 + }, + { + "epoch": 0.057406695692918404, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 2726 + }, + { + "epoch": 0.05742775464218213, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 2727 + }, + { + "epoch": 0.05744881359144585, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6247, + "step": 2728 + }, + { + "epoch": 0.05746987254070958, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5799, + "step": 2729 + }, + { + "epoch": 0.05749093148997331, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2730 + }, + { + "epoch": 0.057511990439237035, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 2731 + }, + { + "epoch": 0.05753304938850076, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5822, + "step": 2732 + }, + { + "epoch": 0.05755410833776449, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 2733 + }, + { + "epoch": 0.05757516728702822, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 2734 + }, + { + "epoch": 0.05759622623629194, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 2735 + }, + { + "epoch": 0.057617285185555665, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 2736 + }, + { + "epoch": 0.05763834413481939, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.567, + "step": 2737 + }, + { + "epoch": 0.05765940308408312, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 2738 + }, + { + "epoch": 0.05768046203334685, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5763, + "step": 2739 + }, + { + "epoch": 0.057701520982610575, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6204, + "step": 2740 + }, + { + "epoch": 0.057722579931874296, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 2741 + }, + { + "epoch": 0.057743638881138024, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2742 + }, + { + "epoch": 0.05776469783040175, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 2743 + }, + { + "epoch": 0.05778575677966548, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6286, + "step": 2744 + }, + { + "epoch": 0.057806815728929206, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 2745 + }, + { + "epoch": 0.057827874678192934, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 2746 + }, + { + "epoch": 0.05784893362745666, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2747 + }, + { + "epoch": 0.05786999257672038, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 2748 + }, + { + "epoch": 0.05789105152598411, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 2749 + }, + { + "epoch": 0.05791211047524784, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 2750 + }, + { + "epoch": 0.057933169424511564, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 2751 + }, + { + "epoch": 0.05795422837377529, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2752 + }, + { + "epoch": 0.05797528732303902, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 2753 + }, + { + "epoch": 0.05799634627230274, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5759, + "step": 2754 + }, + { + "epoch": 0.05801740522156647, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 2755 + }, + { + "epoch": 0.058038464170830195, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 2756 + }, + { + "epoch": 0.05805952312009392, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 1.5824, + "step": 2757 + }, + { + "epoch": 0.05808058206935765, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 2758 + }, + { + "epoch": 0.05810164101862138, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 2759 + }, + { + "epoch": 0.058122699967885105, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 2760 + }, + { + "epoch": 0.058143758917148826, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5622, + "step": 2761 + }, + { + "epoch": 0.05816481786641255, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 2762 + }, + { + "epoch": 0.05818587681567628, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 2763 + }, + { + "epoch": 0.05820693576494001, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6027, + "step": 2764 + }, + { + "epoch": 0.058227994714203736, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 2765 + }, + { + "epoch": 0.058249053663467464, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 2766 + }, + { + "epoch": 0.05827011261273119, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 2767 + }, + { + "epoch": 0.05829117156199491, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6219, + "step": 2768 + }, + { + "epoch": 0.05831223051125864, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5672, + "step": 2769 + }, + { + "epoch": 0.05833328946052237, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 2770 + }, + { + "epoch": 0.058354348409786094, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5672, + "step": 2771 + }, + { + "epoch": 0.05837540735904982, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 2772 + }, + { + "epoch": 0.05839646630831355, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 2773 + }, + { + "epoch": 0.05841752525757727, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 2774 + }, + { + "epoch": 0.058438584206841, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 2775 + }, + { + "epoch": 0.058459643156104725, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 2776 + }, + { + "epoch": 0.05848070210536845, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5737, + "step": 2777 + }, + { + "epoch": 0.05850176105463218, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 2778 + }, + { + "epoch": 0.05852282000389591, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.6129, + "step": 2779 + }, + { + "epoch": 0.058543878953159635, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 2780 + }, + { + "epoch": 0.058564937902423356, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6097, + "step": 2781 + }, + { + "epoch": 0.05858599685168708, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5902, + "step": 2782 + }, + { + "epoch": 0.05860705580095081, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6116, + "step": 2783 + }, + { + "epoch": 0.05862811475021454, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 2784 + }, + { + "epoch": 0.058649173699478266, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 2785 + }, + { + "epoch": 0.05867023264874199, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5852, + "step": 2786 + }, + { + "epoch": 0.05869129159800572, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.602, + "step": 2787 + }, + { + "epoch": 0.05871235054726944, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 2788 + }, + { + "epoch": 0.05873340949653317, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 2789 + }, + { + "epoch": 0.0587544684457969, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 2790 + }, + { + "epoch": 0.058775527395060624, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 2791 + }, + { + "epoch": 0.05879658634432435, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 2792 + }, + { + "epoch": 0.05881764529358808, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5855, + "step": 2793 + }, + { + "epoch": 0.0588387042428518, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6039, + "step": 2794 + }, + { + "epoch": 0.05885976319211553, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 2795 + }, + { + "epoch": 0.058880822141379255, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 2796 + }, + { + "epoch": 0.05890188109064298, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 2797 + }, + { + "epoch": 0.05892294003990671, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5689, + "step": 2798 + }, + { + "epoch": 0.05894399898917044, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 2799 + }, + { + "epoch": 0.058965057938434165, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 2800 + }, + { + "epoch": 0.058986116887697886, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 2801 + }, + { + "epoch": 0.05900717583696161, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 2802 + }, + { + "epoch": 0.05902823478622534, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5998, + "step": 2803 + }, + { + "epoch": 0.05904929373548907, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2804 + }, + { + "epoch": 0.059070352684752796, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5949, + "step": 2805 + }, + { + "epoch": 0.05909141163401652, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 2806 + }, + { + "epoch": 0.059112470583280244, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2807 + }, + { + "epoch": 0.05913352953254397, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 2808 + }, + { + "epoch": 0.0591545884818077, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5856, + "step": 2809 + }, + { + "epoch": 0.059175647431071426, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2810 + }, + { + "epoch": 0.059196706380335154, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5934, + "step": 2811 + }, + { + "epoch": 0.05921776532959888, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 2812 + }, + { + "epoch": 0.05923882427886261, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6359, + "step": 2813 + }, + { + "epoch": 0.05925988322812633, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6083, + "step": 2814 + }, + { + "epoch": 0.05928094217739006, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5733, + "step": 2815 + }, + { + "epoch": 0.059302001126653785, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.585, + "step": 2816 + }, + { + "epoch": 0.05932306007591751, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5912, + "step": 2817 + }, + { + "epoch": 0.05934411902518124, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 2818 + }, + { + "epoch": 0.05936517797444497, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 2819 + }, + { + "epoch": 0.059386236923708695, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.5781, + "step": 2820 + }, + { + "epoch": 0.059407295872972415, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6671, + "step": 2821 + }, + { + "epoch": 0.05942835482223614, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 2822 + }, + { + "epoch": 0.05944941377149987, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 2823 + }, + { + "epoch": 0.0594704727207636, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5822, + "step": 2824 + }, + { + "epoch": 0.059491531670027326, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 2825 + }, + { + "epoch": 0.05951259061929105, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 2826 + }, + { + "epoch": 0.059533649568554774, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6064, + "step": 2827 + }, + { + "epoch": 0.0595547085178185, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 2828 + }, + { + "epoch": 0.05957576746708223, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5764, + "step": 2829 + }, + { + "epoch": 0.059596826416345956, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 2830 + }, + { + "epoch": 0.059617885365609684, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 2831 + }, + { + "epoch": 0.05963894431487341, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5631, + "step": 2832 + }, + { + "epoch": 0.05966000326413714, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 2833 + }, + { + "epoch": 0.05968106221340086, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6295, + "step": 2834 + }, + { + "epoch": 0.05970212116266459, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2835 + }, + { + "epoch": 0.059723180111928315, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5513, + "step": 2836 + }, + { + "epoch": 0.05974423906119204, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 2837 + }, + { + "epoch": 0.05976529801045577, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5922, + "step": 2838 + }, + { + "epoch": 0.0597863569597195, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6122, + "step": 2839 + }, + { + "epoch": 0.059807415908983225, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5758, + "step": 2840 + }, + { + "epoch": 0.059828474858246945, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 2841 + }, + { + "epoch": 0.05984953380751067, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 2842 + }, + { + "epoch": 0.0598705927567744, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 2843 + }, + { + "epoch": 0.05989165170603813, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 2844 + }, + { + "epoch": 0.059912710655301855, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 2845 + }, + { + "epoch": 0.05993376960456558, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 2846 + }, + { + "epoch": 0.059954828553829304, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5781, + "step": 2847 + }, + { + "epoch": 0.05997588750309303, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6101, + "step": 2848 + }, + { + "epoch": 0.05999694645235676, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5965, + "step": 2849 + }, + { + "epoch": 0.060018005401620486, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 2850 + }, + { + "epoch": 0.060039064350884214, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 2851 + }, + { + "epoch": 0.06006012330014794, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 2852 + }, + { + "epoch": 0.06008118224941167, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 2853 + }, + { + "epoch": 0.06010224119867539, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 2854 + }, + { + "epoch": 0.06012330014793912, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 2855 + }, + { + "epoch": 0.060144359097202844, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 2856 + }, + { + "epoch": 0.06016541804646657, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 2857 + }, + { + "epoch": 0.0601864769957303, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 2858 + }, + { + "epoch": 0.06020753594499403, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 2859 + }, + { + "epoch": 0.06022859489425775, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 2860 + }, + { + "epoch": 0.060249653843521475, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2861 + }, + { + "epoch": 0.0602707127927852, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 2862 + }, + { + "epoch": 0.06029177174204893, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 2863 + }, + { + "epoch": 0.06031283069131266, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 2864 + }, + { + "epoch": 0.060333889640576385, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 2865 + }, + { + "epoch": 0.06035494858984011, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6319, + "step": 2866 + }, + { + "epoch": 0.06037600753910383, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 2867 + }, + { + "epoch": 0.06039706648836756, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 2868 + }, + { + "epoch": 0.06041812543763129, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 2869 + }, + { + "epoch": 0.060439184386895016, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 2870 + }, + { + "epoch": 0.060460243336158744, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 2871 + }, + { + "epoch": 0.06048130228542247, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 2872 + }, + { + "epoch": 0.0605023612346862, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 2873 + }, + { + "epoch": 0.06052342018394992, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5553, + "step": 2874 + }, + { + "epoch": 0.06054447913321365, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6046, + "step": 2875 + }, + { + "epoch": 0.060565538082477374, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 2876 + }, + { + "epoch": 0.0605865970317411, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5865, + "step": 2877 + }, + { + "epoch": 0.06060765598100483, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6128, + "step": 2878 + }, + { + "epoch": 0.06062871493026856, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 2879 + }, + { + "epoch": 0.06064977387953228, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6007, + "step": 2880 + }, + { + "epoch": 0.060670832828796005, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 2881 + }, + { + "epoch": 0.06069189177805973, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2882 + }, + { + "epoch": 0.06071295072732346, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 2883 + }, + { + "epoch": 0.06073400967658719, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 2884 + }, + { + "epoch": 0.060755068625850915, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 2885 + }, + { + "epoch": 0.06077612757511464, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.561, + "step": 2886 + }, + { + "epoch": 0.06079718652437836, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 2887 + }, + { + "epoch": 0.06081824547364209, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 2888 + }, + { + "epoch": 0.06083930442290582, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 2889 + }, + { + "epoch": 0.060860363372169546, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5899, + "step": 2890 + }, + { + "epoch": 0.06088142232143327, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.561, + "step": 2891 + }, + { + "epoch": 0.060902481270697, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 2892 + }, + { + "epoch": 0.06092354021996073, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6181, + "step": 2893 + }, + { + "epoch": 0.06094459916922445, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 2894 + }, + { + "epoch": 0.06096565811848818, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 2895 + }, + { + "epoch": 0.060986717067751904, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5755, + "step": 2896 + }, + { + "epoch": 0.06100777601701563, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 2897 + }, + { + "epoch": 0.06102883496627936, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 2898 + }, + { + "epoch": 0.06104989391554309, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6061, + "step": 2899 + }, + { + "epoch": 0.06107095286480681, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 2900 + }, + { + "epoch": 0.061092011814070535, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 2901 + }, + { + "epoch": 0.06111307076333426, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6013, + "step": 2902 + }, + { + "epoch": 0.06113412971259799, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 2903 + }, + { + "epoch": 0.06115518866186172, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 2904 + }, + { + "epoch": 0.061176247611125445, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 2905 + }, + { + "epoch": 0.06119730656038917, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 2906 + }, + { + "epoch": 0.06121836550965289, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5832, + "step": 2907 + }, + { + "epoch": 0.06123942445891662, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 2908 + }, + { + "epoch": 0.06126048340818035, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 2909 + }, + { + "epoch": 0.061281542357444076, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5741, + "step": 2910 + }, + { + "epoch": 0.0613026013067078, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 2911 + }, + { + "epoch": 0.06132366025597153, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 2912 + }, + { + "epoch": 0.06134471920523525, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 2913 + }, + { + "epoch": 0.06136577815449898, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5643, + "step": 2914 + }, + { + "epoch": 0.061386837103762706, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5622, + "step": 2915 + }, + { + "epoch": 0.061407896053026434, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6049, + "step": 2916 + }, + { + "epoch": 0.06142895500229016, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5876, + "step": 2917 + }, + { + "epoch": 0.06145001395155389, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 2918 + }, + { + "epoch": 0.06147107290081762, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5735, + "step": 2919 + }, + { + "epoch": 0.06149213185008134, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 2920 + }, + { + "epoch": 0.061513190799345065, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5737, + "step": 2921 + }, + { + "epoch": 0.06153424974860879, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5987, + "step": 2922 + }, + { + "epoch": 0.06155530869787252, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 2923 + }, + { + "epoch": 0.06157636764713625, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6035, + "step": 2924 + }, + { + "epoch": 0.061597426596399975, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 2925 + }, + { + "epoch": 0.0616184855456637, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 2926 + }, + { + "epoch": 0.06163954449492742, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5628, + "step": 2927 + }, + { + "epoch": 0.06166060344419115, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5971, + "step": 2928 + }, + { + "epoch": 0.06168166239345488, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5758, + "step": 2929 + }, + { + "epoch": 0.061702721342718606, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 2930 + }, + { + "epoch": 0.06172378029198233, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 2931 + }, + { + "epoch": 0.06174483924124606, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2932 + }, + { + "epoch": 0.06176589819050978, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5697, + "step": 2933 + }, + { + "epoch": 0.06178695713977351, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5948, + "step": 2934 + }, + { + "epoch": 0.061808016089037236, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 2935 + }, + { + "epoch": 0.061829075038300964, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 2936 + }, + { + "epoch": 0.06185013398756469, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2937 + }, + { + "epoch": 0.06187119293682842, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 2938 + }, + { + "epoch": 0.061892251886092146, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 2939 + }, + { + "epoch": 0.06191331083535587, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 2940 + }, + { + "epoch": 0.061934369784619595, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 2941 + }, + { + "epoch": 0.06195542873388332, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6212, + "step": 2942 + }, + { + "epoch": 0.06197648768314705, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 2943 + }, + { + "epoch": 0.06199754663241078, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6083, + "step": 2944 + }, + { + "epoch": 0.062018605581674505, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 2945 + }, + { + "epoch": 0.06203966453093823, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 2946 + }, + { + "epoch": 0.06206072348020195, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6118, + "step": 2947 + }, + { + "epoch": 0.06208178242946568, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5965, + "step": 2948 + }, + { + "epoch": 0.06210284137872941, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 2949 + }, + { + "epoch": 0.062123900327993135, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 2950 + }, + { + "epoch": 0.06214495927725686, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5891, + "step": 2951 + }, + { + "epoch": 0.06216601822652059, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5869, + "step": 2952 + }, + { + "epoch": 0.06218707717578431, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2953 + }, + { + "epoch": 0.06220813612504804, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6096, + "step": 2954 + }, + { + "epoch": 0.062229195074311766, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 2955 + }, + { + "epoch": 0.062250254023575494, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.574, + "step": 2956 + }, + { + "epoch": 0.06227131297283922, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 2957 + }, + { + "epoch": 0.06229237192210295, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5479, + "step": 2958 + }, + { + "epoch": 0.062313430871366676, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 2959 + }, + { + "epoch": 0.0623344898206304, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5694, + "step": 2960 + }, + { + "epoch": 0.062355548769894124, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 2961 + }, + { + "epoch": 0.06237660771915785, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5592, + "step": 2962 + }, + { + "epoch": 0.06239766666842158, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 2963 + }, + { + "epoch": 0.06241872561768531, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.568, + "step": 2964 + }, + { + "epoch": 0.062439784566949035, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5848, + "step": 2965 + }, + { + "epoch": 0.062460843516212755, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 2966 + }, + { + "epoch": 0.06248190246547648, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6007, + "step": 2967 + }, + { + "epoch": 0.06250296141474021, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 2968 + }, + { + "epoch": 0.06252402036400394, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5906, + "step": 2969 + }, + { + "epoch": 0.06254507931326767, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.6045, + "step": 2970 + }, + { + "epoch": 0.06256613826253139, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 2971 + }, + { + "epoch": 0.06258719721179512, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 2972 + }, + { + "epoch": 0.06260825616105885, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 2973 + }, + { + "epoch": 0.06262931511032258, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 2974 + }, + { + "epoch": 0.0626503740595863, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 2975 + }, + { + "epoch": 0.06267143300885002, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5855, + "step": 2976 + }, + { + "epoch": 0.06269249195811374, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 2977 + }, + { + "epoch": 0.06271355090737747, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 2978 + }, + { + "epoch": 0.0627346098566412, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.637, + "step": 2979 + }, + { + "epoch": 0.06275566880590493, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5912, + "step": 2980 + }, + { + "epoch": 0.06277672775516865, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5965, + "step": 2981 + }, + { + "epoch": 0.06279778670443238, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 2982 + }, + { + "epoch": 0.06281884565369611, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5987, + "step": 2983 + }, + { + "epoch": 0.06283990460295984, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 2984 + }, + { + "epoch": 0.06286096355222356, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5902, + "step": 2985 + }, + { + "epoch": 0.06288202250148729, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 2986 + }, + { + "epoch": 0.06290308145075102, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 2987 + }, + { + "epoch": 0.06292414040001475, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 2988 + }, + { + "epoch": 0.06294519934927847, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5466, + "step": 2989 + }, + { + "epoch": 0.06296625829854219, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 2990 + }, + { + "epoch": 0.06298731724780592, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.572, + "step": 2991 + }, + { + "epoch": 0.06300837619706964, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 2992 + }, + { + "epoch": 0.06302943514633337, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 2993 + }, + { + "epoch": 0.0630504940955971, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 2994 + }, + { + "epoch": 0.06307155304486083, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 1.5902, + "step": 2995 + }, + { + "epoch": 0.06309261199412455, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 2996 + }, + { + "epoch": 0.06311367094338828, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 2997 + }, + { + "epoch": 0.06313472989265201, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 2998 + }, + { + "epoch": 0.06315578884191574, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 2999 + }, + { + "epoch": 0.06317684779117946, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.6133, + "step": 3000 + }, + { + "epoch": 0.06317684779117946, + "eval_loss": 1.9333325624465942, + "eval_runtime": 897.2967, + "eval_samples_per_second": 68.874, + "eval_steps_per_second": 2.153, + "step": 3000 + }, + { + "epoch": 0.06319790674044319, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5906, + "step": 3001 + }, + { + "epoch": 0.06321896568970692, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 3002 + }, + { + "epoch": 0.06324002463897063, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 3003 + }, + { + "epoch": 0.06326108358823436, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 3004 + }, + { + "epoch": 0.06328214253749809, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 3005 + }, + { + "epoch": 0.06330320148676181, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5531, + "step": 3006 + }, + { + "epoch": 0.06332426043602554, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 3007 + }, + { + "epoch": 0.06334531938528927, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6109, + "step": 3008 + }, + { + "epoch": 0.063366378334553, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 3009 + }, + { + "epoch": 0.06338743728381672, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 3010 + }, + { + "epoch": 0.06340849623308045, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 3011 + }, + { + "epoch": 0.06342955518234418, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 3012 + }, + { + "epoch": 0.06345061413160791, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6371, + "step": 3013 + }, + { + "epoch": 0.06347167308087164, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 3014 + }, + { + "epoch": 0.06349273203013536, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 3015 + }, + { + "epoch": 0.06351379097939908, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5772, + "step": 3016 + }, + { + "epoch": 0.0635348499286628, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 3017 + }, + { + "epoch": 0.06355590887792653, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6118, + "step": 3018 + }, + { + "epoch": 0.06357696782719026, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 3019 + }, + { + "epoch": 0.06359802677645399, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 3020 + }, + { + "epoch": 0.06361908572571771, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5714, + "step": 3021 + }, + { + "epoch": 0.06364014467498144, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 3022 + }, + { + "epoch": 0.06366120362424517, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3023 + }, + { + "epoch": 0.0636822625735089, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 3024 + }, + { + "epoch": 0.06370332152277262, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5562, + "step": 3025 + }, + { + "epoch": 0.06372438047203635, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 3026 + }, + { + "epoch": 0.06374543942130008, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5971, + "step": 3027 + }, + { + "epoch": 0.0637664983705638, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5802, + "step": 3028 + }, + { + "epoch": 0.06378755731982752, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 3029 + }, + { + "epoch": 0.06380861626909125, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.5643, + "step": 3030 + }, + { + "epoch": 0.06382967521835498, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 3031 + }, + { + "epoch": 0.0638507341676187, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 3032 + }, + { + "epoch": 0.06387179311688243, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 3033 + }, + { + "epoch": 0.06389285206614616, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 3034 + }, + { + "epoch": 0.06391391101540989, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5741, + "step": 3035 + }, + { + "epoch": 0.06393496996467361, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 3036 + }, + { + "epoch": 0.06395602891393734, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5835, + "step": 3037 + }, + { + "epoch": 0.06397708786320107, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 3038 + }, + { + "epoch": 0.0639981468124648, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 3039 + }, + { + "epoch": 0.06401920576172852, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5824, + "step": 3040 + }, + { + "epoch": 0.06404026471099225, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 3041 + }, + { + "epoch": 0.06406132366025598, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 3042 + }, + { + "epoch": 0.06408238260951969, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 3043 + }, + { + "epoch": 0.06410344155878342, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6098, + "step": 3044 + }, + { + "epoch": 0.06412450050804715, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 3045 + }, + { + "epoch": 0.06414555945731087, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3046 + }, + { + "epoch": 0.0641666184065746, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5648, + "step": 3047 + }, + { + "epoch": 0.06418767735583833, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5723, + "step": 3048 + }, + { + "epoch": 0.06420873630510206, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 3049 + }, + { + "epoch": 0.06422979525436578, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 3050 + }, + { + "epoch": 0.06425085420362951, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 3051 + }, + { + "epoch": 0.06427191315289324, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 3052 + }, + { + "epoch": 0.06429297210215697, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 3053 + }, + { + "epoch": 0.0643140310514207, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 3054 + }, + { + "epoch": 0.06433509000068442, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6289, + "step": 3055 + }, + { + "epoch": 0.06435614894994814, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5666, + "step": 3056 + }, + { + "epoch": 0.06437720789921186, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 3057 + }, + { + "epoch": 0.06439826684847559, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3058 + }, + { + "epoch": 0.06441932579773932, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6054, + "step": 3059 + }, + { + "epoch": 0.06444038474700305, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 3060 + }, + { + "epoch": 0.06446144369626677, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 3061 + }, + { + "epoch": 0.0644825026455305, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 3062 + }, + { + "epoch": 0.06450356159479423, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5685, + "step": 3063 + }, + { + "epoch": 0.06452462054405796, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6278, + "step": 3064 + }, + { + "epoch": 0.06454567949332168, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3065 + }, + { + "epoch": 0.06456673844258541, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 3066 + }, + { + "epoch": 0.06458779739184914, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 3067 + }, + { + "epoch": 0.06460885634111287, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 3068 + }, + { + "epoch": 0.06462991529037658, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.59, + "step": 3069 + }, + { + "epoch": 0.06465097423964031, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 3070 + }, + { + "epoch": 0.06467203318890404, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 3071 + }, + { + "epoch": 0.06469309213816776, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 3072 + }, + { + "epoch": 0.06471415108743149, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 3073 + }, + { + "epoch": 0.06473521003669522, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6012, + "step": 3074 + }, + { + "epoch": 0.06475626898595895, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 3075 + }, + { + "epoch": 0.06477732793522267, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 3076 + }, + { + "epoch": 0.0647983868844864, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5393, + "step": 3077 + }, + { + "epoch": 0.06481944583375013, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 3078 + }, + { + "epoch": 0.06484050478301386, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.567, + "step": 3079 + }, + { + "epoch": 0.06486156373227758, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 3080 + }, + { + "epoch": 0.06488262268154131, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 3081 + }, + { + "epoch": 0.06490368163080502, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5929, + "step": 3082 + }, + { + "epoch": 0.06492474058006875, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6184, + "step": 3083 + }, + { + "epoch": 0.06494579952933248, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5705, + "step": 3084 + }, + { + "epoch": 0.0649668584785962, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5949, + "step": 3085 + }, + { + "epoch": 0.06498791742785993, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 3086 + }, + { + "epoch": 0.06500897637712366, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6078, + "step": 3087 + }, + { + "epoch": 0.06503003532638739, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 3088 + }, + { + "epoch": 0.06505109427565112, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6111, + "step": 3089 + }, + { + "epoch": 0.06507215322491484, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 3090 + }, + { + "epoch": 0.06509321217417857, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3091 + }, + { + "epoch": 0.0651142711234423, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5945, + "step": 3092 + }, + { + "epoch": 0.06513533007270603, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5741, + "step": 3093 + }, + { + "epoch": 0.06515638902196975, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 3094 + }, + { + "epoch": 0.06517744797123348, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 3095 + }, + { + "epoch": 0.0651985069204972, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 3096 + }, + { + "epoch": 0.06521956586976092, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6119, + "step": 3097 + }, + { + "epoch": 0.06524062481902465, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5983, + "step": 3098 + }, + { + "epoch": 0.06526168376828838, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3099 + }, + { + "epoch": 0.0652827427175521, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 3100 + }, + { + "epoch": 0.06530380166681583, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 3101 + }, + { + "epoch": 0.06532486061607956, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 3102 + }, + { + "epoch": 0.06534591956534329, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 3103 + }, + { + "epoch": 0.06536697851460702, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 3104 + }, + { + "epoch": 0.06538803746387074, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 3105 + }, + { + "epoch": 0.06540909641313447, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 3106 + }, + { + "epoch": 0.0654301553623982, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5832, + "step": 3107 + }, + { + "epoch": 0.06545121431166193, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5897, + "step": 3108 + }, + { + "epoch": 0.06547227326092564, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6334, + "step": 3109 + }, + { + "epoch": 0.06549333221018937, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5847, + "step": 3110 + }, + { + "epoch": 0.0655143911594531, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6056, + "step": 3111 + }, + { + "epoch": 0.06553545010871682, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 3112 + }, + { + "epoch": 0.06555650905798055, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 3113 + }, + { + "epoch": 0.06557756800724428, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 3114 + }, + { + "epoch": 0.065598626956508, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 3115 + }, + { + "epoch": 0.06561968590577173, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6039, + "step": 3116 + }, + { + "epoch": 0.06564074485503546, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 3117 + }, + { + "epoch": 0.06566180380429919, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5681, + "step": 3118 + }, + { + "epoch": 0.06568286275356292, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 3119 + }, + { + "epoch": 0.06570392170282664, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 3120 + }, + { + "epoch": 0.06572498065209037, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6254, + "step": 3121 + }, + { + "epoch": 0.06574603960135408, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5739, + "step": 3122 + }, + { + "epoch": 0.06576709855061781, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 3123 + }, + { + "epoch": 0.06578815749988154, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 3124 + }, + { + "epoch": 0.06580921644914527, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 3125 + }, + { + "epoch": 0.065830275398409, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 3126 + }, + { + "epoch": 0.06585133434767272, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 3127 + }, + { + "epoch": 0.06587239329693645, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5687, + "step": 3128 + }, + { + "epoch": 0.06589345224620018, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5815, + "step": 3129 + }, + { + "epoch": 0.0659145111954639, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5674, + "step": 3130 + }, + { + "epoch": 0.06593557014472763, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 3131 + }, + { + "epoch": 0.06595662909399136, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 3132 + }, + { + "epoch": 0.06597768804325509, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 3133 + }, + { + "epoch": 0.06599874699251881, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 3134 + }, + { + "epoch": 0.06601980594178254, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 3135 + }, + { + "epoch": 0.06604086489104626, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 3136 + }, + { + "epoch": 0.06606192384030998, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5902, + "step": 3137 + }, + { + "epoch": 0.06608298278957371, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 3138 + }, + { + "epoch": 0.06610404173883744, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 3139 + }, + { + "epoch": 0.06612510068810117, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 3140 + }, + { + "epoch": 0.06614615963736489, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 3141 + }, + { + "epoch": 0.06616721858662862, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5695, + "step": 3142 + }, + { + "epoch": 0.06618827753589235, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5851, + "step": 3143 + }, + { + "epoch": 0.06620933648515608, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 3144 + }, + { + "epoch": 0.0662303954344198, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5837, + "step": 3145 + }, + { + "epoch": 0.06625145438368353, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 3146 + }, + { + "epoch": 0.06627251333294726, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5518, + "step": 3147 + }, + { + "epoch": 0.06629357228221099, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 3148 + }, + { + "epoch": 0.0663146312314747, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3149 + }, + { + "epoch": 0.06633569018073843, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 3150 + }, + { + "epoch": 0.06635674913000215, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 3151 + }, + { + "epoch": 0.06637780807926588, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 3152 + }, + { + "epoch": 0.06639886702852961, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5621, + "step": 3153 + }, + { + "epoch": 0.06641992597779334, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5551, + "step": 3154 + }, + { + "epoch": 0.06644098492705706, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 3155 + }, + { + "epoch": 0.06646204387632079, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 3156 + }, + { + "epoch": 0.06648310282558452, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5987, + "step": 3157 + }, + { + "epoch": 0.06650416177484825, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.5284, + "step": 3158 + }, + { + "epoch": 0.06652522072411197, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 3159 + }, + { + "epoch": 0.0665462796733757, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5987, + "step": 3160 + }, + { + "epoch": 0.06656733862263943, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5567, + "step": 3161 + }, + { + "epoch": 0.06658839757190314, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 3162 + }, + { + "epoch": 0.06660945652116687, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3163 + }, + { + "epoch": 0.0666305154704306, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5741, + "step": 3164 + }, + { + "epoch": 0.06665157441969433, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 3165 + }, + { + "epoch": 0.06667263336895805, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5575, + "step": 3166 + }, + { + "epoch": 0.06669369231822178, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 3167 + }, + { + "epoch": 0.06671475126748551, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5642, + "step": 3168 + }, + { + "epoch": 0.06673581021674924, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 3169 + }, + { + "epoch": 0.06675686916601296, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5967, + "step": 3170 + }, + { + "epoch": 0.06677792811527669, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 3171 + }, + { + "epoch": 0.06679898706454042, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 3172 + }, + { + "epoch": 0.06682004601380415, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5334, + "step": 3173 + }, + { + "epoch": 0.06684110496306787, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5771, + "step": 3174 + }, + { + "epoch": 0.06686216391233159, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 3175 + }, + { + "epoch": 0.06688322286159532, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 3176 + }, + { + "epoch": 0.06690428181085904, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 3177 + }, + { + "epoch": 0.06692534076012277, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6238, + "step": 3178 + }, + { + "epoch": 0.0669463997093865, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6122, + "step": 3179 + }, + { + "epoch": 0.06696745865865023, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 3180 + }, + { + "epoch": 0.06698851760791395, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 3181 + }, + { + "epoch": 0.06700957655717768, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5815, + "step": 3182 + }, + { + "epoch": 0.06703063550644141, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 3183 + }, + { + "epoch": 0.06705169445570514, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 3184 + }, + { + "epoch": 0.06707275340496886, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 3185 + }, + { + "epoch": 0.06709381235423259, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 3186 + }, + { + "epoch": 0.06711487130349632, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 3187 + }, + { + "epoch": 0.06713593025276005, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5256, + "step": 3188 + }, + { + "epoch": 0.06715698920202376, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 3189 + }, + { + "epoch": 0.06717804815128749, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5716, + "step": 3190 + }, + { + "epoch": 0.06719910710055121, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 3191 + }, + { + "epoch": 0.06722016604981494, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 3192 + }, + { + "epoch": 0.06724122499907867, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 3193 + }, + { + "epoch": 0.0672622839483424, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5865, + "step": 3194 + }, + { + "epoch": 0.06728334289760612, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 3195 + }, + { + "epoch": 0.06730440184686985, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 3196 + }, + { + "epoch": 0.06732546079613358, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.628, + "step": 3197 + }, + { + "epoch": 0.0673465197453973, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5663, + "step": 3198 + }, + { + "epoch": 0.06736757869466103, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 3199 + }, + { + "epoch": 0.06738863764392476, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 3200 + }, + { + "epoch": 0.06740969659318849, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5523, + "step": 3201 + }, + { + "epoch": 0.0674307555424522, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 3202 + }, + { + "epoch": 0.06745181449171593, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 3203 + }, + { + "epoch": 0.06747287344097966, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 3204 + }, + { + "epoch": 0.06749393239024339, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 3205 + }, + { + "epoch": 0.06751499133950711, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 3206 + }, + { + "epoch": 0.06753605028877084, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6388, + "step": 3207 + }, + { + "epoch": 0.06755710923803457, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 3208 + }, + { + "epoch": 0.0675781681872983, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 3209 + }, + { + "epoch": 0.06759922713656202, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6211, + "step": 3210 + }, + { + "epoch": 0.06762028608582575, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 3211 + }, + { + "epoch": 0.06764134503508948, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 3212 + }, + { + "epoch": 0.0676624039843532, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3213 + }, + { + "epoch": 0.06768346293361693, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 3214 + }, + { + "epoch": 0.06770452188288065, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 3215 + }, + { + "epoch": 0.06772558083214437, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 3216 + }, + { + "epoch": 0.0677466397814081, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 3217 + }, + { + "epoch": 0.06776769873067183, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 3218 + }, + { + "epoch": 0.06778875767993556, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5709, + "step": 3219 + }, + { + "epoch": 0.06780981662919928, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 3220 + }, + { + "epoch": 0.06783087557846301, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6391, + "step": 3221 + }, + { + "epoch": 0.06785193452772674, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 3222 + }, + { + "epoch": 0.06787299347699047, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3223 + }, + { + "epoch": 0.0678940524262542, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.542, + "step": 3224 + }, + { + "epoch": 0.06791511137551792, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 3225 + }, + { + "epoch": 0.06793617032478165, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 3226 + }, + { + "epoch": 0.06795722927404538, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5749, + "step": 3227 + }, + { + "epoch": 0.06797828822330909, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5544, + "step": 3228 + }, + { + "epoch": 0.06799934717257282, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5965, + "step": 3229 + }, + { + "epoch": 0.06802040612183655, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3230 + }, + { + "epoch": 0.06804146507110027, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5614, + "step": 3231 + }, + { + "epoch": 0.068062524020364, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 3232 + }, + { + "epoch": 0.06808358296962773, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.6164, + "step": 3233 + }, + { + "epoch": 0.06810464191889146, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5852, + "step": 3234 + }, + { + "epoch": 0.06812570086815518, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 3235 + }, + { + "epoch": 0.06814675981741891, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5763, + "step": 3236 + }, + { + "epoch": 0.06816781876668264, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 3237 + }, + { + "epoch": 0.06818887771594637, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 3238 + }, + { + "epoch": 0.0682099366652101, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5696, + "step": 3239 + }, + { + "epoch": 0.06823099561447382, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 3240 + }, + { + "epoch": 0.06825205456373755, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 3241 + }, + { + "epoch": 0.06827311351300126, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 3242 + }, + { + "epoch": 0.06829417246226499, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 3243 + }, + { + "epoch": 0.06831523141152872, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 3244 + }, + { + "epoch": 0.06833629036079245, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5567, + "step": 3245 + }, + { + "epoch": 0.06835734931005617, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 3246 + }, + { + "epoch": 0.0683784082593199, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 3247 + }, + { + "epoch": 0.06839946720858363, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 3248 + }, + { + "epoch": 0.06842052615784736, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 3249 + }, + { + "epoch": 0.06844158510711108, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 3250 + }, + { + "epoch": 0.06846264405637481, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 3251 + }, + { + "epoch": 0.06848370300563854, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 3252 + }, + { + "epoch": 0.06850476195490227, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 3253 + }, + { + "epoch": 0.068525820904166, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 3254 + }, + { + "epoch": 0.0685468798534297, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5772, + "step": 3255 + }, + { + "epoch": 0.06856793880269343, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 3256 + }, + { + "epoch": 0.06858899775195716, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 3257 + }, + { + "epoch": 0.06861005670122089, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 3258 + }, + { + "epoch": 0.06863111565048462, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5763, + "step": 3259 + }, + { + "epoch": 0.06865217459974834, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5986, + "step": 3260 + }, + { + "epoch": 0.06867323354901207, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 3261 + }, + { + "epoch": 0.0686942924982758, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.601, + "step": 3262 + }, + { + "epoch": 0.06871535144753953, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 3263 + }, + { + "epoch": 0.06873641039680325, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 3264 + }, + { + "epoch": 0.06875746934606698, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 3265 + }, + { + "epoch": 0.06877852829533071, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5628, + "step": 3266 + }, + { + "epoch": 0.06879958724459444, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 3267 + }, + { + "epoch": 0.06882064619385815, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5544, + "step": 3268 + }, + { + "epoch": 0.06884170514312188, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 3269 + }, + { + "epoch": 0.0688627640923856, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5971, + "step": 3270 + }, + { + "epoch": 0.06888382304164933, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5722, + "step": 3271 + }, + { + "epoch": 0.06890488199091306, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 3272 + }, + { + "epoch": 0.06892594094017679, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5724, + "step": 3273 + }, + { + "epoch": 0.06894699988944052, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5803, + "step": 3274 + }, + { + "epoch": 0.06896805883870424, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 3275 + }, + { + "epoch": 0.06898911778796797, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6164, + "step": 3276 + }, + { + "epoch": 0.0690101767372317, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 3277 + }, + { + "epoch": 0.06903123568649543, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5726, + "step": 3278 + }, + { + "epoch": 0.06905229463575915, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 3279 + }, + { + "epoch": 0.06907335358502288, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 3280 + }, + { + "epoch": 0.0690944125342866, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5906, + "step": 3281 + }, + { + "epoch": 0.06911547148355032, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 3282 + }, + { + "epoch": 0.06913653043281405, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6078, + "step": 3283 + }, + { + "epoch": 0.06915758938207778, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 3284 + }, + { + "epoch": 0.0691786483313415, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 3285 + }, + { + "epoch": 0.06919970728060523, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6235, + "step": 3286 + }, + { + "epoch": 0.06922076622986896, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.568, + "step": 3287 + }, + { + "epoch": 0.06924182517913269, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 3288 + }, + { + "epoch": 0.06926288412839642, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 3289 + }, + { + "epoch": 0.06928394307766014, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3290 + }, + { + "epoch": 0.06930500202692387, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5604, + "step": 3291 + }, + { + "epoch": 0.0693260609761876, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 3292 + }, + { + "epoch": 0.06934711992545133, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5919, + "step": 3293 + }, + { + "epoch": 0.06936817887471505, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 3294 + }, + { + "epoch": 0.06938923782397877, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 3295 + }, + { + "epoch": 0.0694102967732425, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 3296 + }, + { + "epoch": 0.06943135572250622, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6122, + "step": 3297 + }, + { + "epoch": 0.06945241467176995, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 3298 + }, + { + "epoch": 0.06947347362103368, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 1.561, + "step": 3299 + }, + { + "epoch": 0.0694945325702974, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5835, + "step": 3300 + }, + { + "epoch": 0.06951559151956113, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 3301 + }, + { + "epoch": 0.06953665046882486, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 3302 + }, + { + "epoch": 0.06955770941808859, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 3303 + }, + { + "epoch": 0.06957876836735231, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 3304 + }, + { + "epoch": 0.06959982731661604, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6013, + "step": 3305 + }, + { + "epoch": 0.06962088626587977, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 3306 + }, + { + "epoch": 0.0696419452151435, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 3307 + }, + { + "epoch": 0.06966300416440721, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 3308 + }, + { + "epoch": 0.06968406311367094, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 3309 + }, + { + "epoch": 0.06970512206293467, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 3310 + }, + { + "epoch": 0.0697261810121984, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 3311 + }, + { + "epoch": 0.06974723996146212, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 3312 + }, + { + "epoch": 0.06976829891072585, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 3313 + }, + { + "epoch": 0.06978935785998958, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 3314 + }, + { + "epoch": 0.0698104168092533, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5945, + "step": 3315 + }, + { + "epoch": 0.06983147575851703, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 3316 + }, + { + "epoch": 0.06985253470778076, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 3317 + }, + { + "epoch": 0.06987359365704449, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 3318 + }, + { + "epoch": 0.06989465260630821, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6194, + "step": 3319 + }, + { + "epoch": 0.06991571155557194, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 3320 + }, + { + "epoch": 0.06993677050483565, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 3321 + }, + { + "epoch": 0.06995782945409938, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 3322 + }, + { + "epoch": 0.06997888840336311, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 3323 + }, + { + "epoch": 0.06999994735262684, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 3324 + }, + { + "epoch": 0.07002100630189056, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5496, + "step": 3325 + }, + { + "epoch": 0.07004206525115429, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 3326 + }, + { + "epoch": 0.07006312420041802, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5819, + "step": 3327 + }, + { + "epoch": 0.07008418314968175, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3328 + }, + { + "epoch": 0.07010524209894547, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5975, + "step": 3329 + }, + { + "epoch": 0.0701263010482092, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 3330 + }, + { + "epoch": 0.07014735999747293, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 3331 + }, + { + "epoch": 0.07016841894673666, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 3332 + }, + { + "epoch": 0.07018947789600039, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 3333 + }, + { + "epoch": 0.0702105368452641, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 3334 + }, + { + "epoch": 0.07023159579452783, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 3335 + }, + { + "epoch": 0.07025265474379155, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 3336 + }, + { + "epoch": 0.07027371369305528, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 3337 + }, + { + "epoch": 0.07029477264231901, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 3338 + }, + { + "epoch": 0.07031583159158274, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 3339 + }, + { + "epoch": 0.07033689054084646, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 3340 + }, + { + "epoch": 0.07035794949011019, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 3341 + }, + { + "epoch": 0.07037900843937392, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 3342 + }, + { + "epoch": 0.07040006738863765, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5852, + "step": 3343 + }, + { + "epoch": 0.07042112633790137, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 3344 + }, + { + "epoch": 0.0704421852871651, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5929, + "step": 3345 + }, + { + "epoch": 0.07046324423642883, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 3346 + }, + { + "epoch": 0.07048430318569256, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 3347 + }, + { + "epoch": 0.07050536213495627, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 3348 + }, + { + "epoch": 0.07052642108422, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 3349 + }, + { + "epoch": 0.07054748003348373, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 3350 + }, + { + "epoch": 0.07056853898274745, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 3351 + }, + { + "epoch": 0.07058959793201118, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 3352 + }, + { + "epoch": 0.07061065688127491, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 3353 + }, + { + "epoch": 0.07063171583053864, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5643, + "step": 3354 + }, + { + "epoch": 0.07065277477980236, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 3355 + }, + { + "epoch": 0.07067383372906609, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3356 + }, + { + "epoch": 0.07069489267832982, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5473, + "step": 3357 + }, + { + "epoch": 0.07071595162759355, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5495, + "step": 3358 + }, + { + "epoch": 0.07073701057685727, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5824, + "step": 3359 + }, + { + "epoch": 0.070758069526121, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5883, + "step": 3360 + }, + { + "epoch": 0.07077912847538471, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 3361 + }, + { + "epoch": 0.07080018742464844, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6182, + "step": 3362 + }, + { + "epoch": 0.07082124637391217, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 3363 + }, + { + "epoch": 0.0708423053231759, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 3364 + }, + { + "epoch": 0.07086336427243962, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 3365 + }, + { + "epoch": 0.07088442322170335, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 3366 + }, + { + "epoch": 0.07090548217096708, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 3367 + }, + { + "epoch": 0.07092654112023081, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 3368 + }, + { + "epoch": 0.07094760006949453, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 3369 + }, + { + "epoch": 0.07096865901875826, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 3370 + }, + { + "epoch": 0.07098971796802199, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.55, + "step": 3371 + }, + { + "epoch": 0.07101077691728572, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5934, + "step": 3372 + }, + { + "epoch": 0.07103183586654944, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 3373 + }, + { + "epoch": 0.07105289481581316, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 3374 + }, + { + "epoch": 0.07107395376507689, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5752, + "step": 3375 + }, + { + "epoch": 0.07109501271434061, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 3376 + }, + { + "epoch": 0.07111607166360434, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5962, + "step": 3377 + }, + { + "epoch": 0.07113713061286807, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.558, + "step": 3378 + }, + { + "epoch": 0.0711581895621318, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5894, + "step": 3379 + }, + { + "epoch": 0.07117924851139552, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 3380 + }, + { + "epoch": 0.07120030746065925, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5549, + "step": 3381 + }, + { + "epoch": 0.07122136640992298, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 3382 + }, + { + "epoch": 0.0712424253591867, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 3383 + }, + { + "epoch": 0.07126348430845043, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 3384 + }, + { + "epoch": 0.07128454325771416, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 3385 + }, + { + "epoch": 0.07130560220697789, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5755, + "step": 3386 + }, + { + "epoch": 0.0713266611562416, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 3387 + }, + { + "epoch": 0.07134772010550533, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 3388 + }, + { + "epoch": 0.07136877905476906, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 3389 + }, + { + "epoch": 0.07138983800403279, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 3390 + }, + { + "epoch": 0.07141089695329651, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 3391 + }, + { + "epoch": 0.07143195590256024, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 3392 + }, + { + "epoch": 0.07145301485182397, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 3393 + }, + { + "epoch": 0.0714740738010877, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6039, + "step": 3394 + }, + { + "epoch": 0.07149513275035142, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.609, + "step": 3395 + }, + { + "epoch": 0.07151619169961515, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 3396 + }, + { + "epoch": 0.07153725064887888, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6119, + "step": 3397 + }, + { + "epoch": 0.0715583095981426, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 3398 + }, + { + "epoch": 0.07157936854740633, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 3399 + }, + { + "epoch": 0.07160042749667006, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 3400 + }, + { + "epoch": 0.07162148644593377, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 3401 + }, + { + "epoch": 0.0716425453951975, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5521, + "step": 3402 + }, + { + "epoch": 0.07166360434446123, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5606, + "step": 3403 + }, + { + "epoch": 0.07168466329372496, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5802, + "step": 3404 + }, + { + "epoch": 0.07170572224298868, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3405 + }, + { + "epoch": 0.07172678119225241, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 3406 + }, + { + "epoch": 0.07174784014151614, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 3407 + }, + { + "epoch": 0.07176889909077987, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5521, + "step": 3408 + }, + { + "epoch": 0.0717899580400436, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 3409 + }, + { + "epoch": 0.07181101698930732, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5813, + "step": 3410 + }, + { + "epoch": 0.07183207593857105, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 3411 + }, + { + "epoch": 0.07185313488783478, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 3412 + }, + { + "epoch": 0.0718741938370985, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 3413 + }, + { + "epoch": 0.07189525278636222, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 3414 + }, + { + "epoch": 0.07191631173562595, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 3415 + }, + { + "epoch": 0.07193737068488967, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 3416 + }, + { + "epoch": 0.0719584296341534, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 3417 + }, + { + "epoch": 0.07197948858341713, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5716, + "step": 3418 + }, + { + "epoch": 0.07200054753268086, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 3419 + }, + { + "epoch": 0.07202160648194458, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6093, + "step": 3420 + }, + { + "epoch": 0.07204266543120831, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 3421 + }, + { + "epoch": 0.07206372438047204, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 3422 + }, + { + "epoch": 0.07208478332973577, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5855, + "step": 3423 + }, + { + "epoch": 0.0721058422789995, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3424 + }, + { + "epoch": 0.07212690122826322, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 3425 + }, + { + "epoch": 0.07214796017752695, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 3426 + }, + { + "epoch": 0.07216901912679066, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 3427 + }, + { + "epoch": 0.07219007807605439, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5786, + "step": 3428 + }, + { + "epoch": 0.07221113702531812, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 3429 + }, + { + "epoch": 0.07223219597458184, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 3430 + }, + { + "epoch": 0.07225325492384557, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 3431 + }, + { + "epoch": 0.0722743138731093, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3432 + }, + { + "epoch": 0.07229537282237303, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 3433 + }, + { + "epoch": 0.07231643177163675, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 3434 + }, + { + "epoch": 0.07233749072090048, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 3435 + }, + { + "epoch": 0.07235854967016421, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 3436 + }, + { + "epoch": 0.07237960861942794, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 3437 + }, + { + "epoch": 0.07240066756869167, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5587, + "step": 3438 + }, + { + "epoch": 0.07242172651795539, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.6045, + "step": 3439 + }, + { + "epoch": 0.0724427854672191, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 3440 + }, + { + "epoch": 0.07246384441648283, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5625, + "step": 3441 + }, + { + "epoch": 0.07248490336574656, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 3442 + }, + { + "epoch": 0.07250596231501029, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 3443 + }, + { + "epoch": 0.07252702126427402, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5807, + "step": 3444 + }, + { + "epoch": 0.07254808021353774, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5553, + "step": 3445 + }, + { + "epoch": 0.07256913916280147, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 3446 + }, + { + "epoch": 0.0725901981120652, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 3447 + }, + { + "epoch": 0.07261125706132893, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 3448 + }, + { + "epoch": 0.07263231601059265, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 3449 + }, + { + "epoch": 0.07265337495985638, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 3450 + }, + { + "epoch": 0.07267443390912011, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 3451 + }, + { + "epoch": 0.07269549285838384, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 3452 + }, + { + "epoch": 0.07271655180764756, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5955, + "step": 3453 + }, + { + "epoch": 0.07273761075691128, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5912, + "step": 3454 + }, + { + "epoch": 0.072758669706175, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 3455 + }, + { + "epoch": 0.07277972865543873, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3456 + }, + { + "epoch": 0.07280078760470246, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 3457 + }, + { + "epoch": 0.07282184655396619, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 3458 + }, + { + "epoch": 0.07284290550322992, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 3459 + }, + { + "epoch": 0.07286396445249364, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 3460 + }, + { + "epoch": 0.07288502340175737, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5556, + "step": 3461 + }, + { + "epoch": 0.0729060823510211, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 3462 + }, + { + "epoch": 0.07292714130028483, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 3463 + }, + { + "epoch": 0.07294820024954855, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6085, + "step": 3464 + }, + { + "epoch": 0.07296925919881228, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5569, + "step": 3465 + }, + { + "epoch": 0.07299031814807601, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.615, + "step": 3466 + }, + { + "epoch": 0.07301137709733972, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6093, + "step": 3467 + }, + { + "epoch": 0.07303243604660345, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 3468 + }, + { + "epoch": 0.07305349499586718, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 3469 + }, + { + "epoch": 0.0730745539451309, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 3470 + }, + { + "epoch": 0.07309561289439463, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 3471 + }, + { + "epoch": 0.07311667184365836, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 3472 + }, + { + "epoch": 0.07313773079292209, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 3473 + }, + { + "epoch": 0.07315878974218581, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 3474 + }, + { + "epoch": 0.07317984869144954, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5391, + "step": 3475 + }, + { + "epoch": 0.07320090764071327, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 3476 + }, + { + "epoch": 0.073221966589977, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6046, + "step": 3477 + }, + { + "epoch": 0.07324302553924072, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5583, + "step": 3478 + }, + { + "epoch": 0.07326408448850445, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 3479 + }, + { + "epoch": 0.07328514343776817, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 3480 + }, + { + "epoch": 0.0733062023870319, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 3481 + }, + { + "epoch": 0.07332726133629562, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3482 + }, + { + "epoch": 0.07334832028555935, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 3483 + }, + { + "epoch": 0.07336937923482308, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 3484 + }, + { + "epoch": 0.0733904381840868, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 3485 + }, + { + "epoch": 0.07341149713335053, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5663, + "step": 3486 + }, + { + "epoch": 0.07343255608261426, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.5665, + "step": 3487 + }, + { + "epoch": 0.07345361503187799, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 3488 + }, + { + "epoch": 0.07347467398114171, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6255, + "step": 3489 + }, + { + "epoch": 0.07349573293040544, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 3490 + }, + { + "epoch": 0.07351679187966917, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 3491 + }, + { + "epoch": 0.0735378508289329, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5614, + "step": 3492 + }, + { + "epoch": 0.07355890977819661, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5852, + "step": 3493 + }, + { + "epoch": 0.07357996872746034, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 3494 + }, + { + "epoch": 0.07360102767672407, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 3495 + }, + { + "epoch": 0.07362208662598779, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6275, + "step": 3496 + }, + { + "epoch": 0.07364314557525152, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 3497 + }, + { + "epoch": 0.07366420452451525, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6325, + "step": 3498 + }, + { + "epoch": 0.07368526347377898, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5639, + "step": 3499 + }, + { + "epoch": 0.0737063224230427, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5621, + "step": 3500 + }, + { + "epoch": 0.0737063224230427, + "eval_loss": 2.018354892730713, + "eval_runtime": 897.5659, + "eval_samples_per_second": 68.853, + "eval_steps_per_second": 2.152, + "step": 3500 + }, + { + "epoch": 0.07372738137230643, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6097, + "step": 3501 + }, + { + "epoch": 0.07374844032157016, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 3502 + }, + { + "epoch": 0.07376949927083389, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 3503 + }, + { + "epoch": 0.07379055822009761, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5799, + "step": 3504 + }, + { + "epoch": 0.07381161716936134, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 3505 + }, + { + "epoch": 0.07383267611862507, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5433, + "step": 3506 + }, + { + "epoch": 0.07385373506788878, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 3507 + }, + { + "epoch": 0.07387479401715251, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3508 + }, + { + "epoch": 0.07389585296641624, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 3509 + }, + { + "epoch": 0.07391691191567996, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5724, + "step": 3510 + }, + { + "epoch": 0.07393797086494369, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 3511 + }, + { + "epoch": 0.07395902981420742, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 3512 + }, + { + "epoch": 0.07398008876347115, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6012, + "step": 3513 + }, + { + "epoch": 0.07400114771273487, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 3514 + }, + { + "epoch": 0.0740222066619986, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6054, + "step": 3515 + }, + { + "epoch": 0.07404326561126233, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 3516 + }, + { + "epoch": 0.07406432456052606, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6318, + "step": 3517 + }, + { + "epoch": 0.07408538350978978, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3518 + }, + { + "epoch": 0.07410644245905351, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 3519 + }, + { + "epoch": 0.07412750140831723, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5955, + "step": 3520 + }, + { + "epoch": 0.07414856035758095, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 3521 + }, + { + "epoch": 0.07416961930684468, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3522 + }, + { + "epoch": 0.07419067825610841, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 3523 + }, + { + "epoch": 0.07421173720537214, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 3524 + }, + { + "epoch": 0.07423279615463586, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5757, + "step": 3525 + }, + { + "epoch": 0.07425385510389959, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 3526 + }, + { + "epoch": 0.07427491405316332, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5753, + "step": 3527 + }, + { + "epoch": 0.07429597300242705, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6104, + "step": 3528 + }, + { + "epoch": 0.07431703195169077, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3529 + }, + { + "epoch": 0.0743380909009545, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 3530 + }, + { + "epoch": 0.07435914985021823, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5737, + "step": 3531 + }, + { + "epoch": 0.07438020879948196, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 3532 + }, + { + "epoch": 0.07440126774874567, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 3533 + }, + { + "epoch": 0.0744223266980094, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 3534 + }, + { + "epoch": 0.07444338564727312, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 3535 + }, + { + "epoch": 0.07446444459653685, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 3536 + }, + { + "epoch": 0.07448550354580058, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 3537 + }, + { + "epoch": 0.07450656249506431, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 3538 + }, + { + "epoch": 0.07452762144432803, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 3539 + }, + { + "epoch": 0.07454868039359176, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5817, + "step": 3540 + }, + { + "epoch": 0.07456973934285549, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5599, + "step": 3541 + }, + { + "epoch": 0.07459079829211922, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 3542 + }, + { + "epoch": 0.07461185724138295, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6374, + "step": 3543 + }, + { + "epoch": 0.07463291619064667, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 3544 + }, + { + "epoch": 0.0746539751399104, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5945, + "step": 3545 + }, + { + "epoch": 0.07467503408917411, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5538, + "step": 3546 + }, + { + "epoch": 0.07469609303843784, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6129, + "step": 3547 + }, + { + "epoch": 0.07471715198770157, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 3548 + }, + { + "epoch": 0.0747382109369653, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5839, + "step": 3549 + }, + { + "epoch": 0.07475926988622902, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 3550 + }, + { + "epoch": 0.07478032883549275, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 3551 + }, + { + "epoch": 0.07480138778475648, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 3552 + }, + { + "epoch": 0.0748224467340202, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 3553 + }, + { + "epoch": 0.07484350568328393, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5643, + "step": 3554 + }, + { + "epoch": 0.07486456463254766, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5759, + "step": 3555 + }, + { + "epoch": 0.07488562358181139, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5893, + "step": 3556 + }, + { + "epoch": 0.07490668253107512, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5945, + "step": 3557 + }, + { + "epoch": 0.07492774148033884, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 3558 + }, + { + "epoch": 0.07494880042960257, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5613, + "step": 3559 + }, + { + "epoch": 0.07496985937886629, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5672, + "step": 3560 + }, + { + "epoch": 0.07499091832813001, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 3561 + }, + { + "epoch": 0.07501197727739374, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 3562 + }, + { + "epoch": 0.07503303622665747, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 3563 + }, + { + "epoch": 0.0750540951759212, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 3564 + }, + { + "epoch": 0.07507515412518492, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5847, + "step": 3565 + }, + { + "epoch": 0.07509621307444865, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6031, + "step": 3566 + }, + { + "epoch": 0.07511727202371238, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5539, + "step": 3567 + }, + { + "epoch": 0.0751383309729761, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5912, + "step": 3568 + }, + { + "epoch": 0.07515938992223983, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 3569 + }, + { + "epoch": 0.07518044887150356, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5391, + "step": 3570 + }, + { + "epoch": 0.07520150782076729, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 3571 + }, + { + "epoch": 0.07522256677003102, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5544, + "step": 3572 + }, + { + "epoch": 0.07524362571929473, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 3573 + }, + { + "epoch": 0.07526468466855846, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 3574 + }, + { + "epoch": 0.07528574361782218, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5803, + "step": 3575 + }, + { + "epoch": 0.07530680256708591, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5749, + "step": 3576 + }, + { + "epoch": 0.07532786151634964, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 3577 + }, + { + "epoch": 0.07534892046561337, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.536, + "step": 3578 + }, + { + "epoch": 0.0753699794148771, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 3579 + }, + { + "epoch": 0.07539103836414082, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 3580 + }, + { + "epoch": 0.07541209731340455, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 3581 + }, + { + "epoch": 0.07543315626266828, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5525, + "step": 3582 + }, + { + "epoch": 0.075454215211932, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5753, + "step": 3583 + }, + { + "epoch": 0.07547527416119573, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6083, + "step": 3584 + }, + { + "epoch": 0.07549633311045946, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6094, + "step": 3585 + }, + { + "epoch": 0.07551739205972317, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 3586 + }, + { + "epoch": 0.0755384510089869, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5987, + "step": 3587 + }, + { + "epoch": 0.07555950995825063, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 3588 + }, + { + "epoch": 0.07558056890751436, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 3589 + }, + { + "epoch": 0.07560162785677808, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.551, + "step": 3590 + }, + { + "epoch": 0.07562268680604181, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 3591 + }, + { + "epoch": 0.07564374575530554, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 3592 + }, + { + "epoch": 0.07566480470456927, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 3593 + }, + { + "epoch": 0.075685863653833, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 3594 + }, + { + "epoch": 0.07570692260309672, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 3595 + }, + { + "epoch": 0.07572798155236045, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3596 + }, + { + "epoch": 0.07574904050162418, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3597 + }, + { + "epoch": 0.0757700994508879, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 3598 + }, + { + "epoch": 0.07579115840015162, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 3599 + }, + { + "epoch": 0.07581221734941535, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5758, + "step": 3600 + }, + { + "epoch": 0.07583327629867907, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5685, + "step": 3601 + }, + { + "epoch": 0.0758543352479428, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 3602 + }, + { + "epoch": 0.07587539419720653, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5567, + "step": 3603 + }, + { + "epoch": 0.07589645314647026, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5755, + "step": 3604 + }, + { + "epoch": 0.07591751209573398, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 3605 + }, + { + "epoch": 0.07593857104499771, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 3606 + }, + { + "epoch": 0.07595962999426144, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 3607 + }, + { + "epoch": 0.07598068894352517, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 3608 + }, + { + "epoch": 0.07600174789278889, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 3609 + }, + { + "epoch": 0.07602280684205262, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5696, + "step": 3610 + }, + { + "epoch": 0.07604386579131635, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6064, + "step": 3611 + }, + { + "epoch": 0.07606492474058008, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 3612 + }, + { + "epoch": 0.07608598368984379, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 3613 + }, + { + "epoch": 0.07610704263910752, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 3614 + }, + { + "epoch": 0.07612810158837124, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 3615 + }, + { + "epoch": 0.07614916053763497, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6064, + "step": 3616 + }, + { + "epoch": 0.0761702194868987, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 3617 + }, + { + "epoch": 0.07619127843616243, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5642, + "step": 3618 + }, + { + "epoch": 0.07621233738542615, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6129, + "step": 3619 + }, + { + "epoch": 0.07623339633468988, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 3620 + }, + { + "epoch": 0.07625445528395361, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 3621 + }, + { + "epoch": 0.07627551423321734, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5679, + "step": 3622 + }, + { + "epoch": 0.07629657318248106, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 3623 + }, + { + "epoch": 0.07631763213174479, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 3624 + }, + { + "epoch": 0.07633869108100852, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 3625 + }, + { + "epoch": 0.07635975003027223, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 3626 + }, + { + "epoch": 0.07638080897953596, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 3627 + }, + { + "epoch": 0.07640186792879969, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5558, + "step": 3628 + }, + { + "epoch": 0.07642292687806342, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 3629 + }, + { + "epoch": 0.07644398582732714, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 3630 + }, + { + "epoch": 0.07646504477659087, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.601, + "step": 3631 + }, + { + "epoch": 0.0764861037258546, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 3632 + }, + { + "epoch": 0.07650716267511833, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3633 + }, + { + "epoch": 0.07652822162438205, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 3634 + }, + { + "epoch": 0.07654928057364578, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5576, + "step": 3635 + }, + { + "epoch": 0.07657033952290951, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5689, + "step": 3636 + }, + { + "epoch": 0.07659139847217324, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 3637 + }, + { + "epoch": 0.07661245742143696, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 3638 + }, + { + "epoch": 0.07663351637070068, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6129, + "step": 3639 + }, + { + "epoch": 0.0766545753199644, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 3640 + }, + { + "epoch": 0.07667563426922813, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6102, + "step": 3641 + }, + { + "epoch": 0.07669669321849186, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 3642 + }, + { + "epoch": 0.07671775216775559, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 3643 + }, + { + "epoch": 0.07673881111701931, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 3644 + }, + { + "epoch": 0.07675987006628304, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5876, + "step": 3645 + }, + { + "epoch": 0.07678092901554677, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5593, + "step": 3646 + }, + { + "epoch": 0.0768019879648105, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5714, + "step": 3647 + }, + { + "epoch": 0.07682304691407423, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6031, + "step": 3648 + }, + { + "epoch": 0.07684410586333795, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 3649 + }, + { + "epoch": 0.07686516481260168, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 3650 + }, + { + "epoch": 0.07688622376186541, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3651 + }, + { + "epoch": 0.07690728271112912, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.633, + "step": 3652 + }, + { + "epoch": 0.07692834166039285, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 3653 + }, + { + "epoch": 0.07694940060965658, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 3654 + }, + { + "epoch": 0.0769704595589203, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3655 + }, + { + "epoch": 0.07699151850818403, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5681, + "step": 3656 + }, + { + "epoch": 0.07701257745744776, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.59, + "step": 3657 + }, + { + "epoch": 0.07703363640671149, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.575, + "step": 3658 + }, + { + "epoch": 0.07705469535597521, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3659 + }, + { + "epoch": 0.07707575430523894, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5835, + "step": 3660 + }, + { + "epoch": 0.07709681325450267, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5651, + "step": 3661 + }, + { + "epoch": 0.0771178722037664, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5599, + "step": 3662 + }, + { + "epoch": 0.07713893115303012, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 3663 + }, + { + "epoch": 0.07715999010229385, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 3664 + }, + { + "epoch": 0.07718104905155758, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 3665 + }, + { + "epoch": 0.07720210800082129, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 3666 + }, + { + "epoch": 0.07722316695008502, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 3667 + }, + { + "epoch": 0.07724422589934875, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5739, + "step": 3668 + }, + { + "epoch": 0.07726528484861248, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 3669 + }, + { + "epoch": 0.0772863437978762, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 3670 + }, + { + "epoch": 0.07730740274713993, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3671 + }, + { + "epoch": 0.07732846169640366, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 3672 + }, + { + "epoch": 0.07734952064566739, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6056, + "step": 3673 + }, + { + "epoch": 0.07737057959493111, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 3674 + }, + { + "epoch": 0.07739163854419484, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5945, + "step": 3675 + }, + { + "epoch": 0.07741269749345857, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 3676 + }, + { + "epoch": 0.0774337564427223, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 3677 + }, + { + "epoch": 0.07745481539198602, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5784, + "step": 3678 + }, + { + "epoch": 0.07747587434124974, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 3679 + }, + { + "epoch": 0.07749693329051346, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5781, + "step": 3680 + }, + { + "epoch": 0.07751799223977719, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 3681 + }, + { + "epoch": 0.07753905118904092, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5975, + "step": 3682 + }, + { + "epoch": 0.07756011013830465, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6273, + "step": 3683 + }, + { + "epoch": 0.07758116908756837, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5863, + "step": 3684 + }, + { + "epoch": 0.0776022280368321, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5913, + "step": 3685 + }, + { + "epoch": 0.07762328698609583, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5759, + "step": 3686 + }, + { + "epoch": 0.07764434593535956, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 3687 + }, + { + "epoch": 0.07766540488462328, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 3688 + }, + { + "epoch": 0.07768646383388701, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5528, + "step": 3689 + }, + { + "epoch": 0.07770752278315074, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5809, + "step": 3690 + }, + { + "epoch": 0.07772858173241447, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5863, + "step": 3691 + }, + { + "epoch": 0.07774964068167818, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5802, + "step": 3692 + }, + { + "epoch": 0.07777069963094191, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 3693 + }, + { + "epoch": 0.07779175858020564, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 3694 + }, + { + "epoch": 0.07781281752946936, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 3695 + }, + { + "epoch": 0.07783387647873309, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 3696 + }, + { + "epoch": 0.07785493542799682, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6079, + "step": 3697 + }, + { + "epoch": 0.07787599437726055, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 3698 + }, + { + "epoch": 0.07789705332652427, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 3699 + }, + { + "epoch": 0.077918112275788, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 3700 + }, + { + "epoch": 0.07793917122505173, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 3701 + }, + { + "epoch": 0.07796023017431546, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6104, + "step": 3702 + }, + { + "epoch": 0.07798128912357918, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 3703 + }, + { + "epoch": 0.07800234807284291, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6191, + "step": 3704 + }, + { + "epoch": 0.07802340702210664, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5742, + "step": 3705 + }, + { + "epoch": 0.07804446597137035, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5669, + "step": 3706 + }, + { + "epoch": 0.07806552492063408, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5614, + "step": 3707 + }, + { + "epoch": 0.07808658386989781, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 3708 + }, + { + "epoch": 0.07810764281916154, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 3709 + }, + { + "epoch": 0.07812870176842526, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 3710 + }, + { + "epoch": 0.07814976071768899, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5678, + "step": 3711 + }, + { + "epoch": 0.07817081966695272, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 3712 + }, + { + "epoch": 0.07819187861621645, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6148, + "step": 3713 + }, + { + "epoch": 0.07821293756548017, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3714 + }, + { + "epoch": 0.0782339965147439, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 3715 + }, + { + "epoch": 0.07825505546400763, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 3716 + }, + { + "epoch": 0.07827611441327136, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 3717 + }, + { + "epoch": 0.07829717336253508, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 3718 + }, + { + "epoch": 0.0783182323117988, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5995, + "step": 3719 + }, + { + "epoch": 0.07833929126106252, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 3720 + }, + { + "epoch": 0.07836035021032625, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 3721 + }, + { + "epoch": 0.07838140915958998, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5815, + "step": 3722 + }, + { + "epoch": 0.0784024681088537, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5629, + "step": 3723 + }, + { + "epoch": 0.07842352705811743, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.616, + "step": 3724 + }, + { + "epoch": 0.07844458600738116, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6054, + "step": 3725 + }, + { + "epoch": 0.07846564495664489, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 3726 + }, + { + "epoch": 0.07848670390590862, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 3727 + }, + { + "epoch": 0.07850776285517234, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 3728 + }, + { + "epoch": 0.07852882180443607, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5515, + "step": 3729 + }, + { + "epoch": 0.0785498807536998, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.585, + "step": 3730 + }, + { + "epoch": 0.07857093970296353, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5717, + "step": 3731 + }, + { + "epoch": 0.07859199865222724, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 3732 + }, + { + "epoch": 0.07861305760149097, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 3733 + }, + { + "epoch": 0.0786341165507547, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 3734 + }, + { + "epoch": 0.07865517550001842, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5906, + "step": 3735 + }, + { + "epoch": 0.07867623444928215, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6056, + "step": 3736 + }, + { + "epoch": 0.07869729339854588, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5606, + "step": 3737 + }, + { + "epoch": 0.0787183523478096, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 3738 + }, + { + "epoch": 0.07873941129707333, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 3739 + }, + { + "epoch": 0.07876047024633706, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 3740 + }, + { + "epoch": 0.07878152919560079, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 3741 + }, + { + "epoch": 0.07880258814486452, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5674, + "step": 3742 + }, + { + "epoch": 0.07882364709412824, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 3743 + }, + { + "epoch": 0.07884470604339197, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 3744 + }, + { + "epoch": 0.07886576499265568, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 3745 + }, + { + "epoch": 0.07888682394191941, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5794, + "step": 3746 + }, + { + "epoch": 0.07890788289118314, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5576, + "step": 3747 + }, + { + "epoch": 0.07892894184044687, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5756, + "step": 3748 + }, + { + "epoch": 0.0789500007897106, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 3749 + }, + { + "epoch": 0.07897105973897432, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 3750 + }, + { + "epoch": 0.07899211868823805, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5962, + "step": 3751 + }, + { + "epoch": 0.07901317763750178, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5565, + "step": 3752 + }, + { + "epoch": 0.0790342365867655, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5731, + "step": 3753 + }, + { + "epoch": 0.07905529553602923, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 3754 + }, + { + "epoch": 0.07907635448529296, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5807, + "step": 3755 + }, + { + "epoch": 0.07909741343455669, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 3756 + }, + { + "epoch": 0.07911847238382042, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 3757 + }, + { + "epoch": 0.07913953133308414, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6233, + "step": 3758 + }, + { + "epoch": 0.07916059028234786, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.57, + "step": 3759 + }, + { + "epoch": 0.07918164923161158, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 3760 + }, + { + "epoch": 0.07920270818087531, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 3761 + }, + { + "epoch": 0.07922376713013904, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 3762 + }, + { + "epoch": 0.07924482607940277, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 3763 + }, + { + "epoch": 0.0792658850286665, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5852, + "step": 3764 + }, + { + "epoch": 0.07928694397793022, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 3765 + }, + { + "epoch": 0.07930800292719395, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 3766 + }, + { + "epoch": 0.07932906187645768, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 3767 + }, + { + "epoch": 0.0793501208257214, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 3768 + }, + { + "epoch": 0.07937117977498513, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 3769 + }, + { + "epoch": 0.07939223872424886, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5995, + "step": 3770 + }, + { + "epoch": 0.07941329767351259, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 3771 + }, + { + "epoch": 0.0794343566227763, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 3772 + }, + { + "epoch": 0.07945541557204003, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 3773 + }, + { + "epoch": 0.07947647452130376, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 3774 + }, + { + "epoch": 0.07949753347056748, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 3775 + }, + { + "epoch": 0.07951859241983121, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 3776 + }, + { + "epoch": 0.07953965136909494, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 3777 + }, + { + "epoch": 0.07956071031835867, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 3778 + }, + { + "epoch": 0.0795817692676224, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5484, + "step": 3779 + }, + { + "epoch": 0.07960282821688612, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 3780 + }, + { + "epoch": 0.07962388716614985, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 3781 + }, + { + "epoch": 0.07964494611541358, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 3782 + }, + { + "epoch": 0.0796660050646773, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 3783 + }, + { + "epoch": 0.07968706401394103, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 3784 + }, + { + "epoch": 0.07970812296320474, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 3785 + }, + { + "epoch": 0.07972918191246847, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 3786 + }, + { + "epoch": 0.0797502408617322, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5644, + "step": 3787 + }, + { + "epoch": 0.07977129981099593, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5977, + "step": 3788 + }, + { + "epoch": 0.07979235876025965, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 3789 + }, + { + "epoch": 0.07981341770952338, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.5634, + "step": 3790 + }, + { + "epoch": 0.07983447665878711, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5651, + "step": 3791 + }, + { + "epoch": 0.07985553560805084, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6226, + "step": 3792 + }, + { + "epoch": 0.07987659455731456, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5542, + "step": 3793 + }, + { + "epoch": 0.07989765350657829, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 3794 + }, + { + "epoch": 0.07991871245584202, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 3795 + }, + { + "epoch": 0.07993977140510575, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5623, + "step": 3796 + }, + { + "epoch": 0.07996083035436947, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 3797 + }, + { + "epoch": 0.07998188930363319, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 3798 + }, + { + "epoch": 0.08000294825289692, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5929, + "step": 3799 + }, + { + "epoch": 0.08002400720216064, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 3800 + }, + { + "epoch": 0.08004506615142437, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 3801 + }, + { + "epoch": 0.0800661251006881, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 3802 + }, + { + "epoch": 0.08008718404995183, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3803 + }, + { + "epoch": 0.08010824299921555, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 3804 + }, + { + "epoch": 0.08012930194847928, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.5689, + "step": 3805 + }, + { + "epoch": 0.08015036089774301, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 3806 + }, + { + "epoch": 0.08017141984700674, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5863, + "step": 3807 + }, + { + "epoch": 0.08019247879627046, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 3808 + }, + { + "epoch": 0.08021353774553419, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 3809 + }, + { + "epoch": 0.08023459669479792, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5851, + "step": 3810 + }, + { + "epoch": 0.08025565564406165, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 3811 + }, + { + "epoch": 0.08027671459332536, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3812 + }, + { + "epoch": 0.08029777354258909, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6083, + "step": 3813 + }, + { + "epoch": 0.08031883249185282, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5893, + "step": 3814 + }, + { + "epoch": 0.08033989144111654, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5803, + "step": 3815 + }, + { + "epoch": 0.08036095039038027, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 3816 + }, + { + "epoch": 0.080382009339644, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 3817 + }, + { + "epoch": 0.08040306828890773, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 3818 + }, + { + "epoch": 0.08042412723817145, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 3819 + }, + { + "epoch": 0.08044518618743518, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5754, + "step": 3820 + }, + { + "epoch": 0.08046624513669891, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 3821 + }, + { + "epoch": 0.08048730408596264, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 3822 + }, + { + "epoch": 0.08050836303522636, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 3823 + }, + { + "epoch": 0.08052942198449009, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 3824 + }, + { + "epoch": 0.0805504809337538, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 3825 + }, + { + "epoch": 0.08057153988301753, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5571, + "step": 3826 + }, + { + "epoch": 0.08059259883228126, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 3827 + }, + { + "epoch": 0.08061365778154499, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 3828 + }, + { + "epoch": 0.08063471673080871, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 3829 + }, + { + "epoch": 0.08065577568007244, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 3830 + }, + { + "epoch": 0.08067683462933617, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 3831 + }, + { + "epoch": 0.0806978935785999, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 3832 + }, + { + "epoch": 0.08071895252786362, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 3833 + }, + { + "epoch": 0.08074001147712735, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 3834 + }, + { + "epoch": 0.08076107042639108, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 3835 + }, + { + "epoch": 0.08078212937565481, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 3836 + }, + { + "epoch": 0.08080318832491853, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 3837 + }, + { + "epoch": 0.08082424727418225, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 3838 + }, + { + "epoch": 0.08084530622344598, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5893, + "step": 3839 + }, + { + "epoch": 0.0808663651727097, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 3840 + }, + { + "epoch": 0.08088742412197343, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5764, + "step": 3841 + }, + { + "epoch": 0.08090848307123716, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 3842 + }, + { + "epoch": 0.08092954202050089, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5667, + "step": 3843 + }, + { + "epoch": 0.08095060096976461, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6, + "step": 3844 + }, + { + "epoch": 0.08097165991902834, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 3845 + }, + { + "epoch": 0.08099271886829207, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 3846 + }, + { + "epoch": 0.0810137778175558, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 3847 + }, + { + "epoch": 0.08103483676681952, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 3848 + }, + { + "epoch": 0.08105589571608325, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5835, + "step": 3849 + }, + { + "epoch": 0.08107695466534698, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5482, + "step": 3850 + }, + { + "epoch": 0.08109801361461069, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 3851 + }, + { + "epoch": 0.08111907256387442, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 3852 + }, + { + "epoch": 0.08114013151313815, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5347, + "step": 3853 + }, + { + "epoch": 0.08116119046240187, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 3854 + }, + { + "epoch": 0.0811822494116656, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5818, + "step": 3855 + }, + { + "epoch": 0.08120330836092933, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.572, + "step": 3856 + }, + { + "epoch": 0.08122436731019306, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 3857 + }, + { + "epoch": 0.08124542625945678, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5929, + "step": 3858 + }, + { + "epoch": 0.08126648520872051, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 3859 + }, + { + "epoch": 0.08128754415798424, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5869, + "step": 3860 + }, + { + "epoch": 0.08130860310724797, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 3861 + }, + { + "epoch": 0.0813296620565117, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5722, + "step": 3862 + }, + { + "epoch": 0.08135072100577542, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 3863 + }, + { + "epoch": 0.08137177995503915, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 3864 + }, + { + "epoch": 0.08139283890430286, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 3865 + }, + { + "epoch": 0.08141389785356659, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 3866 + }, + { + "epoch": 0.08143495680283032, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5822, + "step": 3867 + }, + { + "epoch": 0.08145601575209405, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6299, + "step": 3868 + }, + { + "epoch": 0.08147707470135777, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 3869 + }, + { + "epoch": 0.0814981336506215, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.616, + "step": 3870 + }, + { + "epoch": 0.08151919259988523, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5648, + "step": 3871 + }, + { + "epoch": 0.08154025154914896, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 3872 + }, + { + "epoch": 0.08156131049841268, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 3873 + }, + { + "epoch": 0.08158236944767641, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.545, + "step": 3874 + }, + { + "epoch": 0.08160342839694014, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3875 + }, + { + "epoch": 0.08162448734620387, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.574, + "step": 3876 + }, + { + "epoch": 0.0816455462954676, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 3877 + }, + { + "epoch": 0.08166660524473131, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5827, + "step": 3878 + }, + { + "epoch": 0.08168766419399504, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5786, + "step": 3879 + }, + { + "epoch": 0.08170872314325876, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.5983, + "step": 3880 + }, + { + "epoch": 0.08172978209252249, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5559, + "step": 3881 + }, + { + "epoch": 0.08175084104178622, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5897, + "step": 3882 + }, + { + "epoch": 0.08177189999104995, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 3883 + }, + { + "epoch": 0.08179295894031367, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5518, + "step": 3884 + }, + { + "epoch": 0.0818140178895774, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 3885 + }, + { + "epoch": 0.08183507683884113, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 3886 + }, + { + "epoch": 0.08185613578810486, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5575, + "step": 3887 + }, + { + "epoch": 0.08187719473736858, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5799, + "step": 3888 + }, + { + "epoch": 0.08189825368663231, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 3889 + }, + { + "epoch": 0.08191931263589604, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6094, + "step": 3890 + }, + { + "epoch": 0.08194037158515975, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5863, + "step": 3891 + }, + { + "epoch": 0.08196143053442348, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5628, + "step": 3892 + }, + { + "epoch": 0.08198248948368721, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6101, + "step": 3893 + }, + { + "epoch": 0.08200354843295093, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 3894 + }, + { + "epoch": 0.08202460738221466, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5739, + "step": 3895 + }, + { + "epoch": 0.08204566633147839, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 3896 + }, + { + "epoch": 0.08206672528074212, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 3897 + }, + { + "epoch": 0.08208778423000584, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 3898 + }, + { + "epoch": 0.08210884317926957, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6135, + "step": 3899 + }, + { + "epoch": 0.0821299021285333, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 3900 + }, + { + "epoch": 0.08215096107779703, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 3901 + }, + { + "epoch": 0.08217202002706075, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 3902 + }, + { + "epoch": 0.08219307897632448, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 3903 + }, + { + "epoch": 0.0822141379255882, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 3904 + }, + { + "epoch": 0.08223519687485192, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5922, + "step": 3905 + }, + { + "epoch": 0.08225625582411565, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5749, + "step": 3906 + }, + { + "epoch": 0.08227731477337938, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 3907 + }, + { + "epoch": 0.0822983737226431, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 3908 + }, + { + "epoch": 0.08231943267190683, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5987, + "step": 3909 + }, + { + "epoch": 0.08234049162117056, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 3910 + }, + { + "epoch": 0.08236155057043429, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.572, + "step": 3911 + }, + { + "epoch": 0.08238260951969802, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 3912 + }, + { + "epoch": 0.08240366846896174, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6054, + "step": 3913 + }, + { + "epoch": 0.08242472741822547, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 3914 + }, + { + "epoch": 0.0824457863674892, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 3915 + }, + { + "epoch": 0.08246684531675293, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 3916 + }, + { + "epoch": 0.08248790426601665, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 3917 + }, + { + "epoch": 0.08250896321528037, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 3918 + }, + { + "epoch": 0.0825300221645441, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5967, + "step": 3919 + }, + { + "epoch": 0.08255108111380782, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6174, + "step": 3920 + }, + { + "epoch": 0.08257214006307155, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 3921 + }, + { + "epoch": 0.08259319901233528, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 3922 + }, + { + "epoch": 0.082614257961599, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5557, + "step": 3923 + }, + { + "epoch": 0.08263531691086273, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 3924 + }, + { + "epoch": 0.08265637586012646, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5856, + "step": 3925 + }, + { + "epoch": 0.08267743480939019, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 3926 + }, + { + "epoch": 0.08269849375865392, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 3927 + }, + { + "epoch": 0.08271955270791764, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 3928 + }, + { + "epoch": 0.08274061165718137, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 3929 + }, + { + "epoch": 0.0827616706064451, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 3930 + }, + { + "epoch": 0.08278272955570881, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5749, + "step": 3931 + }, + { + "epoch": 0.08280378850497254, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 3932 + }, + { + "epoch": 0.08282484745423627, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 3933 + }, + { + "epoch": 0.0828459064035, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6298, + "step": 3934 + }, + { + "epoch": 0.08286696535276372, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 3935 + }, + { + "epoch": 0.08288802430202745, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3936 + }, + { + "epoch": 0.08290908325129118, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 3937 + }, + { + "epoch": 0.0829301422005549, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5679, + "step": 3938 + }, + { + "epoch": 0.08295120114981863, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 3939 + }, + { + "epoch": 0.08297226009908236, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5631, + "step": 3940 + }, + { + "epoch": 0.08299331904834609, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5829, + "step": 3941 + }, + { + "epoch": 0.08301437799760981, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5922, + "step": 3942 + }, + { + "epoch": 0.08303543694687354, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5625, + "step": 3943 + }, + { + "epoch": 0.08305649589613726, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6263, + "step": 3944 + }, + { + "epoch": 0.08307755484540098, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 3945 + }, + { + "epoch": 0.08309861379466471, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5987, + "step": 3946 + }, + { + "epoch": 0.08311967274392844, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.5685, + "step": 3947 + }, + { + "epoch": 0.08314073169319217, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5815, + "step": 3948 + }, + { + "epoch": 0.0831617906424559, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 3949 + }, + { + "epoch": 0.08318284959171962, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 3950 + }, + { + "epoch": 0.08320390854098335, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5815, + "step": 3951 + }, + { + "epoch": 0.08322496749024708, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 3952 + }, + { + "epoch": 0.0832460264395108, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 3953 + }, + { + "epoch": 0.08326708538877453, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6272, + "step": 3954 + }, + { + "epoch": 0.08328814433803826, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5731, + "step": 3955 + }, + { + "epoch": 0.08330920328730199, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5644, + "step": 3956 + }, + { + "epoch": 0.0833302622365657, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 3957 + }, + { + "epoch": 0.08335132118582943, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6111, + "step": 3958 + }, + { + "epoch": 0.08337238013509315, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6061, + "step": 3959 + }, + { + "epoch": 0.08339343908435688, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 3960 + }, + { + "epoch": 0.08341449803362061, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6083, + "step": 3961 + }, + { + "epoch": 0.08343555698288434, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 3962 + }, + { + "epoch": 0.08345661593214806, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 3963 + }, + { + "epoch": 0.08347767488141179, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5912, + "step": 3964 + }, + { + "epoch": 0.08349873383067552, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5998, + "step": 3965 + }, + { + "epoch": 0.08351979277993925, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5314, + "step": 3966 + }, + { + "epoch": 0.08354085172920298, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.6056, + "step": 3967 + }, + { + "epoch": 0.0835619106784667, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5697, + "step": 3968 + }, + { + "epoch": 0.08358296962773043, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5678, + "step": 3969 + }, + { + "epoch": 0.08360402857699416, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 3970 + }, + { + "epoch": 0.08362508752625787, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 3971 + }, + { + "epoch": 0.0836461464755216, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 3972 + }, + { + "epoch": 0.08366720542478533, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 3973 + }, + { + "epoch": 0.08368826437404905, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 3974 + }, + { + "epoch": 0.08370932332331278, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 3975 + }, + { + "epoch": 0.08373038227257651, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5739, + "step": 3976 + }, + { + "epoch": 0.08375144122184024, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 3977 + }, + { + "epoch": 0.08377250017110396, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 3978 + }, + { + "epoch": 0.08379355912036769, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 3979 + }, + { + "epoch": 0.08381461806963142, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 3980 + }, + { + "epoch": 0.08383567701889515, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 3981 + }, + { + "epoch": 0.08385673596815887, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 3982 + }, + { + "epoch": 0.0838777949174226, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 3983 + }, + { + "epoch": 0.08389885386668632, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5709, + "step": 3984 + }, + { + "epoch": 0.08391991281595004, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 3985 + }, + { + "epoch": 0.08394097176521377, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 3986 + }, + { + "epoch": 0.0839620307144775, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5504, + "step": 3987 + }, + { + "epoch": 0.08398308966374123, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5408, + "step": 3988 + }, + { + "epoch": 0.08400414861300495, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 3989 + }, + { + "epoch": 0.08402520756226868, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 3990 + }, + { + "epoch": 0.08404626651153241, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 3991 + }, + { + "epoch": 0.08406732546079614, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 3992 + }, + { + "epoch": 0.08408838441005986, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 1.5855, + "step": 3993 + }, + { + "epoch": 0.08410944335932359, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5583, + "step": 3994 + }, + { + "epoch": 0.08413050230858732, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 3995 + }, + { + "epoch": 0.08415156125785105, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.5906, + "step": 3996 + }, + { + "epoch": 0.08417262020711476, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 3997 + }, + { + "epoch": 0.08419367915637849, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 3998 + }, + { + "epoch": 0.08421473810564221, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 3999 + }, + { + "epoch": 0.08423579705490594, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 4000 + }, + { + "epoch": 0.08423579705490594, + "eval_loss": 2.1100332736968994, + "eval_runtime": 990.4764, + "eval_samples_per_second": 62.394, + "eval_steps_per_second": 1.951, + "step": 4000 + }, + { + "epoch": 0.08425685600416967, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5891, + "step": 4001 + }, + { + "epoch": 0.0842779149534334, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 4002 + }, + { + "epoch": 0.08429897390269712, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 4003 + }, + { + "epoch": 0.08432003285196085, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 4004 + }, + { + "epoch": 0.08434109180122458, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 4005 + }, + { + "epoch": 0.08436215075048831, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5553, + "step": 4006 + }, + { + "epoch": 0.08438320969975203, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5919, + "step": 4007 + }, + { + "epoch": 0.08440426864901576, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5849, + "step": 4008 + }, + { + "epoch": 0.08442532759827949, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6342, + "step": 4009 + }, + { + "epoch": 0.0844463865475432, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5912, + "step": 4010 + }, + { + "epoch": 0.08446744549680693, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 4011 + }, + { + "epoch": 0.08448850444607066, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 4012 + }, + { + "epoch": 0.08450956339533439, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 4013 + }, + { + "epoch": 0.08453062234459811, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 4014 + }, + { + "epoch": 0.08455168129386184, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 4015 + }, + { + "epoch": 0.08457274024312557, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 4016 + }, + { + "epoch": 0.0845937991923893, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6411, + "step": 4017 + }, + { + "epoch": 0.08461485814165302, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5379, + "step": 4018 + }, + { + "epoch": 0.08463591709091675, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5945, + "step": 4019 + }, + { + "epoch": 0.08465697604018048, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 4020 + }, + { + "epoch": 0.0846780349894442, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 4021 + }, + { + "epoch": 0.08469909393870793, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 4022 + }, + { + "epoch": 0.08472015288797166, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 4023 + }, + { + "epoch": 0.08474121183723538, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 4024 + }, + { + "epoch": 0.0847622707864991, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6272, + "step": 4025 + }, + { + "epoch": 0.08478332973576283, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.582, + "step": 4026 + }, + { + "epoch": 0.08480438868502656, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5457, + "step": 4027 + }, + { + "epoch": 0.08482544763429029, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5613, + "step": 4028 + }, + { + "epoch": 0.08484650658355401, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6101, + "step": 4029 + }, + { + "epoch": 0.08486756553281774, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5534, + "step": 4030 + }, + { + "epoch": 0.08488862448208147, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5849, + "step": 4031 + }, + { + "epoch": 0.0849096834313452, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 4032 + }, + { + "epoch": 0.08493074238060892, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 4033 + }, + { + "epoch": 0.08495180132987265, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 4034 + }, + { + "epoch": 0.08497286027913638, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 4035 + }, + { + "epoch": 0.0849939192284001, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 4036 + }, + { + "epoch": 0.08501497817766382, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 4037 + }, + { + "epoch": 0.08503603712692755, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6093, + "step": 4038 + }, + { + "epoch": 0.08505709607619127, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6064, + "step": 4039 + }, + { + "epoch": 0.085078155025455, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 4040 + }, + { + "epoch": 0.08509921397471873, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 4041 + }, + { + "epoch": 0.08512027292398246, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 4042 + }, + { + "epoch": 0.08514133187324618, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 4043 + }, + { + "epoch": 0.08516239082250991, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6174, + "step": 4044 + }, + { + "epoch": 0.08518344977177364, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6026, + "step": 4045 + }, + { + "epoch": 0.08520450872103737, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5504, + "step": 4046 + }, + { + "epoch": 0.0852255676703011, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 4047 + }, + { + "epoch": 0.08524662661956482, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 4048 + }, + { + "epoch": 0.08526768556882855, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 4049 + }, + { + "epoch": 0.08528874451809226, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 4050 + }, + { + "epoch": 0.08530980346735599, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 4051 + }, + { + "epoch": 0.08533086241661972, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 4052 + }, + { + "epoch": 0.08535192136588345, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 4053 + }, + { + "epoch": 0.08537298031514717, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5592, + "step": 4054 + }, + { + "epoch": 0.0853940392644109, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 4055 + }, + { + "epoch": 0.08541509821367463, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 4056 + }, + { + "epoch": 0.08543615716293836, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5597, + "step": 4057 + }, + { + "epoch": 0.08545721611220208, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5685, + "step": 4058 + }, + { + "epoch": 0.08547827506146581, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 4059 + }, + { + "epoch": 0.08549933401072954, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5666, + "step": 4060 + }, + { + "epoch": 0.08552039295999327, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 4061 + }, + { + "epoch": 0.085541451909257, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 4062 + }, + { + "epoch": 0.08556251085852071, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5571, + "step": 4063 + }, + { + "epoch": 0.08558356980778443, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 4064 + }, + { + "epoch": 0.08560462875704816, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 4065 + }, + { + "epoch": 0.08562568770631189, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 4066 + }, + { + "epoch": 0.08564674665557562, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 4067 + }, + { + "epoch": 0.08566780560483934, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 4068 + }, + { + "epoch": 0.08568886455410307, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5507, + "step": 4069 + }, + { + "epoch": 0.0857099235033668, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5855, + "step": 4070 + }, + { + "epoch": 0.08573098245263053, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5741, + "step": 4071 + }, + { + "epoch": 0.08575204140189426, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 4072 + }, + { + "epoch": 0.08577310035115798, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5488, + "step": 4073 + }, + { + "epoch": 0.08579415930042171, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5575, + "step": 4074 + }, + { + "epoch": 0.08581521824968544, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 4075 + }, + { + "epoch": 0.08583627719894917, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5579, + "step": 4076 + }, + { + "epoch": 0.08585733614821288, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5807, + "step": 4077 + }, + { + "epoch": 0.0858783950974766, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5799, + "step": 4078 + }, + { + "epoch": 0.08589945404674033, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5634, + "step": 4079 + }, + { + "epoch": 0.08592051299600406, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 4080 + }, + { + "epoch": 0.08594157194526779, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5569, + "step": 4081 + }, + { + "epoch": 0.08596263089453152, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5557, + "step": 4082 + }, + { + "epoch": 0.08598368984379524, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 4083 + }, + { + "epoch": 0.08600474879305897, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5807, + "step": 4084 + }, + { + "epoch": 0.0860258077423227, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5863, + "step": 4085 + }, + { + "epoch": 0.08604686669158643, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 4086 + }, + { + "epoch": 0.08606792564085015, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 4087 + }, + { + "epoch": 0.08608898459011388, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5633, + "step": 4088 + }, + { + "epoch": 0.08611004353937761, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 4089 + }, + { + "epoch": 0.08613110248864132, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 4090 + }, + { + "epoch": 0.08615216143790505, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 4091 + }, + { + "epoch": 0.08617322038716878, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5719, + "step": 4092 + }, + { + "epoch": 0.0861942793364325, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 4093 + }, + { + "epoch": 0.08621533828569623, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6196, + "step": 4094 + }, + { + "epoch": 0.08623639723495996, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 4095 + }, + { + "epoch": 0.08625745618422369, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 4096 + }, + { + "epoch": 0.08627851513348742, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5484, + "step": 4097 + }, + { + "epoch": 0.08629957408275114, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 4098 + }, + { + "epoch": 0.08632063303201487, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 4099 + }, + { + "epoch": 0.0863416919812786, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 4100 + }, + { + "epoch": 0.08636275093054233, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5569, + "step": 4101 + }, + { + "epoch": 0.08638380987980605, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 4102 + }, + { + "epoch": 0.08640486882906977, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5848, + "step": 4103 + }, + { + "epoch": 0.0864259277783335, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 4104 + }, + { + "epoch": 0.08644698672759722, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 4105 + }, + { + "epoch": 0.08646804567686095, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5558, + "step": 4106 + }, + { + "epoch": 0.08648910462612468, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 4107 + }, + { + "epoch": 0.0865101635753884, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 4108 + }, + { + "epoch": 0.08653122252465213, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 4109 + }, + { + "epoch": 0.08655228147391586, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5948, + "step": 4110 + }, + { + "epoch": 0.08657334042317959, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 4111 + }, + { + "epoch": 0.08659439937244331, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5913, + "step": 4112 + }, + { + "epoch": 0.08661545832170704, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5665, + "step": 4113 + }, + { + "epoch": 0.08663651727097077, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5586, + "step": 4114 + }, + { + "epoch": 0.0866575762202345, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5852, + "step": 4115 + }, + { + "epoch": 0.08667863516949821, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5899, + "step": 4116 + }, + { + "epoch": 0.08669969411876194, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6012, + "step": 4117 + }, + { + "epoch": 0.08672075306802567, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 4118 + }, + { + "epoch": 0.0867418120172894, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 4119 + }, + { + "epoch": 0.08676287096655312, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6097, + "step": 4120 + }, + { + "epoch": 0.08678392991581685, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 4121 + }, + { + "epoch": 0.08680498886508058, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 4122 + }, + { + "epoch": 0.0868260478143443, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6194, + "step": 4123 + }, + { + "epoch": 0.08684710676360803, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 4124 + }, + { + "epoch": 0.08686816571287176, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 4125 + }, + { + "epoch": 0.08688922466213549, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5967, + "step": 4126 + }, + { + "epoch": 0.08691028361139921, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 4127 + }, + { + "epoch": 0.08693134256066294, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 4128 + }, + { + "epoch": 0.08695240150992667, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6248, + "step": 4129 + }, + { + "epoch": 0.08697346045919038, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 4130 + }, + { + "epoch": 0.08699451940845411, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 4131 + }, + { + "epoch": 0.08701557835771784, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6094, + "step": 4132 + }, + { + "epoch": 0.08703663730698157, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 4133 + }, + { + "epoch": 0.08705769625624529, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 4134 + }, + { + "epoch": 0.08707875520550902, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5837, + "step": 4135 + }, + { + "epoch": 0.08709981415477275, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 4136 + }, + { + "epoch": 0.08712087310403648, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5562, + "step": 4137 + }, + { + "epoch": 0.0871419320533002, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5651, + "step": 4138 + }, + { + "epoch": 0.08716299100256393, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 4139 + }, + { + "epoch": 0.08718404995182766, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 4140 + }, + { + "epoch": 0.08720510890109139, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 4141 + }, + { + "epoch": 0.08722616785035511, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 4142 + }, + { + "epoch": 0.08724722679961883, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5876, + "step": 4143 + }, + { + "epoch": 0.08726828574888255, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 4144 + }, + { + "epoch": 0.08728934469814628, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 4145 + }, + { + "epoch": 0.08731040364741001, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5809, + "step": 4146 + }, + { + "epoch": 0.08733146259667374, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 4147 + }, + { + "epoch": 0.08735252154593746, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 4148 + }, + { + "epoch": 0.08737358049520119, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 4149 + }, + { + "epoch": 0.08739463944446492, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 4150 + }, + { + "epoch": 0.08741569839372865, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 4151 + }, + { + "epoch": 0.08743675734299237, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 4152 + }, + { + "epoch": 0.0874578162922561, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5629, + "step": 4153 + }, + { + "epoch": 0.08747887524151983, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 4154 + }, + { + "epoch": 0.08749993419078356, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 4155 + }, + { + "epoch": 0.08752099314004727, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 4156 + }, + { + "epoch": 0.087542052089311, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5723, + "step": 4157 + }, + { + "epoch": 0.08756311103857473, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 4158 + }, + { + "epoch": 0.08758416998783845, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 4159 + }, + { + "epoch": 0.08760522893710218, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5894, + "step": 4160 + }, + { + "epoch": 0.08762628788636591, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 4161 + }, + { + "epoch": 0.08764734683562964, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 4162 + }, + { + "epoch": 0.08766840578489336, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6408, + "step": 4163 + }, + { + "epoch": 0.08768946473415709, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 4164 + }, + { + "epoch": 0.08771052368342082, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 4165 + }, + { + "epoch": 0.08773158263268455, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5863, + "step": 4166 + }, + { + "epoch": 0.08775264158194827, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6249, + "step": 4167 + }, + { + "epoch": 0.087773700531212, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 4168 + }, + { + "epoch": 0.08779475948047571, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.656, + "step": 4169 + }, + { + "epoch": 0.08781581842973944, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 4170 + }, + { + "epoch": 0.08783687737900317, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5695, + "step": 4171 + }, + { + "epoch": 0.0878579363282669, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5621, + "step": 4172 + }, + { + "epoch": 0.08787899527753062, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 4173 + }, + { + "epoch": 0.08790005422679435, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 4174 + }, + { + "epoch": 0.08792111317605808, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 4175 + }, + { + "epoch": 0.08794217212532181, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5893, + "step": 4176 + }, + { + "epoch": 0.08796323107458554, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5719, + "step": 4177 + }, + { + "epoch": 0.08798429002384926, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5876, + "step": 4178 + }, + { + "epoch": 0.08800534897311299, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5818, + "step": 4179 + }, + { + "epoch": 0.08802640792237672, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 4180 + }, + { + "epoch": 0.08804746687164045, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 4181 + }, + { + "epoch": 0.08806852582090417, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5731, + "step": 4182 + }, + { + "epoch": 0.08808958477016789, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 4183 + }, + { + "epoch": 0.08811064371943161, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 4184 + }, + { + "epoch": 0.08813170266869534, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 4185 + }, + { + "epoch": 0.08815276161795907, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5764, + "step": 4186 + }, + { + "epoch": 0.0881738205672228, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.6485, + "step": 4187 + }, + { + "epoch": 0.08819487951648652, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 4188 + }, + { + "epoch": 0.08821593846575025, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 4189 + }, + { + "epoch": 0.08823699741501398, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 4190 + }, + { + "epoch": 0.0882580563642777, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 4191 + }, + { + "epoch": 0.08827911531354143, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5753, + "step": 4192 + }, + { + "epoch": 0.08830017426280516, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5899, + "step": 4193 + }, + { + "epoch": 0.08832123321206889, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 4194 + }, + { + "epoch": 0.08834229216133262, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 4195 + }, + { + "epoch": 0.08836335111059633, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6238, + "step": 4196 + }, + { + "epoch": 0.08838441005986006, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 4197 + }, + { + "epoch": 0.08840546900912379, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5642, + "step": 4198 + }, + { + "epoch": 0.08842652795838751, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 4199 + }, + { + "epoch": 0.08844758690765124, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 4200 + }, + { + "epoch": 0.08846864585691497, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 4201 + }, + { + "epoch": 0.0884897048061787, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 4202 + }, + { + "epoch": 0.08851076375544242, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 4203 + }, + { + "epoch": 0.08853182270470615, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5625, + "step": 4204 + }, + { + "epoch": 0.08855288165396988, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 4205 + }, + { + "epoch": 0.0885739406032336, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 4206 + }, + { + "epoch": 0.08859499955249733, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.6048, + "step": 4207 + }, + { + "epoch": 0.08861605850176106, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 4208 + }, + { + "epoch": 0.08863711745102477, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6181, + "step": 4209 + }, + { + "epoch": 0.0886581764002885, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 4210 + }, + { + "epoch": 0.08867923534955223, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 4211 + }, + { + "epoch": 0.08870029429881596, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 4212 + }, + { + "epoch": 0.08872135324807968, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 4213 + }, + { + "epoch": 0.08874241219734341, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 4214 + }, + { + "epoch": 0.08876347114660714, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5544, + "step": 4215 + }, + { + "epoch": 0.08878453009587087, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6046, + "step": 4216 + }, + { + "epoch": 0.0888055890451346, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 4217 + }, + { + "epoch": 0.08882664799439832, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 4218 + }, + { + "epoch": 0.08884770694366205, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 4219 + }, + { + "epoch": 0.08886876589292578, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 4220 + }, + { + "epoch": 0.0888898248421895, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 4221 + }, + { + "epoch": 0.08891088379145322, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 4222 + }, + { + "epoch": 0.08893194274071695, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 4223 + }, + { + "epoch": 0.08895300168998067, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5677, + "step": 4224 + }, + { + "epoch": 0.0889740606392444, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5407, + "step": 4225 + }, + { + "epoch": 0.08899511958850813, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 4226 + }, + { + "epoch": 0.08901617853777186, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 4227 + }, + { + "epoch": 0.08903723748703558, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 4228 + }, + { + "epoch": 0.08905829643629931, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5749, + "step": 4229 + }, + { + "epoch": 0.08907935538556304, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 4230 + }, + { + "epoch": 0.08910041433482677, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5815, + "step": 4231 + }, + { + "epoch": 0.0891214732840905, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6148, + "step": 4232 + }, + { + "epoch": 0.08914253223335422, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 4233 + }, + { + "epoch": 0.08916359118261795, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 4234 + }, + { + "epoch": 0.08918465013188168, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 4235 + }, + { + "epoch": 0.08920570908114539, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 4236 + }, + { + "epoch": 0.08922676803040912, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 4237 + }, + { + "epoch": 0.08924782697967285, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 4238 + }, + { + "epoch": 0.08926888592893657, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 4239 + }, + { + "epoch": 0.0892899448782003, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 4240 + }, + { + "epoch": 0.08931100382746403, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 4241 + }, + { + "epoch": 0.08933206277672776, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 4242 + }, + { + "epoch": 0.08935312172599148, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 4243 + }, + { + "epoch": 0.08937418067525521, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5733, + "step": 4244 + }, + { + "epoch": 0.08939523962451894, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 4245 + }, + { + "epoch": 0.08941629857378267, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 4246 + }, + { + "epoch": 0.08943735752304639, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5456, + "step": 4247 + }, + { + "epoch": 0.08945841647231012, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 4248 + }, + { + "epoch": 0.08947947542157383, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 4249 + }, + { + "epoch": 0.08950053437083756, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 4250 + }, + { + "epoch": 0.08952159332010129, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5965, + "step": 4251 + }, + { + "epoch": 0.08954265226936502, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 4252 + }, + { + "epoch": 0.08956371121862874, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 4253 + }, + { + "epoch": 0.08958477016789247, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 4254 + }, + { + "epoch": 0.0896058291171562, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5524, + "step": 4255 + }, + { + "epoch": 0.08962688806641993, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 4256 + }, + { + "epoch": 0.08964794701568365, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 4257 + }, + { + "epoch": 0.08966900596494738, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 4258 + }, + { + "epoch": 0.08969006491421111, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5863, + "step": 4259 + }, + { + "epoch": 0.08971112386347484, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6026, + "step": 4260 + }, + { + "epoch": 0.08973218281273856, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 4261 + }, + { + "epoch": 0.08975324176200228, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 4262 + }, + { + "epoch": 0.089774300711266, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 4263 + }, + { + "epoch": 0.08979535966052973, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 4264 + }, + { + "epoch": 0.08981641860979346, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5737, + "step": 4265 + }, + { + "epoch": 0.08983747755905719, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 4266 + }, + { + "epoch": 0.08985853650832092, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 4267 + }, + { + "epoch": 0.08987959545758464, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 4268 + }, + { + "epoch": 0.08990065440684837, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 4269 + }, + { + "epoch": 0.0899217133561121, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 4270 + }, + { + "epoch": 0.08994277230537583, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 4271 + }, + { + "epoch": 0.08996383125463955, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 4272 + }, + { + "epoch": 0.08998489020390328, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 4273 + }, + { + "epoch": 0.09000594915316701, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 4274 + }, + { + "epoch": 0.09002700810243074, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.582, + "step": 4275 + }, + { + "epoch": 0.09004806705169445, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 4276 + }, + { + "epoch": 0.09006912600095818, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 4277 + }, + { + "epoch": 0.0900901849502219, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 4278 + }, + { + "epoch": 0.09011124389948563, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 4279 + }, + { + "epoch": 0.09013230284874936, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 4280 + }, + { + "epoch": 0.09015336179801309, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 4281 + }, + { + "epoch": 0.09017442074727682, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 4282 + }, + { + "epoch": 0.09019547969654054, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 4283 + }, + { + "epoch": 0.09021653864580427, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5681, + "step": 4284 + }, + { + "epoch": 0.090237597595068, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 4285 + }, + { + "epoch": 0.09025865654433173, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5689, + "step": 4286 + }, + { + "epoch": 0.09027971549359545, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 4287 + }, + { + "epoch": 0.09030077444285918, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5518, + "step": 4288 + }, + { + "epoch": 0.0903218333921229, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 4289 + }, + { + "epoch": 0.09034289234138662, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 4290 + }, + { + "epoch": 0.09036395129065035, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 4291 + }, + { + "epoch": 0.09038501023991408, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 4292 + }, + { + "epoch": 0.0904060691891778, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 4293 + }, + { + "epoch": 0.09042712813844153, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 4294 + }, + { + "epoch": 0.09044818708770526, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 4295 + }, + { + "epoch": 0.09046924603696899, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5899, + "step": 4296 + }, + { + "epoch": 0.09049030498623271, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5929, + "step": 4297 + }, + { + "epoch": 0.09051136393549644, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 4298 + }, + { + "epoch": 0.09053242288476017, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 4299 + }, + { + "epoch": 0.0905534818340239, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 4300 + }, + { + "epoch": 0.09057454078328762, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 4301 + }, + { + "epoch": 0.09059559973255134, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 4302 + }, + { + "epoch": 0.09061665868181507, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6013, + "step": 4303 + }, + { + "epoch": 0.09063771763107879, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 4304 + }, + { + "epoch": 0.09065877658034252, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 4305 + }, + { + "epoch": 0.09067983552960625, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5848, + "step": 4306 + }, + { + "epoch": 0.09070089447886998, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5971, + "step": 4307 + }, + { + "epoch": 0.0907219534281337, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5663, + "step": 4308 + }, + { + "epoch": 0.09074301237739743, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 4309 + }, + { + "epoch": 0.09076407132666116, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5471, + "step": 4310 + }, + { + "epoch": 0.09078513027592489, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5635, + "step": 4311 + }, + { + "epoch": 0.09080618922518861, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 4312 + }, + { + "epoch": 0.09082724817445234, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5722, + "step": 4313 + }, + { + "epoch": 0.09084830712371607, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5631, + "step": 4314 + }, + { + "epoch": 0.09086936607297978, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 4315 + }, + { + "epoch": 0.09089042502224351, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 4316 + }, + { + "epoch": 0.09091148397150724, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 4317 + }, + { + "epoch": 0.09093254292077096, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 4318 + }, + { + "epoch": 0.09095360187003469, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5869, + "step": 4319 + }, + { + "epoch": 0.09097466081929842, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 4320 + }, + { + "epoch": 0.09099571976856215, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4321 + }, + { + "epoch": 0.09101677871782587, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 4322 + }, + { + "epoch": 0.0910378376670896, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5565, + "step": 4323 + }, + { + "epoch": 0.09105889661635333, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 4324 + }, + { + "epoch": 0.09107995556561706, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 4325 + }, + { + "epoch": 0.09110101451488078, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5642, + "step": 4326 + }, + { + "epoch": 0.09112207346414451, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 4327 + }, + { + "epoch": 0.09114313241340824, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6046, + "step": 4328 + }, + { + "epoch": 0.09116419136267195, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 4329 + }, + { + "epoch": 0.09118525031193568, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 4330 + }, + { + "epoch": 0.09120630926119941, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 4331 + }, + { + "epoch": 0.09122736821046314, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 4332 + }, + { + "epoch": 0.09124842715972686, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 4333 + }, + { + "epoch": 0.09126948610899059, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5531, + "step": 4334 + }, + { + "epoch": 0.09129054505825432, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5756, + "step": 4335 + }, + { + "epoch": 0.09131160400751805, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5302, + "step": 4336 + }, + { + "epoch": 0.09133266295678177, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 4337 + }, + { + "epoch": 0.0913537219060455, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.561, + "step": 4338 + }, + { + "epoch": 0.09137478085530923, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 4339 + }, + { + "epoch": 0.09139583980457296, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 4340 + }, + { + "epoch": 0.09141689875383668, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 4341 + }, + { + "epoch": 0.0914379577031004, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 4342 + }, + { + "epoch": 0.09145901665236413, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 4343 + }, + { + "epoch": 0.09148007560162785, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 4344 + }, + { + "epoch": 0.09150113455089158, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 4345 + }, + { + "epoch": 0.09152219350015531, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5908, + "step": 4346 + }, + { + "epoch": 0.09154325244941904, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6173, + "step": 4347 + }, + { + "epoch": 0.09156431139868276, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6097, + "step": 4348 + }, + { + "epoch": 0.09158537034794649, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 4349 + }, + { + "epoch": 0.09160642929721022, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 4350 + }, + { + "epoch": 0.09162748824647395, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 4351 + }, + { + "epoch": 0.09164854719573767, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 4352 + }, + { + "epoch": 0.0916696061450014, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6097, + "step": 4353 + }, + { + "epoch": 0.09169066509426513, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5569, + "step": 4354 + }, + { + "epoch": 0.09171172404352884, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.552, + "step": 4355 + }, + { + "epoch": 0.09173278299279257, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 4356 + }, + { + "epoch": 0.0917538419420563, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 4357 + }, + { + "epoch": 0.09177490089132002, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 4358 + }, + { + "epoch": 0.09179595984058375, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5621, + "step": 4359 + }, + { + "epoch": 0.09181701878984748, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 4360 + }, + { + "epoch": 0.0918380777391112, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 4361 + }, + { + "epoch": 0.09185913668837493, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5735, + "step": 4362 + }, + { + "epoch": 0.09188019563763866, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 4363 + }, + { + "epoch": 0.09190125458690239, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5835, + "step": 4364 + }, + { + "epoch": 0.09192231353616612, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 4365 + }, + { + "epoch": 0.09194337248542984, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 4366 + }, + { + "epoch": 0.09196443143469357, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 4367 + }, + { + "epoch": 0.09198549038395729, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 4368 + }, + { + "epoch": 0.09200654933322101, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 4369 + }, + { + "epoch": 0.09202760828248474, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 4370 + }, + { + "epoch": 0.09204866723174847, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 4371 + }, + { + "epoch": 0.0920697261810122, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 4372 + }, + { + "epoch": 0.09209078513027592, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 4373 + }, + { + "epoch": 0.09211184407953965, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5565, + "step": 4374 + }, + { + "epoch": 0.09213290302880338, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 4375 + }, + { + "epoch": 0.0921539619780671, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 4376 + }, + { + "epoch": 0.09217502092733083, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 4377 + }, + { + "epoch": 0.09219607987659456, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.572, + "step": 4378 + }, + { + "epoch": 0.09221713882585829, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5381, + "step": 4379 + }, + { + "epoch": 0.09223819777512202, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 4380 + }, + { + "epoch": 0.09225925672438574, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 4381 + }, + { + "epoch": 0.09228031567364946, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 4382 + }, + { + "epoch": 0.09230137462291318, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5902, + "step": 4383 + }, + { + "epoch": 0.09232243357217691, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 4384 + }, + { + "epoch": 0.09234349252144064, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 4385 + }, + { + "epoch": 0.09236455147070437, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 4386 + }, + { + "epoch": 0.0923856104199681, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 4387 + }, + { + "epoch": 0.09240666936923182, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5446, + "step": 4388 + }, + { + "epoch": 0.09242772831849555, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 4389 + }, + { + "epoch": 0.09244878726775928, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5849, + "step": 4390 + }, + { + "epoch": 0.092469846217023, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 4391 + }, + { + "epoch": 0.09249090516628673, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 4392 + }, + { + "epoch": 0.09251196411555046, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 4393 + }, + { + "epoch": 0.09253302306481419, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 4394 + }, + { + "epoch": 0.0925540820140779, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 4395 + }, + { + "epoch": 0.09257514096334163, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6029, + "step": 4396 + }, + { + "epoch": 0.09259619991260536, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 4397 + }, + { + "epoch": 0.09261725886186908, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5541, + "step": 4398 + }, + { + "epoch": 0.09263831781113281, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 4399 + }, + { + "epoch": 0.09265937676039654, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 4400 + }, + { + "epoch": 0.09268043570966027, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 4401 + }, + { + "epoch": 0.092701494658924, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 4402 + }, + { + "epoch": 0.09272255360818772, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 4403 + }, + { + "epoch": 0.09274361255745145, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5817, + "step": 4404 + }, + { + "epoch": 0.09276467150671518, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6085, + "step": 4405 + }, + { + "epoch": 0.0927857304559789, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6069, + "step": 4406 + }, + { + "epoch": 0.09280678940524263, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 4407 + }, + { + "epoch": 0.09282784835450635, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 4408 + }, + { + "epoch": 0.09284890730377007, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 4409 + }, + { + "epoch": 0.0928699662530338, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 4410 + }, + { + "epoch": 0.09289102520229753, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 4411 + }, + { + "epoch": 0.09291208415156126, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.577, + "step": 4412 + }, + { + "epoch": 0.09293314310082498, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6027, + "step": 4413 + }, + { + "epoch": 0.09295420205008871, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5565, + "step": 4414 + }, + { + "epoch": 0.09297526099935244, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 4415 + }, + { + "epoch": 0.09299631994861617, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 4416 + }, + { + "epoch": 0.0930173788978799, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6056, + "step": 4417 + }, + { + "epoch": 0.09303843784714362, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5709, + "step": 4418 + }, + { + "epoch": 0.09305949679640735, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5695, + "step": 4419 + }, + { + "epoch": 0.09308055574567108, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5839, + "step": 4420 + }, + { + "epoch": 0.09310161469493479, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 4421 + }, + { + "epoch": 0.09312267364419852, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 4422 + }, + { + "epoch": 0.09314373259346224, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5913, + "step": 4423 + }, + { + "epoch": 0.09316479154272597, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 4424 + }, + { + "epoch": 0.0931858504919897, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 4425 + }, + { + "epoch": 0.09320690944125343, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 4426 + }, + { + "epoch": 0.09322796839051715, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 4427 + }, + { + "epoch": 0.09324902733978088, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5809, + "step": 4428 + }, + { + "epoch": 0.09327008628904461, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 4429 + }, + { + "epoch": 0.09329114523830834, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6048, + "step": 4430 + }, + { + "epoch": 0.09331220418757206, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6299, + "step": 4431 + }, + { + "epoch": 0.09333326313683579, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 4432 + }, + { + "epoch": 0.09335432208609952, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 4433 + }, + { + "epoch": 0.09337538103536325, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 4434 + }, + { + "epoch": 0.09339643998462696, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5919, + "step": 4435 + }, + { + "epoch": 0.09341749893389069, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 4436 + }, + { + "epoch": 0.09343855788315442, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 4437 + }, + { + "epoch": 0.09345961683241814, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6116, + "step": 4438 + }, + { + "epoch": 0.09348067578168187, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 4439 + }, + { + "epoch": 0.0935017347309456, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 4440 + }, + { + "epoch": 0.09352279368020933, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 4441 + }, + { + "epoch": 0.09354385262947305, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 4442 + }, + { + "epoch": 0.09356491157873678, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6175, + "step": 4443 + }, + { + "epoch": 0.09358597052800051, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 4444 + }, + { + "epoch": 0.09360702947726424, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5557, + "step": 4445 + }, + { + "epoch": 0.09362808842652796, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 4446 + }, + { + "epoch": 0.09364914737579169, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 4447 + }, + { + "epoch": 0.0936702063250554, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5726, + "step": 4448 + }, + { + "epoch": 0.09369126527431913, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 4449 + }, + { + "epoch": 0.09371232422358286, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6025, + "step": 4450 + }, + { + "epoch": 0.09373338317284659, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6078, + "step": 4451 + }, + { + "epoch": 0.09375444212211032, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 4452 + }, + { + "epoch": 0.09377550107137404, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5285, + "step": 4453 + }, + { + "epoch": 0.09379656002063777, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 4454 + }, + { + "epoch": 0.0938176189699015, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5906, + "step": 4455 + }, + { + "epoch": 0.09383867791916523, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.598, + "step": 4456 + }, + { + "epoch": 0.09385973686842895, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 4457 + }, + { + "epoch": 0.09388079581769268, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5531, + "step": 4458 + }, + { + "epoch": 0.09390185476695641, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5717, + "step": 4459 + }, + { + "epoch": 0.09392291371622014, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5693, + "step": 4460 + }, + { + "epoch": 0.09394397266548385, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 4461 + }, + { + "epoch": 0.09396503161474758, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 4462 + }, + { + "epoch": 0.0939860905640113, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5753, + "step": 4463 + }, + { + "epoch": 0.09400714951327503, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 4464 + }, + { + "epoch": 0.09402820846253876, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5771, + "step": 4465 + }, + { + "epoch": 0.09404926741180249, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 4466 + }, + { + "epoch": 0.09407032636106621, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 4467 + }, + { + "epoch": 0.09409138531032994, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 4468 + }, + { + "epoch": 0.09411244425959367, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5719, + "step": 4469 + }, + { + "epoch": 0.0941335032088574, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 4470 + }, + { + "epoch": 0.09415456215812112, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 4471 + }, + { + "epoch": 0.09417562110738485, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5693, + "step": 4472 + }, + { + "epoch": 0.09419668005664858, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 4473 + }, + { + "epoch": 0.0942177390059123, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 4474 + }, + { + "epoch": 0.09423879795517602, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 4475 + }, + { + "epoch": 0.09425985690443975, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5678, + "step": 4476 + }, + { + "epoch": 0.09428091585370348, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 4477 + }, + { + "epoch": 0.0943019748029672, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 4478 + }, + { + "epoch": 0.09432303375223093, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6094, + "step": 4479 + }, + { + "epoch": 0.09434409270149466, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 4480 + }, + { + "epoch": 0.09436515165075839, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 4481 + }, + { + "epoch": 0.09438621060002211, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 4482 + }, + { + "epoch": 0.09440726954928584, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 4483 + }, + { + "epoch": 0.09442832849854957, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5912, + "step": 4484 + }, + { + "epoch": 0.0944493874478133, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 4485 + }, + { + "epoch": 0.09447044639707702, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 4486 + }, + { + "epoch": 0.09449150534634075, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5674, + "step": 4487 + }, + { + "epoch": 0.09451256429560446, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 4488 + }, + { + "epoch": 0.09453362324486819, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5848, + "step": 4489 + }, + { + "epoch": 0.09455468219413192, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 4490 + }, + { + "epoch": 0.09457574114339565, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 4491 + }, + { + "epoch": 0.09459680009265937, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.5679, + "step": 4492 + }, + { + "epoch": 0.0946178590419231, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 4493 + }, + { + "epoch": 0.09463891799118683, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 4494 + }, + { + "epoch": 0.09465997694045056, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 4495 + }, + { + "epoch": 0.09468103588971429, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5665, + "step": 4496 + }, + { + "epoch": 0.09470209483897801, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 4497 + }, + { + "epoch": 0.09472315378824174, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 4498 + }, + { + "epoch": 0.09474421273750547, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6045, + "step": 4499 + }, + { + "epoch": 0.0947652716867692, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 4500 + }, + { + "epoch": 0.0947652716867692, + "eval_loss": 2.20310378074646, + "eval_runtime": 897.7013, + "eval_samples_per_second": 68.842, + "eval_steps_per_second": 2.152, + "step": 4500 + }, + { + "epoch": 0.09478633063603291, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 4501 + }, + { + "epoch": 0.09480738958529664, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 4502 + }, + { + "epoch": 0.09482844853456036, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5977, + "step": 4503 + }, + { + "epoch": 0.09484950748382409, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6173, + "step": 4504 + }, + { + "epoch": 0.09487056643308782, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.568, + "step": 4505 + }, + { + "epoch": 0.09489162538235155, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 4506 + }, + { + "epoch": 0.09491268433161527, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5593, + "step": 4507 + }, + { + "epoch": 0.094933743280879, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 4508 + }, + { + "epoch": 0.09495480223014273, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5943, + "step": 4509 + }, + { + "epoch": 0.09497586117940646, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 4510 + }, + { + "epoch": 0.09499692012867018, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 4511 + }, + { + "epoch": 0.09501797907793391, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 4512 + }, + { + "epoch": 0.09503903802719764, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5772, + "step": 4513 + }, + { + "epoch": 0.09506009697646135, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 4514 + }, + { + "epoch": 0.09508115592572508, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 4515 + }, + { + "epoch": 0.09510221487498881, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 4516 + }, + { + "epoch": 0.09512327382425254, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 4517 + }, + { + "epoch": 0.09514433277351626, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 4518 + }, + { + "epoch": 0.09516539172277999, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5752, + "step": 4519 + }, + { + "epoch": 0.09518645067204372, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.5396, + "step": 4520 + }, + { + "epoch": 0.09520750962130745, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5962, + "step": 4521 + }, + { + "epoch": 0.09522856857057117, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5922, + "step": 4522 + }, + { + "epoch": 0.0952496275198349, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 4523 + }, + { + "epoch": 0.09527068646909863, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 4524 + }, + { + "epoch": 0.09529174541836236, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 4525 + }, + { + "epoch": 0.09531280436762608, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 4526 + }, + { + "epoch": 0.0953338633168898, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5684, + "step": 4527 + }, + { + "epoch": 0.09535492226615352, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6035, + "step": 4528 + }, + { + "epoch": 0.09537598121541725, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5392, + "step": 4529 + }, + { + "epoch": 0.09539704016468098, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 4530 + }, + { + "epoch": 0.09541809911394471, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 4531 + }, + { + "epoch": 0.09543915806320843, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 4532 + }, + { + "epoch": 0.09546021701247216, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5754, + "step": 4533 + }, + { + "epoch": 0.09548127596173589, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6265, + "step": 4534 + }, + { + "epoch": 0.09550233491099962, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 4535 + }, + { + "epoch": 0.09552339386026334, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 4536 + }, + { + "epoch": 0.09554445280952707, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 4537 + }, + { + "epoch": 0.0955655117587908, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 4538 + }, + { + "epoch": 0.09558657070805453, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5735, + "step": 4539 + }, + { + "epoch": 0.09560762965731825, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 4540 + }, + { + "epoch": 0.09562868860658197, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5604, + "step": 4541 + }, + { + "epoch": 0.0956497475558457, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 4542 + }, + { + "epoch": 0.09567080650510942, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5357, + "step": 4543 + }, + { + "epoch": 0.09569186545437315, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5763, + "step": 4544 + }, + { + "epoch": 0.09571292440363688, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5945, + "step": 4545 + }, + { + "epoch": 0.0957339833529006, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 4546 + }, + { + "epoch": 0.09575504230216433, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 4547 + }, + { + "epoch": 0.09577610125142806, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 4548 + }, + { + "epoch": 0.09579716020069179, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5852, + "step": 4549 + }, + { + "epoch": 0.09581821914995552, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 4550 + }, + { + "epoch": 0.09583927809921924, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 4551 + }, + { + "epoch": 0.09586033704848297, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 4552 + }, + { + "epoch": 0.0958813959977467, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5679, + "step": 4553 + }, + { + "epoch": 0.09590245494701041, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 4554 + }, + { + "epoch": 0.09592351389627414, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5677, + "step": 4555 + }, + { + "epoch": 0.09594457284553787, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 4556 + }, + { + "epoch": 0.0959656317948016, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 4557 + }, + { + "epoch": 0.09598669074406532, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5865, + "step": 4558 + }, + { + "epoch": 0.09600774969332905, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5759, + "step": 4559 + }, + { + "epoch": 0.09602880864259278, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 4560 + }, + { + "epoch": 0.0960498675918565, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 4561 + }, + { + "epoch": 0.09607092654112023, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.582, + "step": 4562 + }, + { + "epoch": 0.09609198549038396, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5849, + "step": 4563 + }, + { + "epoch": 0.09611304443964769, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 4564 + }, + { + "epoch": 0.09613410338891142, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 4565 + }, + { + "epoch": 0.09615516233817514, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.577, + "step": 4566 + }, + { + "epoch": 0.09617622128743886, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 4567 + }, + { + "epoch": 0.09619728023670258, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 4568 + }, + { + "epoch": 0.09621833918596631, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5818, + "step": 4569 + }, + { + "epoch": 0.09623939813523004, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5556, + "step": 4570 + }, + { + "epoch": 0.09626045708449377, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 4571 + }, + { + "epoch": 0.0962815160337575, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 4572 + }, + { + "epoch": 0.09630257498302122, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 4573 + }, + { + "epoch": 0.09632363393228495, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 4574 + }, + { + "epoch": 0.09634469288154868, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 4575 + }, + { + "epoch": 0.0963657518308124, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 4576 + }, + { + "epoch": 0.09638681078007613, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 4577 + }, + { + "epoch": 0.09640786972933986, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5502, + "step": 4578 + }, + { + "epoch": 0.09642892867860359, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5459, + "step": 4579 + }, + { + "epoch": 0.0964499876278673, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5479, + "step": 4580 + }, + { + "epoch": 0.09647104657713103, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5754, + "step": 4581 + }, + { + "epoch": 0.09649210552639476, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 4582 + }, + { + "epoch": 0.09651316447565848, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 4583 + }, + { + "epoch": 0.09653422342492221, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5848, + "step": 4584 + }, + { + "epoch": 0.09655528237418594, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 4585 + }, + { + "epoch": 0.09657634132344967, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 4586 + }, + { + "epoch": 0.0965974002727134, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 4587 + }, + { + "epoch": 0.09661845922197712, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5955, + "step": 4588 + }, + { + "epoch": 0.09663951817124085, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 4589 + }, + { + "epoch": 0.09666057712050458, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 4590 + }, + { + "epoch": 0.0966816360697683, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 4591 + }, + { + "epoch": 0.09670269501903203, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 4592 + }, + { + "epoch": 0.09672375396829576, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 4593 + }, + { + "epoch": 0.09674481291755947, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6295, + "step": 4594 + }, + { + "epoch": 0.0967658718668232, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5569, + "step": 4595 + }, + { + "epoch": 0.09678693081608693, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5529, + "step": 4596 + }, + { + "epoch": 0.09680798976535065, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5818, + "step": 4597 + }, + { + "epoch": 0.09682904871461438, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 4598 + }, + { + "epoch": 0.09685010766387811, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6204, + "step": 4599 + }, + { + "epoch": 0.09687116661314184, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 4600 + }, + { + "epoch": 0.09689222556240557, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5689, + "step": 4601 + }, + { + "epoch": 0.09691328451166929, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6196, + "step": 4602 + }, + { + "epoch": 0.09693434346093302, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 4603 + }, + { + "epoch": 0.09695540241019675, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 4604 + }, + { + "epoch": 0.09697646135946048, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.5801, + "step": 4605 + }, + { + "epoch": 0.0969975203087242, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6349, + "step": 4606 + }, + { + "epoch": 0.09701857925798792, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.616, + "step": 4607 + }, + { + "epoch": 0.09703963820725164, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5849, + "step": 4608 + }, + { + "epoch": 0.09706069715651537, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 4609 + }, + { + "epoch": 0.0970817561057791, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 4610 + }, + { + "epoch": 0.09710281505504283, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 4611 + }, + { + "epoch": 0.09712387400430655, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 4612 + }, + { + "epoch": 0.09714493295357028, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5856, + "step": 4613 + }, + { + "epoch": 0.09716599190283401, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 4614 + }, + { + "epoch": 0.09718705085209774, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6034, + "step": 4615 + }, + { + "epoch": 0.09720810980136146, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 4616 + }, + { + "epoch": 0.09722916875062519, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.582, + "step": 4617 + }, + { + "epoch": 0.09725022769988892, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5759, + "step": 4618 + }, + { + "epoch": 0.09727128664915265, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 4619 + }, + { + "epoch": 0.09729234559841636, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 4620 + }, + { + "epoch": 0.09731340454768009, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 4621 + }, + { + "epoch": 0.09733446349694382, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5558, + "step": 4622 + }, + { + "epoch": 0.09735552244620754, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 4623 + }, + { + "epoch": 0.09737658139547127, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 4624 + }, + { + "epoch": 0.097397640344735, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 4625 + }, + { + "epoch": 0.09741869929399873, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.6013, + "step": 4626 + }, + { + "epoch": 0.09743975824326245, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 4627 + }, + { + "epoch": 0.09746081719252618, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.6027, + "step": 4628 + }, + { + "epoch": 0.09748187614178991, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5818, + "step": 4629 + }, + { + "epoch": 0.09750293509105364, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 4630 + }, + { + "epoch": 0.09752399404031736, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 4631 + }, + { + "epoch": 0.09754505298958109, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 4632 + }, + { + "epoch": 0.0975661119388448, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 4633 + }, + { + "epoch": 0.09758717088810853, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 4634 + }, + { + "epoch": 0.09760822983737226, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5717, + "step": 4635 + }, + { + "epoch": 0.09762928878663599, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 4636 + }, + { + "epoch": 0.09765034773589971, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.609, + "step": 4637 + }, + { + "epoch": 0.09767140668516344, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 4638 + }, + { + "epoch": 0.09769246563442717, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 4639 + }, + { + "epoch": 0.0977135245836909, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 4640 + }, + { + "epoch": 0.09773458353295462, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 4641 + }, + { + "epoch": 0.09775564248221835, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 4642 + }, + { + "epoch": 0.09777670143148208, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 4643 + }, + { + "epoch": 0.09779776038074581, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 4644 + }, + { + "epoch": 0.09781881933000953, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 4645 + }, + { + "epoch": 0.09783987827927326, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5764, + "step": 4646 + }, + { + "epoch": 0.09786093722853698, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5461, + "step": 4647 + }, + { + "epoch": 0.0978819961778007, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 4648 + }, + { + "epoch": 0.09790305512706443, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5613, + "step": 4649 + }, + { + "epoch": 0.09792411407632816, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 4650 + }, + { + "epoch": 0.09794517302559189, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5733, + "step": 4651 + }, + { + "epoch": 0.09796623197485561, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 4652 + }, + { + "epoch": 0.09798729092411934, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 4653 + }, + { + "epoch": 0.09800834987338307, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 4654 + }, + { + "epoch": 0.0980294088226468, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5894, + "step": 4655 + }, + { + "epoch": 0.09805046777191052, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5726, + "step": 4656 + }, + { + "epoch": 0.09807152672117425, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 4657 + }, + { + "epoch": 0.09809258567043798, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5716, + "step": 4658 + }, + { + "epoch": 0.0981136446197017, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 4659 + }, + { + "epoch": 0.09813470356896542, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.585, + "step": 4660 + }, + { + "epoch": 0.09815576251822915, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5869, + "step": 4661 + }, + { + "epoch": 0.09817682146749288, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 4662 + }, + { + "epoch": 0.0981978804167566, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5731, + "step": 4663 + }, + { + "epoch": 0.09821893936602033, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.57, + "step": 4664 + }, + { + "epoch": 0.09823999831528406, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 4665 + }, + { + "epoch": 0.09826105726454779, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 4666 + }, + { + "epoch": 0.09828211621381151, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 4667 + }, + { + "epoch": 0.09830317516307524, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 4668 + }, + { + "epoch": 0.09832423411233897, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 4669 + }, + { + "epoch": 0.0983452930616027, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 4670 + }, + { + "epoch": 0.09836635201086642, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 4671 + }, + { + "epoch": 0.09838741096013015, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5759, + "step": 4672 + }, + { + "epoch": 0.09840846990939386, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 4673 + }, + { + "epoch": 0.09842952885865759, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 4674 + }, + { + "epoch": 0.09845058780792132, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 4675 + }, + { + "epoch": 0.09847164675718505, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 4676 + }, + { + "epoch": 0.09849270570644877, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 4677 + }, + { + "epoch": 0.0985137646557125, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6102, + "step": 4678 + }, + { + "epoch": 0.09853482360497623, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 4679 + }, + { + "epoch": 0.09855588255423996, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 4680 + }, + { + "epoch": 0.09857694150350368, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 4681 + }, + { + "epoch": 0.09859800045276741, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 4682 + }, + { + "epoch": 0.09861905940203114, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5425, + "step": 4683 + }, + { + "epoch": 0.09864011835129487, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 1.57, + "step": 4684 + }, + { + "epoch": 0.0986611773005586, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 4685 + }, + { + "epoch": 0.09868223624982231, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 4686 + }, + { + "epoch": 0.09870329519908604, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.5621, + "step": 4687 + }, + { + "epoch": 0.09872435414834976, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5489, + "step": 4688 + }, + { + "epoch": 0.09874541309761349, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6083, + "step": 4689 + }, + { + "epoch": 0.09876647204687722, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 4690 + }, + { + "epoch": 0.09878753099614095, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5597, + "step": 4691 + }, + { + "epoch": 0.09880858994540467, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5965, + "step": 4692 + }, + { + "epoch": 0.0988296488946684, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 4693 + }, + { + "epoch": 0.09885070784393213, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6013, + "step": 4694 + }, + { + "epoch": 0.09887176679319586, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 4695 + }, + { + "epoch": 0.09889282574245958, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 4696 + }, + { + "epoch": 0.09891388469172331, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 4697 + }, + { + "epoch": 0.09893494364098704, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 4698 + }, + { + "epoch": 0.09895600259025077, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5542, + "step": 4699 + }, + { + "epoch": 0.09897706153951448, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 4700 + }, + { + "epoch": 0.09899812048877821, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 4701 + }, + { + "epoch": 0.09901917943804193, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5723, + "step": 4702 + }, + { + "epoch": 0.09904023838730566, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 4703 + }, + { + "epoch": 0.09906129733656939, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 4704 + }, + { + "epoch": 0.09908235628583312, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5809, + "step": 4705 + }, + { + "epoch": 0.09910341523509685, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6059, + "step": 4706 + }, + { + "epoch": 0.09912447418436057, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 4707 + }, + { + "epoch": 0.0991455331336243, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 4708 + }, + { + "epoch": 0.09916659208288803, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 4709 + }, + { + "epoch": 0.09918765103215176, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 4710 + }, + { + "epoch": 0.09920870998141548, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6007, + "step": 4711 + }, + { + "epoch": 0.09922976893067921, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 4712 + }, + { + "epoch": 0.09925082787994292, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 4713 + }, + { + "epoch": 0.09927188682920665, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 4714 + }, + { + "epoch": 0.09929294577847038, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 4715 + }, + { + "epoch": 0.0993140047277341, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 4716 + }, + { + "epoch": 0.09933506367699783, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6282, + "step": 4717 + }, + { + "epoch": 0.09935612262626156, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5573, + "step": 4718 + }, + { + "epoch": 0.09937718157552529, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.543, + "step": 4719 + }, + { + "epoch": 0.09939824052478902, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 4720 + }, + { + "epoch": 0.09941929947405274, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5742, + "step": 4721 + }, + { + "epoch": 0.09944035842331647, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 4722 + }, + { + "epoch": 0.0994614173725802, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5726, + "step": 4723 + }, + { + "epoch": 0.09948247632184393, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6096, + "step": 4724 + }, + { + "epoch": 0.09950353527110765, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 4725 + }, + { + "epoch": 0.09952459422037137, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5553, + "step": 4726 + }, + { + "epoch": 0.0995456531696351, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.575, + "step": 4727 + }, + { + "epoch": 0.09956671211889882, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5962, + "step": 4728 + }, + { + "epoch": 0.09958777106816255, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 4729 + }, + { + "epoch": 0.09960883001742628, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 4730 + }, + { + "epoch": 0.09962988896669, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 4731 + }, + { + "epoch": 0.09965094791595373, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5733, + "step": 4732 + }, + { + "epoch": 0.09967200686521746, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5696, + "step": 4733 + }, + { + "epoch": 0.09969306581448119, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5922, + "step": 4734 + }, + { + "epoch": 0.09971412476374492, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5753, + "step": 4735 + }, + { + "epoch": 0.09973518371300864, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 4736 + }, + { + "epoch": 0.09975624266227237, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 4737 + }, + { + "epoch": 0.0997773016115361, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.582, + "step": 4738 + }, + { + "epoch": 0.09979836056079981, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5436, + "step": 4739 + }, + { + "epoch": 0.09981941951006354, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 4740 + }, + { + "epoch": 0.09984047845932727, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 4741 + }, + { + "epoch": 0.099861537408591, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 4742 + }, + { + "epoch": 0.09988259635785472, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 4743 + }, + { + "epoch": 0.09990365530711845, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 4744 + }, + { + "epoch": 0.09992471425638218, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 4745 + }, + { + "epoch": 0.0999457732056459, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5815, + "step": 4746 + }, + { + "epoch": 0.09996683215490963, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 4747 + }, + { + "epoch": 0.09998789110417336, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 4748 + }, + { + "epoch": 0.10000895005343709, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 4749 + }, + { + "epoch": 0.10003000900270081, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 4750 + }, + { + "epoch": 0.10005106795196454, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5666, + "step": 4751 + }, + { + "epoch": 0.10007212690122827, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5604, + "step": 4752 + }, + { + "epoch": 0.10009318585049198, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 4753 + }, + { + "epoch": 0.10011424479975571, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 4754 + }, + { + "epoch": 0.10013530374901944, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6102, + "step": 4755 + }, + { + "epoch": 0.10015636269828317, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 4756 + }, + { + "epoch": 0.1001774216475469, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5757, + "step": 4757 + }, + { + "epoch": 0.10019848059681062, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 4758 + }, + { + "epoch": 0.10021953954607435, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 4759 + }, + { + "epoch": 0.10024059849533808, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 4760 + }, + { + "epoch": 0.1002616574446018, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5893, + "step": 4761 + }, + { + "epoch": 0.10028271639386553, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5514, + "step": 4762 + }, + { + "epoch": 0.10030377534312926, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5684, + "step": 4763 + }, + { + "epoch": 0.10032483429239299, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 4764 + }, + { + "epoch": 0.10034589324165671, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5913, + "step": 4765 + }, + { + "epoch": 0.10036695219092043, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 4766 + }, + { + "epoch": 0.10038801114018416, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5855, + "step": 4767 + }, + { + "epoch": 0.10040907008944788, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 4768 + }, + { + "epoch": 0.10043012903871161, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5444, + "step": 4769 + }, + { + "epoch": 0.10045118798797534, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 4770 + }, + { + "epoch": 0.10047224693723907, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5639, + "step": 4771 + }, + { + "epoch": 0.10049330588650279, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 4772 + }, + { + "epoch": 0.10051436483576652, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 4773 + }, + { + "epoch": 0.10053542378503025, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 4774 + }, + { + "epoch": 0.10055648273429398, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.6102, + "step": 4775 + }, + { + "epoch": 0.1005775416835577, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 4776 + }, + { + "epoch": 0.10059860063282143, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5531, + "step": 4777 + }, + { + "epoch": 0.10061965958208516, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 4778 + }, + { + "epoch": 0.10064071853134887, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 4779 + }, + { + "epoch": 0.1006617774806126, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 4780 + }, + { + "epoch": 0.10068283642987633, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 4781 + }, + { + "epoch": 0.10070389537914005, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 4782 + }, + { + "epoch": 0.10072495432840378, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5818, + "step": 4783 + }, + { + "epoch": 0.10074601327766751, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 4784 + }, + { + "epoch": 0.10076707222693124, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 4785 + }, + { + "epoch": 0.10078813117619496, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5741, + "step": 4786 + }, + { + "epoch": 0.10080919012545869, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6379, + "step": 4787 + }, + { + "epoch": 0.10083024907472242, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 4788 + }, + { + "epoch": 0.10085130802398615, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 4789 + }, + { + "epoch": 0.10087236697324987, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5807, + "step": 4790 + }, + { + "epoch": 0.1008934259225136, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 4791 + }, + { + "epoch": 0.10091448487177732, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 4792 + }, + { + "epoch": 0.10093554382104104, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 4793 + }, + { + "epoch": 0.10095660277030477, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 4794 + }, + { + "epoch": 0.1009776617195685, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 4795 + }, + { + "epoch": 0.10099872066883223, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 4796 + }, + { + "epoch": 0.10101977961809595, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.6046, + "step": 4797 + }, + { + "epoch": 0.10104083856735968, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 4798 + }, + { + "epoch": 0.10106189751662341, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5986, + "step": 4799 + }, + { + "epoch": 0.10108295646588714, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 4800 + }, + { + "epoch": 0.10110401541515086, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 4801 + }, + { + "epoch": 0.10112507436441459, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5817, + "step": 4802 + }, + { + "epoch": 0.10114613331367832, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 4803 + }, + { + "epoch": 0.10116719226294205, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5832, + "step": 4804 + }, + { + "epoch": 0.10118825121220577, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 4805 + }, + { + "epoch": 0.10120931016146949, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 4806 + }, + { + "epoch": 0.10123036911073321, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 4807 + }, + { + "epoch": 0.10125142805999694, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 4808 + }, + { + "epoch": 0.10127248700926067, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 4809 + }, + { + "epoch": 0.1012935459585244, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5674, + "step": 4810 + }, + { + "epoch": 0.10131460490778812, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 4811 + }, + { + "epoch": 0.10133566385705185, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 4812 + }, + { + "epoch": 0.10135672280631558, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.575, + "step": 4813 + }, + { + "epoch": 0.10137778175557931, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5611, + "step": 4814 + }, + { + "epoch": 0.10139884070484304, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 4815 + }, + { + "epoch": 0.10141989965410676, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 4816 + }, + { + "epoch": 0.10144095860337049, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 4817 + }, + { + "epoch": 0.10146201755263422, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5565, + "step": 4818 + }, + { + "epoch": 0.10148307650189793, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5672, + "step": 4819 + }, + { + "epoch": 0.10150413545116166, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 4820 + }, + { + "epoch": 0.10152519440042539, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5764, + "step": 4821 + }, + { + "epoch": 0.10154625334968911, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 4822 + }, + { + "epoch": 0.10156731229895284, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5642, + "step": 4823 + }, + { + "epoch": 0.10158837124821657, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5429, + "step": 4824 + }, + { + "epoch": 0.1016094301974803, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 4825 + }, + { + "epoch": 0.10163048914674402, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 4826 + }, + { + "epoch": 0.10165154809600775, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 4827 + }, + { + "epoch": 0.10167260704527148, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 4828 + }, + { + "epoch": 0.1016936659945352, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 4829 + }, + { + "epoch": 0.10171472494379893, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5943, + "step": 4830 + }, + { + "epoch": 0.10173578389306266, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 4831 + }, + { + "epoch": 0.10175684284232638, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 4832 + }, + { + "epoch": 0.1017779017915901, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5628, + "step": 4833 + }, + { + "epoch": 0.10179896074085383, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 4834 + }, + { + "epoch": 0.10182001969011756, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 4835 + }, + { + "epoch": 0.10184107863938129, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 4836 + }, + { + "epoch": 0.10186213758864501, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 4837 + }, + { + "epoch": 0.10188319653790874, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 4838 + }, + { + "epoch": 0.10190425548717247, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 4839 + }, + { + "epoch": 0.1019253144364362, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 4840 + }, + { + "epoch": 0.10194637338569992, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 4841 + }, + { + "epoch": 0.10196743233496365, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5977, + "step": 4842 + }, + { + "epoch": 0.10198849128422738, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 4843 + }, + { + "epoch": 0.1020095502334911, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.603, + "step": 4844 + }, + { + "epoch": 0.10203060918275483, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5731, + "step": 4845 + }, + { + "epoch": 0.10205166813201855, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 4846 + }, + { + "epoch": 0.10207272708128227, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5863, + "step": 4847 + }, + { + "epoch": 0.102093786030546, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 4848 + }, + { + "epoch": 0.10211484497980973, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5815, + "step": 4849 + }, + { + "epoch": 0.10213590392907346, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 4850 + }, + { + "epoch": 0.10215696287833718, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 4851 + }, + { + "epoch": 0.10217802182760091, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 4852 + }, + { + "epoch": 0.10219908077686464, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 4853 + }, + { + "epoch": 0.10222013972612837, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 4854 + }, + { + "epoch": 0.1022411986753921, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5588, + "step": 4855 + }, + { + "epoch": 0.10226225762465582, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 4856 + }, + { + "epoch": 0.10228331657391955, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 4857 + }, + { + "epoch": 0.10230437552318328, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 4858 + }, + { + "epoch": 0.10232543447244699, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.567, + "step": 4859 + }, + { + "epoch": 0.10234649342171072, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 4860 + }, + { + "epoch": 0.10236755237097445, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 4861 + }, + { + "epoch": 0.10238861132023817, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.609, + "step": 4862 + }, + { + "epoch": 0.1024096702695019, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 4863 + }, + { + "epoch": 0.10243072921876563, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5611, + "step": 4864 + }, + { + "epoch": 0.10245178816802936, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6253, + "step": 4865 + }, + { + "epoch": 0.10247284711729308, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 4866 + }, + { + "epoch": 0.10249390606655681, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5754, + "step": 4867 + }, + { + "epoch": 0.10251496501582054, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 4868 + }, + { + "epoch": 0.10253602396508427, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 4869 + }, + { + "epoch": 0.102557082914348, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5945, + "step": 4870 + }, + { + "epoch": 0.10257814186361172, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.5677, + "step": 4871 + }, + { + "epoch": 0.10259920081287544, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6344, + "step": 4872 + }, + { + "epoch": 0.10262025976213916, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 4873 + }, + { + "epoch": 0.10264131871140289, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5644, + "step": 4874 + }, + { + "epoch": 0.10266237766066662, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 4875 + }, + { + "epoch": 0.10268343660993035, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 4876 + }, + { + "epoch": 0.10270449555919407, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5414, + "step": 4877 + }, + { + "epoch": 0.1027255545084578, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 4878 + }, + { + "epoch": 0.10274661345772153, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5852, + "step": 4879 + }, + { + "epoch": 0.10276767240698526, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5693, + "step": 4880 + }, + { + "epoch": 0.10278873135624898, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 4881 + }, + { + "epoch": 0.10280979030551271, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 4882 + }, + { + "epoch": 0.10283084925477644, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 4883 + }, + { + "epoch": 0.10285190820404017, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5803, + "step": 4884 + }, + { + "epoch": 0.10287296715330388, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5848, + "step": 4885 + }, + { + "epoch": 0.1028940261025676, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5559, + "step": 4886 + }, + { + "epoch": 0.10291508505183133, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5638, + "step": 4887 + }, + { + "epoch": 0.10293614400109506, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5466, + "step": 4888 + }, + { + "epoch": 0.10295720295035879, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 1.6031, + "step": 4889 + }, + { + "epoch": 0.10297826189962252, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 4890 + }, + { + "epoch": 0.10299932084888624, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.602, + "step": 4891 + }, + { + "epoch": 0.10302037979814997, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 4892 + }, + { + "epoch": 0.1030414387474137, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 4893 + }, + { + "epoch": 0.10306249769667743, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.6094, + "step": 4894 + }, + { + "epoch": 0.10308355664594115, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 4895 + }, + { + "epoch": 0.10310461559520488, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 1.6062, + "step": 4896 + }, + { + "epoch": 0.10312567454446861, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.558, + "step": 4897 + }, + { + "epoch": 0.10314673349373234, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 4898 + }, + { + "epoch": 0.10316779244299605, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.5385, + "step": 4899 + }, + { + "epoch": 0.10318885139225978, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5819, + "step": 4900 + }, + { + "epoch": 0.1032099103415235, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5955, + "step": 4901 + }, + { + "epoch": 0.10323096929078723, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.5784, + "step": 4902 + }, + { + "epoch": 0.10325202824005096, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5531, + "step": 4903 + }, + { + "epoch": 0.10327308718931469, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5678, + "step": 4904 + }, + { + "epoch": 0.10329414613857842, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.585, + "step": 4905 + }, + { + "epoch": 0.10331520508784214, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 4906 + }, + { + "epoch": 0.10333626403710587, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 4907 + }, + { + "epoch": 0.1033573229863696, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 4908 + }, + { + "epoch": 0.10337838193563333, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 4909 + }, + { + "epoch": 0.10339944088489705, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 4910 + }, + { + "epoch": 0.10342049983416078, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 4911 + }, + { + "epoch": 0.1034415587834245, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5801, + "step": 4912 + }, + { + "epoch": 0.10346261773268822, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 4913 + }, + { + "epoch": 0.10348367668195195, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.569, + "step": 4914 + }, + { + "epoch": 0.10350473563121568, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5578, + "step": 4915 + }, + { + "epoch": 0.1035257945804794, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5837, + "step": 4916 + }, + { + "epoch": 0.10354685352974313, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 4917 + }, + { + "epoch": 0.10356791247900686, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5623, + "step": 4918 + }, + { + "epoch": 0.10358897142827059, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 4919 + }, + { + "epoch": 0.10361003037753432, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 4920 + }, + { + "epoch": 0.10363108932679804, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 4921 + }, + { + "epoch": 0.10365214827606177, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 4922 + }, + { + "epoch": 0.1036732072253255, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 4923 + }, + { + "epoch": 0.10369426617458923, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 4924 + }, + { + "epoch": 0.10371532512385294, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 4925 + }, + { + "epoch": 0.10373638407311667, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 4926 + }, + { + "epoch": 0.1037574430223804, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 4927 + }, + { + "epoch": 0.10377850197164412, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 4928 + }, + { + "epoch": 0.10379956092090785, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 4929 + }, + { + "epoch": 0.10382061987017158, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5695, + "step": 4930 + }, + { + "epoch": 0.1038416788194353, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 4931 + }, + { + "epoch": 0.10386273776869903, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 4932 + }, + { + "epoch": 0.10388379671796276, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5604, + "step": 4933 + }, + { + "epoch": 0.10390485566722649, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5352, + "step": 4934 + }, + { + "epoch": 0.10392591461649021, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5679, + "step": 4935 + }, + { + "epoch": 0.10394697356575394, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 4936 + }, + { + "epoch": 0.10396803251501767, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 4937 + }, + { + "epoch": 0.10398909146428138, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 4938 + }, + { + "epoch": 0.10401015041354511, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5784, + "step": 4939 + }, + { + "epoch": 0.10403120936280884, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 4940 + }, + { + "epoch": 0.10405226831207257, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 4941 + }, + { + "epoch": 0.1040733272613363, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 4942 + }, + { + "epoch": 0.10409438621060002, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 4943 + }, + { + "epoch": 0.10411544515986375, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 4944 + }, + { + "epoch": 0.10413650410912748, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5824, + "step": 4945 + }, + { + "epoch": 0.1041575630583912, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5756, + "step": 4946 + }, + { + "epoch": 0.10417862200765493, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 4947 + }, + { + "epoch": 0.10419968095691866, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 4948 + }, + { + "epoch": 0.10422073990618239, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 4949 + }, + { + "epoch": 0.10424179885544611, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5519, + "step": 4950 + }, + { + "epoch": 0.10426285780470984, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5642, + "step": 4951 + }, + { + "epoch": 0.10428391675397355, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 4952 + }, + { + "epoch": 0.10430497570323728, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 4953 + }, + { + "epoch": 0.10432603465250101, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 4954 + }, + { + "epoch": 0.10434709360176474, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 4955 + }, + { + "epoch": 0.10436815255102846, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5514, + "step": 4956 + }, + { + "epoch": 0.10438921150029219, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5662, + "step": 4957 + }, + { + "epoch": 0.10441027044955592, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6007, + "step": 4958 + }, + { + "epoch": 0.10443132939881965, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 4959 + }, + { + "epoch": 0.10445238834808337, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5737, + "step": 4960 + }, + { + "epoch": 0.1044734472973471, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 4961 + }, + { + "epoch": 0.10449450624661083, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5684, + "step": 4962 + }, + { + "epoch": 0.10451556519587456, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 4963 + }, + { + "epoch": 0.10453662414513828, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 4964 + }, + { + "epoch": 0.104557683094402, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 4965 + }, + { + "epoch": 0.10457874204366573, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 4966 + }, + { + "epoch": 0.10459980099292945, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6245, + "step": 4967 + }, + { + "epoch": 0.10462085994219318, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 1.5643, + "step": 4968 + }, + { + "epoch": 0.10464191889145691, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5588, + "step": 4969 + }, + { + "epoch": 0.10466297784072064, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 4970 + }, + { + "epoch": 0.10468403678998436, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5883, + "step": 4971 + }, + { + "epoch": 0.10470509573924809, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 4972 + }, + { + "epoch": 0.10472615468851182, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.577, + "step": 4973 + }, + { + "epoch": 0.10474721363777555, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 4974 + }, + { + "epoch": 0.10476827258703927, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6318, + "step": 4975 + }, + { + "epoch": 0.104789331536303, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5619, + "step": 4976 + }, + { + "epoch": 0.10481039048556673, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5573, + "step": 4977 + }, + { + "epoch": 0.10483144943483044, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5739, + "step": 4978 + }, + { + "epoch": 0.10485250838409417, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 4979 + }, + { + "epoch": 0.1048735673333579, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5528, + "step": 4980 + }, + { + "epoch": 0.10489462628262163, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5809, + "step": 4981 + }, + { + "epoch": 0.10491568523188535, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 4982 + }, + { + "epoch": 0.10493674418114908, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 4983 + }, + { + "epoch": 0.10495780313041281, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 4984 + }, + { + "epoch": 0.10497886207967654, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 4985 + }, + { + "epoch": 0.10499992102894026, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 4986 + }, + { + "epoch": 0.10502097997820399, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5847, + "step": 4987 + }, + { + "epoch": 0.10504203892746772, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5467, + "step": 4988 + }, + { + "epoch": 0.10506309787673145, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.566, + "step": 4989 + }, + { + "epoch": 0.10508415682599517, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 4990 + }, + { + "epoch": 0.10510521577525889, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 4991 + }, + { + "epoch": 0.10512627472452261, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5902, + "step": 4992 + }, + { + "epoch": 0.10514733367378634, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6007, + "step": 4993 + }, + { + "epoch": 0.10516839262305007, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 4994 + }, + { + "epoch": 0.1051894515723138, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.5839, + "step": 4995 + }, + { + "epoch": 0.10521051052157752, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 4996 + }, + { + "epoch": 0.10523156947084125, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5983, + "step": 4997 + }, + { + "epoch": 0.10525262842010498, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 4998 + }, + { + "epoch": 0.10527368736936871, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 4999 + }, + { + "epoch": 0.10529474631863243, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 5000 + }, + { + "epoch": 0.10529474631863243, + "eval_loss": 2.2983527183532715, + "eval_runtime": 908.9901, + "eval_samples_per_second": 67.988, + "eval_steps_per_second": 2.125, + "step": 5000 + }, + { + "epoch": 0.10531580526789616, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 5001 + }, + { + "epoch": 0.10533686421715989, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5572, + "step": 5002 + }, + { + "epoch": 0.10535792316642362, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 5003 + }, + { + "epoch": 0.10537898211568734, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 5004 + }, + { + "epoch": 0.10540004106495106, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.552, + "step": 5005 + }, + { + "epoch": 0.10542110001421479, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5849, + "step": 5006 + }, + { + "epoch": 0.10544215896347851, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 5007 + }, + { + "epoch": 0.10546321791274224, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 5008 + }, + { + "epoch": 0.10548427686200597, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 5009 + }, + { + "epoch": 0.1055053358112697, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6083, + "step": 5010 + }, + { + "epoch": 0.10552639476053342, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 5011 + }, + { + "epoch": 0.10554745370979715, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 5012 + }, + { + "epoch": 0.10556851265906088, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.5442, + "step": 5013 + }, + { + "epoch": 0.1055895716083246, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5651, + "step": 5014 + }, + { + "epoch": 0.10561063055758833, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 5015 + }, + { + "epoch": 0.10563168950685206, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5547, + "step": 5016 + }, + { + "epoch": 0.10565274845611579, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5629, + "step": 5017 + }, + { + "epoch": 0.1056738074053795, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5629, + "step": 5018 + }, + { + "epoch": 0.10569486635464323, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 5019 + }, + { + "epoch": 0.10571592530390696, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 5020 + }, + { + "epoch": 0.10573698425317068, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5965, + "step": 5021 + }, + { + "epoch": 0.10575804320243441, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 5022 + }, + { + "epoch": 0.10577910215169814, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 5023 + }, + { + "epoch": 0.10580016110096187, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 5024 + }, + { + "epoch": 0.1058212200502256, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 5025 + }, + { + "epoch": 0.10584227899948932, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.574, + "step": 5026 + }, + { + "epoch": 0.10586333794875305, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 5027 + }, + { + "epoch": 0.10588439689801678, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 5028 + }, + { + "epoch": 0.1059054558472805, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5803, + "step": 5029 + }, + { + "epoch": 0.10592651479654423, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 5030 + }, + { + "epoch": 0.10594757374580795, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5483, + "step": 5031 + }, + { + "epoch": 0.10596863269507167, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 5032 + }, + { + "epoch": 0.1059896916443354, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5883, + "step": 5033 + }, + { + "epoch": 0.10601075059359913, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 5034 + }, + { + "epoch": 0.10603180954286286, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 5035 + }, + { + "epoch": 0.10605286849212658, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5742, + "step": 5036 + }, + { + "epoch": 0.10607392744139031, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 5037 + }, + { + "epoch": 0.10609498639065404, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 5038 + }, + { + "epoch": 0.10611604533991777, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 5039 + }, + { + "epoch": 0.1061371042891815, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.556, + "step": 5040 + }, + { + "epoch": 0.10615816323844522, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.57, + "step": 5041 + }, + { + "epoch": 0.10617922218770895, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 5042 + }, + { + "epoch": 0.10620028113697268, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 5043 + }, + { + "epoch": 0.10622134008623639, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5436, + "step": 5044 + }, + { + "epoch": 0.10624239903550012, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5619, + "step": 5045 + }, + { + "epoch": 0.10626345798476385, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 5046 + }, + { + "epoch": 0.10628451693402757, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6059, + "step": 5047 + }, + { + "epoch": 0.1063055758832913, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 5048 + }, + { + "epoch": 0.10632663483255503, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 5049 + }, + { + "epoch": 0.10634769378181876, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5583, + "step": 5050 + }, + { + "epoch": 0.10636875273108248, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5633, + "step": 5051 + }, + { + "epoch": 0.10638981168034621, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 5052 + }, + { + "epoch": 0.10641087062960994, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5357, + "step": 5053 + }, + { + "epoch": 0.10643192957887367, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 5054 + }, + { + "epoch": 0.1064529885281374, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 5055 + }, + { + "epoch": 0.10647404747740112, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5752, + "step": 5056 + }, + { + "epoch": 0.10649510642666485, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 5057 + }, + { + "epoch": 0.10651616537592856, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5898, + "step": 5058 + }, + { + "epoch": 0.10653722432519229, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 5059 + }, + { + "epoch": 0.10655828327445602, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.568, + "step": 5060 + }, + { + "epoch": 0.10657934222371974, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 5061 + }, + { + "epoch": 0.10660040117298347, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5896, + "step": 5062 + }, + { + "epoch": 0.1066214601222472, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5987, + "step": 5063 + }, + { + "epoch": 0.10664251907151093, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5697, + "step": 5064 + }, + { + "epoch": 0.10666357802077465, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.568, + "step": 5065 + }, + { + "epoch": 0.10668463697003838, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 5066 + }, + { + "epoch": 0.10670569591930211, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 5067 + }, + { + "epoch": 0.10672675486856584, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.543, + "step": 5068 + }, + { + "epoch": 0.10674781381782956, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5794, + "step": 5069 + }, + { + "epoch": 0.10676887276709329, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5456, + "step": 5070 + }, + { + "epoch": 0.106789931716357, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.572, + "step": 5071 + }, + { + "epoch": 0.10681099066562073, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5465, + "step": 5072 + }, + { + "epoch": 0.10683204961488446, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 5073 + }, + { + "epoch": 0.10685310856414819, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 5074 + }, + { + "epoch": 0.10687416751341192, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 5075 + }, + { + "epoch": 0.10689522646267564, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 5076 + }, + { + "epoch": 0.10691628541193937, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5698, + "step": 5077 + }, + { + "epoch": 0.1069373443612031, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 5078 + }, + { + "epoch": 0.10695840331046683, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 5079 + }, + { + "epoch": 0.10697946225973055, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5883, + "step": 5080 + }, + { + "epoch": 0.10700052120899428, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 5081 + }, + { + "epoch": 0.10702158015825801, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 5082 + }, + { + "epoch": 0.10704263910752174, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5717, + "step": 5083 + }, + { + "epoch": 0.10706369805678545, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 5084 + }, + { + "epoch": 0.10708475700604918, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5631, + "step": 5085 + }, + { + "epoch": 0.1071058159553129, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 5086 + }, + { + "epoch": 0.10712687490457663, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 5087 + }, + { + "epoch": 0.10714793385384036, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 5088 + }, + { + "epoch": 0.10716899280310409, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5824, + "step": 5089 + }, + { + "epoch": 0.10719005175236782, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 5090 + }, + { + "epoch": 0.10721111070163154, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 5091 + }, + { + "epoch": 0.10723216965089527, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5461, + "step": 5092 + }, + { + "epoch": 0.107253228600159, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 5093 + }, + { + "epoch": 0.10727428754942273, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 5094 + }, + { + "epoch": 0.10729534649868645, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5385, + "step": 5095 + }, + { + "epoch": 0.10731640544795018, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 5096 + }, + { + "epoch": 0.1073374643972139, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5614, + "step": 5097 + }, + { + "epoch": 0.10735852334647762, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 5098 + }, + { + "epoch": 0.10737958229574135, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5764, + "step": 5099 + }, + { + "epoch": 0.10740064124500508, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 5100 + }, + { + "epoch": 0.1074217001942688, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5847, + "step": 5101 + }, + { + "epoch": 0.10744275914353253, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 5102 + }, + { + "epoch": 0.10746381809279626, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6194, + "step": 5103 + }, + { + "epoch": 0.10748487704205999, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5639, + "step": 5104 + }, + { + "epoch": 0.10750593599132371, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 5105 + }, + { + "epoch": 0.10752699494058744, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 5106 + }, + { + "epoch": 0.10754805388985117, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 5107 + }, + { + "epoch": 0.1075691128391149, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5355, + "step": 5108 + }, + { + "epoch": 0.10759017178837862, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.5677, + "step": 5109 + }, + { + "epoch": 0.10761123073764235, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 5110 + }, + { + "epoch": 0.10763228968690607, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5757, + "step": 5111 + }, + { + "epoch": 0.1076533486361698, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 5112 + }, + { + "epoch": 0.10767440758543352, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 5113 + }, + { + "epoch": 0.10769546653469725, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 5114 + }, + { + "epoch": 0.10771652548396098, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5635, + "step": 5115 + }, + { + "epoch": 0.1077375844332247, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 5116 + }, + { + "epoch": 0.10775864338248843, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5818, + "step": 5117 + }, + { + "epoch": 0.10777970233175216, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 5118 + }, + { + "epoch": 0.10780076128101589, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5724, + "step": 5119 + }, + { + "epoch": 0.10782182023027961, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 5120 + }, + { + "epoch": 0.10784287917954334, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5324, + "step": 5121 + }, + { + "epoch": 0.10786393812880707, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5722, + "step": 5122 + }, + { + "epoch": 0.1078849970780708, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5473, + "step": 5123 + }, + { + "epoch": 0.10790605602733451, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 5124 + }, + { + "epoch": 0.10792711497659824, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5657, + "step": 5125 + }, + { + "epoch": 0.10794817392586196, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 5126 + }, + { + "epoch": 0.10796923287512569, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.577, + "step": 5127 + }, + { + "epoch": 0.10799029182438942, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 5128 + }, + { + "epoch": 0.10801135077365315, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 5129 + }, + { + "epoch": 0.10803240972291688, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 5130 + }, + { + "epoch": 0.1080534686721806, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 5131 + }, + { + "epoch": 0.10807452762144433, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6286, + "step": 5132 + }, + { + "epoch": 0.10809558657070806, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 5133 + }, + { + "epoch": 0.10811664551997179, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 5134 + }, + { + "epoch": 0.10813770446923551, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.5705, + "step": 5135 + }, + { + "epoch": 0.10815876341849924, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5694, + "step": 5136 + }, + { + "epoch": 0.10817982236776295, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.6127, + "step": 5137 + }, + { + "epoch": 0.10820088131702668, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 5138 + }, + { + "epoch": 0.10822194026629041, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5719, + "step": 5139 + }, + { + "epoch": 0.10824299921555414, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 5140 + }, + { + "epoch": 0.10826405816481786, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 5141 + }, + { + "epoch": 0.10828511711408159, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5593, + "step": 5142 + }, + { + "epoch": 0.10830617606334532, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 5143 + }, + { + "epoch": 0.10832723501260905, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5755, + "step": 5144 + }, + { + "epoch": 0.10834829396187277, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 5145 + }, + { + "epoch": 0.1083693529111365, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 5146 + }, + { + "epoch": 0.10839041186040023, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 5147 + }, + { + "epoch": 0.10841147080966396, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5818, + "step": 5148 + }, + { + "epoch": 0.10843252975892768, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5822, + "step": 5149 + }, + { + "epoch": 0.1084535887081914, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.5955, + "step": 5150 + }, + { + "epoch": 0.10847464765745513, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.5635, + "step": 5151 + }, + { + "epoch": 0.10849570660671885, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 5152 + }, + { + "epoch": 0.10851676555598258, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5592, + "step": 5153 + }, + { + "epoch": 0.10853782450524631, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5739, + "step": 5154 + }, + { + "epoch": 0.10855888345451004, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 5155 + }, + { + "epoch": 0.10857994240377376, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5922, + "step": 5156 + }, + { + "epoch": 0.10860100135303749, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6204, + "step": 5157 + }, + { + "epoch": 0.10862206030230122, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5832, + "step": 5158 + }, + { + "epoch": 0.10864311925156495, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5465, + "step": 5159 + }, + { + "epoch": 0.10866417820082867, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5998, + "step": 5160 + }, + { + "epoch": 0.1086852371500924, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5541, + "step": 5161 + }, + { + "epoch": 0.10870629609935613, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5642, + "step": 5162 + }, + { + "epoch": 0.10872735504861986, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5657, + "step": 5163 + }, + { + "epoch": 0.10874841399788357, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5815, + "step": 5164 + }, + { + "epoch": 0.1087694729471473, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.59, + "step": 5165 + }, + { + "epoch": 0.10879053189641102, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 5166 + }, + { + "epoch": 0.10881159084567475, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 5167 + }, + { + "epoch": 0.10883264979493848, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 5168 + }, + { + "epoch": 0.10885370874420221, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5494, + "step": 5169 + }, + { + "epoch": 0.10887476769346593, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5965, + "step": 5170 + }, + { + "epoch": 0.10889582664272966, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 5171 + }, + { + "epoch": 0.10891688559199339, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 5172 + }, + { + "epoch": 0.10893794454125712, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 5173 + }, + { + "epoch": 0.10895900349052084, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 5174 + }, + { + "epoch": 0.10898006243978457, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6259, + "step": 5175 + }, + { + "epoch": 0.1090011213890483, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5817, + "step": 5176 + }, + { + "epoch": 0.10902218033831201, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 5177 + }, + { + "epoch": 0.10904323928757574, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.552, + "step": 5178 + }, + { + "epoch": 0.10906429823683947, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5803, + "step": 5179 + }, + { + "epoch": 0.1090853571861032, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 5180 + }, + { + "epoch": 0.10910641613536692, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 5181 + }, + { + "epoch": 0.10912747508463065, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 5182 + }, + { + "epoch": 0.10914853403389438, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 5183 + }, + { + "epoch": 0.1091695929831581, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.5666, + "step": 5184 + }, + { + "epoch": 0.10919065193242183, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5943, + "step": 5185 + }, + { + "epoch": 0.10921171088168556, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6, + "step": 5186 + }, + { + "epoch": 0.10923276983094929, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5662, + "step": 5187 + }, + { + "epoch": 0.10925382878021302, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 5188 + }, + { + "epoch": 0.10927488772947674, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 5189 + }, + { + "epoch": 0.10929594667874046, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5714, + "step": 5190 + }, + { + "epoch": 0.10931700562800419, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 5191 + }, + { + "epoch": 0.10933806457726791, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5771, + "step": 5192 + }, + { + "epoch": 0.10935912352653164, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5627, + "step": 5193 + }, + { + "epoch": 0.10938018247579537, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5891, + "step": 5194 + }, + { + "epoch": 0.1094012414250591, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 5195 + }, + { + "epoch": 0.10942230037432282, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5678, + "step": 5196 + }, + { + "epoch": 0.10944335932358655, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 5197 + }, + { + "epoch": 0.10946441827285028, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6273, + "step": 5198 + }, + { + "epoch": 0.109485477222114, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.572, + "step": 5199 + }, + { + "epoch": 0.10950653617137773, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 5200 + }, + { + "epoch": 0.10952759512064146, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 5201 + }, + { + "epoch": 0.10954865406990519, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5849, + "step": 5202 + }, + { + "epoch": 0.1095697130191689, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 5203 + }, + { + "epoch": 0.10959077196843263, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 5204 + }, + { + "epoch": 0.10961183091769636, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 5205 + }, + { + "epoch": 0.10963288986696008, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 5206 + }, + { + "epoch": 0.10965394881622381, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 5207 + }, + { + "epoch": 0.10967500776548754, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 5208 + }, + { + "epoch": 0.10969606671475127, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5948, + "step": 5209 + }, + { + "epoch": 0.109717125664015, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.63, + "step": 5210 + }, + { + "epoch": 0.10973818461327872, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.6111, + "step": 5211 + }, + { + "epoch": 0.10975924356254245, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 5212 + }, + { + "epoch": 0.10978030251180618, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 5213 + }, + { + "epoch": 0.1098013614610699, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 5214 + }, + { + "epoch": 0.10982242041033363, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5681, + "step": 5215 + }, + { + "epoch": 0.10984347935959736, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5947, + "step": 5216 + }, + { + "epoch": 0.10986453830886107, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 5217 + }, + { + "epoch": 0.1098855972581248, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5667, + "step": 5218 + }, + { + "epoch": 0.10990665620738853, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 5219 + }, + { + "epoch": 0.10992771515665226, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5794, + "step": 5220 + }, + { + "epoch": 0.10994877410591598, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5737, + "step": 5221 + }, + { + "epoch": 0.10996983305517971, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 5222 + }, + { + "epoch": 0.10999089200444344, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5648, + "step": 5223 + }, + { + "epoch": 0.11001195095370717, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 5224 + }, + { + "epoch": 0.1100330099029709, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 5225 + }, + { + "epoch": 0.11005406885223462, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5657, + "step": 5226 + }, + { + "epoch": 0.11007512780149835, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 5227 + }, + { + "epoch": 0.11009618675076208, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 5228 + }, + { + "epoch": 0.1101172457000258, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5689, + "step": 5229 + }, + { + "epoch": 0.11013830464928952, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 5230 + }, + { + "epoch": 0.11015936359855324, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 5231 + }, + { + "epoch": 0.11018042254781697, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 5232 + }, + { + "epoch": 0.1102014814970807, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 5233 + }, + { + "epoch": 0.11022254044634443, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5677, + "step": 5234 + }, + { + "epoch": 0.11024359939560816, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5716, + "step": 5235 + }, + { + "epoch": 0.11026465834487188, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 5236 + }, + { + "epoch": 0.11028571729413561, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 5237 + }, + { + "epoch": 0.11030677624339934, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 5238 + }, + { + "epoch": 0.11032783519266307, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 5239 + }, + { + "epoch": 0.11034889414192679, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6147, + "step": 5240 + }, + { + "epoch": 0.11036995309119052, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6148, + "step": 5241 + }, + { + "epoch": 0.11039101204045425, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 5242 + }, + { + "epoch": 0.11041207098971796, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 5243 + }, + { + "epoch": 0.11043312993898169, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5752, + "step": 5244 + }, + { + "epoch": 0.11045418888824542, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 5245 + }, + { + "epoch": 0.11047524783750914, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5494, + "step": 5246 + }, + { + "epoch": 0.11049630678677287, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5801, + "step": 5247 + }, + { + "epoch": 0.1105173657360366, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 5248 + }, + { + "epoch": 0.11053842468530033, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 5249 + }, + { + "epoch": 0.11055948363456405, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.55, + "step": 5250 + }, + { + "epoch": 0.11058054258382778, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5341, + "step": 5251 + }, + { + "epoch": 0.11060160153309151, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 5252 + }, + { + "epoch": 0.11062266048235524, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 5253 + }, + { + "epoch": 0.11064371943161896, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5763, + "step": 5254 + }, + { + "epoch": 0.11066477838088269, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 5255 + }, + { + "epoch": 0.1106858373301464, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5602, + "step": 5256 + }, + { + "epoch": 0.11070689627941013, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5588, + "step": 5257 + }, + { + "epoch": 0.11072795522867386, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5588, + "step": 5258 + }, + { + "epoch": 0.11074901417793759, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6229, + "step": 5259 + }, + { + "epoch": 0.11077007312720132, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5807, + "step": 5260 + }, + { + "epoch": 0.11079113207646504, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 5261 + }, + { + "epoch": 0.11081219102572877, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5502, + "step": 5262 + }, + { + "epoch": 0.1108332499749925, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5654, + "step": 5263 + }, + { + "epoch": 0.11085430892425623, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 5264 + }, + { + "epoch": 0.11087536787351995, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 5265 + }, + { + "epoch": 0.11089642682278368, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 5266 + }, + { + "epoch": 0.11091748577204741, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 5267 + }, + { + "epoch": 0.11093854472131114, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.598, + "step": 5268 + }, + { + "epoch": 0.11095960367057486, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 5269 + }, + { + "epoch": 0.11098066261983858, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 5270 + }, + { + "epoch": 0.1110017215691023, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 5271 + }, + { + "epoch": 0.11102278051836603, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 5272 + }, + { + "epoch": 0.11104383946762976, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5912, + "step": 5273 + }, + { + "epoch": 0.11106489841689349, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5827, + "step": 5274 + }, + { + "epoch": 0.11108595736615721, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 5275 + }, + { + "epoch": 0.11110701631542094, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 5276 + }, + { + "epoch": 0.11112807526468467, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.5716, + "step": 5277 + }, + { + "epoch": 0.1111491342139484, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 5278 + }, + { + "epoch": 0.11117019316321212, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 5279 + }, + { + "epoch": 0.11119125211247585, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 5280 + }, + { + "epoch": 0.11121231106173958, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5723, + "step": 5281 + }, + { + "epoch": 0.11123337001100331, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 5282 + }, + { + "epoch": 0.11125442896026702, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 5283 + }, + { + "epoch": 0.11127548790953075, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 5284 + }, + { + "epoch": 0.11129654685879448, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 5285 + }, + { + "epoch": 0.1113176058080582, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 5286 + }, + { + "epoch": 0.11133866475732193, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 5287 + }, + { + "epoch": 0.11135972370658566, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 5288 + }, + { + "epoch": 0.11138078265584939, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 5289 + }, + { + "epoch": 0.11140184160511311, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5752, + "step": 5290 + }, + { + "epoch": 0.11142290055437684, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 5291 + }, + { + "epoch": 0.11144395950364057, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 5292 + }, + { + "epoch": 0.1114650184529043, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.572, + "step": 5293 + }, + { + "epoch": 0.11148607740216802, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 5294 + }, + { + "epoch": 0.11150713635143175, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 5295 + }, + { + "epoch": 0.11152819530069547, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5634, + "step": 5296 + }, + { + "epoch": 0.11154925424995919, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 5297 + }, + { + "epoch": 0.11157031319922292, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5638, + "step": 5298 + }, + { + "epoch": 0.11159137214848665, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5392, + "step": 5299 + }, + { + "epoch": 0.11161243109775038, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 5300 + }, + { + "epoch": 0.1116334900470141, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 5301 + }, + { + "epoch": 0.11165454899627783, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 5302 + }, + { + "epoch": 0.11167560794554156, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.569, + "step": 5303 + }, + { + "epoch": 0.11169666689480529, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 5304 + }, + { + "epoch": 0.11171772584406901, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 5305 + }, + { + "epoch": 0.11173878479333274, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 5306 + }, + { + "epoch": 0.11175984374259647, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 5307 + }, + { + "epoch": 0.1117809026918602, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 5308 + }, + { + "epoch": 0.11180196164112391, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 5309 + }, + { + "epoch": 0.11182302059038764, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 5310 + }, + { + "epoch": 0.11184407953965136, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5822, + "step": 5311 + }, + { + "epoch": 0.11186513848891509, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 5312 + }, + { + "epoch": 0.11188619743817882, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 5313 + }, + { + "epoch": 0.11190725638744255, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5607, + "step": 5314 + }, + { + "epoch": 0.11192831533670627, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5705, + "step": 5315 + }, + { + "epoch": 0.11194937428597, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 5316 + }, + { + "epoch": 0.11197043323523373, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6148, + "step": 5317 + }, + { + "epoch": 0.11199149218449746, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5705, + "step": 5318 + }, + { + "epoch": 0.11201255113376118, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5304, + "step": 5319 + }, + { + "epoch": 0.11203361008302491, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 5320 + }, + { + "epoch": 0.11205466903228864, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 5321 + }, + { + "epoch": 0.11207572798155237, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.5863, + "step": 5322 + }, + { + "epoch": 0.11209678693081608, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 5323 + }, + { + "epoch": 0.11211784588007981, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 5324 + }, + { + "epoch": 0.11213890482934354, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 5325 + }, + { + "epoch": 0.11215996377860726, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5975, + "step": 5326 + }, + { + "epoch": 0.11218102272787099, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 5327 + }, + { + "epoch": 0.11220208167713472, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 5328 + }, + { + "epoch": 0.11222314062639845, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5807, + "step": 5329 + }, + { + "epoch": 0.11224419957566217, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5695, + "step": 5330 + }, + { + "epoch": 0.1122652585249259, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 5331 + }, + { + "epoch": 0.11228631747418963, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 5332 + }, + { + "epoch": 0.11230737642345336, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 5333 + }, + { + "epoch": 0.11232843537271708, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5856, + "step": 5334 + }, + { + "epoch": 0.11234949432198081, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.603, + "step": 5335 + }, + { + "epoch": 0.11237055327124452, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 5336 + }, + { + "epoch": 0.11239161222050825, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 5337 + }, + { + "epoch": 0.11241267116977198, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 5338 + }, + { + "epoch": 0.11243373011903571, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 5339 + }, + { + "epoch": 0.11245478906829943, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 5340 + }, + { + "epoch": 0.11247584801756316, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 5341 + }, + { + "epoch": 0.11249690696682689, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 5342 + }, + { + "epoch": 0.11251796591609062, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5735, + "step": 5343 + }, + { + "epoch": 0.11253902486535435, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 5344 + }, + { + "epoch": 0.11256008381461807, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 5345 + }, + { + "epoch": 0.1125811427638818, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 5346 + }, + { + "epoch": 0.11260220171314553, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.6068, + "step": 5347 + }, + { + "epoch": 0.11262326066240926, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5468, + "step": 5348 + }, + { + "epoch": 0.11264431961167297, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 5349 + }, + { + "epoch": 0.1126653785609367, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 5350 + }, + { + "epoch": 0.11268643751020042, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5436, + "step": 5351 + }, + { + "epoch": 0.11270749645946415, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 5352 + }, + { + "epoch": 0.11272855540872788, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 5353 + }, + { + "epoch": 0.1127496143579916, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 5354 + }, + { + "epoch": 0.11277067330725533, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 5355 + }, + { + "epoch": 0.11279173225651906, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5741, + "step": 5356 + }, + { + "epoch": 0.11281279120578279, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6056, + "step": 5357 + }, + { + "epoch": 0.11283385015504652, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 5358 + }, + { + "epoch": 0.11285490910431024, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6194, + "step": 5359 + }, + { + "epoch": 0.11287596805357397, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5633, + "step": 5360 + }, + { + "epoch": 0.1128970270028377, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 5361 + }, + { + "epoch": 0.11291808595210141, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 5362 + }, + { + "epoch": 0.11293914490136514, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5739, + "step": 5363 + }, + { + "epoch": 0.11296020385062887, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 5364 + }, + { + "epoch": 0.1129812627998926, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5849, + "step": 5365 + }, + { + "epoch": 0.11300232174915632, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 5366 + }, + { + "epoch": 0.11302338069842005, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.551, + "step": 5367 + }, + { + "epoch": 0.11304443964768378, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5716, + "step": 5368 + }, + { + "epoch": 0.1130654985969475, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5929, + "step": 5369 + }, + { + "epoch": 0.11308655754621123, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 5370 + }, + { + "epoch": 0.11310761649547496, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 5371 + }, + { + "epoch": 0.11312867544473869, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 5372 + }, + { + "epoch": 0.11314973439400242, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 5373 + }, + { + "epoch": 0.11317079334326614, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 5374 + }, + { + "epoch": 0.11319185229252987, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 5375 + }, + { + "epoch": 0.11321291124179358, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6098, + "step": 5376 + }, + { + "epoch": 0.11323397019105731, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5986, + "step": 5377 + }, + { + "epoch": 0.11325502914032104, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 5378 + }, + { + "epoch": 0.11327608808958477, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.5505, + "step": 5379 + }, + { + "epoch": 0.1132971470388485, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 5380 + }, + { + "epoch": 0.11331820598811222, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 5381 + }, + { + "epoch": 0.11333926493737595, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 5382 + }, + { + "epoch": 0.11336032388663968, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5449, + "step": 5383 + }, + { + "epoch": 0.1133813828359034, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 5384 + }, + { + "epoch": 0.11340244178516713, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5663, + "step": 5385 + }, + { + "epoch": 0.11342350073443086, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5579, + "step": 5386 + }, + { + "epoch": 0.11344455968369459, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 5387 + }, + { + "epoch": 0.11346561863295831, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 5388 + }, + { + "epoch": 0.11348667758222203, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5899, + "step": 5389 + }, + { + "epoch": 0.11350773653148576, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 5390 + }, + { + "epoch": 0.11352879548074948, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 5391 + }, + { + "epoch": 0.11354985443001321, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 5392 + }, + { + "epoch": 0.11357091337927694, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 5393 + }, + { + "epoch": 0.11359197232854067, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 5394 + }, + { + "epoch": 0.1136130312778044, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5547, + "step": 5395 + }, + { + "epoch": 0.11363409022706812, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.5511, + "step": 5396 + }, + { + "epoch": 0.11365514917633185, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 5397 + }, + { + "epoch": 0.11367620812559558, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 5398 + }, + { + "epoch": 0.1136972670748593, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 5399 + }, + { + "epoch": 0.11371832602412303, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 5400 + }, + { + "epoch": 0.11373938497338676, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.5737, + "step": 5401 + }, + { + "epoch": 0.11376044392265047, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 5402 + }, + { + "epoch": 0.1137815028719142, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 5403 + }, + { + "epoch": 0.11380256182117793, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 5404 + }, + { + "epoch": 0.11382362077044166, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.572, + "step": 5405 + }, + { + "epoch": 0.11384467971970538, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5758, + "step": 5406 + }, + { + "epoch": 0.11386573866896911, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.5436, + "step": 5407 + }, + { + "epoch": 0.11388679761823284, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5863, + "step": 5408 + }, + { + "epoch": 0.11390785656749657, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 5409 + }, + { + "epoch": 0.11392891551676029, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 5410 + }, + { + "epoch": 0.11394997446602402, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 5411 + }, + { + "epoch": 0.11397103341528775, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5908, + "step": 5412 + }, + { + "epoch": 0.11399209236455148, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5507, + "step": 5413 + }, + { + "epoch": 0.1140131513138152, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 5414 + }, + { + "epoch": 0.11403421026307893, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 5415 + }, + { + "epoch": 0.11405526921234264, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 5416 + }, + { + "epoch": 0.11407632816160637, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.5531, + "step": 5417 + }, + { + "epoch": 0.1140973871108701, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 5418 + }, + { + "epoch": 0.11411844606013383, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5759, + "step": 5419 + }, + { + "epoch": 0.11413950500939755, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 5420 + }, + { + "epoch": 0.11416056395866128, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 5421 + }, + { + "epoch": 0.11418162290792501, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 5422 + }, + { + "epoch": 0.11420268185718874, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 5423 + }, + { + "epoch": 0.11422374080645246, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.6026, + "step": 5424 + }, + { + "epoch": 0.11424479975571619, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 5425 + }, + { + "epoch": 0.11426585870497992, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 5426 + }, + { + "epoch": 0.11428691765424365, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 5427 + }, + { + "epoch": 0.11430797660350737, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 5428 + }, + { + "epoch": 0.11432903555277109, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 5429 + }, + { + "epoch": 0.11435009450203482, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 5430 + }, + { + "epoch": 0.11437115345129854, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 5431 + }, + { + "epoch": 0.11439221240056227, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5815, + "step": 5432 + }, + { + "epoch": 0.114413271349826, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 5433 + }, + { + "epoch": 0.11443433029908973, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5716, + "step": 5434 + }, + { + "epoch": 0.11445538924835345, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5553, + "step": 5435 + }, + { + "epoch": 0.11447644819761718, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5829, + "step": 5436 + }, + { + "epoch": 0.11449750714688091, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 5437 + }, + { + "epoch": 0.11451856609614464, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 5438 + }, + { + "epoch": 0.11453962504540836, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 5439 + }, + { + "epoch": 0.11456068399467209, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5278, + "step": 5440 + }, + { + "epoch": 0.11458174294393582, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.6249, + "step": 5441 + }, + { + "epoch": 0.11460280189319953, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5847, + "step": 5442 + }, + { + "epoch": 0.11462386084246326, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 5443 + }, + { + "epoch": 0.11464491979172699, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 5444 + }, + { + "epoch": 0.11466597874099071, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6062, + "step": 5445 + }, + { + "epoch": 0.11468703769025444, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5879, + "step": 5446 + }, + { + "epoch": 0.11470809663951817, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5824, + "step": 5447 + }, + { + "epoch": 0.1147291555887819, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 5448 + }, + { + "epoch": 0.11475021453804563, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.5621, + "step": 5449 + }, + { + "epoch": 0.11477127348730935, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.551, + "step": 5450 + }, + { + "epoch": 0.11479233243657308, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6232, + "step": 5451 + }, + { + "epoch": 0.11481339138583681, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 5452 + }, + { + "epoch": 0.11483445033510054, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 5453 + }, + { + "epoch": 0.11485550928436426, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 5454 + }, + { + "epoch": 0.11487656823362798, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 5455 + }, + { + "epoch": 0.1148976271828917, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 5456 + }, + { + "epoch": 0.11491868613215543, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.5406, + "step": 5457 + }, + { + "epoch": 0.11493974508141916, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 5458 + }, + { + "epoch": 0.11496080403068289, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.5629, + "step": 5459 + }, + { + "epoch": 0.11498186297994661, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 5460 + }, + { + "epoch": 0.11500292192921034, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5556, + "step": 5461 + }, + { + "epoch": 0.11502398087847407, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 5462 + }, + { + "epoch": 0.1150450398277378, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 5463 + }, + { + "epoch": 0.11506609877700152, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 5464 + }, + { + "epoch": 0.11508715772626525, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 5465 + }, + { + "epoch": 0.11510821667552898, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 5466 + }, + { + "epoch": 0.1151292756247927, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 5467 + }, + { + "epoch": 0.11515033457405643, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 5468 + }, + { + "epoch": 0.11517139352332015, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 5469 + }, + { + "epoch": 0.11519245247258388, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 1.5913, + "step": 5470 + }, + { + "epoch": 0.1152135114218476, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5698, + "step": 5471 + }, + { + "epoch": 0.11523457037111133, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 5472 + }, + { + "epoch": 0.11525562932037506, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 5473 + }, + { + "epoch": 0.11527668826963879, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 5474 + }, + { + "epoch": 0.11529774721890251, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 5475 + }, + { + "epoch": 0.11531880616816624, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 5476 + }, + { + "epoch": 0.11533986511742997, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 5477 + }, + { + "epoch": 0.1153609240666937, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5879, + "step": 5478 + }, + { + "epoch": 0.11538198301595742, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.5511, + "step": 5479 + }, + { + "epoch": 0.11540304196522115, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 5480 + }, + { + "epoch": 0.11542410091448488, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5635, + "step": 5481 + }, + { + "epoch": 0.11544515986374859, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5894, + "step": 5482 + }, + { + "epoch": 0.11546621881301232, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 5483 + }, + { + "epoch": 0.11548727776227605, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 5484 + }, + { + "epoch": 0.11550833671153977, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 5485 + }, + { + "epoch": 0.1155293956608035, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 5486 + }, + { + "epoch": 0.11555045461006723, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 5487 + }, + { + "epoch": 0.11557151355933096, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 5488 + }, + { + "epoch": 0.11559257250859468, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 5489 + }, + { + "epoch": 0.11561363145785841, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.5642, + "step": 5490 + }, + { + "epoch": 0.11563469040712214, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 5491 + }, + { + "epoch": 0.11565574935638587, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.5623, + "step": 5492 + }, + { + "epoch": 0.1156768083056496, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 5493 + }, + { + "epoch": 0.11569786725491332, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 5494 + }, + { + "epoch": 0.11571892620417704, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 5495 + }, + { + "epoch": 0.11573998515344076, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 5496 + }, + { + "epoch": 0.11576104410270449, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5662, + "step": 5497 + }, + { + "epoch": 0.11578210305196822, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.5576, + "step": 5498 + }, + { + "epoch": 0.11580316200123195, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5579, + "step": 5499 + }, + { + "epoch": 0.11582422095049567, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5457, + "step": 5500 + }, + { + "epoch": 0.11582422095049567, + "eval_loss": 2.3854761123657227, + "eval_runtime": 1003.3844, + "eval_samples_per_second": 61.592, + "eval_steps_per_second": 1.925, + "step": 5500 + } + ], + "logging_steps": 1.0, + "max_steps": 47485, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.7218310856704e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}