{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9576188935619541, "eval_steps": 386, "global_step": 740, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012940795858945326, "grad_norm": 1.7405146360397339, "learning_rate": 2.0000000000000003e-06, "loss": 2.4269, "step": 1 }, { "epoch": 0.0012940795858945326, "eval_loss": 2.247628688812256, "eval_runtime": 189.8853, "eval_samples_per_second": 3.428, "eval_steps_per_second": 0.858, "step": 1 }, { "epoch": 0.002588159171789065, "grad_norm": 1.6643542051315308, "learning_rate": 4.000000000000001e-06, "loss": 2.2583, "step": 2 }, { "epoch": 0.0038822387576835974, "grad_norm": 1.8690767288208008, "learning_rate": 6e-06, "loss": 2.2696, "step": 3 }, { "epoch": 0.00517631834357813, "grad_norm": 1.828118085861206, "learning_rate": 8.000000000000001e-06, "loss": 2.3646, "step": 4 }, { "epoch": 0.006470397929472662, "grad_norm": 1.9319926500320435, "learning_rate": 1e-05, "loss": 2.4196, "step": 5 }, { "epoch": 0.007764477515367195, "grad_norm": 1.7723782062530518, "learning_rate": 1.2e-05, "loss": 2.4177, "step": 6 }, { "epoch": 0.009058557101261728, "grad_norm": 1.9500815868377686, "learning_rate": 1.4000000000000001e-05, "loss": 2.3497, "step": 7 }, { "epoch": 0.01035263668715626, "grad_norm": 2.3909075260162354, "learning_rate": 1.6000000000000003e-05, "loss": 2.405, "step": 8 }, { "epoch": 0.011646716273050793, "grad_norm": 2.0620856285095215, "learning_rate": 1.8e-05, "loss": 2.4098, "step": 9 }, { "epoch": 0.012940795858945324, "grad_norm": 1.8054910898208618, "learning_rate": 2e-05, "loss": 2.1233, "step": 10 }, { "epoch": 0.014234875444839857, "grad_norm": 2.190964937210083, "learning_rate": 2.2000000000000003e-05, "loss": 2.3985, "step": 11 }, { "epoch": 0.01552895503073439, "grad_norm": 1.9412921667099, "learning_rate": 2.4e-05, "loss": 2.462, "step": 12 }, { "epoch": 0.016823034616628922, "grad_norm": 1.9161555767059326, "learning_rate": 2.6000000000000002e-05, "loss": 2.2118, "step": 13 }, { "epoch": 0.018117114202523456, "grad_norm": 1.7161599397659302, "learning_rate": 2.8000000000000003e-05, "loss": 2.2175, "step": 14 }, { "epoch": 0.019411193788417987, "grad_norm": 2.173877000808716, "learning_rate": 3e-05, "loss": 2.2521, "step": 15 }, { "epoch": 0.02070527337431252, "grad_norm": 2.0000555515289307, "learning_rate": 3.2000000000000005e-05, "loss": 2.1615, "step": 16 }, { "epoch": 0.021999352960207053, "grad_norm": 1.5915080308914185, "learning_rate": 3.4000000000000007e-05, "loss": 1.9522, "step": 17 }, { "epoch": 0.023293432546101587, "grad_norm": 1.6972448825836182, "learning_rate": 3.6e-05, "loss": 1.7224, "step": 18 }, { "epoch": 0.024587512131996118, "grad_norm": 1.7509772777557373, "learning_rate": 3.8e-05, "loss": 2.0414, "step": 19 }, { "epoch": 0.02588159171789065, "grad_norm": 1.697340488433838, "learning_rate": 4e-05, "loss": 2.0427, "step": 20 }, { "epoch": 0.027175671303785183, "grad_norm": 1.8733758926391602, "learning_rate": 4.2e-05, "loss": 1.6772, "step": 21 }, { "epoch": 0.028469750889679714, "grad_norm": 1.6085255146026611, "learning_rate": 4.4000000000000006e-05, "loss": 1.6527, "step": 22 }, { "epoch": 0.029763830475574248, "grad_norm": 1.5792337656021118, "learning_rate": 4.600000000000001e-05, "loss": 1.6567, "step": 23 }, { "epoch": 0.03105791006146878, "grad_norm": 1.4392567873001099, "learning_rate": 4.8e-05, "loss": 1.508, "step": 24 }, { "epoch": 0.03235198964736331, "grad_norm": 1.5222433805465698, "learning_rate": 5e-05, "loss": 1.4606, "step": 25 }, { "epoch": 0.033646069233257844, "grad_norm": 1.5480064153671265, "learning_rate": 5.2000000000000004e-05, "loss": 1.5027, "step": 26 }, { "epoch": 0.034940148819152375, "grad_norm": 1.6736445426940918, "learning_rate": 5.4000000000000005e-05, "loss": 1.2426, "step": 27 }, { "epoch": 0.03623422840504691, "grad_norm": 1.7392551898956299, "learning_rate": 5.6000000000000006e-05, "loss": 1.4703, "step": 28 }, { "epoch": 0.037528307990941444, "grad_norm": 1.6173359155654907, "learning_rate": 5.8e-05, "loss": 1.4546, "step": 29 }, { "epoch": 0.038822387576835975, "grad_norm": 1.3955802917480469, "learning_rate": 6e-05, "loss": 1.3808, "step": 30 }, { "epoch": 0.040116467162730506, "grad_norm": 1.353873372077942, "learning_rate": 6.2e-05, "loss": 1.229, "step": 31 }, { "epoch": 0.04141054674862504, "grad_norm": 1.2547746896743774, "learning_rate": 6.400000000000001e-05, "loss": 1.1668, "step": 32 }, { "epoch": 0.042704626334519574, "grad_norm": 1.3806778192520142, "learning_rate": 6.6e-05, "loss": 1.0691, "step": 33 }, { "epoch": 0.043998705920414105, "grad_norm": 1.2815773487091064, "learning_rate": 6.800000000000001e-05, "loss": 1.2409, "step": 34 }, { "epoch": 0.045292785506308636, "grad_norm": 1.3677266836166382, "learning_rate": 7e-05, "loss": 0.9668, "step": 35 }, { "epoch": 0.046586865092203174, "grad_norm": 1.5457032918930054, "learning_rate": 7.2e-05, "loss": 1.1385, "step": 36 }, { "epoch": 0.047880944678097705, "grad_norm": 1.5587060451507568, "learning_rate": 7.4e-05, "loss": 1.1707, "step": 37 }, { "epoch": 0.049175024263992236, "grad_norm": 1.079053282737732, "learning_rate": 7.6e-05, "loss": 1.0655, "step": 38 }, { "epoch": 0.050469103849886766, "grad_norm": 1.1773897409439087, "learning_rate": 7.800000000000001e-05, "loss": 1.0465, "step": 39 }, { "epoch": 0.0517631834357813, "grad_norm": 1.2437673807144165, "learning_rate": 8e-05, "loss": 1.2779, "step": 40 }, { "epoch": 0.053057263021675835, "grad_norm": 1.254847526550293, "learning_rate": 8.2e-05, "loss": 1.0898, "step": 41 }, { "epoch": 0.054351342607570366, "grad_norm": 1.1771515607833862, "learning_rate": 8.4e-05, "loss": 1.1827, "step": 42 }, { "epoch": 0.0556454221934649, "grad_norm": 1.1400648355484009, "learning_rate": 8.6e-05, "loss": 1.1066, "step": 43 }, { "epoch": 0.05693950177935943, "grad_norm": 1.2047138214111328, "learning_rate": 8.800000000000001e-05, "loss": 0.8974, "step": 44 }, { "epoch": 0.058233581365253966, "grad_norm": 1.1269346475601196, "learning_rate": 9e-05, "loss": 1.0146, "step": 45 }, { "epoch": 0.059527660951148496, "grad_norm": 1.169231653213501, "learning_rate": 9.200000000000001e-05, "loss": 1.1266, "step": 46 }, { "epoch": 0.06082174053704303, "grad_norm": 0.9771779179573059, "learning_rate": 9.4e-05, "loss": 0.8351, "step": 47 }, { "epoch": 0.06211582012293756, "grad_norm": 1.2849314212799072, "learning_rate": 9.6e-05, "loss": 1.1822, "step": 48 }, { "epoch": 0.0634098997088321, "grad_norm": 1.023181676864624, "learning_rate": 9.8e-05, "loss": 0.9082, "step": 49 }, { "epoch": 0.06470397929472663, "grad_norm": 1.135751724243164, "learning_rate": 0.0001, "loss": 0.9407, "step": 50 }, { "epoch": 0.06599805888062116, "grad_norm": 0.9701154828071594, "learning_rate": 9.999998300231494e-05, "loss": 0.9423, "step": 51 }, { "epoch": 0.06729213846651569, "grad_norm": 1.2891143560409546, "learning_rate": 9.999993200927133e-05, "loss": 0.9757, "step": 52 }, { "epoch": 0.06858621805241022, "grad_norm": 1.3360975980758667, "learning_rate": 9.999984702090383e-05, "loss": 1.0158, "step": 53 }, { "epoch": 0.06988029763830475, "grad_norm": 0.977446436882019, "learning_rate": 9.999972803727024e-05, "loss": 0.8175, "step": 54 }, { "epoch": 0.0711743772241993, "grad_norm": 0.9943827390670776, "learning_rate": 9.999957505845144e-05, "loss": 0.8627, "step": 55 }, { "epoch": 0.07246845681009383, "grad_norm": 1.1531224250793457, "learning_rate": 9.999938808455145e-05, "loss": 1.143, "step": 56 }, { "epoch": 0.07376253639598836, "grad_norm": 1.287972092628479, "learning_rate": 9.99991671156974e-05, "loss": 1.2342, "step": 57 }, { "epoch": 0.07505661598188289, "grad_norm": 1.1554590463638306, "learning_rate": 9.999891215203949e-05, "loss": 0.9692, "step": 58 }, { "epoch": 0.07635069556777742, "grad_norm": 1.0786008834838867, "learning_rate": 9.999862319375113e-05, "loss": 1.1254, "step": 59 }, { "epoch": 0.07764477515367195, "grad_norm": 1.0764508247375488, "learning_rate": 9.999830024102874e-05, "loss": 0.9312, "step": 60 }, { "epoch": 0.07893885473956648, "grad_norm": 1.1909526586532593, "learning_rate": 9.999794329409194e-05, "loss": 0.9959, "step": 61 }, { "epoch": 0.08023293432546101, "grad_norm": 0.9989166259765625, "learning_rate": 9.999755235318337e-05, "loss": 0.934, "step": 62 }, { "epoch": 0.08152701391135554, "grad_norm": 1.0302046537399292, "learning_rate": 9.999712741856889e-05, "loss": 1.1017, "step": 63 }, { "epoch": 0.08282109349725009, "grad_norm": 0.9583478569984436, "learning_rate": 9.999666849053738e-05, "loss": 1.1384, "step": 64 }, { "epoch": 0.08411517308314462, "grad_norm": 1.001126766204834, "learning_rate": 9.999617556940085e-05, "loss": 0.9279, "step": 65 }, { "epoch": 0.08540925266903915, "grad_norm": 1.0130903720855713, "learning_rate": 9.999564865549449e-05, "loss": 0.9381, "step": 66 }, { "epoch": 0.08670333225493368, "grad_norm": 1.1210829019546509, "learning_rate": 9.999508774917652e-05, "loss": 0.9607, "step": 67 }, { "epoch": 0.08799741184082821, "grad_norm": 1.045749545097351, "learning_rate": 9.999449285082831e-05, "loss": 1.0037, "step": 68 }, { "epoch": 0.08929149142672274, "grad_norm": 1.1308139562606812, "learning_rate": 9.999386396085434e-05, "loss": 0.9086, "step": 69 }, { "epoch": 0.09058557101261727, "grad_norm": 1.1013413667678833, "learning_rate": 9.999320107968219e-05, "loss": 1.0712, "step": 70 }, { "epoch": 0.0918796505985118, "grad_norm": 1.0830566883087158, "learning_rate": 9.999250420776258e-05, "loss": 1.0326, "step": 71 }, { "epoch": 0.09317373018440635, "grad_norm": 1.0673171281814575, "learning_rate": 9.999177334556929e-05, "loss": 1.0034, "step": 72 }, { "epoch": 0.09446780977030088, "grad_norm": 1.1546461582183838, "learning_rate": 9.999100849359926e-05, "loss": 1.059, "step": 73 }, { "epoch": 0.09576188935619541, "grad_norm": 0.9139528870582581, "learning_rate": 9.999020965237249e-05, "loss": 0.8596, "step": 74 }, { "epoch": 0.09705596894208994, "grad_norm": 1.1570812463760376, "learning_rate": 9.998937682243215e-05, "loss": 1.0456, "step": 75 }, { "epoch": 0.09835004852798447, "grad_norm": 1.3232612609863281, "learning_rate": 9.998851000434448e-05, "loss": 0.9994, "step": 76 }, { "epoch": 0.099644128113879, "grad_norm": 1.2017115354537964, "learning_rate": 9.998760919869883e-05, "loss": 1.2664, "step": 77 }, { "epoch": 0.10093820769977353, "grad_norm": 1.0694175958633423, "learning_rate": 9.998667440610765e-05, "loss": 0.9483, "step": 78 }, { "epoch": 0.10223228728566806, "grad_norm": 0.9963059425354004, "learning_rate": 9.998570562720654e-05, "loss": 0.9577, "step": 79 }, { "epoch": 0.1035263668715626, "grad_norm": 0.8873535394668579, "learning_rate": 9.998470286265416e-05, "loss": 0.8498, "step": 80 }, { "epoch": 0.10482044645745714, "grad_norm": 1.1350760459899902, "learning_rate": 9.99836661131323e-05, "loss": 1.0024, "step": 81 }, { "epoch": 0.10611452604335167, "grad_norm": 0.8355389833450317, "learning_rate": 9.998259537934586e-05, "loss": 0.7399, "step": 82 }, { "epoch": 0.1074086056292462, "grad_norm": 0.9935446381568909, "learning_rate": 9.998149066202284e-05, "loss": 0.9809, "step": 83 }, { "epoch": 0.10870268521514073, "grad_norm": 1.0571558475494385, "learning_rate": 9.998035196191435e-05, "loss": 1.0144, "step": 84 }, { "epoch": 0.10999676480103526, "grad_norm": 0.9860286116600037, "learning_rate": 9.99791792797946e-05, "loss": 1.0467, "step": 85 }, { "epoch": 0.1112908443869298, "grad_norm": 1.1422507762908936, "learning_rate": 9.997797261646089e-05, "loss": 0.9535, "step": 86 }, { "epoch": 0.11258492397282432, "grad_norm": 0.8561545014381409, "learning_rate": 9.997673197273365e-05, "loss": 1.007, "step": 87 }, { "epoch": 0.11387900355871886, "grad_norm": 1.0027543306350708, "learning_rate": 9.997545734945639e-05, "loss": 0.9861, "step": 88 }, { "epoch": 0.1151730831446134, "grad_norm": 0.8489773273468018, "learning_rate": 9.997414874749575e-05, "loss": 0.9672, "step": 89 }, { "epoch": 0.11646716273050793, "grad_norm": 1.0517115592956543, "learning_rate": 9.997280616774147e-05, "loss": 1.1672, "step": 90 }, { "epoch": 0.11776124231640246, "grad_norm": 1.0035395622253418, "learning_rate": 9.997142961110634e-05, "loss": 0.9294, "step": 91 }, { "epoch": 0.11905532190229699, "grad_norm": 1.1194915771484375, "learning_rate": 9.997001907852635e-05, "loss": 1.0857, "step": 92 }, { "epoch": 0.12034940148819152, "grad_norm": 1.5234825611114502, "learning_rate": 9.996857457096047e-05, "loss": 1.027, "step": 93 }, { "epoch": 0.12164348107408605, "grad_norm": 0.949878454208374, "learning_rate": 9.996709608939088e-05, "loss": 0.8173, "step": 94 }, { "epoch": 0.12293756065998059, "grad_norm": 0.8736472129821777, "learning_rate": 9.996558363482277e-05, "loss": 0.855, "step": 95 }, { "epoch": 0.12423164024587512, "grad_norm": 0.8604567050933838, "learning_rate": 9.996403720828449e-05, "loss": 0.9485, "step": 96 }, { "epoch": 0.12552571983176966, "grad_norm": 1.020851492881775, "learning_rate": 9.996245681082748e-05, "loss": 1.0024, "step": 97 }, { "epoch": 0.1268197994176642, "grad_norm": 1.0704892873764038, "learning_rate": 9.996084244352623e-05, "loss": 0.9246, "step": 98 }, { "epoch": 0.12811387900355872, "grad_norm": 0.8441987037658691, "learning_rate": 9.99591941074784e-05, "loss": 1.0343, "step": 99 }, { "epoch": 0.12940795858945325, "grad_norm": 1.0280612707138062, "learning_rate": 9.995751180380466e-05, "loss": 0.9644, "step": 100 }, { "epoch": 0.13070203817534778, "grad_norm": 0.9827906489372253, "learning_rate": 9.995579553364887e-05, "loss": 0.9583, "step": 101 }, { "epoch": 0.13199611776124232, "grad_norm": 1.035618543624878, "learning_rate": 9.995404529817791e-05, "loss": 1.0366, "step": 102 }, { "epoch": 0.13329019734713685, "grad_norm": 1.2775524854660034, "learning_rate": 9.995226109858178e-05, "loss": 0.9353, "step": 103 }, { "epoch": 0.13458427693303138, "grad_norm": 1.0101919174194336, "learning_rate": 9.995044293607355e-05, "loss": 0.9045, "step": 104 }, { "epoch": 0.1358783565189259, "grad_norm": 0.8396942019462585, "learning_rate": 9.994859081188943e-05, "loss": 0.867, "step": 105 }, { "epoch": 0.13717243610482044, "grad_norm": 1.04515540599823, "learning_rate": 9.99467047272887e-05, "loss": 0.9693, "step": 106 }, { "epoch": 0.13846651569071497, "grad_norm": 1.099042534828186, "learning_rate": 9.994478468355369e-05, "loss": 0.8879, "step": 107 }, { "epoch": 0.1397605952766095, "grad_norm": 0.8710360527038574, "learning_rate": 9.994283068198988e-05, "loss": 0.9018, "step": 108 }, { "epoch": 0.14105467486250403, "grad_norm": 0.961025059223175, "learning_rate": 9.99408427239258e-05, "loss": 0.8806, "step": 109 }, { "epoch": 0.1423487544483986, "grad_norm": 0.915665328502655, "learning_rate": 9.993882081071306e-05, "loss": 0.8628, "step": 110 }, { "epoch": 0.14364283403429312, "grad_norm": 1.2776648998260498, "learning_rate": 9.993676494372642e-05, "loss": 0.9742, "step": 111 }, { "epoch": 0.14493691362018765, "grad_norm": 1.1270071268081665, "learning_rate": 9.993467512436364e-05, "loss": 0.9729, "step": 112 }, { "epoch": 0.14623099320608218, "grad_norm": 0.8188664317131042, "learning_rate": 9.99325513540456e-05, "loss": 0.9428, "step": 113 }, { "epoch": 0.1475250727919767, "grad_norm": 1.0760393142700195, "learning_rate": 9.993039363421627e-05, "loss": 0.9482, "step": 114 }, { "epoch": 0.14881915237787124, "grad_norm": 1.019920825958252, "learning_rate": 9.992820196634273e-05, "loss": 0.9785, "step": 115 }, { "epoch": 0.15011323196376578, "grad_norm": 0.8342046737670898, "learning_rate": 9.992597635191509e-05, "loss": 0.9291, "step": 116 }, { "epoch": 0.1514073115496603, "grad_norm": 0.8460632562637329, "learning_rate": 9.992371679244658e-05, "loss": 0.8797, "step": 117 }, { "epoch": 0.15270139113555484, "grad_norm": 0.933060348033905, "learning_rate": 9.992142328947345e-05, "loss": 0.9657, "step": 118 }, { "epoch": 0.15399547072144937, "grad_norm": 0.8822593688964844, "learning_rate": 9.991909584455511e-05, "loss": 0.8872, "step": 119 }, { "epoch": 0.1552895503073439, "grad_norm": 0.9599350094795227, "learning_rate": 9.991673445927398e-05, "loss": 0.9064, "step": 120 }, { "epoch": 0.15658362989323843, "grad_norm": 0.8505874872207642, "learning_rate": 9.99143391352356e-05, "loss": 0.9966, "step": 121 }, { "epoch": 0.15787770947913296, "grad_norm": 1.3977786302566528, "learning_rate": 9.991190987406857e-05, "loss": 0.9145, "step": 122 }, { "epoch": 0.1591717890650275, "grad_norm": 0.8947294354438782, "learning_rate": 9.990944667742455e-05, "loss": 0.9569, "step": 123 }, { "epoch": 0.16046586865092202, "grad_norm": 0.7973839044570923, "learning_rate": 9.990694954697828e-05, "loss": 0.8853, "step": 124 }, { "epoch": 0.16175994823681655, "grad_norm": 0.9481159448623657, "learning_rate": 9.99044184844276e-05, "loss": 1.04, "step": 125 }, { "epoch": 0.16305402782271108, "grad_norm": 1.3568611145019531, "learning_rate": 9.990185349149339e-05, "loss": 1.1104, "step": 126 }, { "epoch": 0.16434810740860564, "grad_norm": 0.900867223739624, "learning_rate": 9.98992545699196e-05, "loss": 0.8427, "step": 127 }, { "epoch": 0.16564218699450017, "grad_norm": 0.9025059938430786, "learning_rate": 9.989662172147326e-05, "loss": 0.9671, "step": 128 }, { "epoch": 0.1669362665803947, "grad_norm": 0.944692850112915, "learning_rate": 9.989395494794446e-05, "loss": 1.0966, "step": 129 }, { "epoch": 0.16823034616628924, "grad_norm": 1.196311116218567, "learning_rate": 9.989125425114638e-05, "loss": 1.0888, "step": 130 }, { "epoch": 0.16952442575218377, "grad_norm": 0.9069584608078003, "learning_rate": 9.988851963291522e-05, "loss": 0.8579, "step": 131 }, { "epoch": 0.1708185053380783, "grad_norm": 0.8150789141654968, "learning_rate": 9.988575109511026e-05, "loss": 0.7622, "step": 132 }, { "epoch": 0.17211258492397283, "grad_norm": 1.0844395160675049, "learning_rate": 9.988294863961387e-05, "loss": 0.9284, "step": 133 }, { "epoch": 0.17340666450986736, "grad_norm": 1.0463049411773682, "learning_rate": 9.988011226833146e-05, "loss": 0.9185, "step": 134 }, { "epoch": 0.1747007440957619, "grad_norm": 0.9481234550476074, "learning_rate": 9.987724198319148e-05, "loss": 0.8631, "step": 135 }, { "epoch": 0.17599482368165642, "grad_norm": 0.882074773311615, "learning_rate": 9.987433778614549e-05, "loss": 0.8997, "step": 136 }, { "epoch": 0.17728890326755095, "grad_norm": 0.9853332042694092, "learning_rate": 9.987139967916805e-05, "loss": 0.9226, "step": 137 }, { "epoch": 0.17858298285344548, "grad_norm": 1.151941180229187, "learning_rate": 9.98684276642568e-05, "loss": 1.0486, "step": 138 }, { "epoch": 0.17987706243934, "grad_norm": 1.0128459930419922, "learning_rate": 9.986542174343245e-05, "loss": 1.0797, "step": 139 }, { "epoch": 0.18117114202523454, "grad_norm": 0.9798718094825745, "learning_rate": 9.986238191873874e-05, "loss": 0.875, "step": 140 }, { "epoch": 0.18246522161112907, "grad_norm": 0.8143295645713806, "learning_rate": 9.985930819224247e-05, "loss": 0.8454, "step": 141 }, { "epoch": 0.1837593011970236, "grad_norm": 0.8755755424499512, "learning_rate": 9.985620056603348e-05, "loss": 0.8029, "step": 142 }, { "epoch": 0.18505338078291814, "grad_norm": 0.899174690246582, "learning_rate": 9.985305904222469e-05, "loss": 0.9608, "step": 143 }, { "epoch": 0.1863474603688127, "grad_norm": 0.920137882232666, "learning_rate": 9.984988362295203e-05, "loss": 0.9022, "step": 144 }, { "epoch": 0.18764153995470723, "grad_norm": 1.1012908220291138, "learning_rate": 9.984667431037447e-05, "loss": 0.9621, "step": 145 }, { "epoch": 0.18893561954060176, "grad_norm": 0.8609358668327332, "learning_rate": 9.98434311066741e-05, "loss": 0.917, "step": 146 }, { "epoch": 0.1902296991264963, "grad_norm": 0.8248727321624756, "learning_rate": 9.984015401405594e-05, "loss": 0.7864, "step": 147 }, { "epoch": 0.19152377871239082, "grad_norm": 0.8680225610733032, "learning_rate": 9.983684303474815e-05, "loss": 0.9288, "step": 148 }, { "epoch": 0.19281785829828535, "grad_norm": 1.0807067155838013, "learning_rate": 9.983349817100188e-05, "loss": 0.9842, "step": 149 }, { "epoch": 0.19411193788417988, "grad_norm": 0.9310898780822754, "learning_rate": 9.983011942509131e-05, "loss": 1.0568, "step": 150 }, { "epoch": 0.1954060174700744, "grad_norm": 0.8052242398262024, "learning_rate": 9.98267067993137e-05, "loss": 0.8235, "step": 151 }, { "epoch": 0.19670009705596894, "grad_norm": 0.9700384140014648, "learning_rate": 9.982326029598931e-05, "loss": 0.8611, "step": 152 }, { "epoch": 0.19799417664186347, "grad_norm": 0.8437764048576355, "learning_rate": 9.981977991746142e-05, "loss": 0.83, "step": 153 }, { "epoch": 0.199288256227758, "grad_norm": 0.930636465549469, "learning_rate": 9.98162656660964e-05, "loss": 1.0892, "step": 154 }, { "epoch": 0.20058233581365253, "grad_norm": 0.9111954569816589, "learning_rate": 9.98127175442836e-05, "loss": 0.9962, "step": 155 }, { "epoch": 0.20187641539954707, "grad_norm": 0.9521974921226501, "learning_rate": 9.980913555443541e-05, "loss": 0.911, "step": 156 }, { "epoch": 0.2031704949854416, "grad_norm": 0.8516745567321777, "learning_rate": 9.980551969898727e-05, "loss": 0.9009, "step": 157 }, { "epoch": 0.20446457457133613, "grad_norm": 0.8302998542785645, "learning_rate": 9.98018699803976e-05, "loss": 0.8854, "step": 158 }, { "epoch": 0.20575865415723066, "grad_norm": 0.814391016960144, "learning_rate": 9.979818640114789e-05, "loss": 0.9601, "step": 159 }, { "epoch": 0.2070527337431252, "grad_norm": 0.8938564658164978, "learning_rate": 9.979446896374262e-05, "loss": 0.8834, "step": 160 }, { "epoch": 0.20834681332901975, "grad_norm": 0.9066985249519348, "learning_rate": 9.979071767070932e-05, "loss": 0.7427, "step": 161 }, { "epoch": 0.20964089291491428, "grad_norm": 0.7866595983505249, "learning_rate": 9.978693252459851e-05, "loss": 0.8556, "step": 162 }, { "epoch": 0.2109349725008088, "grad_norm": 0.9159708023071289, "learning_rate": 9.978311352798374e-05, "loss": 0.8101, "step": 163 }, { "epoch": 0.21222905208670334, "grad_norm": 1.1350793838500977, "learning_rate": 9.977926068346157e-05, "loss": 0.9374, "step": 164 }, { "epoch": 0.21352313167259787, "grad_norm": 1.0535932779312134, "learning_rate": 9.977537399365159e-05, "loss": 1.0238, "step": 165 }, { "epoch": 0.2148172112584924, "grad_norm": 0.8717033267021179, "learning_rate": 9.977145346119637e-05, "loss": 1.0265, "step": 166 }, { "epoch": 0.21611129084438693, "grad_norm": 0.8357003927230835, "learning_rate": 9.976749908876152e-05, "loss": 0.9016, "step": 167 }, { "epoch": 0.21740537043028146, "grad_norm": 0.8369495868682861, "learning_rate": 9.976351087903568e-05, "loss": 0.8764, "step": 168 }, { "epoch": 0.218699450016176, "grad_norm": 0.912352979183197, "learning_rate": 9.97594888347304e-05, "loss": 0.9078, "step": 169 }, { "epoch": 0.21999352960207053, "grad_norm": 0.8475804328918457, "learning_rate": 9.975543295858035e-05, "loss": 0.8836, "step": 170 }, { "epoch": 0.22128760918796506, "grad_norm": 0.8391397595405579, "learning_rate": 9.97513432533431e-05, "loss": 0.9003, "step": 171 }, { "epoch": 0.2225816887738596, "grad_norm": 0.9666821360588074, "learning_rate": 9.974721972179931e-05, "loss": 0.9528, "step": 172 }, { "epoch": 0.22387576835975412, "grad_norm": 0.9321691393852234, "learning_rate": 9.974306236675259e-05, "loss": 0.9575, "step": 173 }, { "epoch": 0.22516984794564865, "grad_norm": 0.8022271990776062, "learning_rate": 9.973887119102957e-05, "loss": 0.8731, "step": 174 }, { "epoch": 0.22646392753154318, "grad_norm": 1.1056872606277466, "learning_rate": 9.973464619747983e-05, "loss": 0.9925, "step": 175 }, { "epoch": 0.2277580071174377, "grad_norm": 0.810420036315918, "learning_rate": 9.9730387388976e-05, "loss": 1.0073, "step": 176 }, { "epoch": 0.22905208670333224, "grad_norm": 0.9536454677581787, "learning_rate": 9.972609476841367e-05, "loss": 0.9595, "step": 177 }, { "epoch": 0.2303461662892268, "grad_norm": 0.8205066919326782, "learning_rate": 9.972176833871142e-05, "loss": 0.8146, "step": 178 }, { "epoch": 0.23164024587512133, "grad_norm": 0.9716495275497437, "learning_rate": 9.971740810281083e-05, "loss": 1.0377, "step": 179 }, { "epoch": 0.23293432546101586, "grad_norm": 0.828642725944519, "learning_rate": 9.971301406367644e-05, "loss": 0.8619, "step": 180 }, { "epoch": 0.2342284050469104, "grad_norm": 0.6980477571487427, "learning_rate": 9.970858622429579e-05, "loss": 0.8271, "step": 181 }, { "epoch": 0.23552248463280492, "grad_norm": 0.954387903213501, "learning_rate": 9.970412458767943e-05, "loss": 0.8465, "step": 182 }, { "epoch": 0.23681656421869945, "grad_norm": 0.8425692915916443, "learning_rate": 9.969962915686083e-05, "loss": 0.8893, "step": 183 }, { "epoch": 0.23811064380459399, "grad_norm": 0.8565071225166321, "learning_rate": 9.969509993489647e-05, "loss": 0.939, "step": 184 }, { "epoch": 0.23940472339048852, "grad_norm": 0.8831691145896912, "learning_rate": 9.969053692486583e-05, "loss": 0.8907, "step": 185 }, { "epoch": 0.24069880297638305, "grad_norm": 0.9661678075790405, "learning_rate": 9.96859401298713e-05, "loss": 0.9191, "step": 186 }, { "epoch": 0.24199288256227758, "grad_norm": 0.8784729838371277, "learning_rate": 9.968130955303828e-05, "loss": 1.0393, "step": 187 }, { "epoch": 0.2432869621481721, "grad_norm": 0.8830071091651917, "learning_rate": 9.967664519751515e-05, "loss": 0.9837, "step": 188 }, { "epoch": 0.24458104173406664, "grad_norm": 0.862108588218689, "learning_rate": 9.967194706647322e-05, "loss": 0.7871, "step": 189 }, { "epoch": 0.24587512131996117, "grad_norm": 1.0068063735961914, "learning_rate": 9.966721516310682e-05, "loss": 0.9526, "step": 190 }, { "epoch": 0.2471692009058557, "grad_norm": 0.9828710556030273, "learning_rate": 9.966244949063316e-05, "loss": 0.8923, "step": 191 }, { "epoch": 0.24846328049175023, "grad_norm": 1.0729883909225464, "learning_rate": 9.965765005229248e-05, "loss": 1.0115, "step": 192 }, { "epoch": 0.24975736007764476, "grad_norm": 0.9844326972961426, "learning_rate": 9.965281685134796e-05, "loss": 0.9855, "step": 193 }, { "epoch": 0.2510514396635393, "grad_norm": 1.1593172550201416, "learning_rate": 9.96479498910857e-05, "loss": 1.0912, "step": 194 }, { "epoch": 0.2523455192494338, "grad_norm": 0.8835370540618896, "learning_rate": 9.964304917481482e-05, "loss": 0.9951, "step": 195 }, { "epoch": 0.2536395988353284, "grad_norm": 0.9553850889205933, "learning_rate": 9.963811470586733e-05, "loss": 0.9335, "step": 196 }, { "epoch": 0.2549336784212229, "grad_norm": 0.863814115524292, "learning_rate": 9.963314648759823e-05, "loss": 1.0203, "step": 197 }, { "epoch": 0.25622775800711745, "grad_norm": 0.9639378786087036, "learning_rate": 9.962814452338542e-05, "loss": 1.0357, "step": 198 }, { "epoch": 0.25752183759301195, "grad_norm": 0.880519688129425, "learning_rate": 9.96231088166298e-05, "loss": 0.9964, "step": 199 }, { "epoch": 0.2588159171789065, "grad_norm": 0.8445360064506531, "learning_rate": 9.961803937075516e-05, "loss": 0.9766, "step": 200 }, { "epoch": 0.260109996764801, "grad_norm": 0.8204835057258606, "learning_rate": 9.961293618920826e-05, "loss": 0.8864, "step": 201 }, { "epoch": 0.26140407635069557, "grad_norm": 0.9315406084060669, "learning_rate": 9.960779927545883e-05, "loss": 1.0388, "step": 202 }, { "epoch": 0.2626981559365901, "grad_norm": 0.9286762475967407, "learning_rate": 9.960262863299943e-05, "loss": 0.9653, "step": 203 }, { "epoch": 0.26399223552248463, "grad_norm": 0.8037816882133484, "learning_rate": 9.959742426534566e-05, "loss": 0.765, "step": 204 }, { "epoch": 0.2652863151083792, "grad_norm": 0.9435904622077942, "learning_rate": 9.9592186176036e-05, "loss": 0.8889, "step": 205 }, { "epoch": 0.2665803946942737, "grad_norm": 1.0072762966156006, "learning_rate": 9.958691436863188e-05, "loss": 0.8358, "step": 206 }, { "epoch": 0.26787447428016825, "grad_norm": 0.9463568329811096, "learning_rate": 9.958160884671761e-05, "loss": 0.8815, "step": 207 }, { "epoch": 0.26916855386606275, "grad_norm": 0.9203188419342041, "learning_rate": 9.957626961390047e-05, "loss": 0.9312, "step": 208 }, { "epoch": 0.2704626334519573, "grad_norm": 1.0614677667617798, "learning_rate": 9.957089667381064e-05, "loss": 0.9822, "step": 209 }, { "epoch": 0.2717567130378518, "grad_norm": 0.8971818089485168, "learning_rate": 9.956549003010123e-05, "loss": 0.9421, "step": 210 }, { "epoch": 0.2730507926237464, "grad_norm": 0.9978768825531006, "learning_rate": 9.956004968644825e-05, "loss": 0.9539, "step": 211 }, { "epoch": 0.2743448722096409, "grad_norm": 0.7017274498939514, "learning_rate": 9.955457564655064e-05, "loss": 0.665, "step": 212 }, { "epoch": 0.27563895179553544, "grad_norm": 0.8292055726051331, "learning_rate": 9.954906791413023e-05, "loss": 0.922, "step": 213 }, { "epoch": 0.27693303138142994, "grad_norm": 0.978084146976471, "learning_rate": 9.954352649293178e-05, "loss": 0.9465, "step": 214 }, { "epoch": 0.2782271109673245, "grad_norm": 1.0260313749313354, "learning_rate": 9.953795138672291e-05, "loss": 0.9093, "step": 215 }, { "epoch": 0.279521190553219, "grad_norm": 1.162850022315979, "learning_rate": 9.95323425992942e-05, "loss": 1.0372, "step": 216 }, { "epoch": 0.28081527013911356, "grad_norm": 0.9785279631614685, "learning_rate": 9.952670013445913e-05, "loss": 0.8818, "step": 217 }, { "epoch": 0.28210934972500806, "grad_norm": 0.9386499524116516, "learning_rate": 9.9521023996054e-05, "loss": 0.8711, "step": 218 }, { "epoch": 0.2834034293109026, "grad_norm": 0.8620506525039673, "learning_rate": 9.951531418793812e-05, "loss": 1.011, "step": 219 }, { "epoch": 0.2846975088967972, "grad_norm": 0.9523435831069946, "learning_rate": 9.950957071399357e-05, "loss": 0.8541, "step": 220 }, { "epoch": 0.2859915884826917, "grad_norm": 0.8993477821350098, "learning_rate": 9.950379357812543e-05, "loss": 1.0253, "step": 221 }, { "epoch": 0.28728566806858624, "grad_norm": 1.073880910873413, "learning_rate": 9.949798278426158e-05, "loss": 1.115, "step": 222 }, { "epoch": 0.28857974765448074, "grad_norm": 0.7941976189613342, "learning_rate": 9.949213833635285e-05, "loss": 0.9398, "step": 223 }, { "epoch": 0.2898738272403753, "grad_norm": 0.798089325428009, "learning_rate": 9.948626023837291e-05, "loss": 0.8523, "step": 224 }, { "epoch": 0.2911679068262698, "grad_norm": 1.0251280069351196, "learning_rate": 9.948034849431831e-05, "loss": 0.939, "step": 225 }, { "epoch": 0.29246198641216437, "grad_norm": 0.9793195724487305, "learning_rate": 9.947440310820852e-05, "loss": 1.0998, "step": 226 }, { "epoch": 0.29375606599805887, "grad_norm": 0.8190125823020935, "learning_rate": 9.946842408408583e-05, "loss": 0.9606, "step": 227 }, { "epoch": 0.2950501455839534, "grad_norm": 0.8229602575302124, "learning_rate": 9.946241142601543e-05, "loss": 0.7944, "step": 228 }, { "epoch": 0.29634422516984793, "grad_norm": 0.8640865683555603, "learning_rate": 9.945636513808537e-05, "loss": 1.112, "step": 229 }, { "epoch": 0.2976383047557425, "grad_norm": 0.774501621723175, "learning_rate": 9.945028522440653e-05, "loss": 0.8986, "step": 230 }, { "epoch": 0.298932384341637, "grad_norm": 0.9039688110351562, "learning_rate": 9.944417168911275e-05, "loss": 1.0461, "step": 231 }, { "epoch": 0.30022646392753155, "grad_norm": 0.8048250675201416, "learning_rate": 9.943802453636065e-05, "loss": 0.89, "step": 232 }, { "epoch": 0.30152054351342605, "grad_norm": 0.8166521787643433, "learning_rate": 9.94318437703297e-05, "loss": 0.9149, "step": 233 }, { "epoch": 0.3028146230993206, "grad_norm": 0.7571333646774292, "learning_rate": 9.942562939522228e-05, "loss": 0.9655, "step": 234 }, { "epoch": 0.3041087026852151, "grad_norm": 0.6913223266601562, "learning_rate": 9.941938141526354e-05, "loss": 0.869, "step": 235 }, { "epoch": 0.3054027822711097, "grad_norm": 0.8476676344871521, "learning_rate": 9.94130998347016e-05, "loss": 0.8849, "step": 236 }, { "epoch": 0.30669686185700423, "grad_norm": 0.8454031944274902, "learning_rate": 9.940678465780728e-05, "loss": 0.9102, "step": 237 }, { "epoch": 0.30799094144289874, "grad_norm": 0.8514583706855774, "learning_rate": 9.940043588887438e-05, "loss": 0.9723, "step": 238 }, { "epoch": 0.3092850210287933, "grad_norm": 0.7330415844917297, "learning_rate": 9.939405353221942e-05, "loss": 0.9537, "step": 239 }, { "epoch": 0.3105791006146878, "grad_norm": 0.9652897715568542, "learning_rate": 9.938763759218185e-05, "loss": 0.9736, "step": 240 }, { "epoch": 0.31187318020058236, "grad_norm": 0.7517886161804199, "learning_rate": 9.93811880731239e-05, "loss": 0.8436, "step": 241 }, { "epoch": 0.31316725978647686, "grad_norm": 0.8159210681915283, "learning_rate": 9.937470497943064e-05, "loss": 0.7521, "step": 242 }, { "epoch": 0.3144613393723714, "grad_norm": 0.9554911851882935, "learning_rate": 9.936818831550998e-05, "loss": 1.1076, "step": 243 }, { "epoch": 0.3157554189582659, "grad_norm": 0.8745877742767334, "learning_rate": 9.936163808579266e-05, "loss": 0.8908, "step": 244 }, { "epoch": 0.3170494985441605, "grad_norm": 0.8050674200057983, "learning_rate": 9.93550542947322e-05, "loss": 1.0134, "step": 245 }, { "epoch": 0.318343578130055, "grad_norm": 1.009790062904358, "learning_rate": 9.9348436946805e-05, "loss": 1.0264, "step": 246 }, { "epoch": 0.31963765771594954, "grad_norm": 0.8702448606491089, "learning_rate": 9.934178604651023e-05, "loss": 1.0067, "step": 247 }, { "epoch": 0.32093173730184404, "grad_norm": 0.8105303049087524, "learning_rate": 9.933510159836989e-05, "loss": 0.8121, "step": 248 }, { "epoch": 0.3222258168877386, "grad_norm": 0.7680085897445679, "learning_rate": 9.932838360692878e-05, "loss": 0.8951, "step": 249 }, { "epoch": 0.3235198964736331, "grad_norm": 0.8338052034378052, "learning_rate": 9.93216320767545e-05, "loss": 0.8878, "step": 250 }, { "epoch": 0.32481397605952766, "grad_norm": 0.8709661960601807, "learning_rate": 9.93148470124375e-05, "loss": 0.8786, "step": 251 }, { "epoch": 0.32610805564542217, "grad_norm": 0.9593453407287598, "learning_rate": 9.930802841859095e-05, "loss": 1.0659, "step": 252 }, { "epoch": 0.3274021352313167, "grad_norm": 1.0130974054336548, "learning_rate": 9.93011762998509e-05, "loss": 0.9626, "step": 253 }, { "epoch": 0.3286962148172113, "grad_norm": 0.9949910640716553, "learning_rate": 9.929429066087616e-05, "loss": 1.0499, "step": 254 }, { "epoch": 0.3299902944031058, "grad_norm": 0.927542507648468, "learning_rate": 9.92873715063483e-05, "loss": 0.9571, "step": 255 }, { "epoch": 0.33128437398900035, "grad_norm": 1.0638468265533447, "learning_rate": 9.92804188409717e-05, "loss": 1.0293, "step": 256 }, { "epoch": 0.33257845357489485, "grad_norm": 0.7083877325057983, "learning_rate": 9.927343266947356e-05, "loss": 0.875, "step": 257 }, { "epoch": 0.3338725331607894, "grad_norm": 0.7915517091751099, "learning_rate": 9.92664129966038e-05, "loss": 0.8848, "step": 258 }, { "epoch": 0.3351666127466839, "grad_norm": 0.8054295182228088, "learning_rate": 9.925935982713518e-05, "loss": 0.8981, "step": 259 }, { "epoch": 0.33646069233257847, "grad_norm": 1.012574553489685, "learning_rate": 9.925227316586316e-05, "loss": 0.8119, "step": 260 }, { "epoch": 0.337754771918473, "grad_norm": 0.8329979181289673, "learning_rate": 9.924515301760606e-05, "loss": 0.8467, "step": 261 }, { "epoch": 0.33904885150436753, "grad_norm": 0.801017701625824, "learning_rate": 9.923799938720488e-05, "loss": 0.8333, "step": 262 }, { "epoch": 0.34034293109026204, "grad_norm": 0.9083892703056335, "learning_rate": 9.923081227952347e-05, "loss": 0.8727, "step": 263 }, { "epoch": 0.3416370106761566, "grad_norm": 0.7917154431343079, "learning_rate": 9.922359169944834e-05, "loss": 1.0341, "step": 264 }, { "epoch": 0.3429310902620511, "grad_norm": 0.6865798234939575, "learning_rate": 9.921633765188886e-05, "loss": 0.9117, "step": 265 }, { "epoch": 0.34422516984794566, "grad_norm": 0.9448872208595276, "learning_rate": 9.92090501417771e-05, "loss": 1.1399, "step": 266 }, { "epoch": 0.34551924943384016, "grad_norm": 0.8711137175559998, "learning_rate": 9.920172917406789e-05, "loss": 0.8158, "step": 267 }, { "epoch": 0.3468133290197347, "grad_norm": 0.7883780598640442, "learning_rate": 9.919437475373882e-05, "loss": 1.0259, "step": 268 }, { "epoch": 0.3481074086056292, "grad_norm": 0.7776859402656555, "learning_rate": 9.91869868857902e-05, "loss": 0.9145, "step": 269 }, { "epoch": 0.3494014881915238, "grad_norm": 0.8383511900901794, "learning_rate": 9.91795655752451e-05, "loss": 0.8769, "step": 270 }, { "epoch": 0.35069556777741834, "grad_norm": 0.8562968373298645, "learning_rate": 9.917211082714933e-05, "loss": 1.0078, "step": 271 }, { "epoch": 0.35198964736331284, "grad_norm": 0.810517430305481, "learning_rate": 9.916462264657142e-05, "loss": 0.8847, "step": 272 }, { "epoch": 0.3532837269492074, "grad_norm": 0.8934103846549988, "learning_rate": 9.915710103860263e-05, "loss": 0.8633, "step": 273 }, { "epoch": 0.3545778065351019, "grad_norm": 0.8963167667388916, "learning_rate": 9.914954600835699e-05, "loss": 0.8411, "step": 274 }, { "epoch": 0.35587188612099646, "grad_norm": 0.774557888507843, "learning_rate": 9.91419575609712e-05, "loss": 0.958, "step": 275 }, { "epoch": 0.35716596570689096, "grad_norm": 0.8483523726463318, "learning_rate": 9.913433570160469e-05, "loss": 0.905, "step": 276 }, { "epoch": 0.3584600452927855, "grad_norm": 0.8082010746002197, "learning_rate": 9.912668043543964e-05, "loss": 0.8633, "step": 277 }, { "epoch": 0.35975412487868, "grad_norm": 0.8654133677482605, "learning_rate": 9.911899176768091e-05, "loss": 0.8465, "step": 278 }, { "epoch": 0.3610482044645746, "grad_norm": 0.8709694743156433, "learning_rate": 9.911126970355609e-05, "loss": 0.9286, "step": 279 }, { "epoch": 0.3623422840504691, "grad_norm": 0.8783992528915405, "learning_rate": 9.910351424831546e-05, "loss": 0.9349, "step": 280 }, { "epoch": 0.36363636363636365, "grad_norm": 0.8846459984779358, "learning_rate": 9.909572540723202e-05, "loss": 0.7986, "step": 281 }, { "epoch": 0.36493044322225815, "grad_norm": 0.8271780610084534, "learning_rate": 9.908790318560146e-05, "loss": 0.8179, "step": 282 }, { "epoch": 0.3662245228081527, "grad_norm": 0.8694506883621216, "learning_rate": 9.908004758874216e-05, "loss": 0.8453, "step": 283 }, { "epoch": 0.3675186023940472, "grad_norm": 0.8625207543373108, "learning_rate": 9.90721586219952e-05, "loss": 0.7825, "step": 284 }, { "epoch": 0.36881268197994177, "grad_norm": 0.8270084261894226, "learning_rate": 9.906423629072434e-05, "loss": 1.0889, "step": 285 }, { "epoch": 0.3701067615658363, "grad_norm": 0.8891452550888062, "learning_rate": 9.905628060031605e-05, "loss": 0.8847, "step": 286 }, { "epoch": 0.37140084115173083, "grad_norm": 0.8682214021682739, "learning_rate": 9.904829155617945e-05, "loss": 0.9311, "step": 287 }, { "epoch": 0.3726949207376254, "grad_norm": 1.2218021154403687, "learning_rate": 9.904026916374636e-05, "loss": 0.92, "step": 288 }, { "epoch": 0.3739890003235199, "grad_norm": 1.0069034099578857, "learning_rate": 9.903221342847125e-05, "loss": 1.0061, "step": 289 }, { "epoch": 0.37528307990941445, "grad_norm": 0.7723405361175537, "learning_rate": 9.902412435583128e-05, "loss": 0.8627, "step": 290 }, { "epoch": 0.37657715949530896, "grad_norm": 0.8336161971092224, "learning_rate": 9.901600195132627e-05, "loss": 0.9815, "step": 291 }, { "epoch": 0.3778712390812035, "grad_norm": 0.8170490264892578, "learning_rate": 9.90078462204787e-05, "loss": 0.8305, "step": 292 }, { "epoch": 0.379165318667098, "grad_norm": 0.7367318868637085, "learning_rate": 9.899965716883372e-05, "loss": 0.8314, "step": 293 }, { "epoch": 0.3804593982529926, "grad_norm": 0.8122355937957764, "learning_rate": 9.899143480195913e-05, "loss": 0.8369, "step": 294 }, { "epoch": 0.3817534778388871, "grad_norm": 0.7415926456451416, "learning_rate": 9.898317912544536e-05, "loss": 0.8985, "step": 295 }, { "epoch": 0.38304755742478164, "grad_norm": 0.8654418587684631, "learning_rate": 9.897489014490553e-05, "loss": 0.8752, "step": 296 }, { "epoch": 0.38434163701067614, "grad_norm": 0.7863161563873291, "learning_rate": 9.896656786597535e-05, "loss": 1.0138, "step": 297 }, { "epoch": 0.3856357165965707, "grad_norm": 0.8346667885780334, "learning_rate": 9.895821229431323e-05, "loss": 0.94, "step": 298 }, { "epoch": 0.3869297961824652, "grad_norm": 0.9577547907829285, "learning_rate": 9.894982343560016e-05, "loss": 0.9967, "step": 299 }, { "epoch": 0.38822387576835976, "grad_norm": 0.7633039951324463, "learning_rate": 9.894140129553981e-05, "loss": 0.8469, "step": 300 }, { "epoch": 0.38951795535425426, "grad_norm": 0.8275448083877563, "learning_rate": 9.893294587985843e-05, "loss": 0.8295, "step": 301 }, { "epoch": 0.3908120349401488, "grad_norm": 0.9197372198104858, "learning_rate": 9.892445719430493e-05, "loss": 0.9363, "step": 302 }, { "epoch": 0.3921061145260433, "grad_norm": 0.7137726545333862, "learning_rate": 9.891593524465083e-05, "loss": 0.8207, "step": 303 }, { "epoch": 0.3934001941119379, "grad_norm": 0.9908462762832642, "learning_rate": 9.890738003669029e-05, "loss": 0.9287, "step": 304 }, { "epoch": 0.39469427369783244, "grad_norm": 1.0070343017578125, "learning_rate": 9.889879157624002e-05, "loss": 1.2092, "step": 305 }, { "epoch": 0.39598835328372695, "grad_norm": 0.7812051177024841, "learning_rate": 9.889016986913941e-05, "loss": 0.8149, "step": 306 }, { "epoch": 0.3972824328696215, "grad_norm": 0.8907694816589355, "learning_rate": 9.888151492125039e-05, "loss": 0.9008, "step": 307 }, { "epoch": 0.398576512455516, "grad_norm": 0.960164487361908, "learning_rate": 9.887282673845754e-05, "loss": 0.9437, "step": 308 }, { "epoch": 0.39987059204141057, "grad_norm": 0.8578314185142517, "learning_rate": 9.886410532666805e-05, "loss": 0.8422, "step": 309 }, { "epoch": 0.40116467162730507, "grad_norm": 0.8671669960021973, "learning_rate": 9.885535069181162e-05, "loss": 0.924, "step": 310 }, { "epoch": 0.40245875121319963, "grad_norm": 1.037150263786316, "learning_rate": 9.884656283984062e-05, "loss": 1.0074, "step": 311 }, { "epoch": 0.40375283079909413, "grad_norm": 0.7935649156570435, "learning_rate": 9.883774177672998e-05, "loss": 0.9804, "step": 312 }, { "epoch": 0.4050469103849887, "grad_norm": 0.8249583840370178, "learning_rate": 9.882888750847717e-05, "loss": 0.9627, "step": 313 }, { "epoch": 0.4063409899708832, "grad_norm": 0.7363491058349609, "learning_rate": 9.882000004110233e-05, "loss": 0.8309, "step": 314 }, { "epoch": 0.40763506955677775, "grad_norm": 0.8029589653015137, "learning_rate": 9.881107938064806e-05, "loss": 0.8762, "step": 315 }, { "epoch": 0.40892914914267225, "grad_norm": 0.7131720781326294, "learning_rate": 9.880212553317963e-05, "loss": 0.882, "step": 316 }, { "epoch": 0.4102232287285668, "grad_norm": 0.7951234579086304, "learning_rate": 9.879313850478478e-05, "loss": 1.0095, "step": 317 }, { "epoch": 0.4115173083144613, "grad_norm": 0.7714753150939941, "learning_rate": 9.87841183015739e-05, "loss": 0.8688, "step": 318 }, { "epoch": 0.4128113879003559, "grad_norm": 0.7753300666809082, "learning_rate": 9.877506492967987e-05, "loss": 0.9669, "step": 319 }, { "epoch": 0.4141054674862504, "grad_norm": 0.9767395853996277, "learning_rate": 9.876597839525814e-05, "loss": 1.1169, "step": 320 }, { "epoch": 0.41539954707214494, "grad_norm": 0.7923420667648315, "learning_rate": 9.875685870448672e-05, "loss": 0.9942, "step": 321 }, { "epoch": 0.4166936266580395, "grad_norm": 0.7265552282333374, "learning_rate": 9.874770586356616e-05, "loss": 1.0377, "step": 322 }, { "epoch": 0.417987706243934, "grad_norm": 0.7586270570755005, "learning_rate": 9.873851987871954e-05, "loss": 0.9172, "step": 323 }, { "epoch": 0.41928178582982856, "grad_norm": 0.782192587852478, "learning_rate": 9.872930075619249e-05, "loss": 0.9219, "step": 324 }, { "epoch": 0.42057586541572306, "grad_norm": 0.8508116602897644, "learning_rate": 9.872004850225313e-05, "loss": 0.939, "step": 325 }, { "epoch": 0.4218699450016176, "grad_norm": 0.8639410138130188, "learning_rate": 9.871076312319218e-05, "loss": 1.0854, "step": 326 }, { "epoch": 0.4231640245875121, "grad_norm": 0.809862494468689, "learning_rate": 9.870144462532281e-05, "loss": 0.8283, "step": 327 }, { "epoch": 0.4244581041734067, "grad_norm": 0.7755741477012634, "learning_rate": 9.869209301498072e-05, "loss": 0.7645, "step": 328 }, { "epoch": 0.4257521837593012, "grad_norm": 0.8238282203674316, "learning_rate": 9.868270829852416e-05, "loss": 0.8858, "step": 329 }, { "epoch": 0.42704626334519574, "grad_norm": 0.810691773891449, "learning_rate": 9.867329048233387e-05, "loss": 0.7913, "step": 330 }, { "epoch": 0.42834034293109025, "grad_norm": 0.8913648128509521, "learning_rate": 9.866383957281309e-05, "loss": 0.7885, "step": 331 }, { "epoch": 0.4296344225169848, "grad_norm": 0.9160618782043457, "learning_rate": 9.865435557638757e-05, "loss": 0.7296, "step": 332 }, { "epoch": 0.4309285021028793, "grad_norm": 0.7498170137405396, "learning_rate": 9.864483849950553e-05, "loss": 0.9655, "step": 333 }, { "epoch": 0.43222258168877387, "grad_norm": 0.7315449714660645, "learning_rate": 9.863528834863773e-05, "loss": 0.6886, "step": 334 }, { "epoch": 0.43351666127466837, "grad_norm": 0.8278869390487671, "learning_rate": 9.862570513027735e-05, "loss": 0.9637, "step": 335 }, { "epoch": 0.4348107408605629, "grad_norm": 0.8372804522514343, "learning_rate": 9.861608885094012e-05, "loss": 0.8609, "step": 336 }, { "epoch": 0.43610482044645743, "grad_norm": 0.8712325096130371, "learning_rate": 9.860643951716421e-05, "loss": 0.9718, "step": 337 }, { "epoch": 0.437398900032352, "grad_norm": 0.9869045615196228, "learning_rate": 9.859675713551028e-05, "loss": 0.887, "step": 338 }, { "epoch": 0.43869297961824655, "grad_norm": 0.9166460037231445, "learning_rate": 9.858704171256145e-05, "loss": 1.0751, "step": 339 }, { "epoch": 0.43998705920414105, "grad_norm": 1.1965091228485107, "learning_rate": 9.857729325492329e-05, "loss": 1.0093, "step": 340 }, { "epoch": 0.4412811387900356, "grad_norm": 0.8646867275238037, "learning_rate": 9.856751176922388e-05, "loss": 0.9235, "step": 341 }, { "epoch": 0.4425752183759301, "grad_norm": 0.7576479315757751, "learning_rate": 9.85576972621137e-05, "loss": 0.8323, "step": 342 }, { "epoch": 0.44386929796182467, "grad_norm": 0.8257366418838501, "learning_rate": 9.854784974026572e-05, "loss": 0.8478, "step": 343 }, { "epoch": 0.4451633775477192, "grad_norm": 0.8963577747344971, "learning_rate": 9.853796921037534e-05, "loss": 0.8943, "step": 344 }, { "epoch": 0.44645745713361373, "grad_norm": 0.8696343898773193, "learning_rate": 9.85280556791604e-05, "loss": 0.991, "step": 345 }, { "epoch": 0.44775153671950824, "grad_norm": 0.8321841359138489, "learning_rate": 9.851810915336119e-05, "loss": 1.0462, "step": 346 }, { "epoch": 0.4490456163054028, "grad_norm": 0.8194176554679871, "learning_rate": 9.850812963974042e-05, "loss": 0.9484, "step": 347 }, { "epoch": 0.4503396958912973, "grad_norm": 0.9131197333335876, "learning_rate": 9.849811714508323e-05, "loss": 0.9687, "step": 348 }, { "epoch": 0.45163377547719186, "grad_norm": 0.891231894493103, "learning_rate": 9.848807167619721e-05, "loss": 0.9605, "step": 349 }, { "epoch": 0.45292785506308636, "grad_norm": 0.9574115872383118, "learning_rate": 9.847799323991234e-05, "loss": 1.077, "step": 350 }, { "epoch": 0.4542219346489809, "grad_norm": 0.9179444909095764, "learning_rate": 9.8467881843081e-05, "loss": 0.8353, "step": 351 }, { "epoch": 0.4555160142348754, "grad_norm": 0.855802595615387, "learning_rate": 9.845773749257804e-05, "loss": 0.9764, "step": 352 }, { "epoch": 0.45681009382077, "grad_norm": 0.9086332321166992, "learning_rate": 9.844756019530066e-05, "loss": 1.0526, "step": 353 }, { "epoch": 0.4581041734066645, "grad_norm": 0.890271782875061, "learning_rate": 9.843734995816848e-05, "loss": 0.8905, "step": 354 }, { "epoch": 0.45939825299255904, "grad_norm": 0.7878096699714661, "learning_rate": 9.842710678812351e-05, "loss": 0.8706, "step": 355 }, { "epoch": 0.4606923325784536, "grad_norm": 0.9886014461517334, "learning_rate": 9.841683069213017e-05, "loss": 0.9579, "step": 356 }, { "epoch": 0.4619864121643481, "grad_norm": 0.8265432119369507, "learning_rate": 9.840652167717526e-05, "loss": 0.8528, "step": 357 }, { "epoch": 0.46328049175024266, "grad_norm": 0.7354372143745422, "learning_rate": 9.839617975026793e-05, "loss": 0.775, "step": 358 }, { "epoch": 0.46457457133613717, "grad_norm": 0.8311409950256348, "learning_rate": 9.838580491843976e-05, "loss": 1.0374, "step": 359 }, { "epoch": 0.4658686509220317, "grad_norm": 0.8180521130561829, "learning_rate": 9.837539718874464e-05, "loss": 0.818, "step": 360 }, { "epoch": 0.4671627305079262, "grad_norm": 0.7231691479682922, "learning_rate": 9.83649565682589e-05, "loss": 0.9141, "step": 361 }, { "epoch": 0.4684568100938208, "grad_norm": 0.7644848227500916, "learning_rate": 9.835448306408118e-05, "loss": 0.9565, "step": 362 }, { "epoch": 0.4697508896797153, "grad_norm": 0.7103495001792908, "learning_rate": 9.83439766833325e-05, "loss": 0.7706, "step": 363 }, { "epoch": 0.47104496926560985, "grad_norm": 0.8206077218055725, "learning_rate": 9.83334374331562e-05, "loss": 1.0923, "step": 364 }, { "epoch": 0.47233904885150435, "grad_norm": 0.7409754991531372, "learning_rate": 9.832286532071802e-05, "loss": 0.8534, "step": 365 }, { "epoch": 0.4736331284373989, "grad_norm": 0.8880231380462646, "learning_rate": 9.831226035320602e-05, "loss": 0.9025, "step": 366 }, { "epoch": 0.4749272080232934, "grad_norm": 0.9805209636688232, "learning_rate": 9.830162253783058e-05, "loss": 0.9939, "step": 367 }, { "epoch": 0.47622128760918797, "grad_norm": 0.9529579281806946, "learning_rate": 9.829095188182442e-05, "loss": 0.9645, "step": 368 }, { "epoch": 0.4775153671950825, "grad_norm": 0.8122926354408264, "learning_rate": 9.828024839244263e-05, "loss": 0.9269, "step": 369 }, { "epoch": 0.47880944678097703, "grad_norm": 0.8557335734367371, "learning_rate": 9.826951207696258e-05, "loss": 0.8781, "step": 370 }, { "epoch": 0.48010352636687154, "grad_norm": 0.959077775478363, "learning_rate": 9.825874294268396e-05, "loss": 0.9448, "step": 371 }, { "epoch": 0.4813976059527661, "grad_norm": 0.7479959726333618, "learning_rate": 9.824794099692878e-05, "loss": 0.8608, "step": 372 }, { "epoch": 0.48269168553866065, "grad_norm": 0.8064723014831543, "learning_rate": 9.823710624704137e-05, "loss": 1.1222, "step": 373 }, { "epoch": 0.48398576512455516, "grad_norm": 0.8007084131240845, "learning_rate": 9.822623870038838e-05, "loss": 0.8967, "step": 374 }, { "epoch": 0.4852798447104497, "grad_norm": 0.988571286201477, "learning_rate": 9.82153383643587e-05, "loss": 0.9574, "step": 375 }, { "epoch": 0.4865739242963442, "grad_norm": 0.9317258596420288, "learning_rate": 9.820440524636356e-05, "loss": 0.929, "step": 376 }, { "epoch": 0.4878680038822388, "grad_norm": 0.8651334643363953, "learning_rate": 9.819343935383649e-05, "loss": 0.9483, "step": 377 }, { "epoch": 0.4891620834681333, "grad_norm": 0.791306734085083, "learning_rate": 9.818244069423325e-05, "loss": 0.9277, "step": 378 }, { "epoch": 0.49045616305402784, "grad_norm": 0.6706424355506897, "learning_rate": 9.817140927503192e-05, "loss": 0.9013, "step": 379 }, { "epoch": 0.49175024263992234, "grad_norm": 0.8574838042259216, "learning_rate": 9.816034510373286e-05, "loss": 0.9889, "step": 380 }, { "epoch": 0.4930443222258169, "grad_norm": 0.829309344291687, "learning_rate": 9.814924818785865e-05, "loss": 0.8139, "step": 381 }, { "epoch": 0.4943384018117114, "grad_norm": 0.83943110704422, "learning_rate": 9.81381185349542e-05, "loss": 0.9202, "step": 382 }, { "epoch": 0.49563248139760596, "grad_norm": 0.7981933355331421, "learning_rate": 9.812695615258662e-05, "loss": 0.9131, "step": 383 }, { "epoch": 0.49692656098350046, "grad_norm": 0.7930905818939209, "learning_rate": 9.81157610483453e-05, "loss": 0.769, "step": 384 }, { "epoch": 0.498220640569395, "grad_norm": 0.8699679970741272, "learning_rate": 9.81045332298419e-05, "loss": 0.9468, "step": 385 }, { "epoch": 0.4995147201552895, "grad_norm": 0.7733138799667358, "learning_rate": 9.809327270471025e-05, "loss": 0.8982, "step": 386 }, { "epoch": 0.4995147201552895, "eval_loss": 0.8872498869895935, "eval_runtime": 189.8035, "eval_samples_per_second": 3.43, "eval_steps_per_second": 0.859, "step": 386 }, { "epoch": 0.5008087997411841, "grad_norm": 0.9202268123626709, "learning_rate": 9.808197948060651e-05, "loss": 0.9664, "step": 387 }, { "epoch": 0.5021028793270786, "grad_norm": 0.7772640585899353, "learning_rate": 9.807065356520899e-05, "loss": 0.8825, "step": 388 }, { "epoch": 0.5033969589129732, "grad_norm": 0.8117741346359253, "learning_rate": 9.805929496621828e-05, "loss": 0.8166, "step": 389 }, { "epoch": 0.5046910384988677, "grad_norm": 0.7308422327041626, "learning_rate": 9.804790369135718e-05, "loss": 0.8657, "step": 390 }, { "epoch": 0.5059851180847622, "grad_norm": 0.8373358845710754, "learning_rate": 9.80364797483707e-05, "loss": 0.9175, "step": 391 }, { "epoch": 0.5072791976706568, "grad_norm": 0.8288902640342712, "learning_rate": 9.802502314502607e-05, "loss": 0.7463, "step": 392 }, { "epoch": 0.5085732772565513, "grad_norm": 0.6780114769935608, "learning_rate": 9.801353388911269e-05, "loss": 0.7973, "step": 393 }, { "epoch": 0.5098673568424458, "grad_norm": 0.9328367710113525, "learning_rate": 9.800201198844221e-05, "loss": 1.0405, "step": 394 }, { "epoch": 0.5111614364283403, "grad_norm": 0.9010327458381653, "learning_rate": 9.799045745084847e-05, "loss": 1.1194, "step": 395 }, { "epoch": 0.5124555160142349, "grad_norm": 0.7900609374046326, "learning_rate": 9.797887028418746e-05, "loss": 0.9747, "step": 396 }, { "epoch": 0.5137495956001294, "grad_norm": 0.8245125412940979, "learning_rate": 9.796725049633741e-05, "loss": 0.8594, "step": 397 }, { "epoch": 0.5150436751860239, "grad_norm": 0.7617928385734558, "learning_rate": 9.795559809519866e-05, "loss": 0.8176, "step": 398 }, { "epoch": 0.5163377547719185, "grad_norm": 0.9844444990158081, "learning_rate": 9.79439130886938e-05, "loss": 1.0116, "step": 399 }, { "epoch": 0.517631834357813, "grad_norm": 0.8849684000015259, "learning_rate": 9.793219548476753e-05, "loss": 0.8706, "step": 400 }, { "epoch": 0.5189259139437076, "grad_norm": 0.893902063369751, "learning_rate": 9.792044529138674e-05, "loss": 0.8217, "step": 401 }, { "epoch": 0.520219993529602, "grad_norm": 0.7412934899330139, "learning_rate": 9.79086625165405e-05, "loss": 0.868, "step": 402 }, { "epoch": 0.5215140731154966, "grad_norm": 0.796435534954071, "learning_rate": 9.789684716823995e-05, "loss": 0.8691, "step": 403 }, { "epoch": 0.5228081527013911, "grad_norm": 1.063193440437317, "learning_rate": 9.788499925451849e-05, "loss": 1.0085, "step": 404 }, { "epoch": 0.5241022322872857, "grad_norm": 0.952882707118988, "learning_rate": 9.787311878343157e-05, "loss": 0.8378, "step": 405 }, { "epoch": 0.5253963118731803, "grad_norm": 0.7899916768074036, "learning_rate": 9.786120576305682e-05, "loss": 0.8917, "step": 406 }, { "epoch": 0.5266903914590747, "grad_norm": 0.8242781758308411, "learning_rate": 9.784926020149398e-05, "loss": 0.9778, "step": 407 }, { "epoch": 0.5279844710449693, "grad_norm": 0.9736928343772888, "learning_rate": 9.783728210686496e-05, "loss": 1.0145, "step": 408 }, { "epoch": 0.5292785506308638, "grad_norm": 0.8070263862609863, "learning_rate": 9.782527148731372e-05, "loss": 0.9923, "step": 409 }, { "epoch": 0.5305726302167584, "grad_norm": 0.861262857913971, "learning_rate": 9.781322835100638e-05, "loss": 1.0261, "step": 410 }, { "epoch": 0.5318667098026528, "grad_norm": 0.8204033970832825, "learning_rate": 9.780115270613115e-05, "loss": 0.8865, "step": 411 }, { "epoch": 0.5331607893885474, "grad_norm": 0.8053429126739502, "learning_rate": 9.778904456089838e-05, "loss": 0.8606, "step": 412 }, { "epoch": 0.5344548689744419, "grad_norm": 0.7944838404655457, "learning_rate": 9.777690392354045e-05, "loss": 0.8925, "step": 413 }, { "epoch": 0.5357489485603365, "grad_norm": 0.9882616400718689, "learning_rate": 9.77647308023119e-05, "loss": 0.8577, "step": 414 }, { "epoch": 0.537043028146231, "grad_norm": 0.6937118768692017, "learning_rate": 9.77525252054893e-05, "loss": 0.6972, "step": 415 }, { "epoch": 0.5383371077321255, "grad_norm": 1.0071479082107544, "learning_rate": 9.774028714137133e-05, "loss": 0.9233, "step": 416 }, { "epoch": 0.5396311873180201, "grad_norm": 0.916771411895752, "learning_rate": 9.772801661827874e-05, "loss": 1.0072, "step": 417 }, { "epoch": 0.5409252669039146, "grad_norm": 0.8663278818130493, "learning_rate": 9.771571364455439e-05, "loss": 1.1011, "step": 418 }, { "epoch": 0.5422193464898091, "grad_norm": 0.7842355370521545, "learning_rate": 9.77033782285631e-05, "loss": 1.0382, "step": 419 }, { "epoch": 0.5435134260757036, "grad_norm": 0.8407487273216248, "learning_rate": 9.769101037869187e-05, "loss": 0.9612, "step": 420 }, { "epoch": 0.5448075056615982, "grad_norm": 0.7557950615882874, "learning_rate": 9.767861010334962e-05, "loss": 0.9965, "step": 421 }, { "epoch": 0.5461015852474927, "grad_norm": 0.812099814414978, "learning_rate": 9.766617741096746e-05, "loss": 0.8824, "step": 422 }, { "epoch": 0.5473956648333873, "grad_norm": 0.8438544273376465, "learning_rate": 9.765371230999843e-05, "loss": 0.8852, "step": 423 }, { "epoch": 0.5486897444192818, "grad_norm": 0.7006101608276367, "learning_rate": 9.764121480891765e-05, "loss": 0.808, "step": 424 }, { "epoch": 0.5499838240051763, "grad_norm": 0.7312132716178894, "learning_rate": 9.76286849162223e-05, "loss": 0.7964, "step": 425 }, { "epoch": 0.5512779035910709, "grad_norm": 0.939939558506012, "learning_rate": 9.76161226404315e-05, "loss": 0.9434, "step": 426 }, { "epoch": 0.5525719831769654, "grad_norm": 0.7815152406692505, "learning_rate": 9.760352799008643e-05, "loss": 1.0487, "step": 427 }, { "epoch": 0.5538660627628599, "grad_norm": 0.8934217691421509, "learning_rate": 9.759090097375032e-05, "loss": 1.0036, "step": 428 }, { "epoch": 0.5551601423487544, "grad_norm": 0.7264319658279419, "learning_rate": 9.757824160000837e-05, "loss": 0.7866, "step": 429 }, { "epoch": 0.556454221934649, "grad_norm": 0.8536062836647034, "learning_rate": 9.756554987746776e-05, "loss": 0.9949, "step": 430 }, { "epoch": 0.5577483015205436, "grad_norm": 0.8253523111343384, "learning_rate": 9.755282581475769e-05, "loss": 0.9801, "step": 431 }, { "epoch": 0.559042381106438, "grad_norm": 0.6686350703239441, "learning_rate": 9.754006942052936e-05, "loss": 0.8388, "step": 432 }, { "epoch": 0.5603364606923326, "grad_norm": 0.8560882210731506, "learning_rate": 9.752728070345591e-05, "loss": 1.0167, "step": 433 }, { "epoch": 0.5616305402782271, "grad_norm": 0.765852153301239, "learning_rate": 9.751445967223252e-05, "loss": 0.7865, "step": 434 }, { "epoch": 0.5629246198641217, "grad_norm": 0.8869863152503967, "learning_rate": 9.750160633557627e-05, "loss": 0.9424, "step": 435 }, { "epoch": 0.5642186994500161, "grad_norm": 0.8522469401359558, "learning_rate": 9.748872070222625e-05, "loss": 0.8151, "step": 436 }, { "epoch": 0.5655127790359107, "grad_norm": 0.862091064453125, "learning_rate": 9.747580278094352e-05, "loss": 1.0131, "step": 437 }, { "epoch": 0.5668068586218052, "grad_norm": 0.8809049725532532, "learning_rate": 9.746285258051104e-05, "loss": 0.9092, "step": 438 }, { "epoch": 0.5681009382076998, "grad_norm": 0.8506961464881897, "learning_rate": 9.744987010973377e-05, "loss": 0.8756, "step": 439 }, { "epoch": 0.5693950177935944, "grad_norm": 0.7440844774246216, "learning_rate": 9.743685537743856e-05, "loss": 0.7948, "step": 440 }, { "epoch": 0.5706890973794888, "grad_norm": 0.8117014169692993, "learning_rate": 9.742380839247425e-05, "loss": 0.9382, "step": 441 }, { "epoch": 0.5719831769653834, "grad_norm": 0.8515580296516418, "learning_rate": 9.741072916371157e-05, "loss": 0.9202, "step": 442 }, { "epoch": 0.5732772565512779, "grad_norm": 0.8774899244308472, "learning_rate": 9.739761770004318e-05, "loss": 0.9129, "step": 443 }, { "epoch": 0.5745713361371725, "grad_norm": 0.7189667820930481, "learning_rate": 9.738447401038367e-05, "loss": 0.7392, "step": 444 }, { "epoch": 0.5758654157230669, "grad_norm": 0.7225956916809082, "learning_rate": 9.737129810366952e-05, "loss": 0.8703, "step": 445 }, { "epoch": 0.5771594953089615, "grad_norm": 0.8351484537124634, "learning_rate": 9.735808998885915e-05, "loss": 0.9626, "step": 446 }, { "epoch": 0.578453574894856, "grad_norm": 0.860819399356842, "learning_rate": 9.734484967493282e-05, "loss": 0.9061, "step": 447 }, { "epoch": 0.5797476544807506, "grad_norm": 0.7928848266601562, "learning_rate": 9.733157717089277e-05, "loss": 0.9141, "step": 448 }, { "epoch": 0.581041734066645, "grad_norm": 0.9132540822029114, "learning_rate": 9.7318272485763e-05, "loss": 0.7613, "step": 449 }, { "epoch": 0.5823358136525396, "grad_norm": 0.7894258499145508, "learning_rate": 9.730493562858953e-05, "loss": 0.8234, "step": 450 }, { "epoch": 0.5836298932384342, "grad_norm": 0.8427807688713074, "learning_rate": 9.729156660844017e-05, "loss": 1.1023, "step": 451 }, { "epoch": 0.5849239728243287, "grad_norm": 0.8546382188796997, "learning_rate": 9.727816543440458e-05, "loss": 0.886, "step": 452 }, { "epoch": 0.5862180524102232, "grad_norm": 0.6701838970184326, "learning_rate": 9.726473211559437e-05, "loss": 0.7861, "step": 453 }, { "epoch": 0.5875121319961177, "grad_norm": 1.010311245918274, "learning_rate": 9.725126666114292e-05, "loss": 1.0393, "step": 454 }, { "epoch": 0.5888062115820123, "grad_norm": 0.8440571427345276, "learning_rate": 9.72377690802055e-05, "loss": 0.9959, "step": 455 }, { "epoch": 0.5901002911679069, "grad_norm": 0.8143338561058044, "learning_rate": 9.722423938195922e-05, "loss": 0.9954, "step": 456 }, { "epoch": 0.5913943707538014, "grad_norm": 0.6839838027954102, "learning_rate": 9.721067757560303e-05, "loss": 0.7288, "step": 457 }, { "epoch": 0.5926884503396959, "grad_norm": 0.8407920598983765, "learning_rate": 9.719708367035767e-05, "loss": 0.858, "step": 458 }, { "epoch": 0.5939825299255904, "grad_norm": 0.8388239741325378, "learning_rate": 9.718345767546576e-05, "loss": 0.8455, "step": 459 }, { "epoch": 0.595276609511485, "grad_norm": 0.7476726770401001, "learning_rate": 9.716979960019173e-05, "loss": 0.8261, "step": 460 }, { "epoch": 0.5965706890973795, "grad_norm": 0.766036331653595, "learning_rate": 9.715610945382177e-05, "loss": 0.9601, "step": 461 }, { "epoch": 0.597864768683274, "grad_norm": 0.9155464172363281, "learning_rate": 9.714238724566393e-05, "loss": 0.8716, "step": 462 }, { "epoch": 0.5991588482691685, "grad_norm": 0.8961794972419739, "learning_rate": 9.712863298504807e-05, "loss": 0.9595, "step": 463 }, { "epoch": 0.6004529278550631, "grad_norm": 0.8102921843528748, "learning_rate": 9.711484668132575e-05, "loss": 0.7174, "step": 464 }, { "epoch": 0.6017470074409577, "grad_norm": 0.7466800808906555, "learning_rate": 9.710102834387043e-05, "loss": 0.8707, "step": 465 }, { "epoch": 0.6030410870268521, "grad_norm": 0.7572315335273743, "learning_rate": 9.708717798207729e-05, "loss": 1.0715, "step": 466 }, { "epoch": 0.6043351666127467, "grad_norm": 0.944107174873352, "learning_rate": 9.707329560536328e-05, "loss": 0.9103, "step": 467 }, { "epoch": 0.6056292461986412, "grad_norm": 0.7636998891830444, "learning_rate": 9.705938122316715e-05, "loss": 1.0246, "step": 468 }, { "epoch": 0.6069233257845358, "grad_norm": 0.7484267950057983, "learning_rate": 9.704543484494937e-05, "loss": 1.0018, "step": 469 }, { "epoch": 0.6082174053704302, "grad_norm": 0.89073646068573, "learning_rate": 9.70314564801922e-05, "loss": 0.8776, "step": 470 }, { "epoch": 0.6095114849563248, "grad_norm": 0.8132034540176392, "learning_rate": 9.701744613839963e-05, "loss": 0.9575, "step": 471 }, { "epoch": 0.6108055645422193, "grad_norm": 0.8051428198814392, "learning_rate": 9.700340382909739e-05, "loss": 0.7546, "step": 472 }, { "epoch": 0.6120996441281139, "grad_norm": 0.7600153088569641, "learning_rate": 9.698932956183296e-05, "loss": 0.8556, "step": 473 }, { "epoch": 0.6133937237140085, "grad_norm": 0.9171023964881897, "learning_rate": 9.697522334617555e-05, "loss": 0.8196, "step": 474 }, { "epoch": 0.6146878032999029, "grad_norm": 1.0437992811203003, "learning_rate": 9.696108519171605e-05, "loss": 1.0997, "step": 475 }, { "epoch": 0.6159818828857975, "grad_norm": 0.8450473546981812, "learning_rate": 9.69469151080671e-05, "loss": 1.0144, "step": 476 }, { "epoch": 0.617275962471692, "grad_norm": 0.8718084692955017, "learning_rate": 9.693271310486307e-05, "loss": 1.0536, "step": 477 }, { "epoch": 0.6185700420575866, "grad_norm": 0.858589768409729, "learning_rate": 9.691847919175999e-05, "loss": 0.9851, "step": 478 }, { "epoch": 0.619864121643481, "grad_norm": 0.7903156280517578, "learning_rate": 9.69042133784356e-05, "loss": 1.2178, "step": 479 }, { "epoch": 0.6211582012293756, "grad_norm": 0.6688570976257324, "learning_rate": 9.688991567458933e-05, "loss": 0.7721, "step": 480 }, { "epoch": 0.6224522808152702, "grad_norm": 0.7499825954437256, "learning_rate": 9.687558608994232e-05, "loss": 0.8728, "step": 481 }, { "epoch": 0.6237463604011647, "grad_norm": 0.871703028678894, "learning_rate": 9.686122463423732e-05, "loss": 0.88, "step": 482 }, { "epoch": 0.6250404399870592, "grad_norm": 0.8204110860824585, "learning_rate": 9.684683131723884e-05, "loss": 0.8569, "step": 483 }, { "epoch": 0.6263345195729537, "grad_norm": 0.8779101371765137, "learning_rate": 9.683240614873294e-05, "loss": 0.901, "step": 484 }, { "epoch": 0.6276285991588483, "grad_norm": 0.8231368064880371, "learning_rate": 9.681794913852746e-05, "loss": 0.989, "step": 485 }, { "epoch": 0.6289226787447428, "grad_norm": 0.7921927571296692, "learning_rate": 9.68034602964518e-05, "loss": 0.8748, "step": 486 }, { "epoch": 0.6302167583306373, "grad_norm": 0.8911926746368408, "learning_rate": 9.678893963235704e-05, "loss": 1.0641, "step": 487 }, { "epoch": 0.6315108379165318, "grad_norm": 0.8004783391952515, "learning_rate": 9.677438715611586e-05, "loss": 0.9681, "step": 488 }, { "epoch": 0.6328049175024264, "grad_norm": 0.9230672717094421, "learning_rate": 9.675980287762263e-05, "loss": 0.9226, "step": 489 }, { "epoch": 0.634098997088321, "grad_norm": 0.7298123240470886, "learning_rate": 9.67451868067933e-05, "loss": 0.7876, "step": 490 }, { "epoch": 0.6353930766742155, "grad_norm": 0.7051340937614441, "learning_rate": 9.673053895356543e-05, "loss": 0.8534, "step": 491 }, { "epoch": 0.63668715626011, "grad_norm": 0.9023073315620422, "learning_rate": 9.671585932789821e-05, "loss": 0.8995, "step": 492 }, { "epoch": 0.6379812358460045, "grad_norm": 0.7804109454154968, "learning_rate": 9.670114793977243e-05, "loss": 0.788, "step": 493 }, { "epoch": 0.6392753154318991, "grad_norm": 0.7766551375389099, "learning_rate": 9.66864047991905e-05, "loss": 0.7799, "step": 494 }, { "epoch": 0.6405693950177936, "grad_norm": 0.8782010078430176, "learning_rate": 9.667162991617633e-05, "loss": 0.897, "step": 495 }, { "epoch": 0.6418634746036881, "grad_norm": 0.7679833173751831, "learning_rate": 9.665682330077551e-05, "loss": 0.84, "step": 496 }, { "epoch": 0.6431575541895826, "grad_norm": 0.9792498350143433, "learning_rate": 9.664198496305517e-05, "loss": 0.9597, "step": 497 }, { "epoch": 0.6444516337754772, "grad_norm": 0.9134929180145264, "learning_rate": 9.662711491310398e-05, "loss": 1.0342, "step": 498 }, { "epoch": 0.6457457133613718, "grad_norm": 0.7805169224739075, "learning_rate": 9.661221316103224e-05, "loss": 0.8463, "step": 499 }, { "epoch": 0.6470397929472662, "grad_norm": 0.8533938527107239, "learning_rate": 9.659727971697174e-05, "loss": 0.8315, "step": 500 }, { "epoch": 0.6483338725331608, "grad_norm": 0.7894063591957092, "learning_rate": 9.658231459107582e-05, "loss": 0.9049, "step": 501 }, { "epoch": 0.6496279521190553, "grad_norm": 0.8634886145591736, "learning_rate": 9.656731779351939e-05, "loss": 0.9318, "step": 502 }, { "epoch": 0.6509220317049499, "grad_norm": 0.8121569156646729, "learning_rate": 9.655228933449888e-05, "loss": 0.8475, "step": 503 }, { "epoch": 0.6522161112908443, "grad_norm": 0.8126516342163086, "learning_rate": 9.653722922423229e-05, "loss": 0.8375, "step": 504 }, { "epoch": 0.6535101908767389, "grad_norm": 0.7656330466270447, "learning_rate": 9.652213747295906e-05, "loss": 0.8605, "step": 505 }, { "epoch": 0.6548042704626335, "grad_norm": 0.7796474695205688, "learning_rate": 9.650701409094018e-05, "loss": 0.803, "step": 506 }, { "epoch": 0.656098350048528, "grad_norm": 0.8425498008728027, "learning_rate": 9.649185908845818e-05, "loss": 0.9324, "step": 507 }, { "epoch": 0.6573924296344226, "grad_norm": 0.6857015490531921, "learning_rate": 9.647667247581703e-05, "loss": 0.8701, "step": 508 }, { "epoch": 0.658686509220317, "grad_norm": 0.7896414995193481, "learning_rate": 9.646145426334223e-05, "loss": 0.7447, "step": 509 }, { "epoch": 0.6599805888062116, "grad_norm": 0.7142564058303833, "learning_rate": 9.644620446138077e-05, "loss": 0.9221, "step": 510 }, { "epoch": 0.6612746683921061, "grad_norm": 0.8938276767730713, "learning_rate": 9.643092308030109e-05, "loss": 1.0021, "step": 511 }, { "epoch": 0.6625687479780007, "grad_norm": 1.0916978120803833, "learning_rate": 9.641561013049309e-05, "loss": 0.9354, "step": 512 }, { "epoch": 0.6638628275638951, "grad_norm": 0.8139733672142029, "learning_rate": 9.64002656223682e-05, "loss": 0.923, "step": 513 }, { "epoch": 0.6651569071497897, "grad_norm": 0.8234017491340637, "learning_rate": 9.638488956635925e-05, "loss": 0.8491, "step": 514 }, { "epoch": 0.6664509867356843, "grad_norm": 0.7203138470649719, "learning_rate": 9.636948197292052e-05, "loss": 0.8313, "step": 515 }, { "epoch": 0.6677450663215788, "grad_norm": 0.7210227251052856, "learning_rate": 9.635404285252777e-05, "loss": 0.8593, "step": 516 }, { "epoch": 0.6690391459074733, "grad_norm": 0.7367059588432312, "learning_rate": 9.633857221567815e-05, "loss": 0.9743, "step": 517 }, { "epoch": 0.6703332254933678, "grad_norm": 0.8669779896736145, "learning_rate": 9.632307007289027e-05, "loss": 0.8914, "step": 518 }, { "epoch": 0.6716273050792624, "grad_norm": 0.700624406337738, "learning_rate": 9.630753643470416e-05, "loss": 0.9032, "step": 519 }, { "epoch": 0.6729213846651569, "grad_norm": 0.81085205078125, "learning_rate": 9.629197131168124e-05, "loss": 0.8612, "step": 520 }, { "epoch": 0.6742154642510514, "grad_norm": 0.760386049747467, "learning_rate": 9.627637471440436e-05, "loss": 0.8932, "step": 521 }, { "epoch": 0.675509543836946, "grad_norm": 0.9040992259979248, "learning_rate": 9.626074665347778e-05, "loss": 0.8954, "step": 522 }, { "epoch": 0.6768036234228405, "grad_norm": 0.9394627809524536, "learning_rate": 9.624508713952709e-05, "loss": 0.9641, "step": 523 }, { "epoch": 0.6780977030087351, "grad_norm": 0.7919620871543884, "learning_rate": 9.622939618319936e-05, "loss": 1.0499, "step": 524 }, { "epoch": 0.6793917825946296, "grad_norm": 0.7263360619544983, "learning_rate": 9.621367379516294e-05, "loss": 1.0297, "step": 525 }, { "epoch": 0.6806858621805241, "grad_norm": 0.7512305974960327, "learning_rate": 9.619791998610763e-05, "loss": 0.7866, "step": 526 }, { "epoch": 0.6819799417664186, "grad_norm": 0.8404532670974731, "learning_rate": 9.618213476674455e-05, "loss": 0.8134, "step": 527 }, { "epoch": 0.6832740213523132, "grad_norm": 0.8437522053718567, "learning_rate": 9.61663181478062e-05, "loss": 0.908, "step": 528 }, { "epoch": 0.6845681009382077, "grad_norm": 0.8053356409072876, "learning_rate": 9.61504701400464e-05, "loss": 0.8775, "step": 529 }, { "epoch": 0.6858621805241022, "grad_norm": 0.7193287014961243, "learning_rate": 9.613459075424034e-05, "loss": 0.8122, "step": 530 }, { "epoch": 0.6871562601099968, "grad_norm": 0.7333494424819946, "learning_rate": 9.611868000118452e-05, "loss": 0.8027, "step": 531 }, { "epoch": 0.6884503396958913, "grad_norm": 0.7772257924079895, "learning_rate": 9.61027378916968e-05, "loss": 0.8538, "step": 532 }, { "epoch": 0.6897444192817859, "grad_norm": 0.7605924606323242, "learning_rate": 9.60867644366163e-05, "loss": 0.875, "step": 533 }, { "epoch": 0.6910384988676803, "grad_norm": 0.8444223999977112, "learning_rate": 9.607075964680352e-05, "loss": 1.0179, "step": 534 }, { "epoch": 0.6923325784535749, "grad_norm": 0.7386454939842224, "learning_rate": 9.605472353314023e-05, "loss": 0.9023, "step": 535 }, { "epoch": 0.6936266580394694, "grad_norm": 0.7770008444786072, "learning_rate": 9.603865610652952e-05, "loss": 0.9006, "step": 536 }, { "epoch": 0.694920737625364, "grad_norm": 0.8127918243408203, "learning_rate": 9.60225573778957e-05, "loss": 0.8098, "step": 537 }, { "epoch": 0.6962148172112584, "grad_norm": 0.7819888591766357, "learning_rate": 9.600642735818446e-05, "loss": 0.9933, "step": 538 }, { "epoch": 0.697508896797153, "grad_norm": 0.7097641229629517, "learning_rate": 9.599026605836273e-05, "loss": 0.7463, "step": 539 }, { "epoch": 0.6988029763830476, "grad_norm": 0.7568448781967163, "learning_rate": 9.597407348941865e-05, "loss": 0.7678, "step": 540 }, { "epoch": 0.7000970559689421, "grad_norm": 0.7815922498703003, "learning_rate": 9.59578496623617e-05, "loss": 0.8866, "step": 541 }, { "epoch": 0.7013911355548367, "grad_norm": 0.9351438283920288, "learning_rate": 9.594159458822257e-05, "loss": 0.9607, "step": 542 }, { "epoch": 0.7026852151407311, "grad_norm": 0.8216230869293213, "learning_rate": 9.592530827805322e-05, "loss": 1.0062, "step": 543 }, { "epoch": 0.7039792947266257, "grad_norm": 0.8280504941940308, "learning_rate": 9.59089907429268e-05, "loss": 0.8715, "step": 544 }, { "epoch": 0.7052733743125202, "grad_norm": 0.9679670929908752, "learning_rate": 9.589264199393776e-05, "loss": 1.0543, "step": 545 }, { "epoch": 0.7065674538984148, "grad_norm": 0.7370772361755371, "learning_rate": 9.587626204220171e-05, "loss": 0.8853, "step": 546 }, { "epoch": 0.7078615334843092, "grad_norm": 0.7535960674285889, "learning_rate": 9.585985089885552e-05, "loss": 0.8419, "step": 547 }, { "epoch": 0.7091556130702038, "grad_norm": 0.7049685716629028, "learning_rate": 9.584340857505722e-05, "loss": 0.8282, "step": 548 }, { "epoch": 0.7104496926560984, "grad_norm": 0.7157800197601318, "learning_rate": 9.58269350819861e-05, "loss": 0.8282, "step": 549 }, { "epoch": 0.7117437722419929, "grad_norm": 0.7660270929336548, "learning_rate": 9.581043043084259e-05, "loss": 0.9006, "step": 550 }, { "epoch": 0.7130378518278874, "grad_norm": 0.7422590255737305, "learning_rate": 9.579389463284834e-05, "loss": 0.9591, "step": 551 }, { "epoch": 0.7143319314137819, "grad_norm": 0.7437978982925415, "learning_rate": 9.577732769924614e-05, "loss": 0.8823, "step": 552 }, { "epoch": 0.7156260109996765, "grad_norm": 0.7947098016738892, "learning_rate": 9.576072964129998e-05, "loss": 0.8161, "step": 553 }, { "epoch": 0.716920090585571, "grad_norm": 0.8041345477104187, "learning_rate": 9.574410047029502e-05, "loss": 0.9831, "step": 554 }, { "epoch": 0.7182141701714655, "grad_norm": 0.9462196826934814, "learning_rate": 9.572744019753752e-05, "loss": 0.9516, "step": 555 }, { "epoch": 0.71950824975736, "grad_norm": 0.7633935213088989, "learning_rate": 9.571074883435496e-05, "loss": 0.8415, "step": 556 }, { "epoch": 0.7208023293432546, "grad_norm": 0.7996200919151306, "learning_rate": 9.569402639209589e-05, "loss": 0.9336, "step": 557 }, { "epoch": 0.7220964089291492, "grad_norm": 0.879639208316803, "learning_rate": 9.567727288213005e-05, "loss": 0.9479, "step": 558 }, { "epoch": 0.7233904885150437, "grad_norm": 0.7905813455581665, "learning_rate": 9.566048831584826e-05, "loss": 0.7692, "step": 559 }, { "epoch": 0.7246845681009382, "grad_norm": 0.7934820055961609, "learning_rate": 9.564367270466247e-05, "loss": 0.8735, "step": 560 }, { "epoch": 0.7259786476868327, "grad_norm": 0.8444498181343079, "learning_rate": 9.562682606000575e-05, "loss": 0.9474, "step": 561 }, { "epoch": 0.7272727272727273, "grad_norm": 0.7441167831420898, "learning_rate": 9.560994839333224e-05, "loss": 0.9077, "step": 562 }, { "epoch": 0.7285668068586219, "grad_norm": 0.7276771664619446, "learning_rate": 9.55930397161172e-05, "loss": 0.8866, "step": 563 }, { "epoch": 0.7298608864445163, "grad_norm": 0.8522891402244568, "learning_rate": 9.557610003985698e-05, "loss": 0.9961, "step": 564 }, { "epoch": 0.7311549660304109, "grad_norm": 0.8116034269332886, "learning_rate": 9.555912937606895e-05, "loss": 0.8989, "step": 565 }, { "epoch": 0.7324490456163054, "grad_norm": 0.8132615089416504, "learning_rate": 9.554212773629164e-05, "loss": 0.809, "step": 566 }, { "epoch": 0.7337431252022, "grad_norm": 0.9693892002105713, "learning_rate": 9.552509513208456e-05, "loss": 0.959, "step": 567 }, { "epoch": 0.7350372047880944, "grad_norm": 0.8740898966789246, "learning_rate": 9.550803157502831e-05, "loss": 0.9758, "step": 568 }, { "epoch": 0.736331284373989, "grad_norm": 0.8451546430587769, "learning_rate": 9.549093707672453e-05, "loss": 0.9776, "step": 569 }, { "epoch": 0.7376253639598835, "grad_norm": 0.8144363760948181, "learning_rate": 9.54738116487959e-05, "loss": 0.8114, "step": 570 }, { "epoch": 0.7389194435457781, "grad_norm": 0.9224253296852112, "learning_rate": 9.545665530288612e-05, "loss": 0.8829, "step": 571 }, { "epoch": 0.7402135231316725, "grad_norm": 0.8720259666442871, "learning_rate": 9.543946805065992e-05, "loss": 0.9658, "step": 572 }, { "epoch": 0.7415076027175671, "grad_norm": 0.5992270708084106, "learning_rate": 9.542224990380304e-05, "loss": 0.8167, "step": 573 }, { "epoch": 0.7428016823034617, "grad_norm": 0.7595481276512146, "learning_rate": 9.540500087402222e-05, "loss": 0.9002, "step": 574 }, { "epoch": 0.7440957618893562, "grad_norm": 0.8269613981246948, "learning_rate": 9.538772097304521e-05, "loss": 0.8442, "step": 575 }, { "epoch": 0.7453898414752508, "grad_norm": 0.7087002992630005, "learning_rate": 9.537041021262072e-05, "loss": 0.8004, "step": 576 }, { "epoch": 0.7466839210611452, "grad_norm": 0.8022588491439819, "learning_rate": 9.535306860451849e-05, "loss": 0.9188, "step": 577 }, { "epoch": 0.7479780006470398, "grad_norm": 0.8022916316986084, "learning_rate": 9.533569616052921e-05, "loss": 0.9815, "step": 578 }, { "epoch": 0.7492720802329343, "grad_norm": 0.6670771837234497, "learning_rate": 9.531829289246452e-05, "loss": 0.812, "step": 579 }, { "epoch": 0.7505661598188289, "grad_norm": 0.7644294500350952, "learning_rate": 9.530085881215705e-05, "loss": 0.8092, "step": 580 }, { "epoch": 0.7518602394047234, "grad_norm": 0.8892885446548462, "learning_rate": 9.528339393146033e-05, "loss": 0.8422, "step": 581 }, { "epoch": 0.7531543189906179, "grad_norm": 0.7766391634941101, "learning_rate": 9.526589826224887e-05, "loss": 0.9596, "step": 582 }, { "epoch": 0.7544483985765125, "grad_norm": 0.8290911316871643, "learning_rate": 9.524837181641813e-05, "loss": 0.9624, "step": 583 }, { "epoch": 0.755742478162407, "grad_norm": 0.7832044363021851, "learning_rate": 9.523081460588444e-05, "loss": 0.8141, "step": 584 }, { "epoch": 0.7570365577483015, "grad_norm": 0.9279314875602722, "learning_rate": 9.521322664258508e-05, "loss": 0.8869, "step": 585 }, { "epoch": 0.758330637334196, "grad_norm": 0.8901260495185852, "learning_rate": 9.519560793847826e-05, "loss": 1.0233, "step": 586 }, { "epoch": 0.7596247169200906, "grad_norm": 0.7530801296234131, "learning_rate": 9.517795850554306e-05, "loss": 0.8466, "step": 587 }, { "epoch": 0.7609187965059852, "grad_norm": 0.7969221472740173, "learning_rate": 9.516027835577943e-05, "loss": 0.789, "step": 588 }, { "epoch": 0.7622128760918796, "grad_norm": 0.7585557103157043, "learning_rate": 9.514256750120828e-05, "loss": 0.8779, "step": 589 }, { "epoch": 0.7635069556777742, "grad_norm": 0.8145812749862671, "learning_rate": 9.512482595387132e-05, "loss": 0.9836, "step": 590 }, { "epoch": 0.7648010352636687, "grad_norm": 0.6959390640258789, "learning_rate": 9.510705372583118e-05, "loss": 0.8525, "step": 591 }, { "epoch": 0.7660951148495633, "grad_norm": 0.6348171234130859, "learning_rate": 9.50892508291713e-05, "loss": 0.7776, "step": 592 }, { "epoch": 0.7673891944354578, "grad_norm": 1.0133094787597656, "learning_rate": 9.507141727599602e-05, "loss": 0.8943, "step": 593 }, { "epoch": 0.7686832740213523, "grad_norm": 0.8890700340270996, "learning_rate": 9.50535530784305e-05, "loss": 0.9169, "step": 594 }, { "epoch": 0.7699773536072468, "grad_norm": 0.7059516310691833, "learning_rate": 9.503565824862076e-05, "loss": 0.7544, "step": 595 }, { "epoch": 0.7712714331931414, "grad_norm": 0.8086109161376953, "learning_rate": 9.50177327987336e-05, "loss": 0.8457, "step": 596 }, { "epoch": 0.772565512779036, "grad_norm": 0.8387449383735657, "learning_rate": 9.499977674095669e-05, "loss": 0.8291, "step": 597 }, { "epoch": 0.7738595923649304, "grad_norm": 0.7841625809669495, "learning_rate": 9.498179008749847e-05, "loss": 0.8972, "step": 598 }, { "epoch": 0.775153671950825, "grad_norm": 0.7261033058166504, "learning_rate": 9.496377285058819e-05, "loss": 0.7678, "step": 599 }, { "epoch": 0.7764477515367195, "grad_norm": 0.8468754887580872, "learning_rate": 9.494572504247593e-05, "loss": 0.9044, "step": 600 }, { "epoch": 0.7777418311226141, "grad_norm": 0.8177486062049866, "learning_rate": 9.492764667543252e-05, "loss": 0.8132, "step": 601 }, { "epoch": 0.7790359107085085, "grad_norm": 1.0061938762664795, "learning_rate": 9.490953776174955e-05, "loss": 0.9204, "step": 602 }, { "epoch": 0.7803299902944031, "grad_norm": 0.8224837779998779, "learning_rate": 9.489139831373944e-05, "loss": 0.7909, "step": 603 }, { "epoch": 0.7816240698802976, "grad_norm": 0.7812953591346741, "learning_rate": 9.48732283437353e-05, "loss": 0.9583, "step": 604 }, { "epoch": 0.7829181494661922, "grad_norm": 0.8498075604438782, "learning_rate": 9.485502786409107e-05, "loss": 1.0692, "step": 605 }, { "epoch": 0.7842122290520867, "grad_norm": 0.733768105506897, "learning_rate": 9.483679688718135e-05, "loss": 0.8835, "step": 606 }, { "epoch": 0.7855063086379812, "grad_norm": 0.7453616261482239, "learning_rate": 9.481853542540154e-05, "loss": 0.7778, "step": 607 }, { "epoch": 0.7868003882238758, "grad_norm": 0.8596757650375366, "learning_rate": 9.480024349116771e-05, "loss": 0.9711, "step": 608 }, { "epoch": 0.7880944678097703, "grad_norm": 0.6516830325126648, "learning_rate": 9.478192109691674e-05, "loss": 0.8342, "step": 609 }, { "epoch": 0.7893885473956649, "grad_norm": 0.8740130662918091, "learning_rate": 9.476356825510612e-05, "loss": 0.9179, "step": 610 }, { "epoch": 0.7906826269815593, "grad_norm": 0.7338031530380249, "learning_rate": 9.474518497821409e-05, "loss": 0.9518, "step": 611 }, { "epoch": 0.7919767065674539, "grad_norm": 0.8368909358978271, "learning_rate": 9.472677127873958e-05, "loss": 0.875, "step": 612 }, { "epoch": 0.7932707861533485, "grad_norm": 0.7648499011993408, "learning_rate": 9.47083271692022e-05, "loss": 1.0374, "step": 613 }, { "epoch": 0.794564865739243, "grad_norm": 0.767672598361969, "learning_rate": 9.468985266214223e-05, "loss": 0.8341, "step": 614 }, { "epoch": 0.7958589453251375, "grad_norm": 0.9221040606498718, "learning_rate": 9.467134777012063e-05, "loss": 1.0149, "step": 615 }, { "epoch": 0.797153024911032, "grad_norm": 0.8752298355102539, "learning_rate": 9.465281250571902e-05, "loss": 0.9948, "step": 616 }, { "epoch": 0.7984471044969266, "grad_norm": 0.8295148611068726, "learning_rate": 9.463424688153966e-05, "loss": 0.8181, "step": 617 }, { "epoch": 0.7997411840828211, "grad_norm": 0.7995513677597046, "learning_rate": 9.461565091020543e-05, "loss": 0.8851, "step": 618 }, { "epoch": 0.8010352636687156, "grad_norm": 0.7693185210227966, "learning_rate": 9.459702460435989e-05, "loss": 0.8141, "step": 619 }, { "epoch": 0.8023293432546101, "grad_norm": 0.8453662991523743, "learning_rate": 9.457836797666722e-05, "loss": 0.9608, "step": 620 }, { "epoch": 0.8036234228405047, "grad_norm": 0.8325058817863464, "learning_rate": 9.455968103981219e-05, "loss": 0.8521, "step": 621 }, { "epoch": 0.8049175024263993, "grad_norm": 0.7803249359130859, "learning_rate": 9.454096380650014e-05, "loss": 0.8598, "step": 622 }, { "epoch": 0.8062115820122937, "grad_norm": 0.731028139591217, "learning_rate": 9.452221628945713e-05, "loss": 0.8091, "step": 623 }, { "epoch": 0.8075056615981883, "grad_norm": 0.8708882927894592, "learning_rate": 9.450343850142969e-05, "loss": 0.8467, "step": 624 }, { "epoch": 0.8087997411840828, "grad_norm": 0.7343994975090027, "learning_rate": 9.448463045518499e-05, "loss": 0.8496, "step": 625 }, { "epoch": 0.8100938207699774, "grad_norm": 0.8136367797851562, "learning_rate": 9.446579216351074e-05, "loss": 0.8387, "step": 626 }, { "epoch": 0.8113879003558719, "grad_norm": 0.7161028981208801, "learning_rate": 9.444692363921528e-05, "loss": 0.839, "step": 627 }, { "epoch": 0.8126819799417664, "grad_norm": 0.9956607222557068, "learning_rate": 9.442802489512741e-05, "loss": 1.0051, "step": 628 }, { "epoch": 0.813976059527661, "grad_norm": 0.8381032943725586, "learning_rate": 9.440909594409655e-05, "loss": 0.9681, "step": 629 }, { "epoch": 0.8152701391135555, "grad_norm": 0.7452998757362366, "learning_rate": 9.439013679899262e-05, "loss": 0.8106, "step": 630 }, { "epoch": 0.8165642186994501, "grad_norm": 0.7175205945968628, "learning_rate": 9.437114747270612e-05, "loss": 0.8622, "step": 631 }, { "epoch": 0.8178582982853445, "grad_norm": 0.8083619475364685, "learning_rate": 9.435212797814798e-05, "loss": 0.8608, "step": 632 }, { "epoch": 0.8191523778712391, "grad_norm": 0.9113181233406067, "learning_rate": 9.433307832824974e-05, "loss": 0.9445, "step": 633 }, { "epoch": 0.8204464574571336, "grad_norm": 0.7915831208229065, "learning_rate": 9.431399853596336e-05, "loss": 0.8651, "step": 634 }, { "epoch": 0.8217405370430282, "grad_norm": 0.7277971506118774, "learning_rate": 9.429488861426137e-05, "loss": 0.8799, "step": 635 }, { "epoch": 0.8230346166289226, "grad_norm": 0.7446788549423218, "learning_rate": 9.427574857613672e-05, "loss": 0.8071, "step": 636 }, { "epoch": 0.8243286962148172, "grad_norm": 0.8236122131347656, "learning_rate": 9.425657843460288e-05, "loss": 0.9889, "step": 637 }, { "epoch": 0.8256227758007118, "grad_norm": 0.8189204335212708, "learning_rate": 9.423737820269376e-05, "loss": 0.9607, "step": 638 }, { "epoch": 0.8269168553866063, "grad_norm": 0.9449727535247803, "learning_rate": 9.421814789346375e-05, "loss": 0.9581, "step": 639 }, { "epoch": 0.8282109349725008, "grad_norm": 0.7527281045913696, "learning_rate": 9.419888751998767e-05, "loss": 0.7984, "step": 640 }, { "epoch": 0.8295050145583953, "grad_norm": 0.8296061754226685, "learning_rate": 9.417959709536078e-05, "loss": 0.8597, "step": 641 }, { "epoch": 0.8307990941442899, "grad_norm": 0.8495222330093384, "learning_rate": 9.416027663269881e-05, "loss": 0.8396, "step": 642 }, { "epoch": 0.8320931737301844, "grad_norm": 0.8501962423324585, "learning_rate": 9.414092614513787e-05, "loss": 0.9911, "step": 643 }, { "epoch": 0.833387253316079, "grad_norm": 0.7546764612197876, "learning_rate": 9.412154564583448e-05, "loss": 0.9043, "step": 644 }, { "epoch": 0.8346813329019734, "grad_norm": 0.8009942770004272, "learning_rate": 9.410213514796564e-05, "loss": 0.8242, "step": 645 }, { "epoch": 0.835975412487868, "grad_norm": 0.8321415185928345, "learning_rate": 9.408269466472864e-05, "loss": 0.9094, "step": 646 }, { "epoch": 0.8372694920737626, "grad_norm": 0.7805325984954834, "learning_rate": 9.406322420934123e-05, "loss": 0.837, "step": 647 }, { "epoch": 0.8385635716596571, "grad_norm": 0.8147938251495361, "learning_rate": 9.404372379504151e-05, "loss": 0.9177, "step": 648 }, { "epoch": 0.8398576512455516, "grad_norm": 0.7518514394760132, "learning_rate": 9.402419343508797e-05, "loss": 0.7562, "step": 649 }, { "epoch": 0.8411517308314461, "grad_norm": 0.7858056426048279, "learning_rate": 9.400463314275943e-05, "loss": 0.9865, "step": 650 }, { "epoch": 0.8424458104173407, "grad_norm": 0.7086622714996338, "learning_rate": 9.398504293135507e-05, "loss": 0.8451, "step": 651 }, { "epoch": 0.8437398900032352, "grad_norm": 0.8999130725860596, "learning_rate": 9.396542281419445e-05, "loss": 0.919, "step": 652 }, { "epoch": 0.8450339695891297, "grad_norm": 0.8439355492591858, "learning_rate": 9.39457728046174e-05, "loss": 0.808, "step": 653 }, { "epoch": 0.8463280491750242, "grad_norm": 0.863634467124939, "learning_rate": 9.392609291598413e-05, "loss": 1.0594, "step": 654 }, { "epoch": 0.8476221287609188, "grad_norm": 0.8126616477966309, "learning_rate": 9.390638316167512e-05, "loss": 1.0526, "step": 655 }, { "epoch": 0.8489162083468134, "grad_norm": 0.6746686697006226, "learning_rate": 9.388664355509122e-05, "loss": 0.8609, "step": 656 }, { "epoch": 0.8502102879327078, "grad_norm": 0.6596627831459045, "learning_rate": 9.386687410965349e-05, "loss": 0.8474, "step": 657 }, { "epoch": 0.8515043675186024, "grad_norm": 0.7229893207550049, "learning_rate": 9.384707483880334e-05, "loss": 0.8095, "step": 658 }, { "epoch": 0.8527984471044969, "grad_norm": 0.678260326385498, "learning_rate": 9.382724575600243e-05, "loss": 0.9453, "step": 659 }, { "epoch": 0.8540925266903915, "grad_norm": 0.7702340483665466, "learning_rate": 9.380738687473272e-05, "loss": 0.9527, "step": 660 }, { "epoch": 0.855386606276286, "grad_norm": 0.7638357877731323, "learning_rate": 9.378749820849642e-05, "loss": 1.0131, "step": 661 }, { "epoch": 0.8566806858621805, "grad_norm": 0.7495226263999939, "learning_rate": 9.376757977081594e-05, "loss": 0.8511, "step": 662 }, { "epoch": 0.857974765448075, "grad_norm": 0.8003675937652588, "learning_rate": 9.374763157523399e-05, "loss": 0.9291, "step": 663 }, { "epoch": 0.8592688450339696, "grad_norm": 0.7893322706222534, "learning_rate": 9.372765363531351e-05, "loss": 0.8206, "step": 664 }, { "epoch": 0.8605629246198642, "grad_norm": 0.7351086735725403, "learning_rate": 9.370764596463763e-05, "loss": 0.7185, "step": 665 }, { "epoch": 0.8618570042057586, "grad_norm": 0.8211125135421753, "learning_rate": 9.368760857680974e-05, "loss": 0.9613, "step": 666 }, { "epoch": 0.8631510837916532, "grad_norm": 0.9151694178581238, "learning_rate": 9.366754148545338e-05, "loss": 0.9788, "step": 667 }, { "epoch": 0.8644451633775477, "grad_norm": 0.84197998046875, "learning_rate": 9.364744470421234e-05, "loss": 0.9891, "step": 668 }, { "epoch": 0.8657392429634423, "grad_norm": 0.6799368858337402, "learning_rate": 9.362731824675056e-05, "loss": 0.7988, "step": 669 }, { "epoch": 0.8670333225493367, "grad_norm": 0.722326934337616, "learning_rate": 9.360716212675214e-05, "loss": 0.8399, "step": 670 }, { "epoch": 0.8683274021352313, "grad_norm": 0.7735809683799744, "learning_rate": 9.35869763579214e-05, "loss": 0.8677, "step": 671 }, { "epoch": 0.8696214817211259, "grad_norm": 0.7918348908424377, "learning_rate": 9.35667609539828e-05, "loss": 0.8792, "step": 672 }, { "epoch": 0.8709155613070204, "grad_norm": 0.7758587002754211, "learning_rate": 9.354651592868094e-05, "loss": 0.8844, "step": 673 }, { "epoch": 0.8722096408929149, "grad_norm": 0.8733750581741333, "learning_rate": 9.352624129578054e-05, "loss": 0.811, "step": 674 }, { "epoch": 0.8735037204788094, "grad_norm": 0.8485933542251587, "learning_rate": 9.350593706906651e-05, "loss": 0.787, "step": 675 }, { "epoch": 0.874797800064704, "grad_norm": 0.7540378570556641, "learning_rate": 9.348560326234381e-05, "loss": 0.8578, "step": 676 }, { "epoch": 0.8760918796505985, "grad_norm": 0.7626041173934937, "learning_rate": 9.346523988943758e-05, "loss": 0.9294, "step": 677 }, { "epoch": 0.8773859592364931, "grad_norm": 0.8360442519187927, "learning_rate": 9.3444846964193e-05, "loss": 0.8635, "step": 678 }, { "epoch": 0.8786800388223875, "grad_norm": 0.9039386510848999, "learning_rate": 9.342442450047537e-05, "loss": 0.83, "step": 679 }, { "epoch": 0.8799741184082821, "grad_norm": 0.7554466724395752, "learning_rate": 9.340397251217009e-05, "loss": 0.8103, "step": 680 }, { "epoch": 0.8812681979941767, "grad_norm": 0.8201668858528137, "learning_rate": 9.338349101318261e-05, "loss": 0.7959, "step": 681 }, { "epoch": 0.8825622775800712, "grad_norm": 0.814172625541687, "learning_rate": 9.336298001743846e-05, "loss": 0.8074, "step": 682 }, { "epoch": 0.8838563571659657, "grad_norm": 0.7825469374656677, "learning_rate": 9.33424395388832e-05, "loss": 0.91, "step": 683 }, { "epoch": 0.8851504367518602, "grad_norm": 0.7842270135879517, "learning_rate": 9.332186959148248e-05, "loss": 0.9331, "step": 684 }, { "epoch": 0.8864445163377548, "grad_norm": 0.8008681535720825, "learning_rate": 9.330127018922194e-05, "loss": 0.7166, "step": 685 }, { "epoch": 0.8877385959236493, "grad_norm": 0.8031767010688782, "learning_rate": 9.328064134610727e-05, "loss": 0.9718, "step": 686 }, { "epoch": 0.8890326755095438, "grad_norm": 0.7503944635391235, "learning_rate": 9.325998307616417e-05, "loss": 0.9732, "step": 687 }, { "epoch": 0.8903267550954383, "grad_norm": 0.8711128234863281, "learning_rate": 9.323929539343837e-05, "loss": 1.05, "step": 688 }, { "epoch": 0.8916208346813329, "grad_norm": 0.7816237211227417, "learning_rate": 9.321857831199554e-05, "loss": 0.9514, "step": 689 }, { "epoch": 0.8929149142672275, "grad_norm": 0.8090843558311462, "learning_rate": 9.319783184592142e-05, "loss": 0.9691, "step": 690 }, { "epoch": 0.8942089938531219, "grad_norm": 0.6824143528938293, "learning_rate": 9.317705600932164e-05, "loss": 0.7027, "step": 691 }, { "epoch": 0.8955030734390165, "grad_norm": 0.8860282301902771, "learning_rate": 9.315625081632191e-05, "loss": 0.9419, "step": 692 }, { "epoch": 0.896797153024911, "grad_norm": 0.9011827707290649, "learning_rate": 9.313541628106777e-05, "loss": 0.8468, "step": 693 }, { "epoch": 0.8980912326108056, "grad_norm": 0.836565375328064, "learning_rate": 9.311455241772482e-05, "loss": 1.0456, "step": 694 }, { "epoch": 0.8993853121967001, "grad_norm": 0.819817841053009, "learning_rate": 9.309365924047853e-05, "loss": 0.9606, "step": 695 }, { "epoch": 0.9006793917825946, "grad_norm": 0.7008129954338074, "learning_rate": 9.307273676353432e-05, "loss": 0.8531, "step": 696 }, { "epoch": 0.9019734713684892, "grad_norm": 0.8026110529899597, "learning_rate": 9.305178500111755e-05, "loss": 0.7784, "step": 697 }, { "epoch": 0.9032675509543837, "grad_norm": 0.7309970855712891, "learning_rate": 9.30308039674735e-05, "loss": 0.9284, "step": 698 }, { "epoch": 0.9045616305402783, "grad_norm": 0.801511824131012, "learning_rate": 9.300979367686729e-05, "loss": 0.8111, "step": 699 }, { "epoch": 0.9058557101261727, "grad_norm": 0.8487135767936707, "learning_rate": 9.298875414358399e-05, "loss": 0.9095, "step": 700 }, { "epoch": 0.9071497897120673, "grad_norm": 0.6941442489624023, "learning_rate": 9.296768538192853e-05, "loss": 0.8285, "step": 701 }, { "epoch": 0.9084438692979618, "grad_norm": 0.7753114700317383, "learning_rate": 9.294658740622573e-05, "loss": 0.9726, "step": 702 }, { "epoch": 0.9097379488838564, "grad_norm": 0.8224105834960938, "learning_rate": 9.292546023082025e-05, "loss": 0.8728, "step": 703 }, { "epoch": 0.9110320284697508, "grad_norm": 0.9305656552314758, "learning_rate": 9.29043038700766e-05, "loss": 0.891, "step": 704 }, { "epoch": 0.9123261080556454, "grad_norm": 0.7868937849998474, "learning_rate": 9.288311833837917e-05, "loss": 0.8883, "step": 705 }, { "epoch": 0.91362018764154, "grad_norm": 0.6708902716636658, "learning_rate": 9.286190365013217e-05, "loss": 0.8315, "step": 706 }, { "epoch": 0.9149142672274345, "grad_norm": 0.802245557308197, "learning_rate": 9.284065981975958e-05, "loss": 0.9128, "step": 707 }, { "epoch": 0.916208346813329, "grad_norm": 0.8574107885360718, "learning_rate": 9.281938686170526e-05, "loss": 0.9122, "step": 708 }, { "epoch": 0.9175024263992235, "grad_norm": 1.060194730758667, "learning_rate": 9.279808479043286e-05, "loss": 1.1447, "step": 709 }, { "epoch": 0.9187965059851181, "grad_norm": 0.8753707408905029, "learning_rate": 9.277675362042581e-05, "loss": 0.8524, "step": 710 }, { "epoch": 0.9200905855710126, "grad_norm": 1.0536813735961914, "learning_rate": 9.275539336618732e-05, "loss": 1.014, "step": 711 }, { "epoch": 0.9213846651569072, "grad_norm": 0.779331624507904, "learning_rate": 9.27340040422404e-05, "loss": 0.913, "step": 712 }, { "epoch": 0.9226787447428016, "grad_norm": 1.0297069549560547, "learning_rate": 9.27125856631278e-05, "loss": 1.0926, "step": 713 }, { "epoch": 0.9239728243286962, "grad_norm": 0.7252759337425232, "learning_rate": 9.269113824341205e-05, "loss": 0.8096, "step": 714 }, { "epoch": 0.9252669039145908, "grad_norm": 0.900393009185791, "learning_rate": 9.266966179767538e-05, "loss": 0.7856, "step": 715 }, { "epoch": 0.9265609835004853, "grad_norm": 0.8092474937438965, "learning_rate": 9.264815634051982e-05, "loss": 1.0298, "step": 716 }, { "epoch": 0.9278550630863798, "grad_norm": 0.8919221758842468, "learning_rate": 9.262662188656705e-05, "loss": 0.7962, "step": 717 }, { "epoch": 0.9291491426722743, "grad_norm": 0.771659791469574, "learning_rate": 9.260505845045854e-05, "loss": 1.0239, "step": 718 }, { "epoch": 0.9304432222581689, "grad_norm": 0.7577182054519653, "learning_rate": 9.258346604685542e-05, "loss": 0.9672, "step": 719 }, { "epoch": 0.9317373018440634, "grad_norm": 0.7373968362808228, "learning_rate": 9.256184469043851e-05, "loss": 0.7484, "step": 720 }, { "epoch": 0.9330313814299579, "grad_norm": 0.7698187232017517, "learning_rate": 9.254019439590835e-05, "loss": 0.7563, "step": 721 }, { "epoch": 0.9343254610158525, "grad_norm": 0.8331950306892395, "learning_rate": 9.251851517798514e-05, "loss": 0.8826, "step": 722 }, { "epoch": 0.935619540601747, "grad_norm": 0.7643804550170898, "learning_rate": 9.24968070514087e-05, "loss": 1.0688, "step": 723 }, { "epoch": 0.9369136201876416, "grad_norm": 0.802943229675293, "learning_rate": 9.247507003093858e-05, "loss": 0.8872, "step": 724 }, { "epoch": 0.938207699773536, "grad_norm": 0.8977981209754944, "learning_rate": 9.245330413135395e-05, "loss": 0.8946, "step": 725 }, { "epoch": 0.9395017793594306, "grad_norm": 0.7683425545692444, "learning_rate": 9.243150936745357e-05, "loss": 0.8221, "step": 726 }, { "epoch": 0.9407958589453251, "grad_norm": 0.7723820209503174, "learning_rate": 9.240968575405589e-05, "loss": 0.9058, "step": 727 }, { "epoch": 0.9420899385312197, "grad_norm": 0.7846856117248535, "learning_rate": 9.238783330599893e-05, "loss": 0.8775, "step": 728 }, { "epoch": 0.9433840181171143, "grad_norm": 0.6629738211631775, "learning_rate": 9.236595203814037e-05, "loss": 0.8753, "step": 729 }, { "epoch": 0.9446780977030087, "grad_norm": 0.7508718371391296, "learning_rate": 9.23440419653574e-05, "loss": 0.9403, "step": 730 }, { "epoch": 0.9459721772889033, "grad_norm": 0.8904352784156799, "learning_rate": 9.232210310254685e-05, "loss": 0.8514, "step": 731 }, { "epoch": 0.9472662568747978, "grad_norm": 1.004438877105713, "learning_rate": 9.230013546462512e-05, "loss": 0.9824, "step": 732 }, { "epoch": 0.9485603364606924, "grad_norm": 0.7370134592056274, "learning_rate": 9.227813906652818e-05, "loss": 0.9869, "step": 733 }, { "epoch": 0.9498544160465868, "grad_norm": 0.8597853183746338, "learning_rate": 9.225611392321153e-05, "loss": 0.8758, "step": 734 }, { "epoch": 0.9511484956324814, "grad_norm": 0.7227449417114258, "learning_rate": 9.223406004965023e-05, "loss": 0.8208, "step": 735 }, { "epoch": 0.9524425752183759, "grad_norm": 0.8053308725357056, "learning_rate": 9.221197746083887e-05, "loss": 0.8555, "step": 736 }, { "epoch": 0.9537366548042705, "grad_norm": 0.7663565278053284, "learning_rate": 9.21898661717916e-05, "loss": 0.7754, "step": 737 }, { "epoch": 0.955030734390165, "grad_norm": 0.7501769661903381, "learning_rate": 9.216772619754199e-05, "loss": 0.8798, "step": 738 }, { "epoch": 0.9563248139760595, "grad_norm": 0.7159123420715332, "learning_rate": 9.21455575531432e-05, "loss": 0.963, "step": 739 }, { "epoch": 0.9576188935619541, "grad_norm": 0.8228005766868591, "learning_rate": 9.212336025366788e-05, "loss": 0.9521, "step": 740 } ], "logging_steps": 1, "max_steps": 3860, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.272735070532403e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }