{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3299902944031058, "eval_steps": 386, "global_step": 255, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012940795858945326, "grad_norm": 1.7405146360397339, "learning_rate": 2.0000000000000003e-06, "loss": 2.4269, "step": 1 }, { "epoch": 0.0012940795858945326, "eval_loss": 2.247628688812256, "eval_runtime": 189.8853, "eval_samples_per_second": 3.428, "eval_steps_per_second": 0.858, "step": 1 }, { "epoch": 0.002588159171789065, "grad_norm": 1.6643542051315308, "learning_rate": 4.000000000000001e-06, "loss": 2.2583, "step": 2 }, { "epoch": 0.0038822387576835974, "grad_norm": 1.8690767288208008, "learning_rate": 6e-06, "loss": 2.2696, "step": 3 }, { "epoch": 0.00517631834357813, "grad_norm": 1.828118085861206, "learning_rate": 8.000000000000001e-06, "loss": 2.3646, "step": 4 }, { "epoch": 0.006470397929472662, "grad_norm": 1.9319926500320435, "learning_rate": 1e-05, "loss": 2.4196, "step": 5 }, { "epoch": 0.007764477515367195, "grad_norm": 1.7723782062530518, "learning_rate": 1.2e-05, "loss": 2.4177, "step": 6 }, { "epoch": 0.009058557101261728, "grad_norm": 1.9500815868377686, "learning_rate": 1.4000000000000001e-05, "loss": 2.3497, "step": 7 }, { "epoch": 0.01035263668715626, "grad_norm": 2.3909075260162354, "learning_rate": 1.6000000000000003e-05, "loss": 2.405, "step": 8 }, { "epoch": 0.011646716273050793, "grad_norm": 2.0620856285095215, "learning_rate": 1.8e-05, "loss": 2.4098, "step": 9 }, { "epoch": 0.012940795858945324, "grad_norm": 1.8054910898208618, "learning_rate": 2e-05, "loss": 2.1233, "step": 10 }, { "epoch": 0.014234875444839857, "grad_norm": 2.190964937210083, "learning_rate": 2.2000000000000003e-05, "loss": 2.3985, "step": 11 }, { "epoch": 0.01552895503073439, "grad_norm": 1.9412921667099, "learning_rate": 2.4e-05, "loss": 2.462, "step": 12 }, { "epoch": 0.016823034616628922, "grad_norm": 1.9161555767059326, "learning_rate": 2.6000000000000002e-05, "loss": 2.2118, "step": 13 }, { "epoch": 0.018117114202523456, "grad_norm": 1.7161599397659302, "learning_rate": 2.8000000000000003e-05, "loss": 2.2175, "step": 14 }, { "epoch": 0.019411193788417987, "grad_norm": 2.173877000808716, "learning_rate": 3e-05, "loss": 2.2521, "step": 15 }, { "epoch": 0.02070527337431252, "grad_norm": 2.0000555515289307, "learning_rate": 3.2000000000000005e-05, "loss": 2.1615, "step": 16 }, { "epoch": 0.021999352960207053, "grad_norm": 1.5915080308914185, "learning_rate": 3.4000000000000007e-05, "loss": 1.9522, "step": 17 }, { "epoch": 0.023293432546101587, "grad_norm": 1.6972448825836182, "learning_rate": 3.6e-05, "loss": 1.7224, "step": 18 }, { "epoch": 0.024587512131996118, "grad_norm": 1.7509772777557373, "learning_rate": 3.8e-05, "loss": 2.0414, "step": 19 }, { "epoch": 0.02588159171789065, "grad_norm": 1.697340488433838, "learning_rate": 4e-05, "loss": 2.0427, "step": 20 }, { "epoch": 0.027175671303785183, "grad_norm": 1.8733758926391602, "learning_rate": 4.2e-05, "loss": 1.6772, "step": 21 }, { "epoch": 0.028469750889679714, "grad_norm": 1.6085255146026611, "learning_rate": 4.4000000000000006e-05, "loss": 1.6527, "step": 22 }, { "epoch": 0.029763830475574248, "grad_norm": 1.5792337656021118, "learning_rate": 4.600000000000001e-05, "loss": 1.6567, "step": 23 }, { "epoch": 0.03105791006146878, "grad_norm": 1.4392567873001099, "learning_rate": 4.8e-05, "loss": 1.508, "step": 24 }, { "epoch": 0.03235198964736331, "grad_norm": 1.5222433805465698, "learning_rate": 5e-05, "loss": 1.4606, "step": 25 }, { "epoch": 0.033646069233257844, "grad_norm": 1.5480064153671265, "learning_rate": 5.2000000000000004e-05, "loss": 1.5027, "step": 26 }, { "epoch": 0.034940148819152375, "grad_norm": 1.6736445426940918, "learning_rate": 5.4000000000000005e-05, "loss": 1.2426, "step": 27 }, { "epoch": 0.03623422840504691, "grad_norm": 1.7392551898956299, "learning_rate": 5.6000000000000006e-05, "loss": 1.4703, "step": 28 }, { "epoch": 0.037528307990941444, "grad_norm": 1.6173359155654907, "learning_rate": 5.8e-05, "loss": 1.4546, "step": 29 }, { "epoch": 0.038822387576835975, "grad_norm": 1.3955802917480469, "learning_rate": 6e-05, "loss": 1.3808, "step": 30 }, { "epoch": 0.040116467162730506, "grad_norm": 1.353873372077942, "learning_rate": 6.2e-05, "loss": 1.229, "step": 31 }, { "epoch": 0.04141054674862504, "grad_norm": 1.2547746896743774, "learning_rate": 6.400000000000001e-05, "loss": 1.1668, "step": 32 }, { "epoch": 0.042704626334519574, "grad_norm": 1.3806778192520142, "learning_rate": 6.6e-05, "loss": 1.0691, "step": 33 }, { "epoch": 0.043998705920414105, "grad_norm": 1.2815773487091064, "learning_rate": 6.800000000000001e-05, "loss": 1.2409, "step": 34 }, { "epoch": 0.045292785506308636, "grad_norm": 1.3677266836166382, "learning_rate": 7e-05, "loss": 0.9668, "step": 35 }, { "epoch": 0.046586865092203174, "grad_norm": 1.5457032918930054, "learning_rate": 7.2e-05, "loss": 1.1385, "step": 36 }, { "epoch": 0.047880944678097705, "grad_norm": 1.5587060451507568, "learning_rate": 7.4e-05, "loss": 1.1707, "step": 37 }, { "epoch": 0.049175024263992236, "grad_norm": 1.079053282737732, "learning_rate": 7.6e-05, "loss": 1.0655, "step": 38 }, { "epoch": 0.050469103849886766, "grad_norm": 1.1773897409439087, "learning_rate": 7.800000000000001e-05, "loss": 1.0465, "step": 39 }, { "epoch": 0.0517631834357813, "grad_norm": 1.2437673807144165, "learning_rate": 8e-05, "loss": 1.2779, "step": 40 }, { "epoch": 0.053057263021675835, "grad_norm": 1.254847526550293, "learning_rate": 8.2e-05, "loss": 1.0898, "step": 41 }, { "epoch": 0.054351342607570366, "grad_norm": 1.1771515607833862, "learning_rate": 8.4e-05, "loss": 1.1827, "step": 42 }, { "epoch": 0.0556454221934649, "grad_norm": 1.1400648355484009, "learning_rate": 8.6e-05, "loss": 1.1066, "step": 43 }, { "epoch": 0.05693950177935943, "grad_norm": 1.2047138214111328, "learning_rate": 8.800000000000001e-05, "loss": 0.8974, "step": 44 }, { "epoch": 0.058233581365253966, "grad_norm": 1.1269346475601196, "learning_rate": 9e-05, "loss": 1.0146, "step": 45 }, { "epoch": 0.059527660951148496, "grad_norm": 1.169231653213501, "learning_rate": 9.200000000000001e-05, "loss": 1.1266, "step": 46 }, { "epoch": 0.06082174053704303, "grad_norm": 0.9771779179573059, "learning_rate": 9.4e-05, "loss": 0.8351, "step": 47 }, { "epoch": 0.06211582012293756, "grad_norm": 1.2849314212799072, "learning_rate": 9.6e-05, "loss": 1.1822, "step": 48 }, { "epoch": 0.0634098997088321, "grad_norm": 1.023181676864624, "learning_rate": 9.8e-05, "loss": 0.9082, "step": 49 }, { "epoch": 0.06470397929472663, "grad_norm": 1.135751724243164, "learning_rate": 0.0001, "loss": 0.9407, "step": 50 }, { "epoch": 0.06599805888062116, "grad_norm": 0.9701154828071594, "learning_rate": 9.999998300231494e-05, "loss": 0.9423, "step": 51 }, { "epoch": 0.06729213846651569, "grad_norm": 1.2891143560409546, "learning_rate": 9.999993200927133e-05, "loss": 0.9757, "step": 52 }, { "epoch": 0.06858621805241022, "grad_norm": 1.3360975980758667, "learning_rate": 9.999984702090383e-05, "loss": 1.0158, "step": 53 }, { "epoch": 0.06988029763830475, "grad_norm": 0.977446436882019, "learning_rate": 9.999972803727024e-05, "loss": 0.8175, "step": 54 }, { "epoch": 0.0711743772241993, "grad_norm": 0.9943827390670776, "learning_rate": 9.999957505845144e-05, "loss": 0.8627, "step": 55 }, { "epoch": 0.07246845681009383, "grad_norm": 1.1531224250793457, "learning_rate": 9.999938808455145e-05, "loss": 1.143, "step": 56 }, { "epoch": 0.07376253639598836, "grad_norm": 1.287972092628479, "learning_rate": 9.99991671156974e-05, "loss": 1.2342, "step": 57 }, { "epoch": 0.07505661598188289, "grad_norm": 1.1554590463638306, "learning_rate": 9.999891215203949e-05, "loss": 0.9692, "step": 58 }, { "epoch": 0.07635069556777742, "grad_norm": 1.0786008834838867, "learning_rate": 9.999862319375113e-05, "loss": 1.1254, "step": 59 }, { "epoch": 0.07764477515367195, "grad_norm": 1.0764508247375488, "learning_rate": 9.999830024102874e-05, "loss": 0.9312, "step": 60 }, { "epoch": 0.07893885473956648, "grad_norm": 1.1909526586532593, "learning_rate": 9.999794329409194e-05, "loss": 0.9959, "step": 61 }, { "epoch": 0.08023293432546101, "grad_norm": 0.9989166259765625, "learning_rate": 9.999755235318337e-05, "loss": 0.934, "step": 62 }, { "epoch": 0.08152701391135554, "grad_norm": 1.0302046537399292, "learning_rate": 9.999712741856889e-05, "loss": 1.1017, "step": 63 }, { "epoch": 0.08282109349725009, "grad_norm": 0.9583478569984436, "learning_rate": 9.999666849053738e-05, "loss": 1.1384, "step": 64 }, { "epoch": 0.08411517308314462, "grad_norm": 1.001126766204834, "learning_rate": 9.999617556940085e-05, "loss": 0.9279, "step": 65 }, { "epoch": 0.08540925266903915, "grad_norm": 1.0130903720855713, "learning_rate": 9.999564865549449e-05, "loss": 0.9381, "step": 66 }, { "epoch": 0.08670333225493368, "grad_norm": 1.1210829019546509, "learning_rate": 9.999508774917652e-05, "loss": 0.9607, "step": 67 }, { "epoch": 0.08799741184082821, "grad_norm": 1.045749545097351, "learning_rate": 9.999449285082831e-05, "loss": 1.0037, "step": 68 }, { "epoch": 0.08929149142672274, "grad_norm": 1.1308139562606812, "learning_rate": 9.999386396085434e-05, "loss": 0.9086, "step": 69 }, { "epoch": 0.09058557101261727, "grad_norm": 1.1013413667678833, "learning_rate": 9.999320107968219e-05, "loss": 1.0712, "step": 70 }, { "epoch": 0.0918796505985118, "grad_norm": 1.0830566883087158, "learning_rate": 9.999250420776258e-05, "loss": 1.0326, "step": 71 }, { "epoch": 0.09317373018440635, "grad_norm": 1.0673171281814575, "learning_rate": 9.999177334556929e-05, "loss": 1.0034, "step": 72 }, { "epoch": 0.09446780977030088, "grad_norm": 1.1546461582183838, "learning_rate": 9.999100849359926e-05, "loss": 1.059, "step": 73 }, { "epoch": 0.09576188935619541, "grad_norm": 0.9139528870582581, "learning_rate": 9.999020965237249e-05, "loss": 0.8596, "step": 74 }, { "epoch": 0.09705596894208994, "grad_norm": 1.1570812463760376, "learning_rate": 9.998937682243215e-05, "loss": 1.0456, "step": 75 }, { "epoch": 0.09835004852798447, "grad_norm": 1.3232612609863281, "learning_rate": 9.998851000434448e-05, "loss": 0.9994, "step": 76 }, { "epoch": 0.099644128113879, "grad_norm": 1.2017115354537964, "learning_rate": 9.998760919869883e-05, "loss": 1.2664, "step": 77 }, { "epoch": 0.10093820769977353, "grad_norm": 1.0694175958633423, "learning_rate": 9.998667440610765e-05, "loss": 0.9483, "step": 78 }, { "epoch": 0.10223228728566806, "grad_norm": 0.9963059425354004, "learning_rate": 9.998570562720654e-05, "loss": 0.9577, "step": 79 }, { "epoch": 0.1035263668715626, "grad_norm": 0.8873535394668579, "learning_rate": 9.998470286265416e-05, "loss": 0.8498, "step": 80 }, { "epoch": 0.10482044645745714, "grad_norm": 1.1350760459899902, "learning_rate": 9.99836661131323e-05, "loss": 1.0024, "step": 81 }, { "epoch": 0.10611452604335167, "grad_norm": 0.8355389833450317, "learning_rate": 9.998259537934586e-05, "loss": 0.7399, "step": 82 }, { "epoch": 0.1074086056292462, "grad_norm": 0.9935446381568909, "learning_rate": 9.998149066202284e-05, "loss": 0.9809, "step": 83 }, { "epoch": 0.10870268521514073, "grad_norm": 1.0571558475494385, "learning_rate": 9.998035196191435e-05, "loss": 1.0144, "step": 84 }, { "epoch": 0.10999676480103526, "grad_norm": 0.9860286116600037, "learning_rate": 9.99791792797946e-05, "loss": 1.0467, "step": 85 }, { "epoch": 0.1112908443869298, "grad_norm": 1.1422507762908936, "learning_rate": 9.997797261646089e-05, "loss": 0.9535, "step": 86 }, { "epoch": 0.11258492397282432, "grad_norm": 0.8561545014381409, "learning_rate": 9.997673197273365e-05, "loss": 1.007, "step": 87 }, { "epoch": 0.11387900355871886, "grad_norm": 1.0027543306350708, "learning_rate": 9.997545734945639e-05, "loss": 0.9861, "step": 88 }, { "epoch": 0.1151730831446134, "grad_norm": 0.8489773273468018, "learning_rate": 9.997414874749575e-05, "loss": 0.9672, "step": 89 }, { "epoch": 0.11646716273050793, "grad_norm": 1.0517115592956543, "learning_rate": 9.997280616774147e-05, "loss": 1.1672, "step": 90 }, { "epoch": 0.11776124231640246, "grad_norm": 1.0035395622253418, "learning_rate": 9.997142961110634e-05, "loss": 0.9294, "step": 91 }, { "epoch": 0.11905532190229699, "grad_norm": 1.1194915771484375, "learning_rate": 9.997001907852635e-05, "loss": 1.0857, "step": 92 }, { "epoch": 0.12034940148819152, "grad_norm": 1.5234825611114502, "learning_rate": 9.996857457096047e-05, "loss": 1.027, "step": 93 }, { "epoch": 0.12164348107408605, "grad_norm": 0.949878454208374, "learning_rate": 9.996709608939088e-05, "loss": 0.8173, "step": 94 }, { "epoch": 0.12293756065998059, "grad_norm": 0.8736472129821777, "learning_rate": 9.996558363482277e-05, "loss": 0.855, "step": 95 }, { "epoch": 0.12423164024587512, "grad_norm": 0.8604567050933838, "learning_rate": 9.996403720828449e-05, "loss": 0.9485, "step": 96 }, { "epoch": 0.12552571983176966, "grad_norm": 1.020851492881775, "learning_rate": 9.996245681082748e-05, "loss": 1.0024, "step": 97 }, { "epoch": 0.1268197994176642, "grad_norm": 1.0704892873764038, "learning_rate": 9.996084244352623e-05, "loss": 0.9246, "step": 98 }, { "epoch": 0.12811387900355872, "grad_norm": 0.8441987037658691, "learning_rate": 9.99591941074784e-05, "loss": 1.0343, "step": 99 }, { "epoch": 0.12940795858945325, "grad_norm": 1.0280612707138062, "learning_rate": 9.995751180380466e-05, "loss": 0.9644, "step": 100 }, { "epoch": 0.13070203817534778, "grad_norm": 0.9827906489372253, "learning_rate": 9.995579553364887e-05, "loss": 0.9583, "step": 101 }, { "epoch": 0.13199611776124232, "grad_norm": 1.035618543624878, "learning_rate": 9.995404529817791e-05, "loss": 1.0366, "step": 102 }, { "epoch": 0.13329019734713685, "grad_norm": 1.2775524854660034, "learning_rate": 9.995226109858178e-05, "loss": 0.9353, "step": 103 }, { "epoch": 0.13458427693303138, "grad_norm": 1.0101919174194336, "learning_rate": 9.995044293607355e-05, "loss": 0.9045, "step": 104 }, { "epoch": 0.1358783565189259, "grad_norm": 0.8396942019462585, "learning_rate": 9.994859081188943e-05, "loss": 0.867, "step": 105 }, { "epoch": 0.13717243610482044, "grad_norm": 1.04515540599823, "learning_rate": 9.99467047272887e-05, "loss": 0.9693, "step": 106 }, { "epoch": 0.13846651569071497, "grad_norm": 1.099042534828186, "learning_rate": 9.994478468355369e-05, "loss": 0.8879, "step": 107 }, { "epoch": 0.1397605952766095, "grad_norm": 0.8710360527038574, "learning_rate": 9.994283068198988e-05, "loss": 0.9018, "step": 108 }, { "epoch": 0.14105467486250403, "grad_norm": 0.961025059223175, "learning_rate": 9.99408427239258e-05, "loss": 0.8806, "step": 109 }, { "epoch": 0.1423487544483986, "grad_norm": 0.915665328502655, "learning_rate": 9.993882081071306e-05, "loss": 0.8628, "step": 110 }, { "epoch": 0.14364283403429312, "grad_norm": 1.2776648998260498, "learning_rate": 9.993676494372642e-05, "loss": 0.9742, "step": 111 }, { "epoch": 0.14493691362018765, "grad_norm": 1.1270071268081665, "learning_rate": 9.993467512436364e-05, "loss": 0.9729, "step": 112 }, { "epoch": 0.14623099320608218, "grad_norm": 0.8188664317131042, "learning_rate": 9.99325513540456e-05, "loss": 0.9428, "step": 113 }, { "epoch": 0.1475250727919767, "grad_norm": 1.0760393142700195, "learning_rate": 9.993039363421627e-05, "loss": 0.9482, "step": 114 }, { "epoch": 0.14881915237787124, "grad_norm": 1.019920825958252, "learning_rate": 9.992820196634273e-05, "loss": 0.9785, "step": 115 }, { "epoch": 0.15011323196376578, "grad_norm": 0.8342046737670898, "learning_rate": 9.992597635191509e-05, "loss": 0.9291, "step": 116 }, { "epoch": 0.1514073115496603, "grad_norm": 0.8460632562637329, "learning_rate": 9.992371679244658e-05, "loss": 0.8797, "step": 117 }, { "epoch": 0.15270139113555484, "grad_norm": 0.933060348033905, "learning_rate": 9.992142328947345e-05, "loss": 0.9657, "step": 118 }, { "epoch": 0.15399547072144937, "grad_norm": 0.8822593688964844, "learning_rate": 9.991909584455511e-05, "loss": 0.8872, "step": 119 }, { "epoch": 0.1552895503073439, "grad_norm": 0.9599350094795227, "learning_rate": 9.991673445927398e-05, "loss": 0.9064, "step": 120 }, { "epoch": 0.15658362989323843, "grad_norm": 0.8505874872207642, "learning_rate": 9.99143391352356e-05, "loss": 0.9966, "step": 121 }, { "epoch": 0.15787770947913296, "grad_norm": 1.3977786302566528, "learning_rate": 9.991190987406857e-05, "loss": 0.9145, "step": 122 }, { "epoch": 0.1591717890650275, "grad_norm": 0.8947294354438782, "learning_rate": 9.990944667742455e-05, "loss": 0.9569, "step": 123 }, { "epoch": 0.16046586865092202, "grad_norm": 0.7973839044570923, "learning_rate": 9.990694954697828e-05, "loss": 0.8853, "step": 124 }, { "epoch": 0.16175994823681655, "grad_norm": 0.9481159448623657, "learning_rate": 9.99044184844276e-05, "loss": 1.04, "step": 125 }, { "epoch": 0.16305402782271108, "grad_norm": 1.3568611145019531, "learning_rate": 9.990185349149339e-05, "loss": 1.1104, "step": 126 }, { "epoch": 0.16434810740860564, "grad_norm": 0.900867223739624, "learning_rate": 9.98992545699196e-05, "loss": 0.8427, "step": 127 }, { "epoch": 0.16564218699450017, "grad_norm": 0.9025059938430786, "learning_rate": 9.989662172147326e-05, "loss": 0.9671, "step": 128 }, { "epoch": 0.1669362665803947, "grad_norm": 0.944692850112915, "learning_rate": 9.989395494794446e-05, "loss": 1.0966, "step": 129 }, { "epoch": 0.16823034616628924, "grad_norm": 1.196311116218567, "learning_rate": 9.989125425114638e-05, "loss": 1.0888, "step": 130 }, { "epoch": 0.16952442575218377, "grad_norm": 0.9069584608078003, "learning_rate": 9.988851963291522e-05, "loss": 0.8579, "step": 131 }, { "epoch": 0.1708185053380783, "grad_norm": 0.8150789141654968, "learning_rate": 9.988575109511026e-05, "loss": 0.7622, "step": 132 }, { "epoch": 0.17211258492397283, "grad_norm": 1.0844395160675049, "learning_rate": 9.988294863961387e-05, "loss": 0.9284, "step": 133 }, { "epoch": 0.17340666450986736, "grad_norm": 1.0463049411773682, "learning_rate": 9.988011226833146e-05, "loss": 0.9185, "step": 134 }, { "epoch": 0.1747007440957619, "grad_norm": 0.9481234550476074, "learning_rate": 9.987724198319148e-05, "loss": 0.8631, "step": 135 }, { "epoch": 0.17599482368165642, "grad_norm": 0.882074773311615, "learning_rate": 9.987433778614549e-05, "loss": 0.8997, "step": 136 }, { "epoch": 0.17728890326755095, "grad_norm": 0.9853332042694092, "learning_rate": 9.987139967916805e-05, "loss": 0.9226, "step": 137 }, { "epoch": 0.17858298285344548, "grad_norm": 1.151941180229187, "learning_rate": 9.98684276642568e-05, "loss": 1.0486, "step": 138 }, { "epoch": 0.17987706243934, "grad_norm": 1.0128459930419922, "learning_rate": 9.986542174343245e-05, "loss": 1.0797, "step": 139 }, { "epoch": 0.18117114202523454, "grad_norm": 0.9798718094825745, "learning_rate": 9.986238191873874e-05, "loss": 0.875, "step": 140 }, { "epoch": 0.18246522161112907, "grad_norm": 0.8143295645713806, "learning_rate": 9.985930819224247e-05, "loss": 0.8454, "step": 141 }, { "epoch": 0.1837593011970236, "grad_norm": 0.8755755424499512, "learning_rate": 9.985620056603348e-05, "loss": 0.8029, "step": 142 }, { "epoch": 0.18505338078291814, "grad_norm": 0.899174690246582, "learning_rate": 9.985305904222469e-05, "loss": 0.9608, "step": 143 }, { "epoch": 0.1863474603688127, "grad_norm": 0.920137882232666, "learning_rate": 9.984988362295203e-05, "loss": 0.9022, "step": 144 }, { "epoch": 0.18764153995470723, "grad_norm": 1.1012908220291138, "learning_rate": 9.984667431037447e-05, "loss": 0.9621, "step": 145 }, { "epoch": 0.18893561954060176, "grad_norm": 0.8609358668327332, "learning_rate": 9.98434311066741e-05, "loss": 0.917, "step": 146 }, { "epoch": 0.1902296991264963, "grad_norm": 0.8248727321624756, "learning_rate": 9.984015401405594e-05, "loss": 0.7864, "step": 147 }, { "epoch": 0.19152377871239082, "grad_norm": 0.8680225610733032, "learning_rate": 9.983684303474815e-05, "loss": 0.9288, "step": 148 }, { "epoch": 0.19281785829828535, "grad_norm": 1.0807067155838013, "learning_rate": 9.983349817100188e-05, "loss": 0.9842, "step": 149 }, { "epoch": 0.19411193788417988, "grad_norm": 0.9310898780822754, "learning_rate": 9.983011942509131e-05, "loss": 1.0568, "step": 150 }, { "epoch": 0.1954060174700744, "grad_norm": 0.8052242398262024, "learning_rate": 9.98267067993137e-05, "loss": 0.8235, "step": 151 }, { "epoch": 0.19670009705596894, "grad_norm": 0.9700384140014648, "learning_rate": 9.982326029598931e-05, "loss": 0.8611, "step": 152 }, { "epoch": 0.19799417664186347, "grad_norm": 0.8437764048576355, "learning_rate": 9.981977991746142e-05, "loss": 0.83, "step": 153 }, { "epoch": 0.199288256227758, "grad_norm": 0.930636465549469, "learning_rate": 9.98162656660964e-05, "loss": 1.0892, "step": 154 }, { "epoch": 0.20058233581365253, "grad_norm": 0.9111954569816589, "learning_rate": 9.98127175442836e-05, "loss": 0.9962, "step": 155 }, { "epoch": 0.20187641539954707, "grad_norm": 0.9521974921226501, "learning_rate": 9.980913555443541e-05, "loss": 0.911, "step": 156 }, { "epoch": 0.2031704949854416, "grad_norm": 0.8516745567321777, "learning_rate": 9.980551969898727e-05, "loss": 0.9009, "step": 157 }, { "epoch": 0.20446457457133613, "grad_norm": 0.8302998542785645, "learning_rate": 9.98018699803976e-05, "loss": 0.8854, "step": 158 }, { "epoch": 0.20575865415723066, "grad_norm": 0.814391016960144, "learning_rate": 9.979818640114789e-05, "loss": 0.9601, "step": 159 }, { "epoch": 0.2070527337431252, "grad_norm": 0.8938564658164978, "learning_rate": 9.979446896374262e-05, "loss": 0.8834, "step": 160 }, { "epoch": 0.20834681332901975, "grad_norm": 0.9066985249519348, "learning_rate": 9.979071767070932e-05, "loss": 0.7427, "step": 161 }, { "epoch": 0.20964089291491428, "grad_norm": 0.7866595983505249, "learning_rate": 9.978693252459851e-05, "loss": 0.8556, "step": 162 }, { "epoch": 0.2109349725008088, "grad_norm": 0.9159708023071289, "learning_rate": 9.978311352798374e-05, "loss": 0.8101, "step": 163 }, { "epoch": 0.21222905208670334, "grad_norm": 1.1350793838500977, "learning_rate": 9.977926068346157e-05, "loss": 0.9374, "step": 164 }, { "epoch": 0.21352313167259787, "grad_norm": 1.0535932779312134, "learning_rate": 9.977537399365159e-05, "loss": 1.0238, "step": 165 }, { "epoch": 0.2148172112584924, "grad_norm": 0.8717033267021179, "learning_rate": 9.977145346119637e-05, "loss": 1.0265, "step": 166 }, { "epoch": 0.21611129084438693, "grad_norm": 0.8357003927230835, "learning_rate": 9.976749908876152e-05, "loss": 0.9016, "step": 167 }, { "epoch": 0.21740537043028146, "grad_norm": 0.8369495868682861, "learning_rate": 9.976351087903568e-05, "loss": 0.8764, "step": 168 }, { "epoch": 0.218699450016176, "grad_norm": 0.912352979183197, "learning_rate": 9.97594888347304e-05, "loss": 0.9078, "step": 169 }, { "epoch": 0.21999352960207053, "grad_norm": 0.8475804328918457, "learning_rate": 9.975543295858035e-05, "loss": 0.8836, "step": 170 }, { "epoch": 0.22128760918796506, "grad_norm": 0.8391397595405579, "learning_rate": 9.97513432533431e-05, "loss": 0.9003, "step": 171 }, { "epoch": 0.2225816887738596, "grad_norm": 0.9666821360588074, "learning_rate": 9.974721972179931e-05, "loss": 0.9528, "step": 172 }, { "epoch": 0.22387576835975412, "grad_norm": 0.9321691393852234, "learning_rate": 9.974306236675259e-05, "loss": 0.9575, "step": 173 }, { "epoch": 0.22516984794564865, "grad_norm": 0.8022271990776062, "learning_rate": 9.973887119102957e-05, "loss": 0.8731, "step": 174 }, { "epoch": 0.22646392753154318, "grad_norm": 1.1056872606277466, "learning_rate": 9.973464619747983e-05, "loss": 0.9925, "step": 175 }, { "epoch": 0.2277580071174377, "grad_norm": 0.810420036315918, "learning_rate": 9.9730387388976e-05, "loss": 1.0073, "step": 176 }, { "epoch": 0.22905208670333224, "grad_norm": 0.9536454677581787, "learning_rate": 9.972609476841367e-05, "loss": 0.9595, "step": 177 }, { "epoch": 0.2303461662892268, "grad_norm": 0.8205066919326782, "learning_rate": 9.972176833871142e-05, "loss": 0.8146, "step": 178 }, { "epoch": 0.23164024587512133, "grad_norm": 0.9716495275497437, "learning_rate": 9.971740810281083e-05, "loss": 1.0377, "step": 179 }, { "epoch": 0.23293432546101586, "grad_norm": 0.828642725944519, "learning_rate": 9.971301406367644e-05, "loss": 0.8619, "step": 180 }, { "epoch": 0.2342284050469104, "grad_norm": 0.6980477571487427, "learning_rate": 9.970858622429579e-05, "loss": 0.8271, "step": 181 }, { "epoch": 0.23552248463280492, "grad_norm": 0.954387903213501, "learning_rate": 9.970412458767943e-05, "loss": 0.8465, "step": 182 }, { "epoch": 0.23681656421869945, "grad_norm": 0.8425692915916443, "learning_rate": 9.969962915686083e-05, "loss": 0.8893, "step": 183 }, { "epoch": 0.23811064380459399, "grad_norm": 0.8565071225166321, "learning_rate": 9.969509993489647e-05, "loss": 0.939, "step": 184 }, { "epoch": 0.23940472339048852, "grad_norm": 0.8831691145896912, "learning_rate": 9.969053692486583e-05, "loss": 0.8907, "step": 185 }, { "epoch": 0.24069880297638305, "grad_norm": 0.9661678075790405, "learning_rate": 9.96859401298713e-05, "loss": 0.9191, "step": 186 }, { "epoch": 0.24199288256227758, "grad_norm": 0.8784729838371277, "learning_rate": 9.968130955303828e-05, "loss": 1.0393, "step": 187 }, { "epoch": 0.2432869621481721, "grad_norm": 0.8830071091651917, "learning_rate": 9.967664519751515e-05, "loss": 0.9837, "step": 188 }, { "epoch": 0.24458104173406664, "grad_norm": 0.862108588218689, "learning_rate": 9.967194706647322e-05, "loss": 0.7871, "step": 189 }, { "epoch": 0.24587512131996117, "grad_norm": 1.0068063735961914, "learning_rate": 9.966721516310682e-05, "loss": 0.9526, "step": 190 }, { "epoch": 0.2471692009058557, "grad_norm": 0.9828710556030273, "learning_rate": 9.966244949063316e-05, "loss": 0.8923, "step": 191 }, { "epoch": 0.24846328049175023, "grad_norm": 1.0729883909225464, "learning_rate": 9.965765005229248e-05, "loss": 1.0115, "step": 192 }, { "epoch": 0.24975736007764476, "grad_norm": 0.9844326972961426, "learning_rate": 9.965281685134796e-05, "loss": 0.9855, "step": 193 }, { "epoch": 0.2510514396635393, "grad_norm": 1.1593172550201416, "learning_rate": 9.96479498910857e-05, "loss": 1.0912, "step": 194 }, { "epoch": 0.2523455192494338, "grad_norm": 0.8835370540618896, "learning_rate": 9.964304917481482e-05, "loss": 0.9951, "step": 195 }, { "epoch": 0.2536395988353284, "grad_norm": 0.9553850889205933, "learning_rate": 9.963811470586733e-05, "loss": 0.9335, "step": 196 }, { "epoch": 0.2549336784212229, "grad_norm": 0.863814115524292, "learning_rate": 9.963314648759823e-05, "loss": 1.0203, "step": 197 }, { "epoch": 0.25622775800711745, "grad_norm": 0.9639378786087036, "learning_rate": 9.962814452338542e-05, "loss": 1.0357, "step": 198 }, { "epoch": 0.25752183759301195, "grad_norm": 0.880519688129425, "learning_rate": 9.96231088166298e-05, "loss": 0.9964, "step": 199 }, { "epoch": 0.2588159171789065, "grad_norm": 0.8445360064506531, "learning_rate": 9.961803937075516e-05, "loss": 0.9766, "step": 200 }, { "epoch": 0.260109996764801, "grad_norm": 0.8204835057258606, "learning_rate": 9.961293618920826e-05, "loss": 0.8864, "step": 201 }, { "epoch": 0.26140407635069557, "grad_norm": 0.9315406084060669, "learning_rate": 9.960779927545883e-05, "loss": 1.0388, "step": 202 }, { "epoch": 0.2626981559365901, "grad_norm": 0.9286762475967407, "learning_rate": 9.960262863299943e-05, "loss": 0.9653, "step": 203 }, { "epoch": 0.26399223552248463, "grad_norm": 0.8037816882133484, "learning_rate": 9.959742426534566e-05, "loss": 0.765, "step": 204 }, { "epoch": 0.2652863151083792, "grad_norm": 0.9435904622077942, "learning_rate": 9.9592186176036e-05, "loss": 0.8889, "step": 205 }, { "epoch": 0.2665803946942737, "grad_norm": 1.0072762966156006, "learning_rate": 9.958691436863188e-05, "loss": 0.8358, "step": 206 }, { "epoch": 0.26787447428016825, "grad_norm": 0.9463568329811096, "learning_rate": 9.958160884671761e-05, "loss": 0.8815, "step": 207 }, { "epoch": 0.26916855386606275, "grad_norm": 0.9203188419342041, "learning_rate": 9.957626961390047e-05, "loss": 0.9312, "step": 208 }, { "epoch": 0.2704626334519573, "grad_norm": 1.0614677667617798, "learning_rate": 9.957089667381064e-05, "loss": 0.9822, "step": 209 }, { "epoch": 0.2717567130378518, "grad_norm": 0.8971818089485168, "learning_rate": 9.956549003010123e-05, "loss": 0.9421, "step": 210 }, { "epoch": 0.2730507926237464, "grad_norm": 0.9978768825531006, "learning_rate": 9.956004968644825e-05, "loss": 0.9539, "step": 211 }, { "epoch": 0.2743448722096409, "grad_norm": 0.7017274498939514, "learning_rate": 9.955457564655064e-05, "loss": 0.665, "step": 212 }, { "epoch": 0.27563895179553544, "grad_norm": 0.8292055726051331, "learning_rate": 9.954906791413023e-05, "loss": 0.922, "step": 213 }, { "epoch": 0.27693303138142994, "grad_norm": 0.978084146976471, "learning_rate": 9.954352649293178e-05, "loss": 0.9465, "step": 214 }, { "epoch": 0.2782271109673245, "grad_norm": 1.0260313749313354, "learning_rate": 9.953795138672291e-05, "loss": 0.9093, "step": 215 }, { "epoch": 0.279521190553219, "grad_norm": 1.162850022315979, "learning_rate": 9.95323425992942e-05, "loss": 1.0372, "step": 216 }, { "epoch": 0.28081527013911356, "grad_norm": 0.9785279631614685, "learning_rate": 9.952670013445913e-05, "loss": 0.8818, "step": 217 }, { "epoch": 0.28210934972500806, "grad_norm": 0.9386499524116516, "learning_rate": 9.9521023996054e-05, "loss": 0.8711, "step": 218 }, { "epoch": 0.2834034293109026, "grad_norm": 0.8620506525039673, "learning_rate": 9.951531418793812e-05, "loss": 1.011, "step": 219 }, { "epoch": 0.2846975088967972, "grad_norm": 0.9523435831069946, "learning_rate": 9.950957071399357e-05, "loss": 0.8541, "step": 220 }, { "epoch": 0.2859915884826917, "grad_norm": 0.8993477821350098, "learning_rate": 9.950379357812543e-05, "loss": 1.0253, "step": 221 }, { "epoch": 0.28728566806858624, "grad_norm": 1.073880910873413, "learning_rate": 9.949798278426158e-05, "loss": 1.115, "step": 222 }, { "epoch": 0.28857974765448074, "grad_norm": 0.7941976189613342, "learning_rate": 9.949213833635285e-05, "loss": 0.9398, "step": 223 }, { "epoch": 0.2898738272403753, "grad_norm": 0.798089325428009, "learning_rate": 9.948626023837291e-05, "loss": 0.8523, "step": 224 }, { "epoch": 0.2911679068262698, "grad_norm": 1.0251280069351196, "learning_rate": 9.948034849431831e-05, "loss": 0.939, "step": 225 }, { "epoch": 0.29246198641216437, "grad_norm": 0.9793195724487305, "learning_rate": 9.947440310820852e-05, "loss": 1.0998, "step": 226 }, { "epoch": 0.29375606599805887, "grad_norm": 0.8190125823020935, "learning_rate": 9.946842408408583e-05, "loss": 0.9606, "step": 227 }, { "epoch": 0.2950501455839534, "grad_norm": 0.8229602575302124, "learning_rate": 9.946241142601543e-05, "loss": 0.7944, "step": 228 }, { "epoch": 0.29634422516984793, "grad_norm": 0.8640865683555603, "learning_rate": 9.945636513808537e-05, "loss": 1.112, "step": 229 }, { "epoch": 0.2976383047557425, "grad_norm": 0.774501621723175, "learning_rate": 9.945028522440653e-05, "loss": 0.8986, "step": 230 }, { "epoch": 0.298932384341637, "grad_norm": 0.9039688110351562, "learning_rate": 9.944417168911275e-05, "loss": 1.0461, "step": 231 }, { "epoch": 0.30022646392753155, "grad_norm": 0.8048250675201416, "learning_rate": 9.943802453636065e-05, "loss": 0.89, "step": 232 }, { "epoch": 0.30152054351342605, "grad_norm": 0.8166521787643433, "learning_rate": 9.94318437703297e-05, "loss": 0.9149, "step": 233 }, { "epoch": 0.3028146230993206, "grad_norm": 0.7571333646774292, "learning_rate": 9.942562939522228e-05, "loss": 0.9655, "step": 234 }, { "epoch": 0.3041087026852151, "grad_norm": 0.6913223266601562, "learning_rate": 9.941938141526354e-05, "loss": 0.869, "step": 235 }, { "epoch": 0.3054027822711097, "grad_norm": 0.8476676344871521, "learning_rate": 9.94130998347016e-05, "loss": 0.8849, "step": 236 }, { "epoch": 0.30669686185700423, "grad_norm": 0.8454031944274902, "learning_rate": 9.940678465780728e-05, "loss": 0.9102, "step": 237 }, { "epoch": 0.30799094144289874, "grad_norm": 0.8514583706855774, "learning_rate": 9.940043588887438e-05, "loss": 0.9723, "step": 238 }, { "epoch": 0.3092850210287933, "grad_norm": 0.7330415844917297, "learning_rate": 9.939405353221942e-05, "loss": 0.9537, "step": 239 }, { "epoch": 0.3105791006146878, "grad_norm": 0.9652897715568542, "learning_rate": 9.938763759218185e-05, "loss": 0.9736, "step": 240 }, { "epoch": 0.31187318020058236, "grad_norm": 0.7517886161804199, "learning_rate": 9.93811880731239e-05, "loss": 0.8436, "step": 241 }, { "epoch": 0.31316725978647686, "grad_norm": 0.8159210681915283, "learning_rate": 9.937470497943064e-05, "loss": 0.7521, "step": 242 }, { "epoch": 0.3144613393723714, "grad_norm": 0.9554911851882935, "learning_rate": 9.936818831550998e-05, "loss": 1.1076, "step": 243 }, { "epoch": 0.3157554189582659, "grad_norm": 0.8745877742767334, "learning_rate": 9.936163808579266e-05, "loss": 0.8908, "step": 244 }, { "epoch": 0.3170494985441605, "grad_norm": 0.8050674200057983, "learning_rate": 9.93550542947322e-05, "loss": 1.0134, "step": 245 }, { "epoch": 0.318343578130055, "grad_norm": 1.009790062904358, "learning_rate": 9.9348436946805e-05, "loss": 1.0264, "step": 246 }, { "epoch": 0.31963765771594954, "grad_norm": 0.8702448606491089, "learning_rate": 9.934178604651023e-05, "loss": 1.0067, "step": 247 }, { "epoch": 0.32093173730184404, "grad_norm": 0.8105303049087524, "learning_rate": 9.933510159836989e-05, "loss": 0.8121, "step": 248 }, { "epoch": 0.3222258168877386, "grad_norm": 0.7680085897445679, "learning_rate": 9.932838360692878e-05, "loss": 0.8951, "step": 249 }, { "epoch": 0.3235198964736331, "grad_norm": 0.8338052034378052, "learning_rate": 9.93216320767545e-05, "loss": 0.8878, "step": 250 }, { "epoch": 0.32481397605952766, "grad_norm": 0.8709661960601807, "learning_rate": 9.93148470124375e-05, "loss": 0.8786, "step": 251 }, { "epoch": 0.32610805564542217, "grad_norm": 0.9593453407287598, "learning_rate": 9.930802841859095e-05, "loss": 1.0659, "step": 252 }, { "epoch": 0.3274021352313167, "grad_norm": 1.0130974054336548, "learning_rate": 9.93011762998509e-05, "loss": 0.9626, "step": 253 }, { "epoch": 0.3286962148172113, "grad_norm": 0.9949910640716553, "learning_rate": 9.929429066087616e-05, "loss": 1.0499, "step": 254 }, { "epoch": 0.3299902944031058, "grad_norm": 0.927542507648468, "learning_rate": 9.92873715063483e-05, "loss": 0.9571, "step": 255 } ], "logging_steps": 1, "max_steps": 3860, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8507397878185984e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }