{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9946949602122017, "eval_steps": 500, "global_step": 282, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007073386383731211, "grad_norm": 0.4045802652835846, "learning_rate": 0.0, "loss": 2.592, "step": 1 }, { "epoch": 0.014146772767462422, "grad_norm": 0.4087854325771332, "learning_rate": 0.00018927892607143717, "loss": 2.3663, "step": 2 }, { "epoch": 0.021220159151193633, "grad_norm": 0.391991525888443, "learning_rate": 0.0003, "loss": 2.3427, "step": 3 }, { "epoch": 0.028293545534924844, "grad_norm": 0.47497037053108215, "learning_rate": 0.0003, "loss": 2.4095, "step": 4 }, { "epoch": 0.03536693191865606, "grad_norm": 0.3936399221420288, "learning_rate": 0.0003, "loss": 1.7048, "step": 5 }, { "epoch": 0.042440318302387266, "grad_norm": 0.6155605316162109, "learning_rate": 0.0003, "loss": 1.8204, "step": 6 }, { "epoch": 0.04951370468611848, "grad_norm": 0.49080851674079895, "learning_rate": 0.0003, "loss": 1.4646, "step": 7 }, { "epoch": 0.05658709106984969, "grad_norm": 0.5759713053703308, "learning_rate": 0.0003, "loss": 1.4984, "step": 8 }, { "epoch": 0.0636604774535809, "grad_norm": 0.5349287390708923, "learning_rate": 0.0003, "loss": 1.3691, "step": 9 }, { "epoch": 0.07073386383731212, "grad_norm": 0.3948557674884796, "learning_rate": 0.0003, "loss": 1.4401, "step": 10 }, { "epoch": 0.07780725022104333, "grad_norm": 0.37507522106170654, "learning_rate": 0.0003, "loss": 1.1852, "step": 11 }, { "epoch": 0.08488063660477453, "grad_norm": 0.32405033707618713, "learning_rate": 0.0003, "loss": 1.051, "step": 12 }, { "epoch": 0.09195402298850575, "grad_norm": 0.4525175392627716, "learning_rate": 0.0003, "loss": 1.2695, "step": 13 }, { "epoch": 0.09902740937223696, "grad_norm": 0.42692625522613525, "learning_rate": 0.0003, "loss": 1.1057, "step": 14 }, { "epoch": 0.10610079575596817, "grad_norm": 0.5049455761909485, "learning_rate": 0.0003, "loss": 1.6851, "step": 15 }, { "epoch": 0.11317418213969938, "grad_norm": 0.38740119338035583, "learning_rate": 0.0003, "loss": 1.2632, "step": 16 }, { "epoch": 0.12024756852343059, "grad_norm": 0.3729807138442993, "learning_rate": 0.0003, "loss": 1.2857, "step": 17 }, { "epoch": 0.1273209549071618, "grad_norm": 0.4548921287059784, "learning_rate": 0.0003, "loss": 1.2233, "step": 18 }, { "epoch": 0.134394341290893, "grad_norm": 0.4324336051940918, "learning_rate": 0.0003, "loss": 1.1058, "step": 19 }, { "epoch": 0.14146772767462423, "grad_norm": 0.5775079727172852, "learning_rate": 0.0003, "loss": 1.0475, "step": 20 }, { "epoch": 0.14854111405835543, "grad_norm": 0.40563157200813293, "learning_rate": 0.0003, "loss": 1.1364, "step": 21 }, { "epoch": 0.15561450044208666, "grad_norm": 0.4697245657444, "learning_rate": 0.0003, "loss": 1.3599, "step": 22 }, { "epoch": 0.16268788682581786, "grad_norm": 0.42879530787467957, "learning_rate": 0.0003, "loss": 1.1086, "step": 23 }, { "epoch": 0.16976127320954906, "grad_norm": 0.42367979884147644, "learning_rate": 0.0003, "loss": 0.9705, "step": 24 }, { "epoch": 0.1768346595932803, "grad_norm": 0.3987770080566406, "learning_rate": 0.0003, "loss": 1.0087, "step": 25 }, { "epoch": 0.1839080459770115, "grad_norm": 0.3194337487220764, "learning_rate": 0.0003, "loss": 0.8143, "step": 26 }, { "epoch": 0.1909814323607427, "grad_norm": 0.3626921474933624, "learning_rate": 0.0003, "loss": 0.9763, "step": 27 }, { "epoch": 0.19805481874447392, "grad_norm": 0.38496437668800354, "learning_rate": 0.0003, "loss": 0.6315, "step": 28 }, { "epoch": 0.20512820512820512, "grad_norm": 0.41984379291534424, "learning_rate": 0.0003, "loss": 1.0303, "step": 29 }, { "epoch": 0.21220159151193635, "grad_norm": 0.4012935161590576, "learning_rate": 0.0003, "loss": 0.9862, "step": 30 }, { "epoch": 0.21927497789566755, "grad_norm": 0.40578627586364746, "learning_rate": 0.0003, "loss": 1.0094, "step": 31 }, { "epoch": 0.22634836427939875, "grad_norm": 0.41153454780578613, "learning_rate": 0.0003, "loss": 0.966, "step": 32 }, { "epoch": 0.23342175066312998, "grad_norm": 0.3835723400115967, "learning_rate": 0.0003, "loss": 0.5704, "step": 33 }, { "epoch": 0.24049513704686118, "grad_norm": 0.4588032066822052, "learning_rate": 0.0003, "loss": 0.8564, "step": 34 }, { "epoch": 0.2475685234305924, "grad_norm": 0.42644572257995605, "learning_rate": 0.0003, "loss": 0.8448, "step": 35 }, { "epoch": 0.2546419098143236, "grad_norm": 0.44491246342658997, "learning_rate": 0.0003, "loss": 1.1484, "step": 36 }, { "epoch": 0.26171529619805484, "grad_norm": 0.44271302223205566, "learning_rate": 0.0003, "loss": 0.7746, "step": 37 }, { "epoch": 0.268788682581786, "grad_norm": 0.4080619215965271, "learning_rate": 0.0003, "loss": 0.5377, "step": 38 }, { "epoch": 0.27586206896551724, "grad_norm": 0.3697488605976105, "learning_rate": 0.0003, "loss": 0.9936, "step": 39 }, { "epoch": 0.28293545534924847, "grad_norm": 0.37987953424453735, "learning_rate": 0.0003, "loss": 0.7066, "step": 40 }, { "epoch": 0.29000884173297964, "grad_norm": 0.5652127861976624, "learning_rate": 0.0003, "loss": 0.8813, "step": 41 }, { "epoch": 0.29708222811671087, "grad_norm": 0.45179855823516846, "learning_rate": 0.0003, "loss": 0.6442, "step": 42 }, { "epoch": 0.3041556145004421, "grad_norm": 0.40251022577285767, "learning_rate": 0.0003, "loss": 0.6876, "step": 43 }, { "epoch": 0.3112290008841733, "grad_norm": 0.3425946831703186, "learning_rate": 0.0003, "loss": 0.4759, "step": 44 }, { "epoch": 0.3183023872679045, "grad_norm": 0.3156929016113281, "learning_rate": 0.0003, "loss": 0.5237, "step": 45 }, { "epoch": 0.3253757736516357, "grad_norm": 0.5097647309303284, "learning_rate": 0.0003, "loss": 1.0965, "step": 46 }, { "epoch": 0.33244916003536695, "grad_norm": 0.4245418906211853, "learning_rate": 0.0003, "loss": 0.717, "step": 47 }, { "epoch": 0.3395225464190981, "grad_norm": 0.36271074414253235, "learning_rate": 0.0003, "loss": 0.925, "step": 48 }, { "epoch": 0.34659593280282935, "grad_norm": 0.3543199300765991, "learning_rate": 0.0003, "loss": 0.52, "step": 49 }, { "epoch": 0.3536693191865606, "grad_norm": 0.4760311245918274, "learning_rate": 0.0003, "loss": 0.6514, "step": 50 }, { "epoch": 0.36074270557029176, "grad_norm": 0.36290043592453003, "learning_rate": 0.0003, "loss": 0.6391, "step": 51 }, { "epoch": 0.367816091954023, "grad_norm": 0.4390805959701538, "learning_rate": 0.0003, "loss": 0.7822, "step": 52 }, { "epoch": 0.3748894783377542, "grad_norm": 0.402041494846344, "learning_rate": 0.0003, "loss": 0.5967, "step": 53 }, { "epoch": 0.3819628647214854, "grad_norm": 0.42580777406692505, "learning_rate": 0.0003, "loss": 0.7591, "step": 54 }, { "epoch": 0.3890362511052166, "grad_norm": 0.4342993199825287, "learning_rate": 0.0003, "loss": 0.9428, "step": 55 }, { "epoch": 0.39610963748894784, "grad_norm": 0.42949816584587097, "learning_rate": 0.0003, "loss": 0.6546, "step": 56 }, { "epoch": 0.40318302387267907, "grad_norm": 0.44655221700668335, "learning_rate": 0.0003, "loss": 0.6999, "step": 57 }, { "epoch": 0.41025641025641024, "grad_norm": 0.38236895203590393, "learning_rate": 0.0003, "loss": 0.5464, "step": 58 }, { "epoch": 0.41732979664014147, "grad_norm": 0.39055347442626953, "learning_rate": 0.0003, "loss": 0.8726, "step": 59 }, { "epoch": 0.4244031830238727, "grad_norm": 0.47743409872055054, "learning_rate": 0.0003, "loss": 0.6839, "step": 60 }, { "epoch": 0.43147656940760387, "grad_norm": 0.5571391582489014, "learning_rate": 0.0003, "loss": 0.6384, "step": 61 }, { "epoch": 0.4385499557913351, "grad_norm": 0.4612606465816498, "learning_rate": 0.0003, "loss": 0.8187, "step": 62 }, { "epoch": 0.44562334217506633, "grad_norm": 0.3999072313308716, "learning_rate": 0.0003, "loss": 0.6792, "step": 63 }, { "epoch": 0.4526967285587975, "grad_norm": 0.4889736771583557, "learning_rate": 0.0003, "loss": 0.7837, "step": 64 }, { "epoch": 0.45977011494252873, "grad_norm": 0.4411163628101349, "learning_rate": 0.0003, "loss": 0.7325, "step": 65 }, { "epoch": 0.46684350132625996, "grad_norm": 0.4137038588523865, "learning_rate": 0.0003, "loss": 0.5974, "step": 66 }, { "epoch": 0.4739168877099912, "grad_norm": 0.4226423501968384, "learning_rate": 0.0003, "loss": 0.6251, "step": 67 }, { "epoch": 0.48099027409372236, "grad_norm": 0.4461803734302521, "learning_rate": 0.0003, "loss": 0.5721, "step": 68 }, { "epoch": 0.4880636604774536, "grad_norm": 0.4135233461856842, "learning_rate": 0.0003, "loss": 0.708, "step": 69 }, { "epoch": 0.4951370468611848, "grad_norm": 0.40338656306266785, "learning_rate": 0.0003, "loss": 0.6943, "step": 70 }, { "epoch": 0.502210433244916, "grad_norm": 0.47266095876693726, "learning_rate": 0.0003, "loss": 0.6883, "step": 71 }, { "epoch": 0.5092838196286472, "grad_norm": 0.45008358359336853, "learning_rate": 0.0003, "loss": 0.6347, "step": 72 }, { "epoch": 0.5163572060123784, "grad_norm": 0.36589792370796204, "learning_rate": 0.0003, "loss": 0.746, "step": 73 }, { "epoch": 0.5234305923961097, "grad_norm": 0.36300450563430786, "learning_rate": 0.0003, "loss": 0.7846, "step": 74 }, { "epoch": 0.5305039787798409, "grad_norm": 0.42305129766464233, "learning_rate": 0.0003, "loss": 0.7909, "step": 75 }, { "epoch": 0.537577365163572, "grad_norm": 0.36807361245155334, "learning_rate": 0.0003, "loss": 0.578, "step": 76 }, { "epoch": 0.5446507515473032, "grad_norm": 0.3479249179363251, "learning_rate": 0.0003, "loss": 0.4358, "step": 77 }, { "epoch": 0.5517241379310345, "grad_norm": 0.4373302161693573, "learning_rate": 0.0003, "loss": 0.8263, "step": 78 }, { "epoch": 0.5587975243147657, "grad_norm": 0.5427613854408264, "learning_rate": 0.0003, "loss": 0.7728, "step": 79 }, { "epoch": 0.5658709106984969, "grad_norm": 0.4510067403316498, "learning_rate": 0.0003, "loss": 0.7188, "step": 80 }, { "epoch": 0.5729442970822282, "grad_norm": 0.3964546322822571, "learning_rate": 0.0003, "loss": 0.6707, "step": 81 }, { "epoch": 0.5800176834659593, "grad_norm": 0.40177956223487854, "learning_rate": 0.0003, "loss": 0.7056, "step": 82 }, { "epoch": 0.5870910698496905, "grad_norm": 0.4081084728240967, "learning_rate": 0.0003, "loss": 0.6588, "step": 83 }, { "epoch": 0.5941644562334217, "grad_norm": 0.3595137298107147, "learning_rate": 0.0003, "loss": 0.6469, "step": 84 }, { "epoch": 0.601237842617153, "grad_norm": 0.40407031774520874, "learning_rate": 0.0003, "loss": 0.6954, "step": 85 }, { "epoch": 0.6083112290008842, "grad_norm": 0.47531482577323914, "learning_rate": 0.0003, "loss": 0.5842, "step": 86 }, { "epoch": 0.6153846153846154, "grad_norm": 0.3669019639492035, "learning_rate": 0.0003, "loss": 0.6278, "step": 87 }, { "epoch": 0.6224580017683466, "grad_norm": 0.3638778030872345, "learning_rate": 0.0003, "loss": 0.4731, "step": 88 }, { "epoch": 0.6295313881520778, "grad_norm": 0.39883217215538025, "learning_rate": 0.0003, "loss": 0.6891, "step": 89 }, { "epoch": 0.636604774535809, "grad_norm": 0.627139687538147, "learning_rate": 0.0003, "loss": 0.58, "step": 90 }, { "epoch": 0.6436781609195402, "grad_norm": 0.5339258313179016, "learning_rate": 0.0003, "loss": 0.6198, "step": 91 }, { "epoch": 0.6507515473032714, "grad_norm": 0.4699147939682007, "learning_rate": 0.0003, "loss": 0.7175, "step": 92 }, { "epoch": 0.6578249336870027, "grad_norm": 0.3144320249557495, "learning_rate": 0.0003, "loss": 0.4438, "step": 93 }, { "epoch": 0.6648983200707339, "grad_norm": 0.47343114018440247, "learning_rate": 0.0003, "loss": 0.7511, "step": 94 }, { "epoch": 0.671971706454465, "grad_norm": 0.43690529465675354, "learning_rate": 0.0003, "loss": 0.4847, "step": 95 }, { "epoch": 0.6790450928381963, "grad_norm": 0.5092759728431702, "learning_rate": 0.0003, "loss": 0.6703, "step": 96 }, { "epoch": 0.6861184792219275, "grad_norm": 0.7045844793319702, "learning_rate": 0.0003, "loss": 0.717, "step": 97 }, { "epoch": 0.6931918656056587, "grad_norm": 0.34709087014198303, "learning_rate": 0.0003, "loss": 0.5597, "step": 98 }, { "epoch": 0.7002652519893899, "grad_norm": 0.39407986402511597, "learning_rate": 0.0003, "loss": 0.5079, "step": 99 }, { "epoch": 0.7073386383731212, "grad_norm": 0.6836314797401428, "learning_rate": 0.0003, "loss": 0.5947, "step": 100 }, { "epoch": 0.7144120247568524, "grad_norm": 0.4487530291080475, "learning_rate": 0.0003, "loss": 0.5638, "step": 101 }, { "epoch": 0.7214854111405835, "grad_norm": 0.34299322962760925, "learning_rate": 0.0003, "loss": 0.4268, "step": 102 }, { "epoch": 0.7285587975243147, "grad_norm": 0.4325425624847412, "learning_rate": 0.0003, "loss": 0.7195, "step": 103 }, { "epoch": 0.735632183908046, "grad_norm": 0.3857167959213257, "learning_rate": 0.0003, "loss": 0.5525, "step": 104 }, { "epoch": 0.7427055702917772, "grad_norm": 0.5439281463623047, "learning_rate": 0.0003, "loss": 0.8488, "step": 105 }, { "epoch": 0.7497789566755084, "grad_norm": 0.5054299831390381, "learning_rate": 0.0003, "loss": 0.5801, "step": 106 }, { "epoch": 0.7568523430592397, "grad_norm": 0.5152317881584167, "learning_rate": 0.0003, "loss": 0.6918, "step": 107 }, { "epoch": 0.7639257294429708, "grad_norm": 0.32669249176979065, "learning_rate": 0.0003, "loss": 0.5322, "step": 108 }, { "epoch": 0.770999115826702, "grad_norm": 0.4302417039871216, "learning_rate": 0.0003, "loss": 0.6439, "step": 109 }, { "epoch": 0.7780725022104332, "grad_norm": 0.4388223886489868, "learning_rate": 0.0003, "loss": 0.6196, "step": 110 }, { "epoch": 0.7851458885941645, "grad_norm": 0.42924442887306213, "learning_rate": 0.0003, "loss": 0.5175, "step": 111 }, { "epoch": 0.7922192749778957, "grad_norm": 0.4361798167228699, "learning_rate": 0.0003, "loss": 0.5342, "step": 112 }, { "epoch": 0.7992926613616269, "grad_norm": 0.4133489429950714, "learning_rate": 0.0003, "loss": 0.5639, "step": 113 }, { "epoch": 0.8063660477453581, "grad_norm": 0.34224194288253784, "learning_rate": 0.0003, "loss": 0.4695, "step": 114 }, { "epoch": 0.8134394341290893, "grad_norm": 0.4219891428947449, "learning_rate": 0.0003, "loss": 0.6307, "step": 115 }, { "epoch": 0.8205128205128205, "grad_norm": 0.44273802638053894, "learning_rate": 0.0003, "loss": 0.5475, "step": 116 }, { "epoch": 0.8275862068965517, "grad_norm": 0.42054426670074463, "learning_rate": 0.0003, "loss": 0.827, "step": 117 }, { "epoch": 0.8346595932802829, "grad_norm": 0.4792965054512024, "learning_rate": 0.0003, "loss": 0.6, "step": 118 }, { "epoch": 0.8417329796640142, "grad_norm": 0.5182773470878601, "learning_rate": 0.0003, "loss": 0.8832, "step": 119 }, { "epoch": 0.8488063660477454, "grad_norm": 0.41087284684181213, "learning_rate": 0.0003, "loss": 0.5825, "step": 120 }, { "epoch": 0.8558797524314765, "grad_norm": 0.36328765749931335, "learning_rate": 0.0003, "loss": 0.4198, "step": 121 }, { "epoch": 0.8629531388152077, "grad_norm": 0.43922775983810425, "learning_rate": 0.0003, "loss": 0.5495, "step": 122 }, { "epoch": 0.870026525198939, "grad_norm": 0.5079771876335144, "learning_rate": 0.0003, "loss": 0.6814, "step": 123 }, { "epoch": 0.8770999115826702, "grad_norm": 0.3167728781700134, "learning_rate": 0.0003, "loss": 0.5706, "step": 124 }, { "epoch": 0.8841732979664014, "grad_norm": 0.45660603046417236, "learning_rate": 0.0003, "loss": 0.7102, "step": 125 }, { "epoch": 0.8912466843501327, "grad_norm": 0.42243629693984985, "learning_rate": 0.0003, "loss": 0.5449, "step": 126 }, { "epoch": 0.8983200707338639, "grad_norm": 0.32169416546821594, "learning_rate": 0.0003, "loss": 0.3933, "step": 127 }, { "epoch": 0.905393457117595, "grad_norm": 0.32228872179985046, "learning_rate": 0.0003, "loss": 0.6444, "step": 128 }, { "epoch": 0.9124668435013262, "grad_norm": 0.47969621419906616, "learning_rate": 0.0003, "loss": 0.7959, "step": 129 }, { "epoch": 0.9195402298850575, "grad_norm": 0.35543474555015564, "learning_rate": 0.0003, "loss": 0.6535, "step": 130 }, { "epoch": 0.9266136162687887, "grad_norm": 0.4273511469364166, "learning_rate": 0.0003, "loss": 0.6058, "step": 131 }, { "epoch": 0.9336870026525199, "grad_norm": 0.3400624692440033, "learning_rate": 0.0003, "loss": 0.6066, "step": 132 }, { "epoch": 0.9407603890362511, "grad_norm": 0.3195785582065582, "learning_rate": 0.0003, "loss": 0.5878, "step": 133 }, { "epoch": 0.9478337754199824, "grad_norm": 0.34657567739486694, "learning_rate": 0.0003, "loss": 0.6462, "step": 134 }, { "epoch": 0.9549071618037135, "grad_norm": 0.4706454873085022, "learning_rate": 0.0003, "loss": 0.8299, "step": 135 }, { "epoch": 0.9619805481874447, "grad_norm": 0.41353291273117065, "learning_rate": 0.0003, "loss": 0.6372, "step": 136 }, { "epoch": 0.969053934571176, "grad_norm": 0.34282562136650085, "learning_rate": 0.0003, "loss": 0.5901, "step": 137 }, { "epoch": 0.9761273209549072, "grad_norm": 0.4154914617538452, "learning_rate": 0.0003, "loss": 0.6213, "step": 138 }, { "epoch": 0.9832007073386384, "grad_norm": 0.2933409810066223, "learning_rate": 0.0003, "loss": 0.4435, "step": 139 }, { "epoch": 0.9902740937223696, "grad_norm": 0.3763149082660675, "learning_rate": 0.0003, "loss": 0.4754, "step": 140 }, { "epoch": 0.9973474801061007, "grad_norm": 0.4369047284126282, "learning_rate": 0.0003, "loss": 0.6313, "step": 141 }, { "epoch": 1.004420866489832, "grad_norm": 0.40332600474357605, "learning_rate": 0.0003, "loss": 0.4778, "step": 142 }, { "epoch": 1.0114942528735633, "grad_norm": 0.31336432695388794, "learning_rate": 0.0003, "loss": 0.4599, "step": 143 }, { "epoch": 1.0185676392572944, "grad_norm": 0.3116231858730316, "learning_rate": 0.0003, "loss": 0.3823, "step": 144 }, { "epoch": 1.0256410256410255, "grad_norm": 0.47887638211250305, "learning_rate": 0.0003, "loss": 0.4838, "step": 145 }, { "epoch": 1.032714412024757, "grad_norm": 0.3979848325252533, "learning_rate": 0.0003, "loss": 0.3765, "step": 146 }, { "epoch": 1.039787798408488, "grad_norm": 0.3911687433719635, "learning_rate": 0.0003, "loss": 0.379, "step": 147 }, { "epoch": 1.0468611847922193, "grad_norm": 0.41035008430480957, "learning_rate": 0.0003, "loss": 0.4544, "step": 148 }, { "epoch": 1.0539345711759505, "grad_norm": 0.3448046147823334, "learning_rate": 0.0003, "loss": 0.3809, "step": 149 }, { "epoch": 1.0610079575596818, "grad_norm": 0.3258429765701294, "learning_rate": 0.0003, "loss": 0.3027, "step": 150 }, { "epoch": 1.068081343943413, "grad_norm": 0.4393693208694458, "learning_rate": 0.0003, "loss": 0.4825, "step": 151 }, { "epoch": 1.075154730327144, "grad_norm": 0.29749980568885803, "learning_rate": 0.0003, "loss": 0.2696, "step": 152 }, { "epoch": 1.0822281167108754, "grad_norm": 0.3464600741863251, "learning_rate": 0.0003, "loss": 0.2812, "step": 153 }, { "epoch": 1.0893015030946065, "grad_norm": 0.3517362177371979, "learning_rate": 0.0003, "loss": 0.4352, "step": 154 }, { "epoch": 1.0963748894783378, "grad_norm": 0.3475998640060425, "learning_rate": 0.0003, "loss": 0.3298, "step": 155 }, { "epoch": 1.103448275862069, "grad_norm": 0.41514718532562256, "learning_rate": 0.0003, "loss": 0.2779, "step": 156 }, { "epoch": 1.1105216622458003, "grad_norm": 0.38064250349998474, "learning_rate": 0.0003, "loss": 0.3552, "step": 157 }, { "epoch": 1.1175950486295314, "grad_norm": 0.48406025767326355, "learning_rate": 0.0003, "loss": 0.4691, "step": 158 }, { "epoch": 1.1246684350132625, "grad_norm": 0.3856564462184906, "learning_rate": 0.0003, "loss": 0.3817, "step": 159 }, { "epoch": 1.1317418213969939, "grad_norm": 0.40879660844802856, "learning_rate": 0.0003, "loss": 0.3555, "step": 160 }, { "epoch": 1.138815207780725, "grad_norm": 0.4073532223701477, "learning_rate": 0.0003, "loss": 0.3218, "step": 161 }, { "epoch": 1.1458885941644563, "grad_norm": 0.5433499217033386, "learning_rate": 0.0003, "loss": 0.4749, "step": 162 }, { "epoch": 1.1529619805481874, "grad_norm": 0.47047749161720276, "learning_rate": 0.0003, "loss": 0.3945, "step": 163 }, { "epoch": 1.1600353669319188, "grad_norm": 0.3000759184360504, "learning_rate": 0.0003, "loss": 0.3944, "step": 164 }, { "epoch": 1.16710875331565, "grad_norm": 0.38655105233192444, "learning_rate": 0.0003, "loss": 0.458, "step": 165 }, { "epoch": 1.174182139699381, "grad_norm": 0.3441111743450165, "learning_rate": 0.0003, "loss": 0.3388, "step": 166 }, { "epoch": 1.1812555260831124, "grad_norm": 0.5380314588546753, "learning_rate": 0.0003, "loss": 0.5506, "step": 167 }, { "epoch": 1.1883289124668435, "grad_norm": 0.2528212070465088, "learning_rate": 0.0003, "loss": 0.3144, "step": 168 }, { "epoch": 1.1954022988505748, "grad_norm": 0.3783420920372009, "learning_rate": 0.0003, "loss": 0.5596, "step": 169 }, { "epoch": 1.202475685234306, "grad_norm": 0.3812076449394226, "learning_rate": 0.0003, "loss": 0.42, "step": 170 }, { "epoch": 1.209549071618037, "grad_norm": 0.43172749876976013, "learning_rate": 0.0003, "loss": 0.4931, "step": 171 }, { "epoch": 1.2166224580017684, "grad_norm": 0.41426223516464233, "learning_rate": 0.0003, "loss": 0.2998, "step": 172 }, { "epoch": 1.2236958443854995, "grad_norm": 0.35829058289527893, "learning_rate": 0.0003, "loss": 0.4243, "step": 173 }, { "epoch": 1.2307692307692308, "grad_norm": 0.4014543294906616, "learning_rate": 0.0003, "loss": 0.3049, "step": 174 }, { "epoch": 1.237842617152962, "grad_norm": 0.3007238507270813, "learning_rate": 0.0003, "loss": 0.2005, "step": 175 }, { "epoch": 1.244916003536693, "grad_norm": 0.3595844507217407, "learning_rate": 0.0003, "loss": 0.344, "step": 176 }, { "epoch": 1.2519893899204244, "grad_norm": 0.34730204939842224, "learning_rate": 0.0003, "loss": 0.2573, "step": 177 }, { "epoch": 1.2590627763041558, "grad_norm": 0.39390042424201965, "learning_rate": 0.0003, "loss": 0.3177, "step": 178 }, { "epoch": 1.2661361626878869, "grad_norm": 0.41631364822387695, "learning_rate": 0.0003, "loss": 0.4541, "step": 179 }, { "epoch": 1.273209549071618, "grad_norm": 0.4117166996002197, "learning_rate": 0.0003, "loss": 0.4597, "step": 180 }, { "epoch": 1.2802829354553493, "grad_norm": 0.46357792615890503, "learning_rate": 0.0003, "loss": 0.3166, "step": 181 }, { "epoch": 1.2873563218390804, "grad_norm": 0.31492120027542114, "learning_rate": 0.0003, "loss": 0.2183, "step": 182 }, { "epoch": 1.2944297082228116, "grad_norm": 0.31738027930259705, "learning_rate": 0.0003, "loss": 0.3114, "step": 183 }, { "epoch": 1.301503094606543, "grad_norm": 0.37768757343292236, "learning_rate": 0.0003, "loss": 0.2977, "step": 184 }, { "epoch": 1.308576480990274, "grad_norm": 0.45224347710609436, "learning_rate": 0.0003, "loss": 0.3788, "step": 185 }, { "epoch": 1.3156498673740054, "grad_norm": 0.42707428336143494, "learning_rate": 0.0003, "loss": 0.3065, "step": 186 }, { "epoch": 1.3227232537577365, "grad_norm": 0.359110027551651, "learning_rate": 0.0003, "loss": 0.3916, "step": 187 }, { "epoch": 1.3297966401414678, "grad_norm": 0.4212663173675537, "learning_rate": 0.0003, "loss": 0.592, "step": 188 }, { "epoch": 1.336870026525199, "grad_norm": 0.4227355122566223, "learning_rate": 0.0003, "loss": 0.4278, "step": 189 }, { "epoch": 1.34394341290893, "grad_norm": 0.45795100927352905, "learning_rate": 0.0003, "loss": 0.4068, "step": 190 }, { "epoch": 1.3510167992926614, "grad_norm": 0.47883355617523193, "learning_rate": 0.0003, "loss": 0.5285, "step": 191 }, { "epoch": 1.3580901856763925, "grad_norm": 0.36151745915412903, "learning_rate": 0.0003, "loss": 0.365, "step": 192 }, { "epoch": 1.3651635720601238, "grad_norm": 0.38841187953948975, "learning_rate": 0.0003, "loss": 0.4783, "step": 193 }, { "epoch": 1.372236958443855, "grad_norm": 0.3572918772697449, "learning_rate": 0.0003, "loss": 0.4407, "step": 194 }, { "epoch": 1.3793103448275863, "grad_norm": 0.36447620391845703, "learning_rate": 0.0003, "loss": 0.3111, "step": 195 }, { "epoch": 1.3863837312113174, "grad_norm": 0.31043165922164917, "learning_rate": 0.0003, "loss": 0.3809, "step": 196 }, { "epoch": 1.3934571175950485, "grad_norm": 0.4331524670124054, "learning_rate": 0.0003, "loss": 0.3464, "step": 197 }, { "epoch": 1.4005305039787799, "grad_norm": 0.5187276005744934, "learning_rate": 0.0003, "loss": 0.4041, "step": 198 }, { "epoch": 1.407603890362511, "grad_norm": 0.3016161322593689, "learning_rate": 0.0003, "loss": 0.1315, "step": 199 }, { "epoch": 1.4146772767462423, "grad_norm": 0.3778589069843292, "learning_rate": 0.0003, "loss": 0.2563, "step": 200 }, { "epoch": 1.4217506631299734, "grad_norm": 0.4542739987373352, "learning_rate": 0.0003, "loss": 0.3676, "step": 201 }, { "epoch": 1.4288240495137048, "grad_norm": 0.37201106548309326, "learning_rate": 0.0003, "loss": 0.4023, "step": 202 }, { "epoch": 1.435897435897436, "grad_norm": 0.3098253607749939, "learning_rate": 0.0003, "loss": 0.2013, "step": 203 }, { "epoch": 1.442970822281167, "grad_norm": 0.41762611269950867, "learning_rate": 0.0003, "loss": 0.2562, "step": 204 }, { "epoch": 1.4500442086648984, "grad_norm": 0.3805309534072876, "learning_rate": 0.0003, "loss": 0.2091, "step": 205 }, { "epoch": 1.4571175950486295, "grad_norm": 0.30562469363212585, "learning_rate": 0.0003, "loss": 0.3204, "step": 206 }, { "epoch": 1.4641909814323608, "grad_norm": 0.40833625197410583, "learning_rate": 0.0003, "loss": 0.3828, "step": 207 }, { "epoch": 1.471264367816092, "grad_norm": 0.44443726539611816, "learning_rate": 0.0003, "loss": 0.3023, "step": 208 }, { "epoch": 1.4783377541998233, "grad_norm": 0.3216983675956726, "learning_rate": 0.0003, "loss": 0.148, "step": 209 }, { "epoch": 1.4854111405835544, "grad_norm": 0.49379777908325195, "learning_rate": 0.0003, "loss": 0.3597, "step": 210 }, { "epoch": 1.4924845269672855, "grad_norm": 0.41881895065307617, "learning_rate": 0.0003, "loss": 0.3724, "step": 211 }, { "epoch": 1.4995579133510168, "grad_norm": 0.37855106592178345, "learning_rate": 0.0003, "loss": 0.2177, "step": 212 }, { "epoch": 1.506631299734748, "grad_norm": 0.4481782615184784, "learning_rate": 0.0003, "loss": 0.4668, "step": 213 }, { "epoch": 1.513704686118479, "grad_norm": 0.45132726430892944, "learning_rate": 0.0003, "loss": 0.5844, "step": 214 }, { "epoch": 1.5207780725022104, "grad_norm": 0.4039032459259033, "learning_rate": 0.0003, "loss": 0.411, "step": 215 }, { "epoch": 1.5278514588859418, "grad_norm": 0.3423170745372772, "learning_rate": 0.0003, "loss": 0.3069, "step": 216 }, { "epoch": 1.5349248452696729, "grad_norm": 0.3927661180496216, "learning_rate": 0.0003, "loss": 0.5008, "step": 217 }, { "epoch": 1.541998231653404, "grad_norm": 0.43571972846984863, "learning_rate": 0.0003, "loss": 0.4626, "step": 218 }, { "epoch": 1.5490716180371353, "grad_norm": 0.370449423789978, "learning_rate": 0.0003, "loss": 0.2882, "step": 219 }, { "epoch": 1.5561450044208665, "grad_norm": 0.3305343687534332, "learning_rate": 0.0003, "loss": 0.2781, "step": 220 }, { "epoch": 1.5632183908045976, "grad_norm": 0.40083616971969604, "learning_rate": 0.0003, "loss": 0.2652, "step": 221 }, { "epoch": 1.570291777188329, "grad_norm": 0.38695937395095825, "learning_rate": 0.0003, "loss": 0.4565, "step": 222 }, { "epoch": 1.5773651635720602, "grad_norm": 0.5376386046409607, "learning_rate": 0.0003, "loss": 0.4184, "step": 223 }, { "epoch": 1.5844385499557914, "grad_norm": 0.5290461182594299, "learning_rate": 0.0003, "loss": 0.3836, "step": 224 }, { "epoch": 1.5915119363395225, "grad_norm": 0.39294925332069397, "learning_rate": 0.0003, "loss": 0.446, "step": 225 }, { "epoch": 1.5985853227232538, "grad_norm": 0.3946995139122009, "learning_rate": 0.0003, "loss": 0.3433, "step": 226 }, { "epoch": 1.605658709106985, "grad_norm": 0.3850666880607605, "learning_rate": 0.0003, "loss": 0.515, "step": 227 }, { "epoch": 1.612732095490716, "grad_norm": 0.3812507688999176, "learning_rate": 0.0003, "loss": 0.4666, "step": 228 }, { "epoch": 1.6198054818744474, "grad_norm": 0.34343773126602173, "learning_rate": 0.0003, "loss": 0.3437, "step": 229 }, { "epoch": 1.6268788682581787, "grad_norm": 0.42423132061958313, "learning_rate": 0.0003, "loss": 0.2998, "step": 230 }, { "epoch": 1.6339522546419099, "grad_norm": 0.36676838994026184, "learning_rate": 0.0003, "loss": 0.381, "step": 231 }, { "epoch": 1.641025641025641, "grad_norm": 0.45891061425209045, "learning_rate": 0.0003, "loss": 0.4426, "step": 232 }, { "epoch": 1.6480990274093723, "grad_norm": 0.4290439188480377, "learning_rate": 0.0003, "loss": 0.3475, "step": 233 }, { "epoch": 1.6551724137931034, "grad_norm": 0.3556974232196808, "learning_rate": 0.0003, "loss": 0.328, "step": 234 }, { "epoch": 1.6622458001768345, "grad_norm": 0.30578428506851196, "learning_rate": 0.0003, "loss": 0.2591, "step": 235 }, { "epoch": 1.6693191865605659, "grad_norm": 0.3522488474845886, "learning_rate": 0.0003, "loss": 0.416, "step": 236 }, { "epoch": 1.6763925729442972, "grad_norm": 0.3940620720386505, "learning_rate": 0.0003, "loss": 0.548, "step": 237 }, { "epoch": 1.6834659593280283, "grad_norm": 0.4076889455318451, "learning_rate": 0.0003, "loss": 0.5044, "step": 238 }, { "epoch": 1.6905393457117595, "grad_norm": 0.49337613582611084, "learning_rate": 0.0003, "loss": 0.4355, "step": 239 }, { "epoch": 1.6976127320954908, "grad_norm": 0.37077927589416504, "learning_rate": 0.0003, "loss": 0.4739, "step": 240 }, { "epoch": 1.704686118479222, "grad_norm": 0.4110550880432129, "learning_rate": 0.0003, "loss": 0.428, "step": 241 }, { "epoch": 1.711759504862953, "grad_norm": 0.49631252884864807, "learning_rate": 0.0003, "loss": 0.4227, "step": 242 }, { "epoch": 1.7188328912466844, "grad_norm": 0.3230995535850525, "learning_rate": 0.0003, "loss": 0.3451, "step": 243 }, { "epoch": 1.7259062776304157, "grad_norm": 0.36575183272361755, "learning_rate": 0.0003, "loss": 0.2817, "step": 244 }, { "epoch": 1.7329796640141468, "grad_norm": 0.4187852740287781, "learning_rate": 0.0003, "loss": 0.319, "step": 245 }, { "epoch": 1.740053050397878, "grad_norm": 0.3224227726459503, "learning_rate": 0.0003, "loss": 0.3406, "step": 246 }, { "epoch": 1.7471264367816093, "grad_norm": 0.379561185836792, "learning_rate": 0.0003, "loss": 0.3817, "step": 247 }, { "epoch": 1.7541998231653404, "grad_norm": 0.44703027606010437, "learning_rate": 0.0003, "loss": 0.3879, "step": 248 }, { "epoch": 1.7612732095490715, "grad_norm": 0.34053027629852295, "learning_rate": 0.0003, "loss": 0.2767, "step": 249 }, { "epoch": 1.7683465959328029, "grad_norm": 0.48519593477249146, "learning_rate": 0.0003, "loss": 0.5043, "step": 250 }, { "epoch": 1.7754199823165342, "grad_norm": 0.3466756045818329, "learning_rate": 0.0003, "loss": 0.2593, "step": 251 }, { "epoch": 1.782493368700265, "grad_norm": 0.5155137777328491, "learning_rate": 0.0003, "loss": 0.3529, "step": 252 }, { "epoch": 1.7895667550839964, "grad_norm": 0.4184979796409607, "learning_rate": 0.0003, "loss": 0.535, "step": 253 }, { "epoch": 1.7966401414677278, "grad_norm": 0.3188352882862091, "learning_rate": 0.0003, "loss": 0.2358, "step": 254 }, { "epoch": 1.8037135278514589, "grad_norm": 0.42813432216644287, "learning_rate": 0.0003, "loss": 0.374, "step": 255 }, { "epoch": 1.81078691423519, "grad_norm": 0.40070992708206177, "learning_rate": 0.0003, "loss": 0.4326, "step": 256 }, { "epoch": 1.8178603006189213, "grad_norm": 0.45408982038497925, "learning_rate": 0.0003, "loss": 0.4945, "step": 257 }, { "epoch": 1.8249336870026527, "grad_norm": 0.42870137095451355, "learning_rate": 0.0003, "loss": 0.4528, "step": 258 }, { "epoch": 1.8320070733863836, "grad_norm": 0.3272749185562134, "learning_rate": 0.0003, "loss": 0.2587, "step": 259 }, { "epoch": 1.839080459770115, "grad_norm": 0.4601209759712219, "learning_rate": 0.0003, "loss": 0.5043, "step": 260 }, { "epoch": 1.8461538461538463, "grad_norm": 0.48971623182296753, "learning_rate": 0.0003, "loss": 0.4837, "step": 261 }, { "epoch": 1.8532272325375774, "grad_norm": 0.37702813744544983, "learning_rate": 0.0003, "loss": 0.421, "step": 262 }, { "epoch": 1.8603006189213085, "grad_norm": 0.37648722529411316, "learning_rate": 0.0003, "loss": 0.2666, "step": 263 }, { "epoch": 1.8673740053050398, "grad_norm": 0.5787553787231445, "learning_rate": 0.0003, "loss": 0.2987, "step": 264 }, { "epoch": 1.874447391688771, "grad_norm": 0.4249975085258484, "learning_rate": 0.0003, "loss": 0.5577, "step": 265 }, { "epoch": 1.881520778072502, "grad_norm": 0.3846690356731415, "learning_rate": 0.0003, "loss": 0.3106, "step": 266 }, { "epoch": 1.8885941644562334, "grad_norm": 0.37595272064208984, "learning_rate": 0.0003, "loss": 0.3638, "step": 267 }, { "epoch": 1.8956675508399647, "grad_norm": 0.4609120190143585, "learning_rate": 0.0003, "loss": 0.4356, "step": 268 }, { "epoch": 1.9027409372236959, "grad_norm": 0.3405689299106598, "learning_rate": 0.0003, "loss": 0.3113, "step": 269 }, { "epoch": 1.909814323607427, "grad_norm": 0.30769774317741394, "learning_rate": 0.0003, "loss": 0.2626, "step": 270 }, { "epoch": 1.9168877099911583, "grad_norm": 0.36806437373161316, "learning_rate": 0.0003, "loss": 0.401, "step": 271 }, { "epoch": 1.9239610963748894, "grad_norm": 0.45491501688957214, "learning_rate": 0.0003, "loss": 0.4295, "step": 272 }, { "epoch": 1.9310344827586206, "grad_norm": 0.3272283971309662, "learning_rate": 0.0003, "loss": 0.3143, "step": 273 }, { "epoch": 1.938107869142352, "grad_norm": 0.32763826847076416, "learning_rate": 0.0003, "loss": 0.246, "step": 274 }, { "epoch": 1.9451812555260832, "grad_norm": 0.43065381050109863, "learning_rate": 0.0003, "loss": 0.3338, "step": 275 }, { "epoch": 1.9522546419098143, "grad_norm": 0.43713968992233276, "learning_rate": 0.0003, "loss": 0.3136, "step": 276 }, { "epoch": 1.9593280282935455, "grad_norm": 0.2735891342163086, "learning_rate": 0.0003, "loss": 0.2381, "step": 277 }, { "epoch": 1.9664014146772768, "grad_norm": 0.3156580626964569, "learning_rate": 0.0003, "loss": 0.3336, "step": 278 }, { "epoch": 1.973474801061008, "grad_norm": 0.4958134591579437, "learning_rate": 0.0003, "loss": 0.5279, "step": 279 }, { "epoch": 1.980548187444739, "grad_norm": 0.41325512528419495, "learning_rate": 0.0003, "loss": 0.3997, "step": 280 }, { "epoch": 1.9876215738284704, "grad_norm": 0.29986992478370667, "learning_rate": 0.0003, "loss": 0.2996, "step": 281 }, { "epoch": 1.9946949602122017, "grad_norm": 0.3219819962978363, "learning_rate": 0.0003, "loss": 0.2875, "step": 282 }, { "epoch": 1.9946949602122017, "step": 282, "total_flos": 1.061363392708608e+16, "train_loss": 0.5953954255327265, "train_runtime": 9564.3104, "train_samples_per_second": 0.473, "train_steps_per_second": 0.029 } ], "logging_steps": 1.0, "max_steps": 282, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "total_flos": 1.061363392708608e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }