{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 500, "global_step": 146, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0547945205479452, "grad_norm": 1.6586511135101318, "learning_rate": 1e-06, "loss": 2.2379, "step": 1 }, { "epoch": 0.1095890410958904, "grad_norm": 1.6110830307006836, "learning_rate": 1e-06, "loss": 2.2933, "step": 2 }, { "epoch": 0.1643835616438356, "grad_norm": 1.5261093378067017, "learning_rate": 1e-06, "loss": 2.2564, "step": 3 }, { "epoch": 0.2191780821917808, "grad_norm": 1.6366506814956665, "learning_rate": 1e-06, "loss": 2.2794, "step": 4 }, { "epoch": 0.273972602739726, "grad_norm": 1.5530800819396973, "learning_rate": 1e-06, "loss": 2.2344, "step": 5 }, { "epoch": 0.3287671232876712, "grad_norm": 1.5802958011627197, "learning_rate": 1e-06, "loss": 2.2363, "step": 6 }, { "epoch": 0.3835616438356164, "grad_norm": 1.5483659505844116, "learning_rate": 1e-06, "loss": 2.256, "step": 7 }, { "epoch": 0.4383561643835616, "grad_norm": 1.5273737907409668, "learning_rate": 1e-06, "loss": 2.2733, "step": 8 }, { "epoch": 0.4931506849315068, "grad_norm": 1.534605860710144, "learning_rate": 1e-06, "loss": 2.223, "step": 9 }, { "epoch": 0.547945205479452, "grad_norm": 1.5523834228515625, "learning_rate": 1e-06, "loss": 2.2755, "step": 10 }, { "epoch": 0.6027397260273972, "grad_norm": 1.5962920188903809, "learning_rate": 1e-06, "loss": 2.2875, "step": 11 }, { "epoch": 0.6575342465753424, "grad_norm": 1.5564601421356201, "learning_rate": 1e-06, "loss": 2.2716, "step": 12 }, { "epoch": 0.7123287671232876, "grad_norm": 1.5305095911026, "learning_rate": 1e-06, "loss": 2.2485, "step": 13 }, { "epoch": 0.7671232876712328, "grad_norm": 1.4675662517547607, "learning_rate": 1e-06, "loss": 2.2574, "step": 14 }, { "epoch": 0.821917808219178, "grad_norm": 1.4668537378311157, "learning_rate": 1e-06, "loss": 2.2226, "step": 15 }, { "epoch": 0.8767123287671232, "grad_norm": 1.5306854248046875, "learning_rate": 1e-06, "loss": 2.2798, "step": 16 }, { "epoch": 0.9315068493150684, "grad_norm": 1.5047531127929688, "learning_rate": 1e-06, "loss": 2.2486, "step": 17 }, { "epoch": 0.9863013698630136, "grad_norm": 1.4622173309326172, "learning_rate": 1e-06, "loss": 2.217, "step": 18 }, { "epoch": 1.0410958904109588, "grad_norm": 1.5452288389205933, "learning_rate": 1e-06, "loss": 2.2271, "step": 19 }, { "epoch": 1.095890410958904, "grad_norm": 1.4995627403259277, "learning_rate": 1e-06, "loss": 2.2222, "step": 20 }, { "epoch": 1.1506849315068493, "grad_norm": 1.4030557870864868, "learning_rate": 1e-06, "loss": 2.2547, "step": 21 }, { "epoch": 1.2054794520547945, "grad_norm": 1.4066240787506104, "learning_rate": 1e-06, "loss": 2.2279, "step": 22 }, { "epoch": 1.2602739726027397, "grad_norm": 1.4491875171661377, "learning_rate": 1e-06, "loss": 2.2497, "step": 23 }, { "epoch": 1.3150684931506849, "grad_norm": 1.3880819082260132, "learning_rate": 1e-06, "loss": 2.2593, "step": 24 }, { "epoch": 1.36986301369863, "grad_norm": 1.471488118171692, "learning_rate": 1e-06, "loss": 2.2496, "step": 25 }, { "epoch": 1.4246575342465753, "grad_norm": 1.388680338859558, "learning_rate": 1e-06, "loss": 2.2262, "step": 26 }, { "epoch": 1.4794520547945205, "grad_norm": 1.4523004293441772, "learning_rate": 1e-06, "loss": 2.2535, "step": 27 }, { "epoch": 1.5342465753424657, "grad_norm": 1.4338841438293457, "learning_rate": 1e-06, "loss": 2.2315, "step": 28 }, { "epoch": 1.589041095890411, "grad_norm": 1.3985637426376343, "learning_rate": 1e-06, "loss": 2.262, "step": 29 }, { "epoch": 1.643835616438356, "grad_norm": 1.3776822090148926, "learning_rate": 1e-06, "loss": 2.224, "step": 30 }, { "epoch": 1.6986301369863015, "grad_norm": 1.3197417259216309, "learning_rate": 1e-06, "loss": 2.2009, "step": 31 }, { "epoch": 1.7534246575342465, "grad_norm": 1.4159483909606934, "learning_rate": 1e-06, "loss": 2.2131, "step": 32 }, { "epoch": 1.808219178082192, "grad_norm": 1.3864014148712158, "learning_rate": 1e-06, "loss": 2.2498, "step": 33 }, { "epoch": 1.8630136986301369, "grad_norm": 1.3488203287124634, "learning_rate": 1e-06, "loss": 2.2147, "step": 34 }, { "epoch": 1.9178082191780823, "grad_norm": 1.345689296722412, "learning_rate": 1e-06, "loss": 2.2383, "step": 35 }, { "epoch": 1.9726027397260273, "grad_norm": 1.344303011894226, "learning_rate": 1e-06, "loss": 2.2159, "step": 36 }, { "epoch": 2.0273972602739727, "grad_norm": 1.3895442485809326, "learning_rate": 1e-06, "loss": 2.2265, "step": 37 }, { "epoch": 2.0821917808219177, "grad_norm": 1.3593428134918213, "learning_rate": 1e-06, "loss": 2.2063, "step": 38 }, { "epoch": 2.136986301369863, "grad_norm": 1.3060978651046753, "learning_rate": 1e-06, "loss": 2.2572, "step": 39 }, { "epoch": 2.191780821917808, "grad_norm": 1.3199517726898193, "learning_rate": 1e-06, "loss": 2.2099, "step": 40 }, { "epoch": 2.2465753424657535, "grad_norm": 1.3381460905075073, "learning_rate": 1e-06, "loss": 2.2693, "step": 41 }, { "epoch": 2.3013698630136985, "grad_norm": 1.334553599357605, "learning_rate": 1e-06, "loss": 2.2206, "step": 42 }, { "epoch": 2.356164383561644, "grad_norm": 1.3222883939743042, "learning_rate": 1e-06, "loss": 2.1851, "step": 43 }, { "epoch": 2.410958904109589, "grad_norm": 1.3213746547698975, "learning_rate": 1e-06, "loss": 2.2542, "step": 44 }, { "epoch": 2.4657534246575343, "grad_norm": 1.3214170932769775, "learning_rate": 1e-06, "loss": 2.2319, "step": 45 }, { "epoch": 2.5205479452054793, "grad_norm": 1.345453143119812, "learning_rate": 1e-06, "loss": 2.222, "step": 46 }, { "epoch": 2.5753424657534247, "grad_norm": 1.2182488441467285, "learning_rate": 1e-06, "loss": 2.2069, "step": 47 }, { "epoch": 2.6301369863013697, "grad_norm": 1.2841640710830688, "learning_rate": 1e-06, "loss": 2.2181, "step": 48 }, { "epoch": 2.684931506849315, "grad_norm": 1.270230770111084, "learning_rate": 1e-06, "loss": 2.2097, "step": 49 }, { "epoch": 2.73972602739726, "grad_norm": 1.213972806930542, "learning_rate": 1e-06, "loss": 2.218, "step": 50 }, { "epoch": 2.7945205479452055, "grad_norm": 1.2877941131591797, "learning_rate": 1e-06, "loss": 2.2055, "step": 51 }, { "epoch": 2.8493150684931505, "grad_norm": 1.273301601409912, "learning_rate": 1e-06, "loss": 2.1895, "step": 52 }, { "epoch": 2.904109589041096, "grad_norm": 1.2318782806396484, "learning_rate": 1e-06, "loss": 2.2255, "step": 53 }, { "epoch": 2.958904109589041, "grad_norm": 1.1937693357467651, "learning_rate": 1e-06, "loss": 2.1865, "step": 54 }, { "epoch": 3.0136986301369864, "grad_norm": 1.1707606315612793, "learning_rate": 1e-06, "loss": 2.2179, "step": 55 }, { "epoch": 3.0684931506849313, "grad_norm": 1.2074235677719116, "learning_rate": 1e-06, "loss": 2.155, "step": 56 }, { "epoch": 3.1232876712328768, "grad_norm": 1.1725316047668457, "learning_rate": 1e-06, "loss": 2.2011, "step": 57 }, { "epoch": 3.1780821917808217, "grad_norm": 1.1967130899429321, "learning_rate": 1e-06, "loss": 2.2155, "step": 58 }, { "epoch": 3.232876712328767, "grad_norm": 1.1932190656661987, "learning_rate": 1e-06, "loss": 2.1858, "step": 59 }, { "epoch": 3.287671232876712, "grad_norm": 1.19328773021698, "learning_rate": 1e-06, "loss": 2.2351, "step": 60 }, { "epoch": 3.3424657534246576, "grad_norm": 1.1168928146362305, "learning_rate": 1e-06, "loss": 2.2022, "step": 61 }, { "epoch": 3.3972602739726026, "grad_norm": 1.2043449878692627, "learning_rate": 1e-06, "loss": 2.1964, "step": 62 }, { "epoch": 3.452054794520548, "grad_norm": 1.2224105596542358, "learning_rate": 1e-06, "loss": 2.1919, "step": 63 }, { "epoch": 3.506849315068493, "grad_norm": 1.2362271547317505, "learning_rate": 1e-06, "loss": 2.199, "step": 64 }, { "epoch": 3.5616438356164384, "grad_norm": 1.2123560905456543, "learning_rate": 1e-06, "loss": 2.2357, "step": 65 }, { "epoch": 3.616438356164384, "grad_norm": 1.1854863166809082, "learning_rate": 1e-06, "loss": 2.1878, "step": 66 }, { "epoch": 3.671232876712329, "grad_norm": 1.1320362091064453, "learning_rate": 1e-06, "loss": 2.1872, "step": 67 }, { "epoch": 3.7260273972602738, "grad_norm": 1.1633937358856201, "learning_rate": 1e-06, "loss": 2.205, "step": 68 }, { "epoch": 3.780821917808219, "grad_norm": 1.1435497999191284, "learning_rate": 1e-06, "loss": 2.1972, "step": 69 }, { "epoch": 3.8356164383561646, "grad_norm": 1.1820743083953857, "learning_rate": 1e-06, "loss": 2.1961, "step": 70 }, { "epoch": 3.8904109589041096, "grad_norm": 1.203647255897522, "learning_rate": 1e-06, "loss": 2.2149, "step": 71 }, { "epoch": 3.9452054794520546, "grad_norm": 1.1167892217636108, "learning_rate": 1e-06, "loss": 2.197, "step": 72 }, { "epoch": 4.0, "grad_norm": 1.0951488018035889, "learning_rate": 1e-06, "loss": 2.1898, "step": 73 }, { "epoch": 4.054794520547945, "grad_norm": 1.1908702850341797, "learning_rate": 1e-06, "loss": 2.1973, "step": 74 }, { "epoch": 4.109589041095891, "grad_norm": 1.0710009336471558, "learning_rate": 1e-06, "loss": 2.2014, "step": 75 }, { "epoch": 4.164383561643835, "grad_norm": 1.1268314123153687, "learning_rate": 1e-06, "loss": 2.2125, "step": 76 }, { "epoch": 4.219178082191781, "grad_norm": 1.0808967351913452, "learning_rate": 1e-06, "loss": 2.2184, "step": 77 }, { "epoch": 4.273972602739726, "grad_norm": 1.0744292736053467, "learning_rate": 1e-06, "loss": 2.162, "step": 78 }, { "epoch": 4.328767123287671, "grad_norm": 1.0902713537216187, "learning_rate": 1e-06, "loss": 2.2045, "step": 79 }, { "epoch": 4.383561643835616, "grad_norm": 1.1404340267181396, "learning_rate": 1e-06, "loss": 2.1919, "step": 80 }, { "epoch": 4.438356164383562, "grad_norm": 1.0819721221923828, "learning_rate": 1e-06, "loss": 2.1848, "step": 81 }, { "epoch": 4.493150684931507, "grad_norm": 1.0939464569091797, "learning_rate": 1e-06, "loss": 2.197, "step": 82 }, { "epoch": 4.5479452054794525, "grad_norm": 1.1371257305145264, "learning_rate": 1e-06, "loss": 2.1802, "step": 83 }, { "epoch": 4.602739726027397, "grad_norm": 1.0913671255111694, "learning_rate": 1e-06, "loss": 2.182, "step": 84 }, { "epoch": 4.657534246575342, "grad_norm": 1.0597493648529053, "learning_rate": 1e-06, "loss": 2.1663, "step": 85 }, { "epoch": 4.712328767123288, "grad_norm": 1.040493130683899, "learning_rate": 1e-06, "loss": 2.1774, "step": 86 }, { "epoch": 4.767123287671232, "grad_norm": 1.0556532144546509, "learning_rate": 1e-06, "loss": 2.2029, "step": 87 }, { "epoch": 4.821917808219178, "grad_norm": 1.0801831483840942, "learning_rate": 1e-06, "loss": 2.1648, "step": 88 }, { "epoch": 4.876712328767123, "grad_norm": 1.073749303817749, "learning_rate": 1e-06, "loss": 2.174, "step": 89 }, { "epoch": 4.931506849315069, "grad_norm": 1.0210574865341187, "learning_rate": 1e-06, "loss": 2.1474, "step": 90 }, { "epoch": 4.986301369863014, "grad_norm": 1.0152342319488525, "learning_rate": 1e-06, "loss": 2.1629, "step": 91 }, { "epoch": 5.041095890410959, "grad_norm": 1.0388507843017578, "learning_rate": 1e-06, "loss": 2.1931, "step": 92 }, { "epoch": 5.095890410958904, "grad_norm": 1.011426329612732, "learning_rate": 1e-06, "loss": 2.204, "step": 93 }, { "epoch": 5.1506849315068495, "grad_norm": 1.0486528873443604, "learning_rate": 1e-06, "loss": 2.1908, "step": 94 }, { "epoch": 5.205479452054795, "grad_norm": 0.9501799941062927, "learning_rate": 1e-06, "loss": 2.1823, "step": 95 }, { "epoch": 5.260273972602739, "grad_norm": 1.0336531400680542, "learning_rate": 1e-06, "loss": 2.1965, "step": 96 }, { "epoch": 5.315068493150685, "grad_norm": 1.0227267742156982, "learning_rate": 1e-06, "loss": 2.1896, "step": 97 }, { "epoch": 5.36986301369863, "grad_norm": 1.0686023235321045, "learning_rate": 1e-06, "loss": 2.1496, "step": 98 }, { "epoch": 5.424657534246576, "grad_norm": 0.9931809902191162, "learning_rate": 1e-06, "loss": 2.1474, "step": 99 }, { "epoch": 5.47945205479452, "grad_norm": 0.9578049778938293, "learning_rate": 1e-06, "loss": 2.1488, "step": 100 }, { "epoch": 5.534246575342466, "grad_norm": 0.9815987944602966, "learning_rate": 1e-06, "loss": 2.1755, "step": 101 }, { "epoch": 5.589041095890411, "grad_norm": 0.9837309718132019, "learning_rate": 1e-06, "loss": 2.1559, "step": 102 }, { "epoch": 5.6438356164383565, "grad_norm": 0.9334861040115356, "learning_rate": 1e-06, "loss": 2.1773, "step": 103 }, { "epoch": 5.698630136986301, "grad_norm": 1.0627118349075317, "learning_rate": 1e-06, "loss": 2.2116, "step": 104 }, { "epoch": 5.7534246575342465, "grad_norm": 0.9978325963020325, "learning_rate": 1e-06, "loss": 2.1413, "step": 105 }, { "epoch": 5.808219178082192, "grad_norm": 0.9550198912620544, "learning_rate": 1e-06, "loss": 2.1535, "step": 106 }, { "epoch": 5.863013698630137, "grad_norm": 0.9339421987533569, "learning_rate": 1e-06, "loss": 2.1504, "step": 107 }, { "epoch": 5.917808219178082, "grad_norm": 0.9043423533439636, "learning_rate": 1e-06, "loss": 2.1469, "step": 108 }, { "epoch": 5.972602739726027, "grad_norm": 0.921292781829834, "learning_rate": 1e-06, "loss": 2.1337, "step": 109 }, { "epoch": 6.027397260273973, "grad_norm": 0.9245712757110596, "learning_rate": 1e-06, "loss": 2.1762, "step": 110 }, { "epoch": 6.082191780821918, "grad_norm": 0.9610967636108398, "learning_rate": 1e-06, "loss": 2.1618, "step": 111 }, { "epoch": 6.136986301369863, "grad_norm": 0.9136860370635986, "learning_rate": 1e-06, "loss": 2.1505, "step": 112 }, { "epoch": 6.191780821917808, "grad_norm": 0.9340102672576904, "learning_rate": 1e-06, "loss": 2.1692, "step": 113 }, { "epoch": 6.2465753424657535, "grad_norm": 0.8885300159454346, "learning_rate": 1e-06, "loss": 2.1494, "step": 114 }, { "epoch": 6.301369863013699, "grad_norm": 0.917847216129303, "learning_rate": 1e-06, "loss": 2.1503, "step": 115 }, { "epoch": 6.3561643835616435, "grad_norm": 0.9519619345664978, "learning_rate": 1e-06, "loss": 2.1766, "step": 116 }, { "epoch": 6.410958904109589, "grad_norm": 0.8926482200622559, "learning_rate": 1e-06, "loss": 2.1493, "step": 117 }, { "epoch": 6.465753424657534, "grad_norm": 0.817862868309021, "learning_rate": 1e-06, "loss": 2.166, "step": 118 }, { "epoch": 6.52054794520548, "grad_norm": 0.8948012590408325, "learning_rate": 1e-06, "loss": 2.1346, "step": 119 }, { "epoch": 6.575342465753424, "grad_norm": 0.9632709622383118, "learning_rate": 1e-06, "loss": 2.1427, "step": 120 }, { "epoch": 6.63013698630137, "grad_norm": 0.9267117381095886, "learning_rate": 1e-06, "loss": 2.1581, "step": 121 }, { "epoch": 6.684931506849315, "grad_norm": 0.9063679575920105, "learning_rate": 1e-06, "loss": 2.1453, "step": 122 }, { "epoch": 6.739726027397261, "grad_norm": 0.9395270347595215, "learning_rate": 1e-06, "loss": 2.1515, "step": 123 }, { "epoch": 6.794520547945205, "grad_norm": 0.9410396218299866, "learning_rate": 1e-06, "loss": 2.1518, "step": 124 }, { "epoch": 6.8493150684931505, "grad_norm": 0.9229517579078674, "learning_rate": 1e-06, "loss": 2.1703, "step": 125 }, { "epoch": 6.904109589041096, "grad_norm": 0.8469845652580261, "learning_rate": 1e-06, "loss": 2.1491, "step": 126 }, { "epoch": 6.958904109589041, "grad_norm": 0.9080257415771484, "learning_rate": 1e-06, "loss": 2.1472, "step": 127 }, { "epoch": 7.013698630136986, "grad_norm": 0.9071102142333984, "learning_rate": 1e-06, "loss": 2.1685, "step": 128 }, { "epoch": 7.068493150684931, "grad_norm": 0.8933852910995483, "learning_rate": 1e-06, "loss": 2.1617, "step": 129 }, { "epoch": 7.123287671232877, "grad_norm": 0.9227753281593323, "learning_rate": 1e-06, "loss": 2.1617, "step": 130 }, { "epoch": 7.178082191780822, "grad_norm": 0.8686262965202332, "learning_rate": 1e-06, "loss": 2.1546, "step": 131 }, { "epoch": 7.232876712328767, "grad_norm": 0.8385916948318481, "learning_rate": 1e-06, "loss": 2.1442, "step": 132 }, { "epoch": 7.287671232876712, "grad_norm": 0.8217021822929382, "learning_rate": 1e-06, "loss": 2.1606, "step": 133 }, { "epoch": 7.342465753424658, "grad_norm": 0.862777590751648, "learning_rate": 1e-06, "loss": 2.153, "step": 134 }, { "epoch": 7.397260273972603, "grad_norm": 0.8956757187843323, "learning_rate": 1e-06, "loss": 2.1807, "step": 135 }, { "epoch": 7.4520547945205475, "grad_norm": 0.781984806060791, "learning_rate": 1e-06, "loss": 2.1469, "step": 136 }, { "epoch": 7.506849315068493, "grad_norm": 0.8100602030754089, "learning_rate": 1e-06, "loss": 2.107, "step": 137 }, { "epoch": 7.561643835616438, "grad_norm": 0.8204404711723328, "learning_rate": 1e-06, "loss": 2.1477, "step": 138 }, { "epoch": 7.616438356164384, "grad_norm": 0.8198928236961365, "learning_rate": 1e-06, "loss": 2.1514, "step": 139 }, { "epoch": 7.671232876712329, "grad_norm": 0.8388807773590088, "learning_rate": 1e-06, "loss": 2.1265, "step": 140 }, { "epoch": 7.726027397260274, "grad_norm": 0.8662092089653015, "learning_rate": 1e-06, "loss": 2.1316, "step": 141 }, { "epoch": 7.780821917808219, "grad_norm": 0.7682031393051147, "learning_rate": 1e-06, "loss": 2.1164, "step": 142 }, { "epoch": 7.835616438356165, "grad_norm": 0.796292781829834, "learning_rate": 1e-06, "loss": 2.1342, "step": 143 }, { "epoch": 7.890410958904109, "grad_norm": 0.8075994253158569, "learning_rate": 1e-06, "loss": 2.1221, "step": 144 }, { "epoch": 7.945205479452055, "grad_norm": 0.8507598638534546, "learning_rate": 1e-06, "loss": 2.1513, "step": 145 }, { "epoch": 8.0, "grad_norm": 0.768495500087738, "learning_rate": 1e-06, "loss": 2.1369, "step": 146 } ], "logging_steps": 1, "max_steps": 540, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "total_flos": 1.0225056854153626e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }