diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,40217 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 156.25, + "eval_steps": 250, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0625, + "grad_norm": 29172.793748496217, + "learning_rate": 0.0, + "loss": 233.2141, + "num_input_tokens_seen": 71616, + "step": 1 + }, + { + "epoch": 0.0625, + "eval_synth_IoU": 0.0, + "eval_synth_MAE_x": 14.7373046875, + "eval_synth_MAE_y": 43.91310119628906, + "eval_synth_NUM_probability": 8.316255950546747e-08, + "eval_synth_inside_bbox": 0.0, + "eval_synth_loss": 9172.6328125, + "eval_synth_loss_ce": 4.6333194971084595, + "eval_synth_loss_xval": 9168.0, + "eval_synth_runtime": 62.867, + "eval_synth_samples_per_second": 2.036, + "eval_synth_steps_per_second": 0.064, + "num_input_tokens_seen": 71616, + "step": 1 + }, + { + "epoch": 0.0625, + "loss": 8836.6484375, + "loss_ce": 4.648736000061035, + "loss_xval": 8832.0, + "num_input_tokens_seen": 71616, + "step": 1 + }, + { + "epoch": 0.125, + "grad_norm": 399652.4844158417, + "learning_rate": 6.27684584633728e-06, + "loss": 9956.6553, + "num_input_tokens_seen": 143360, + "step": 2 + }, + { + "epoch": 0.125, + "loss": 8644.6943359375, + "loss_ce": 4.694034099578857, + "loss_xval": 8640.0, + "num_input_tokens_seen": 143360, + "step": 2 + }, + { + "epoch": 0.1875, + "grad_norm": 404496.1754390865, + "learning_rate": 9.948565289251939e-06, + "loss": 10052.5811, + "num_input_tokens_seen": 215040, + "step": 3 + }, + { + "epoch": 0.1875, + "loss": 10884.490234375, + "loss_ce": 4.490614891052246, + "loss_xval": 10880.0, + "num_input_tokens_seen": 215040, + "step": 3 + }, + { + "epoch": 0.25, + "grad_norm": 192550.26822256536, + "learning_rate": 1.255369169267456e-05, + "loss": 3084.5933, + "num_input_tokens_seen": 286848, + "step": 4 + }, + { + "epoch": 0.25, + "loss": 3044.640380859375, + "loss_ce": 4.640501499176025, + "loss_xval": 3040.0, + "num_input_tokens_seen": 286848, + "step": 4 + }, + { + "epoch": 0.3125, + "grad_norm": 45415.22626884781, + "learning_rate": 1.4574384717887574e-05, + "loss": 373.7348, + "num_input_tokens_seen": 358592, + "step": 5 + }, + { + "epoch": 0.3125, + "loss": 460.8260192871094, + "loss_ce": 4.826022148132324, + "loss_xval": 456.0, + "num_input_tokens_seen": 358592, + "step": 5 + }, + { + "epoch": 0.375, + "grad_norm": 78593.80926834777, + "learning_rate": 1.6225411135589218e-05, + "loss": 934.9646, + "num_input_tokens_seen": 430208, + "step": 6 + }, + { + "epoch": 0.375, + "loss": 880.810791015625, + "loss_ce": 4.810770034790039, + "loss_xval": 876.0, + "num_input_tokens_seen": 430208, + "step": 6 + }, + { + "epoch": 0.4375, + "grad_norm": 28745.67233353964, + "learning_rate": 1.762133408171179e-05, + "loss": 258.9128, + "num_input_tokens_seen": 501952, + "step": 7 + }, + { + "epoch": 0.4375, + "loss": 208.93655395507812, + "loss_ce": 4.936546325683594, + "loss_xval": 204.0, + "num_input_tokens_seen": 501952, + "step": 7 + }, + { + "epoch": 0.5, + "grad_norm": 7699.87093071136, + "learning_rate": 1.883053753901184e-05, + "loss": 57.5219, + "num_input_tokens_seen": 561088, + "step": 8 + }, + { + "epoch": 0.5, + "loss": 62.05592346191406, + "loss_ce": 4.8059234619140625, + "loss_xval": 57.25, + "num_input_tokens_seen": 561088, + "step": 8 + }, + { + "epoch": 0.5625, + "grad_norm": 16712.558684854343, + "learning_rate": 1.9897130578503877e-05, + "loss": 95.5276, + "num_input_tokens_seen": 632768, + "step": 9 + }, + { + "epoch": 0.5625, + "loss": 103.8332290649414, + "loss_ce": 4.8332319259643555, + "loss_xval": 99.0, + "num_input_tokens_seen": 632768, + "step": 9 + }, + { + "epoch": 0.625, + "grad_norm": 17002.791469915122, + "learning_rate": 2.0851230564224858e-05, + "loss": 87.0461, + "num_input_tokens_seen": 704320, + "step": 10 + }, + { + "epoch": 0.625, + "loss": 110.79865264892578, + "loss_ce": 4.798652648925781, + "loss_xval": 106.0, + "num_input_tokens_seen": 704320, + "step": 10 + }, + { + "epoch": 0.6875, + "grad_norm": 5949.488232484429, + "learning_rate": 2.1714318986131375e-05, + "loss": 20.0366, + "num_input_tokens_seen": 776064, + "step": 11 + }, + { + "epoch": 0.6875, + "loss": 20.34089469909668, + "loss_ce": 4.7783942222595215, + "loss_xval": 15.5625, + "num_input_tokens_seen": 776064, + "step": 11 + }, + { + "epoch": 0.75, + "grad_norm": 4687.070808057291, + "learning_rate": 2.2502256981926498e-05, + "loss": 18.7925, + "num_input_tokens_seen": 835200, + "step": 12 + }, + { + "epoch": 0.75, + "loss": 21.56426429748535, + "loss_ce": 4.689263820648193, + "loss_xval": 16.875, + "num_input_tokens_seen": 835200, + "step": 12 + }, + { + "epoch": 0.8125, + "grad_norm": 3537.3965456054607, + "learning_rate": 2.3227089674435412e-05, + "loss": 22.4419, + "num_input_tokens_seen": 906944, + "step": 13 + }, + { + "epoch": 0.8125, + "loss": 25.58965301513672, + "loss_ce": 4.964653968811035, + "loss_xval": 20.625, + "num_input_tokens_seen": 906944, + "step": 13 + }, + { + "epoch": 0.875, + "grad_norm": 9298.24053084731, + "learning_rate": 2.389817992804907e-05, + "loss": 53.2494, + "num_input_tokens_seen": 978624, + "step": 14 + }, + { + "epoch": 0.875, + "loss": 74.64051818847656, + "loss_ce": 5.1405205726623535, + "loss_xval": 69.5, + "num_input_tokens_seen": 978624, + "step": 14 + }, + { + "epoch": 0.9375, + "grad_norm": 4254.288944119962, + "learning_rate": 2.4522950007139514e-05, + "loss": 22.9845, + "num_input_tokens_seen": 1050368, + "step": 15 + }, + { + "epoch": 0.9375, + "loss": 23.68242645263672, + "loss_ce": 5.182426929473877, + "loss_xval": 18.5, + "num_input_tokens_seen": 1050368, + "step": 15 + }, + { + "epoch": 1.0, + "grad_norm": 2730.8534629090714, + "learning_rate": 2.510738338534912e-05, + "loss": 11.6825, + "num_input_tokens_seen": 1122112, + "step": 16 + }, + { + "epoch": 1.0, + "loss": 12.665728569030762, + "loss_ce": 5.321978569030762, + "loss_xval": 7.34375, + "num_input_tokens_seen": 1122112, + "step": 16 + }, + { + "epoch": 1.0625, + "grad_norm": 1268.1179480482795, + "learning_rate": 2.5656374157160173e-05, + "loss": 9.2256, + "num_input_tokens_seen": 1193664, + "step": 17 + }, + { + "epoch": 1.0625, + "loss": 9.186548233032227, + "loss_ce": 5.295922756195068, + "loss_xval": 3.890625, + "num_input_tokens_seen": 1193664, + "step": 17 + }, + { + "epoch": 1.125, + "grad_norm": 1232.2475849082016, + "learning_rate": 2.6173976424841157e-05, + "loss": 8.0166, + "num_input_tokens_seen": 1265408, + "step": 18 + }, + { + "epoch": 1.125, + "loss": 8.235204696655273, + "loss_ce": 5.641454696655273, + "loss_xval": 2.59375, + "num_input_tokens_seen": 1265408, + "step": 18 + }, + { + "epoch": 1.1875, + "grad_norm": 930.3138210861072, + "learning_rate": 2.666358616830022e-05, + "loss": 7.102, + "num_input_tokens_seen": 1324608, + "step": 19 + }, + { + "epoch": 1.1875, + "loss": 6.875672817230225, + "loss_ce": 5.766297817230225, + "loss_xval": 1.109375, + "num_input_tokens_seen": 1324608, + "step": 19 + }, + { + "epoch": 1.25, + "grad_norm": 504.1642954319787, + "learning_rate": 2.7128076410562138e-05, + "loss": 6.509, + "num_input_tokens_seen": 1383616, + "step": 20 + }, + { + "epoch": 1.25, + "loss": 6.596158981323242, + "loss_ce": 5.393033981323242, + "loss_xval": 1.203125, + "num_input_tokens_seen": 1383616, + "step": 20 + }, + { + "epoch": 1.3125, + "grad_norm": 259.8236571378379, + "learning_rate": 2.756989937096373e-05, + "loss": 6.418, + "num_input_tokens_seen": 1455232, + "step": 21 + }, + { + "epoch": 1.3125, + "loss": 6.42617654800415, + "loss_ce": 5.99453592300415, + "loss_xval": 0.431640625, + "num_input_tokens_seen": 1455232, + "step": 21 + }, + { + "epoch": 1.375, + "grad_norm": 456.6421855261366, + "learning_rate": 2.799116483246866e-05, + "loss": 6.3956, + "num_input_tokens_seen": 1526848, + "step": 22 + }, + { + "epoch": 1.375, + "loss": 6.097779750823975, + "loss_ce": 5.468873500823975, + "loss_xval": 0.62890625, + "num_input_tokens_seen": 1526848, + "step": 22 + }, + { + "epoch": 1.4375, + "grad_norm": 566.2674289085138, + "learning_rate": 2.8393701074525802e-05, + "loss": 6.6971, + "num_input_tokens_seen": 1598528, + "step": 23 + }, + { + "epoch": 1.4375, + "loss": 6.89132833480835, + "loss_ce": 5.68820333480835, + "loss_xval": 1.203125, + "num_input_tokens_seen": 1598528, + "step": 23 + }, + { + "epoch": 1.5, + "grad_norm": 776.1354374337782, + "learning_rate": 2.877910282826378e-05, + "loss": 7.5419, + "num_input_tokens_seen": 1670336, + "step": 24 + }, + { + "epoch": 1.5, + "loss": 7.6928510665893555, + "loss_ce": 6.2162885665893555, + "loss_xval": 1.4765625, + "num_input_tokens_seen": 1670336, + "step": 24 + }, + { + "epoch": 1.5625, + "grad_norm": 487.7768849955227, + "learning_rate": 2.9148769435775147e-05, + "loss": 6.6087, + "num_input_tokens_seen": 1741952, + "step": 25 + }, + { + "epoch": 1.5625, + "loss": 6.729894638061523, + "loss_ce": 5.921300888061523, + "loss_xval": 0.80859375, + "num_input_tokens_seen": 1741952, + "step": 25 + }, + { + "epoch": 1.625, + "grad_norm": 126.59415634802222, + "learning_rate": 2.950393552077269e-05, + "loss": 5.676, + "num_input_tokens_seen": 1788608, + "step": 26 + }, + { + "epoch": 1.625, + "loss": 5.721405982971191, + "loss_ce": 5.501679420471191, + "loss_xval": 0.2197265625, + "num_input_tokens_seen": 1788608, + "step": 26 + }, + { + "epoch": 1.6875, + "grad_norm": 195.61738909493855, + "learning_rate": 2.9845695867755812e-05, + "loss": 5.8845, + "num_input_tokens_seen": 1847680, + "step": 27 + }, + { + "epoch": 1.6875, + "loss": 6.124744415283203, + "loss_ce": 5.868885040283203, + "loss_xval": 0.255859375, + "num_input_tokens_seen": 1847680, + "step": 27 + }, + { + "epoch": 1.75, + "grad_norm": 350.88859252845054, + "learning_rate": 3.017502577438635e-05, + "loss": 6.1343, + "num_input_tokens_seen": 1919360, + "step": 28 + }, + { + "epoch": 1.75, + "loss": 5.883708953857422, + "loss_ce": 5.438396453857422, + "loss_xval": 0.4453125, + "num_input_tokens_seen": 1919360, + "step": 28 + }, + { + "epoch": 1.8125, + "grad_norm": 447.93921756481274, + "learning_rate": 3.049279783085195e-05, + "loss": 6.0422, + "num_input_tokens_seen": 1990976, + "step": 29 + }, + { + "epoch": 1.8125, + "loss": 6.0027031898498535, + "loss_ce": 5.3698906898498535, + "loss_xval": 0.6328125, + "num_input_tokens_seen": 1990976, + "step": 29 + }, + { + "epoch": 1.875, + "grad_norm": 309.12130084336945, + "learning_rate": 3.079979585347679e-05, + "loss": 5.8828, + "num_input_tokens_seen": 2062656, + "step": 30 + }, + { + "epoch": 1.875, + "loss": 5.82305383682251, + "loss_ce": 5.50078821182251, + "loss_xval": 0.322265625, + "num_input_tokens_seen": 2062656, + "step": 30 + }, + { + "epoch": 1.9375, + "grad_norm": 274.02947169042403, + "learning_rate": 3.1096726532791335e-05, + "loss": 5.6858, + "num_input_tokens_seen": 2134400, + "step": 31 + }, + { + "epoch": 1.9375, + "loss": 5.657408237457275, + "loss_ce": 5.414244174957275, + "loss_xval": 0.2431640625, + "num_input_tokens_seen": 2134400, + "step": 31 + }, + { + "epoch": 2.0, + "grad_norm": 88.1339226566502, + "learning_rate": 3.1384229231686404e-05, + "loss": 5.118, + "num_input_tokens_seen": 2205952, + "step": 32 + }, + { + "epoch": 2.0, + "loss": 5.154168605804443, + "loss_ce": 5.036981105804443, + "loss_xval": 0.1171875, + "num_input_tokens_seen": 2205952, + "step": 32 + }, + { + "epoch": 2.0625, + "grad_norm": 230.87518409285667, + "learning_rate": 3.1662884275383314e-05, + "loss": 5.265, + "num_input_tokens_seen": 2277632, + "step": 33 + }, + { + "epoch": 2.0625, + "loss": 5.119575500488281, + "loss_ce": 4.890083312988281, + "loss_xval": 0.2294921875, + "num_input_tokens_seen": 2277632, + "step": 33 + }, + { + "epoch": 2.125, + "grad_norm": 301.8125776339725, + "learning_rate": 3.1933220003497456e-05, + "loss": 5.1474, + "num_input_tokens_seen": 2336768, + "step": 34 + }, + { + "epoch": 2.125, + "loss": 5.0475993156433105, + "loss_ce": 4.6159586906433105, + "loss_xval": 0.431640625, + "num_input_tokens_seen": 2336768, + "step": 34 + }, + { + "epoch": 2.1875, + "grad_norm": 362.43387778423926, + "learning_rate": 3.219571879959937e-05, + "loss": 5.2475, + "num_input_tokens_seen": 2408448, + "step": 35 + }, + { + "epoch": 2.1875, + "loss": 5.348223686218262, + "loss_ce": 4.916583061218262, + "loss_xval": 0.431640625, + "num_input_tokens_seen": 2408448, + "step": 35 + }, + { + "epoch": 2.25, + "grad_norm": 240.2118877111659, + "learning_rate": 3.2450822271178436e-05, + "loss": 4.8557, + "num_input_tokens_seen": 2480064, + "step": 36 + }, + { + "epoch": 2.25, + "loss": 4.754091262817383, + "loss_ce": 4.496278762817383, + "loss_xval": 0.2578125, + "num_input_tokens_seen": 2480064, + "step": 36 + }, + { + "epoch": 2.3125, + "grad_norm": 112.64967575290002, + "learning_rate": 3.269893571973584e-05, + "loss": 4.644, + "num_input_tokens_seen": 2551616, + "step": 37 + }, + { + "epoch": 2.3125, + "loss": 4.523304462432861, + "loss_ce": 4.384632587432861, + "loss_xval": 0.138671875, + "num_input_tokens_seen": 2551616, + "step": 37 + }, + { + "epoch": 2.375, + "grad_norm": 112.36137777715612, + "learning_rate": 3.29404320146375e-05, + "loss": 4.4596, + "num_input_tokens_seen": 2623360, + "step": 38 + }, + { + "epoch": 2.375, + "loss": 4.345686435699463, + "loss_ce": 4.292219638824463, + "loss_xval": 0.053466796875, + "num_input_tokens_seen": 2623360, + "step": 38 + }, + { + "epoch": 2.4375, + "grad_norm": 178.55025735379013, + "learning_rate": 3.317565496368735e-05, + "loss": 4.2165, + "num_input_tokens_seen": 2682496, + "step": 39 + }, + { + "epoch": 2.4375, + "loss": 4.244419097900391, + "loss_ce": 4.049106597900391, + "loss_xval": 0.1953125, + "num_input_tokens_seen": 2682496, + "step": 39 + }, + { + "epoch": 2.5, + "grad_norm": 315.4850255105284, + "learning_rate": 3.340492225689942e-05, + "loss": 4.4928, + "num_input_tokens_seen": 2754048, + "step": 40 + }, + { + "epoch": 2.5, + "loss": 4.407598972320557, + "loss_ce": 4.122442722320557, + "loss_xval": 0.28515625, + "num_input_tokens_seen": 2754048, + "step": 40 + }, + { + "epoch": 2.5625, + "grad_norm": 232.48313805962374, + "learning_rate": 3.362852804672299e-05, + "loss": 4.244, + "num_input_tokens_seen": 2825728, + "step": 41 + }, + { + "epoch": 2.5625, + "loss": 4.199363708496094, + "loss_ce": 3.9737777709960938, + "loss_xval": 0.2255859375, + "num_input_tokens_seen": 2825728, + "step": 41 + }, + { + "epoch": 2.625, + "grad_norm": 87.66313343151434, + "learning_rate": 3.384674521730101e-05, + "loss": 3.7806, + "num_input_tokens_seen": 2884800, + "step": 42 + }, + { + "epoch": 2.625, + "loss": 3.834822177886963, + "loss_ce": 3.755720615386963, + "loss_xval": 0.0791015625, + "num_input_tokens_seen": 2884800, + "step": 42 + }, + { + "epoch": 2.6875, + "grad_norm": 99.57816021937762, + "learning_rate": 3.405982738667825e-05, + "loss": 3.9589, + "num_input_tokens_seen": 2956416, + "step": 43 + }, + { + "epoch": 2.6875, + "loss": 3.855365753173828, + "loss_ce": 3.767475128173828, + "loss_xval": 0.087890625, + "num_input_tokens_seen": 2956416, + "step": 43 + }, + { + "epoch": 2.75, + "grad_norm": 157.27395398869928, + "learning_rate": 3.4268010678805934e-05, + "loss": 3.7278, + "num_input_tokens_seen": 3028032, + "step": 44 + }, + { + "epoch": 2.75, + "loss": 3.758025884628296, + "loss_ce": 3.646697759628296, + "loss_xval": 0.111328125, + "num_input_tokens_seen": 3028032, + "step": 44 + }, + { + "epoch": 2.8125, + "grad_norm": 199.10929648202688, + "learning_rate": 3.447151529639145e-05, + "loss": 3.6274, + "num_input_tokens_seen": 3099648, + "step": 45 + }, + { + "epoch": 2.8125, + "loss": 3.6010501384735107, + "loss_ce": 3.4106204509735107, + "loss_xval": 0.1904296875, + "num_input_tokens_seen": 3099648, + "step": 45 + }, + { + "epoch": 2.875, + "grad_norm": 188.23721951133405, + "learning_rate": 3.467054692086308e-05, + "loss": 3.4598, + "num_input_tokens_seen": 3158720, + "step": 46 + }, + { + "epoch": 2.875, + "loss": 3.4753458499908447, + "loss_ce": 3.3415567874908447, + "loss_xval": 0.1337890625, + "num_input_tokens_seen": 3158720, + "step": 46 + }, + { + "epoch": 2.9375, + "grad_norm": 52.48630854608108, + "learning_rate": 3.486529796176414e-05, + "loss": 3.3459, + "num_input_tokens_seen": 3230400, + "step": 47 + }, + { + "epoch": 2.9375, + "loss": 3.3899075984954834, + "loss_ce": 3.3230130672454834, + "loss_xval": 0.06689453125, + "num_input_tokens_seen": 3230400, + "step": 47 + }, + { + "epoch": 3.0, + "grad_norm": 78.14763974150075, + "learning_rate": 3.505594867460106e-05, + "loss": 3.2297, + "num_input_tokens_seen": 3301952, + "step": 48 + }, + { + "epoch": 3.0, + "loss": 3.199742555618286, + "loss_ce": 3.146764039993286, + "loss_xval": 0.052978515625, + "num_input_tokens_seen": 3301952, + "step": 48 + }, + { + "epoch": 3.0625, + "grad_norm": 107.09323318280654, + "learning_rate": 3.524266816342358e-05, + "loss": 3.1971, + "num_input_tokens_seen": 3361088, + "step": 49 + }, + { + "epoch": 3.0625, + "loss": 3.155874013900757, + "loss_ce": 3.062124013900757, + "loss_xval": 0.09375, + "num_input_tokens_seen": 3361088, + "step": 49 + }, + { + "epoch": 3.125, + "grad_norm": 175.2975707620488, + "learning_rate": 3.542561528211243e-05, + "loss": 3.178, + "num_input_tokens_seen": 3432704, + "step": 50 + }, + { + "epoch": 3.125, + "loss": 3.104872941970825, + "loss_ce": 3.011611223220825, + "loss_xval": 0.09326171875, + "num_input_tokens_seen": 3432704, + "step": 50 + }, + { + "epoch": 3.1875, + "grad_norm": 127.36496606890086, + "learning_rate": 3.560493944641211e-05, + "loss": 2.9993, + "num_input_tokens_seen": 3504320, + "step": 51 + }, + { + "epoch": 3.1875, + "eval_synth_IoU": 0.0066440212685847655, + "eval_synth_MAE_x": 0.12607574462890625, + "eval_synth_MAE_y": 0.268218994140625, + "eval_synth_NUM_probability": 2.8884072733603716e-07, + "eval_synth_inside_bbox": 0.0625, + "eval_synth_loss": 2.902937412261963, + "eval_synth_loss_ce": 2.8528276681900024, + "eval_synth_loss_xval": 0.05010986328125, + "eval_synth_runtime": 61.7514, + "eval_synth_samples_per_second": 2.073, + "eval_synth_steps_per_second": 0.065, + "num_input_tokens_seen": 3504320, + "step": 51 + }, + { + "epoch": 3.1875, + "loss": 2.87408447265625, + "loss_ce": 2.84063720703125, + "loss_xval": 0.033447265625, + "num_input_tokens_seen": 3504320, + "step": 51 + }, + { + "epoch": 3.25, + "grad_norm": 25.963210609727025, + "learning_rate": 3.578078136710997e-05, + "loss": 2.9028, + "num_input_tokens_seen": 3576000, + "step": 52 + }, + { + "epoch": 3.25, + "loss": 2.9049689769744873, + "loss_ce": 2.8453986644744873, + "loss_xval": 0.0595703125, + "num_input_tokens_seen": 3576000, + "step": 52 + }, + { + "epoch": 3.3125, + "grad_norm": 42.52841656307943, + "learning_rate": 3.595327371337536e-05, + "loss": 2.8712, + "num_input_tokens_seen": 3647616, + "step": 53 + }, + { + "epoch": 3.3125, + "loss": 2.880415678024292, + "loss_ce": 2.812544584274292, + "loss_xval": 0.06787109375, + "num_input_tokens_seen": 3647616, + "step": 53 + }, + { + "epoch": 3.375, + "grad_norm": 169.11167670262176, + "learning_rate": 3.6122541714093095e-05, + "loss": 2.845, + "num_input_tokens_seen": 3719168, + "step": 54 + }, + { + "epoch": 3.375, + "loss": 2.8389875888824463, + "loss_ce": 2.7012922763824463, + "loss_xval": 0.1376953125, + "num_input_tokens_seen": 3719168, + "step": 54 + }, + { + "epoch": 3.4375, + "grad_norm": 116.0472794869621, + "learning_rate": 3.628870370401895e-05, + "loss": 2.7699, + "num_input_tokens_seen": 3790784, + "step": 55 + }, + { + "epoch": 3.4375, + "loss": 2.774611234664917, + "loss_ce": 2.682814359664917, + "loss_xval": 0.091796875, + "num_input_tokens_seen": 3790784, + "step": 55 + }, + { + "epoch": 3.5, + "grad_norm": 69.64980837437214, + "learning_rate": 3.6451871620723636e-05, + "loss": 2.6313, + "num_input_tokens_seen": 3849792, + "step": 56 + }, + { + "epoch": 3.5, + "loss": 2.609570026397705, + "loss_ce": 2.552441120147705, + "loss_xval": 0.05712890625, + "num_input_tokens_seen": 3849792, + "step": 56 + }, + { + "epoch": 3.5625, + "grad_norm": 27.048681544392675, + "learning_rate": 3.661215145755216e-05, + "loss": 2.5762, + "num_input_tokens_seen": 3921536, + "step": 57 + }, + { + "epoch": 3.5625, + "loss": 2.5791242122650146, + "loss_ce": 2.5366437435150146, + "loss_xval": 0.04248046875, + "num_input_tokens_seen": 3921536, + "step": 57 + }, + { + "epoch": 3.625, + "grad_norm": 93.63238587072217, + "learning_rate": 3.6769643677189227e-05, + "loss": 2.5727, + "num_input_tokens_seen": 3993280, + "step": 58 + }, + { + "epoch": 3.625, + "loss": 2.563948154449463, + "loss_ce": 2.504377841949463, + "loss_xval": 0.0595703125, + "num_input_tokens_seen": 3993280, + "step": 58 + }, + { + "epoch": 3.6875, + "grad_norm": 126.16773744523488, + "learning_rate": 3.692444358987175e-05, + "loss": 2.5464, + "num_input_tokens_seen": 4065024, + "step": 59 + }, + { + "epoch": 3.6875, + "loss": 2.493313789367676, + "loss_ce": 2.430325508117676, + "loss_xval": 0.06298828125, + "num_input_tokens_seen": 4065024, + "step": 59 + }, + { + "epoch": 3.75, + "grad_norm": 117.97609678952398, + "learning_rate": 3.707664169981407e-05, + "loss": 2.4898, + "num_input_tokens_seen": 4136768, + "step": 60 + }, + { + "epoch": 3.75, + "loss": 2.492903232574463, + "loss_ce": 2.384016513824463, + "loss_xval": 0.10888671875, + "num_input_tokens_seen": 4136768, + "step": 60 + }, + { + "epoch": 3.8125, + "grad_norm": 33.00295945775187, + "learning_rate": 3.7226324022999023e-05, + "loss": 2.335, + "num_input_tokens_seen": 4208512, + "step": 61 + }, + { + "epoch": 3.8125, + "loss": 2.3376686573028564, + "loss_ce": 2.2946999073028564, + "loss_xval": 0.04296875, + "num_input_tokens_seen": 4208512, + "step": 61 + }, + { + "epoch": 3.875, + "grad_norm": 117.30328549034672, + "learning_rate": 3.737357237912862e-05, + "loss": 2.401, + "num_input_tokens_seen": 4280192, + "step": 62 + }, + { + "epoch": 3.875, + "loss": 2.417743682861328, + "loss_ce": 2.324970245361328, + "loss_xval": 0.0927734375, + "num_input_tokens_seen": 4280192, + "step": 62 + }, + { + "epoch": 3.9375, + "grad_norm": 111.69349851089615, + "learning_rate": 3.751846466021567e-05, + "loss": 2.2598, + "num_input_tokens_seen": 4351872, + "step": 63 + }, + { + "epoch": 3.9375, + "loss": 2.269806385040283, + "loss_ce": 2.207306385040283, + "loss_xval": 0.0625, + "num_input_tokens_seen": 4351872, + "step": 63 + }, + { + "epoch": 4.0, + "grad_norm": 28.60956298923437, + "learning_rate": 3.766107507802368e-05, + "loss": 2.2539, + "num_input_tokens_seen": 4423424, + "step": 64 + }, + { + "epoch": 4.0, + "loss": 2.2942304611206055, + "loss_ce": 2.2344160079956055, + "loss_xval": 0.059814453125, + "num_input_tokens_seen": 4423424, + "step": 64 + }, + { + "epoch": 4.0625, + "grad_norm": 18.836018192038598, + "learning_rate": 3.7801474392322984e-05, + "loss": 2.1486, + "num_input_tokens_seen": 4495040, + "step": 65 + }, + { + "epoch": 4.0625, + "loss": 2.1568853855133057, + "loss_ce": 2.1168463230133057, + "loss_xval": 0.0400390625, + "num_input_tokens_seen": 4495040, + "step": 65 + }, + { + "epoch": 4.125, + "grad_norm": 56.66330982290737, + "learning_rate": 3.7939730121720594e-05, + "loss": 2.1409, + "num_input_tokens_seen": 4566720, + "step": 66 + }, + { + "epoch": 4.125, + "loss": 2.1396796703338623, + "loss_ce": 2.1030585765838623, + "loss_xval": 0.03662109375, + "num_input_tokens_seen": 4566720, + "step": 66 + }, + { + "epoch": 4.1875, + "grad_norm": 70.88389886337272, + "learning_rate": 3.807590673863634e-05, + "loss": 2.0626, + "num_input_tokens_seen": 4625792, + "step": 67 + }, + { + "epoch": 4.1875, + "loss": 2.050058364868164, + "loss_ce": 2.017343521118164, + "loss_xval": 0.03271484375, + "num_input_tokens_seen": 4625792, + "step": 67 + }, + { + "epoch": 4.25, + "grad_norm": 55.513793058827545, + "learning_rate": 3.8210065849834735e-05, + "loss": 2.0257, + "num_input_tokens_seen": 4684800, + "step": 68 + }, + { + "epoch": 4.25, + "loss": 2.0458872318267822, + "loss_ce": 2.0043833255767822, + "loss_xval": 0.04150390625, + "num_input_tokens_seen": 4684800, + "step": 68 + }, + { + "epoch": 4.3125, + "grad_norm": 28.58848301667242, + "learning_rate": 3.834226636377775e-05, + "loss": 2.0008, + "num_input_tokens_seen": 4756608, + "step": 69 + }, + { + "epoch": 4.3125, + "loss": 2.0231857299804688, + "loss_ce": 1.9741133451461792, + "loss_xval": 0.049072265625, + "num_input_tokens_seen": 4756608, + "step": 69 + }, + { + "epoch": 4.375, + "grad_norm": 31.1682609533774, + "learning_rate": 3.847256464593665e-05, + "loss": 2.0347, + "num_input_tokens_seen": 4828352, + "step": 70 + }, + { + "epoch": 4.375, + "loss": 1.989190936088562, + "loss_ce": 1.929376482963562, + "loss_xval": 0.059814453125, + "num_input_tokens_seen": 4828352, + "step": 70 + }, + { + "epoch": 4.4375, + "grad_norm": 122.59637628732717, + "learning_rate": 3.860101466308762e-05, + "loss": 1.94, + "num_input_tokens_seen": 4900160, + "step": 71 + }, + { + "epoch": 4.4375, + "loss": 1.920021414756775, + "loss_ce": 1.859474539756775, + "loss_xval": 0.060546875, + "num_input_tokens_seen": 4900160, + "step": 71 + }, + { + "epoch": 4.5, + "grad_norm": 84.61405063476768, + "learning_rate": 3.872766811751572e-05, + "loss": 1.8873, + "num_input_tokens_seen": 4971840, + "step": 72 + }, + { + "epoch": 4.5, + "loss": 1.9320684671401978, + "loss_ce": 1.8661504983901978, + "loss_xval": 0.06591796875, + "num_input_tokens_seen": 4971840, + "step": 72 + }, + { + "epoch": 4.5625, + "grad_norm": 31.840479159637216, + "learning_rate": 3.8852574571962525e-05, + "loss": 1.8108, + "num_input_tokens_seen": 5043584, + "step": 73 + }, + { + "epoch": 4.5625, + "loss": 1.7685670852661133, + "loss_ce": 1.7431764602661133, + "loss_xval": 0.025390625, + "num_input_tokens_seen": 5043584, + "step": 73 + }, + { + "epoch": 4.625, + "grad_norm": 25.983925310667896, + "learning_rate": 3.897578156607312e-05, + "loss": 1.8146, + "num_input_tokens_seen": 5102720, + "step": 74 + }, + { + "epoch": 4.625, + "loss": 1.8115485906600952, + "loss_ce": 1.7851814031600952, + "loss_xval": 0.0263671875, + "num_input_tokens_seen": 5102720, + "step": 74 + }, + { + "epoch": 4.6875, + "grad_norm": 74.29589067086889, + "learning_rate": 3.909733472502708e-05, + "loss": 1.7631, + "num_input_tokens_seen": 5174336, + "step": 75 + }, + { + "epoch": 4.6875, + "loss": 1.7935320138931274, + "loss_ce": 1.7349382638931274, + "loss_xval": 0.05859375, + "num_input_tokens_seen": 5174336, + "step": 75 + }, + { + "epoch": 4.75, + "grad_norm": 88.02846317965282, + "learning_rate": 3.921727786097478e-05, + "loss": 1.7123, + "num_input_tokens_seen": 5246016, + "step": 76 + }, + { + "epoch": 4.75, + "loss": 1.746011734008789, + "loss_ce": 1.680093765258789, + "loss_xval": 0.06591796875, + "num_input_tokens_seen": 5246016, + "step": 76 + }, + { + "epoch": 4.8125, + "grad_norm": 16.541784065292656, + "learning_rate": 3.933565306784317e-05, + "loss": 1.6954, + "num_input_tokens_seen": 5317568, + "step": 77 + }, + { + "epoch": 4.8125, + "loss": 1.6550248861312866, + "loss_ce": 1.6351274251937866, + "loss_xval": 0.0198974609375, + "num_input_tokens_seen": 5317568, + "step": 77 + }, + { + "epoch": 4.875, + "grad_norm": 47.77120385942182, + "learning_rate": 3.945250081002463e-05, + "loss": 1.6259, + "num_input_tokens_seen": 5389248, + "step": 78 + }, + { + "epoch": 4.875, + "loss": 1.6267539262771606, + "loss_ce": 1.5662070512771606, + "loss_xval": 0.060546875, + "num_input_tokens_seen": 5389248, + "step": 78 + }, + { + "epoch": 4.9375, + "grad_norm": 102.39096093646586, + "learning_rate": 3.9567860005416364e-05, + "loss": 1.5948, + "num_input_tokens_seen": 5460864, + "step": 79 + }, + { + "epoch": 4.9375, + "loss": 1.622156023979187, + "loss_ce": 1.566003680229187, + "loss_xval": 0.05615234375, + "num_input_tokens_seen": 5460864, + "step": 79 + }, + { + "epoch": 5.0, + "grad_norm": 58.587913611747304, + "learning_rate": 3.96817681032367e-05, + "loss": 1.5486, + "num_input_tokens_seen": 5532544, + "step": 80 + }, + { + "epoch": 5.0, + "loss": 1.5358805656433105, + "loss_ce": 1.5047526359558105, + "loss_xval": 0.0311279296875, + "num_input_tokens_seen": 5532544, + "step": 80 + }, + { + "epoch": 5.0625, + "grad_norm": 48.16275511376751, + "learning_rate": 3.9794261157007754e-05, + "loss": 1.5092, + "num_input_tokens_seen": 5604096, + "step": 81 + }, + { + "epoch": 5.0625, + "loss": 1.4976553916931152, + "loss_ce": 1.4751944541931152, + "loss_xval": 0.0224609375, + "num_input_tokens_seen": 5604096, + "step": 81 + }, + { + "epoch": 5.125, + "grad_norm": 82.27402561349479, + "learning_rate": 3.990537389306027e-05, + "loss": 1.4799, + "num_input_tokens_seen": 5663168, + "step": 82 + }, + { + "epoch": 5.125, + "loss": 1.4801445007324219, + "loss_ce": 1.4425468444824219, + "loss_xval": 0.03759765625, + "num_input_tokens_seen": 5663168, + "step": 82 + }, + { + "epoch": 5.1875, + "grad_norm": 66.20344925699517, + "learning_rate": 4.001513977488632e-05, + "loss": 1.4557, + "num_input_tokens_seen": 5722240, + "step": 83 + }, + { + "epoch": 5.1875, + "loss": 1.4894322156906128, + "loss_ce": 1.4515904188156128, + "loss_xval": 0.037841796875, + "num_input_tokens_seen": 5722240, + "step": 83 + }, + { + "epoch": 5.25, + "grad_norm": 26.081096904088774, + "learning_rate": 4.012359106363829e-05, + "loss": 1.3982, + "num_input_tokens_seen": 5793984, + "step": 84 + }, + { + "epoch": 5.25, + "loss": 1.409085750579834, + "loss_ce": 1.382108211517334, + "loss_xval": 0.0269775390625, + "num_input_tokens_seen": 5793984, + "step": 84 + }, + { + "epoch": 5.3125, + "grad_norm": 82.13112661521336, + "learning_rate": 4.023075887504775e-05, + "loss": 1.3883, + "num_input_tokens_seen": 5865600, + "step": 85 + }, + { + "epoch": 5.3125, + "loss": 1.3842965364456177, + "loss_ce": 1.3257027864456177, + "loss_xval": 0.05859375, + "num_input_tokens_seen": 5865600, + "step": 85 + }, + { + "epoch": 5.375, + "grad_norm": 58.38568800603289, + "learning_rate": 4.033667323301552e-05, + "loss": 1.358, + "num_input_tokens_seen": 5937152, + "step": 86 + }, + { + "epoch": 5.375, + "loss": 1.357524037361145, + "loss_ce": 1.327738881111145, + "loss_xval": 0.02978515625, + "num_input_tokens_seen": 5937152, + "step": 86 + }, + { + "epoch": 5.4375, + "grad_norm": 29.264244807039464, + "learning_rate": 4.0441363120103886e-05, + "loss": 1.295, + "num_input_tokens_seen": 6008960, + "step": 87 + }, + { + "epoch": 5.4375, + "loss": 1.2950177192687988, + "loss_ce": 1.2564435005187988, + "loss_xval": 0.03857421875, + "num_input_tokens_seen": 6008960, + "step": 87 + }, + { + "epoch": 5.5, + "grad_norm": 58.26095726094696, + "learning_rate": 4.054485652514322e-05, + "loss": 1.2911, + "num_input_tokens_seen": 6080640, + "step": 88 + }, + { + "epoch": 5.5, + "loss": 1.3091117143630981, + "loss_ce": 1.2700492143630981, + "loss_xval": 0.0390625, + "num_input_tokens_seen": 6080640, + "step": 88 + }, + { + "epoch": 5.5625, + "grad_norm": 52.30535783546792, + "learning_rate": 4.0647180488148894e-05, + "loss": 1.2402, + "num_input_tokens_seen": 6152192, + "step": 89 + }, + { + "epoch": 5.5625, + "loss": 1.2014875411987305, + "loss_ce": 1.1710920333862305, + "loss_xval": 0.0303955078125, + "num_input_tokens_seen": 6152192, + "step": 89 + }, + { + "epoch": 5.625, + "grad_norm": 33.50300709768344, + "learning_rate": 4.074836114272873e-05, + "loss": 1.2258, + "num_input_tokens_seen": 6223872, + "step": 90 + }, + { + "epoch": 5.625, + "loss": 1.2106300592422485, + "loss_ce": 1.1835304498672485, + "loss_xval": 0.027099609375, + "num_input_tokens_seen": 6223872, + "step": 90 + }, + { + "epoch": 5.6875, + "grad_norm": 42.07708626662781, + "learning_rate": 4.08484237561472e-05, + "loss": 1.1905, + "num_input_tokens_seen": 6295552, + "step": 91 + }, + { + "epoch": 5.6875, + "loss": 1.1875571012496948, + "loss_ce": 1.1519125699996948, + "loss_xval": 0.03564453125, + "num_input_tokens_seen": 6295552, + "step": 91 + }, + { + "epoch": 5.75, + "grad_norm": 26.631374919077388, + "learning_rate": 4.094739276720037e-05, + "loss": 1.1531, + "num_input_tokens_seen": 6367232, + "step": 92 + }, + { + "epoch": 5.75, + "loss": 1.143760323524475, + "loss_ce": 1.108848214149475, + "loss_xval": 0.034912109375, + "num_input_tokens_seen": 6367232, + "step": 92 + }, + { + "epoch": 5.8125, + "grad_norm": 11.203893962892462, + "learning_rate": 4.104529182204328e-05, + "loss": 1.1346, + "num_input_tokens_seen": 6438848, + "step": 93 + }, + { + "epoch": 5.8125, + "loss": 1.1466225385665894, + "loss_ce": 1.1186684370040894, + "loss_xval": 0.0279541015625, + "num_input_tokens_seen": 6438848, + "step": 93 + }, + { + "epoch": 5.875, + "grad_norm": 16.897054232717412, + "learning_rate": 4.114214380810143e-05, + "loss": 1.1241, + "num_input_tokens_seen": 6510528, + "step": 94 + }, + { + "epoch": 5.875, + "loss": 1.109265923500061, + "loss_ce": 1.076795220375061, + "loss_xval": 0.032470703125, + "num_input_tokens_seen": 6510528, + "step": 94 + }, + { + "epoch": 5.9375, + "grad_norm": 20.24250335227694, + "learning_rate": 4.1237970886187796e-05, + "loss": 1.0701, + "num_input_tokens_seen": 6582144, + "step": 95 + }, + { + "epoch": 5.9375, + "loss": 1.067662000656128, + "loss_ce": 1.053135633468628, + "loss_xval": 0.0145263671875, + "num_input_tokens_seen": 6582144, + "step": 95 + }, + { + "epoch": 6.0, + "grad_norm": 45.49070360548281, + "learning_rate": 4.1332794520938336e-05, + "loss": 1.0772, + "num_input_tokens_seen": 6653696, + "step": 96 + }, + { + "epoch": 6.0, + "loss": 1.0987242460250854, + "loss_ce": 1.0538023710250854, + "loss_xval": 0.044921875, + "num_input_tokens_seen": 6653696, + "step": 96 + }, + { + "epoch": 6.0625, + "grad_norm": 35.75220054268847, + "learning_rate": 4.1426635509670346e-05, + "loss": 1.0282, + "num_input_tokens_seen": 6725312, + "step": 97 + }, + { + "epoch": 6.0625, + "loss": 1.017682433128357, + "loss_ce": 0.9949773550033569, + "loss_xval": 0.022705078125, + "num_input_tokens_seen": 6725312, + "step": 97 + }, + { + "epoch": 6.125, + "grad_norm": 47.313168802314216, + "learning_rate": 4.151951400976087e-05, + "loss": 1.0307, + "num_input_tokens_seen": 6796864, + "step": 98 + }, + { + "epoch": 6.125, + "loss": 1.0477159023284912, + "loss_ce": 0.990831196308136, + "loss_xval": 0.056884765625, + "num_input_tokens_seen": 6796864, + "step": 98 + }, + { + "epoch": 6.1875, + "grad_norm": 76.72953585821251, + "learning_rate": 4.1611449564635246e-05, + "loss": 1.0313, + "num_input_tokens_seen": 6855936, + "step": 99 + }, + { + "epoch": 6.1875, + "loss": 1.008699893951416, + "loss_ce": 0.965242862701416, + "loss_xval": 0.04345703125, + "num_input_tokens_seen": 6855936, + "step": 99 + }, + { + "epoch": 6.25, + "grad_norm": 17.94697157489963, + "learning_rate": 4.1702461128449717e-05, + "loss": 0.9989, + "num_input_tokens_seen": 6915136, + "step": 100 + }, + { + "epoch": 6.25, + "loss": 0.9928085803985596, + "loss_ce": 0.9674179553985596, + "loss_xval": 0.025390625, + "num_input_tokens_seen": 6915136, + "step": 100 + }, + { + "epoch": 6.3125, + "grad_norm": 33.2144512243145, + "learning_rate": 4.179256708954579e-05, + "loss": 0.9649, + "num_input_tokens_seen": 6986752, + "step": 101 + }, + { + "epoch": 6.3125, + "loss": 0.9760175943374634, + "loss_ce": 0.9579511880874634, + "loss_xval": 0.01806640625, + "num_input_tokens_seen": 6986752, + "step": 101 + }, + { + "epoch": 6.375, + "grad_norm": 83.07224534219694, + "learning_rate": 4.188178529274939e-05, + "loss": 0.9689, + "num_input_tokens_seen": 7058496, + "step": 102 + }, + { + "epoch": 6.375, + "loss": 0.9563613533973694, + "loss_ce": 0.9209609627723694, + "loss_xval": 0.035400390625, + "num_input_tokens_seen": 7058496, + "step": 102 + }, + { + "epoch": 6.4375, + "grad_norm": 11.471453835567821, + "learning_rate": 4.197013306058202e-05, + "loss": 0.9529, + "num_input_tokens_seen": 7130048, + "step": 103 + }, + { + "epoch": 6.4375, + "loss": 0.9473816752433777, + "loss_ce": 0.9284607768058777, + "loss_xval": 0.0189208984375, + "num_input_tokens_seen": 7130048, + "step": 103 + }, + { + "epoch": 6.5, + "grad_norm": 20.514656398784428, + "learning_rate": 4.205762721344725e-05, + "loss": 0.9252, + "num_input_tokens_seen": 7201664, + "step": 104 + }, + { + "epoch": 6.5, + "loss": 0.9248759746551514, + "loss_ce": 0.9055888652801514, + "loss_xval": 0.019287109375, + "num_input_tokens_seen": 7201664, + "step": 104 + }, + { + "epoch": 6.5625, + "grad_norm": 54.333103205036785, + "learning_rate": 4.21442840888513e-05, + "loss": 0.9309, + "num_input_tokens_seen": 7273344, + "step": 105 + }, + { + "epoch": 6.5625, + "loss": 0.9247029423713684, + "loss_ce": 0.8912556767463684, + "loss_xval": 0.033447265625, + "num_input_tokens_seen": 7273344, + "step": 105 + }, + { + "epoch": 6.625, + "grad_norm": 42.01503117611703, + "learning_rate": 4.223011955971264e-05, + "loss": 0.8987, + "num_input_tokens_seen": 7345152, + "step": 106 + }, + { + "epoch": 6.625, + "loss": 0.8918097019195557, + "loss_ce": 0.8678839206695557, + "loss_xval": 0.02392578125, + "num_input_tokens_seen": 7345152, + "step": 106 + }, + { + "epoch": 6.6875, + "grad_norm": 64.62949480171359, + "learning_rate": 4.231514905181194e-05, + "loss": 0.8985, + "num_input_tokens_seen": 7416832, + "step": 107 + }, + { + "epoch": 6.6875, + "loss": 0.8956446051597595, + "loss_ce": 0.8673242926597595, + "loss_xval": 0.0283203125, + "num_input_tokens_seen": 7416832, + "step": 107 + }, + { + "epoch": 6.75, + "grad_norm": 54.310221908362855, + "learning_rate": 4.2399387560430375e-05, + "loss": 0.8738, + "num_input_tokens_seen": 7488576, + "step": 108 + }, + { + "epoch": 6.75, + "loss": 0.8796428442001343, + "loss_ce": 0.8507121801376343, + "loss_xval": 0.0289306640625, + "num_input_tokens_seen": 7488576, + "step": 108 + }, + { + "epoch": 6.8125, + "grad_norm": 15.868960973624445, + "learning_rate": 4.2482849666221134e-05, + "loss": 0.8873, + "num_input_tokens_seen": 7560128, + "step": 109 + }, + { + "epoch": 6.8125, + "loss": 0.8466143012046814, + "loss_ce": 0.8327593207359314, + "loss_xval": 0.01385498046875, + "num_input_tokens_seen": 7560128, + "step": 109 + }, + { + "epoch": 6.875, + "grad_norm": 49.92541947914144, + "learning_rate": 4.2565549550356234e-05, + "loss": 0.8584, + "num_input_tokens_seen": 7631808, + "step": 110 + }, + { + "epoch": 6.875, + "loss": 0.840313732624054, + "loss_ce": 0.815655529499054, + "loss_xval": 0.024658203125, + "num_input_tokens_seen": 7631808, + "step": 110 + }, + { + "epoch": 6.9375, + "grad_norm": 36.27667896033985, + "learning_rate": 4.2647501008987776e-05, + "loss": 0.8169, + "num_input_tokens_seen": 7703488, + "step": 111 + }, + { + "epoch": 6.9375, + "loss": 0.8194260001182556, + "loss_ce": 0.7958664298057556, + "loss_xval": 0.0235595703125, + "num_input_tokens_seen": 7703488, + "step": 111 + }, + { + "epoch": 7.0, + "grad_norm": 28.119935449380055, + "learning_rate": 4.272871746706091e-05, + "loss": 0.8164, + "num_input_tokens_seen": 7762496, + "step": 112 + }, + { + "epoch": 7.0, + "loss": 0.8153378367424011, + "loss_ce": 0.7873837351799011, + "loss_xval": 0.0279541015625, + "num_input_tokens_seen": 7762496, + "step": 112 + }, + { + "epoch": 7.0625, + "grad_norm": 39.909688575941395, + "learning_rate": 4.280921199151268e-05, + "loss": 0.8144, + "num_input_tokens_seen": 7834240, + "step": 113 + }, + { + "epoch": 7.0625, + "loss": 0.8192792534828186, + "loss_ce": 0.7809491753578186, + "loss_xval": 0.038330078125, + "num_input_tokens_seen": 7834240, + "step": 113 + }, + { + "epoch": 7.125, + "grad_norm": 18.717218960055316, + "learning_rate": 4.288899730388944e-05, + "loss": 0.788, + "num_input_tokens_seen": 7905984, + "step": 114 + }, + { + "epoch": 7.125, + "loss": 0.7716240882873535, + "loss_ce": 0.7491631507873535, + "loss_xval": 0.0224609375, + "num_input_tokens_seen": 7905984, + "step": 114 + }, + { + "epoch": 7.1875, + "grad_norm": 60.9610528810964, + "learning_rate": 4.296808579241338e-05, + "loss": 0.7927, + "num_input_tokens_seen": 7977600, + "step": 115 + }, + { + "epoch": 7.1875, + "loss": 0.7663066387176514, + "loss_ce": 0.7440898418426514, + "loss_xval": 0.022216796875, + "num_input_tokens_seen": 7977600, + "step": 115 + }, + { + "epoch": 7.25, + "grad_norm": 9.439264469153885, + "learning_rate": 4.3046489523526506e-05, + "loss": 0.7562, + "num_input_tokens_seen": 8036672, + "step": 116 + }, + { + "epoch": 7.25, + "loss": 0.7505704760551453, + "loss_ce": 0.7373258471488953, + "loss_xval": 0.01324462890625, + "num_input_tokens_seen": 8036672, + "step": 116 + }, + { + "epoch": 7.3125, + "grad_norm": 14.835568869475779, + "learning_rate": 4.312422025293929e-05, + "loss": 0.7503, + "num_input_tokens_seen": 8108352, + "step": 117 + }, + { + "epoch": 7.3125, + "loss": 0.758650004863739, + "loss_ce": 0.728498637676239, + "loss_xval": 0.0301513671875, + "num_input_tokens_seen": 8108352, + "step": 117 + }, + { + "epoch": 7.375, + "grad_norm": 36.075858696561966, + "learning_rate": 4.320128943620903e-05, + "loss": 0.7338, + "num_input_tokens_seen": 8180096, + "step": 118 + }, + { + "epoch": 7.375, + "loss": 0.7231059670448303, + "loss_ce": 0.7030864357948303, + "loss_xval": 0.02001953125, + "num_input_tokens_seen": 8180096, + "step": 118 + }, + { + "epoch": 7.4375, + "grad_norm": 16.369509919688415, + "learning_rate": 4.327770823887197e-05, + "loss": 0.711, + "num_input_tokens_seen": 8251648, + "step": 119 + }, + { + "epoch": 7.4375, + "loss": 0.6975811123847961, + "loss_ce": 0.6894634366035461, + "loss_xval": 0.00811767578125, + "num_input_tokens_seen": 8251648, + "step": 119 + }, + { + "epoch": 7.5, + "grad_norm": 23.42065778482628, + "learning_rate": 4.335348754615135e-05, + "loss": 0.7175, + "num_input_tokens_seen": 8323264, + "step": 120 + }, + { + "epoch": 7.5, + "loss": 0.7115622758865356, + "loss_ce": 0.6920310258865356, + "loss_xval": 0.01953125, + "num_input_tokens_seen": 8323264, + "step": 120 + }, + { + "epoch": 7.5625, + "grad_norm": 11.954631803701059, + "learning_rate": 4.342863797226275e-05, + "loss": 0.6866, + "num_input_tokens_seen": 8394880, + "step": 121 + }, + { + "epoch": 7.5625, + "loss": 0.687512993812561, + "loss_ce": 0.666028618812561, + "loss_xval": 0.021484375, + "num_input_tokens_seen": 8394880, + "step": 121 + }, + { + "epoch": 7.625, + "grad_norm": 10.161558994787343, + "learning_rate": 4.350316986933631e-05, + "loss": 0.6903, + "num_input_tokens_seen": 8466496, + "step": 122 + }, + { + "epoch": 7.625, + "loss": 0.6849098801612854, + "loss_ce": 0.6639137864112854, + "loss_xval": 0.02099609375, + "num_input_tokens_seen": 8466496, + "step": 122 + }, + { + "epoch": 7.6875, + "grad_norm": 41.66894560375791, + "learning_rate": 4.357709333597492e-05, + "loss": 0.6819, + "num_input_tokens_seen": 8538048, + "step": 123 + }, + { + "epoch": 7.6875, + "loss": 0.6672234535217285, + "loss_ce": 0.6472039222717285, + "loss_xval": 0.02001953125, + "num_input_tokens_seen": 8538048, + "step": 123 + }, + { + "epoch": 7.75, + "grad_norm": 19.22585236986661, + "learning_rate": 4.36504182254659e-05, + "loss": 0.6671, + "num_input_tokens_seen": 8609728, + "step": 124 + }, + { + "epoch": 7.75, + "loss": 0.6756678819656372, + "loss_ce": 0.6427088975906372, + "loss_xval": 0.032958984375, + "num_input_tokens_seen": 8609728, + "step": 124 + }, + { + "epoch": 7.8125, + "grad_norm": 13.173396588019653, + "learning_rate": 4.372315415366273e-05, + "loss": 0.6541, + "num_input_tokens_seen": 8668800, + "step": 125 + }, + { + "epoch": 7.8125, + "loss": 0.6586058139801025, + "loss_ce": 0.6227171421051025, + "loss_xval": 0.035888671875, + "num_input_tokens_seen": 8668800, + "step": 125 + }, + { + "epoch": 7.875, + "grad_norm": 73.83611704448327, + "learning_rate": 4.379531050655295e-05, + "loss": 0.6407, + "num_input_tokens_seen": 8740480, + "step": 126 + }, + { + "epoch": 7.875, + "loss": 0.6286131739616394, + "loss_ce": 0.6065184473991394, + "loss_xval": 0.0220947265625, + "num_input_tokens_seen": 8740480, + "step": 126 + }, + { + "epoch": 7.9375, + "grad_norm": 20.34502886757861, + "learning_rate": 4.386689644752683e-05, + "loss": 0.6079, + "num_input_tokens_seen": 8812032, + "step": 127 + }, + { + "epoch": 7.9375, + "loss": 0.6007564067840576, + "loss_ce": 0.5874507427215576, + "loss_xval": 0.0133056640625, + "num_input_tokens_seen": 8812032, + "step": 127 + }, + { + "epoch": 8.0, + "grad_norm": 53.69771774292302, + "learning_rate": 4.3937920924360956e-05, + "loss": 0.6143, + "num_input_tokens_seen": 8871232, + "step": 128 + }, + { + "epoch": 8.0, + "loss": 0.6093257665634155, + "loss_ce": 0.5786861181259155, + "loss_xval": 0.0306396484375, + "num_input_tokens_seen": 8871232, + "step": 128 + }, + { + "epoch": 8.0625, + "grad_norm": 39.90429875950801, + "learning_rate": 4.400839267593018e-05, + "loss": 0.5816, + "num_input_tokens_seen": 8930432, + "step": 129 + }, + { + "epoch": 8.0625, + "loss": 0.5776582360267639, + "loss_ce": 0.5531221032142639, + "loss_xval": 0.0245361328125, + "num_input_tokens_seen": 8930432, + "step": 129 + }, + { + "epoch": 8.125, + "grad_norm": 42.8106886505879, + "learning_rate": 4.407832023866026e-05, + "loss": 0.5801, + "num_input_tokens_seen": 9002176, + "step": 130 + }, + { + "epoch": 8.125, + "loss": 0.564851701259613, + "loss_ce": 0.541170060634613, + "loss_xval": 0.023681640625, + "num_input_tokens_seen": 9002176, + "step": 130 + }, + { + "epoch": 8.1875, + "grad_norm": 74.80583526977246, + "learning_rate": 4.414771195273343e-05, + "loss": 0.5773, + "num_input_tokens_seen": 9073856, + "step": 131 + }, + { + "epoch": 8.1875, + "loss": 0.5918972492218018, + "loss_ce": 0.5333034992218018, + "loss_xval": 0.05859375, + "num_input_tokens_seen": 9073856, + "step": 131 + }, + { + "epoch": 8.25, + "grad_norm": 36.13689427412808, + "learning_rate": 4.421657596805787e-05, + "loss": 0.5285, + "num_input_tokens_seen": 9145536, + "step": 132 + }, + { + "epoch": 8.25, + "loss": 0.5381279587745667, + "loss_ce": 0.5160332322120667, + "loss_xval": 0.0220947265625, + "num_input_tokens_seen": 9145536, + "step": 132 + }, + { + "epoch": 8.3125, + "grad_norm": 36.18830775008529, + "learning_rate": 4.4284920250012015e-05, + "loss": 0.5293, + "num_input_tokens_seen": 9217088, + "step": 133 + }, + { + "epoch": 8.3125, + "loss": 0.5235571265220642, + "loss_ce": 0.5017065405845642, + "loss_xval": 0.0218505859375, + "num_input_tokens_seen": 9217088, + "step": 133 + }, + { + "epoch": 8.375, + "grad_norm": 29.863124074710544, + "learning_rate": 4.435275258497362e-05, + "loss": 0.5279, + "num_input_tokens_seen": 9288704, + "step": 134 + }, + { + "epoch": 8.375, + "loss": 0.5201796293258667, + "loss_ce": 0.5000380277633667, + "loss_xval": 0.0201416015625, + "num_input_tokens_seen": 9288704, + "step": 134 + }, + { + "epoch": 8.4375, + "grad_norm": 26.175864599633144, + "learning_rate": 4.4420080585643395e-05, + "loss": 0.5015, + "num_input_tokens_seen": 9347904, + "step": 135 + }, + { + "epoch": 8.4375, + "loss": 0.48246899247169495, + "loss_ce": 0.46836987137794495, + "loss_xval": 0.01409912109375, + "num_input_tokens_seen": 9347904, + "step": 135 + }, + { + "epoch": 8.5, + "grad_norm": 41.789246513263144, + "learning_rate": 4.4486911696172015e-05, + "loss": 0.4809, + "num_input_tokens_seen": 9419648, + "step": 136 + }, + { + "epoch": 8.5, + "loss": 0.48507535457611084, + "loss_ce": 0.46285855770111084, + "loss_xval": 0.022216796875, + "num_input_tokens_seen": 9419648, + "step": 136 + }, + { + "epoch": 8.5625, + "grad_norm": 12.475484859727866, + "learning_rate": 4.4553253197099536e-05, + "loss": 0.4717, + "num_input_tokens_seen": 9491200, + "step": 137 + }, + { + "epoch": 8.5625, + "loss": 0.4641610085964203, + "loss_ce": 0.4516488015651703, + "loss_xval": 0.01251220703125, + "num_input_tokens_seen": 9491200, + "step": 137 + }, + { + "epoch": 8.625, + "grad_norm": 57.1829240329704, + "learning_rate": 4.461911221011503e-05, + "loss": 0.4917, + "num_input_tokens_seen": 9550400, + "step": 138 + }, + { + "epoch": 8.625, + "loss": 0.500900149345398, + "loss_ce": 0.45500174164772034, + "loss_xval": 0.0458984375, + "num_input_tokens_seen": 9550400, + "step": 138 + }, + { + "epoch": 8.6875, + "grad_norm": 15.812633321792019, + "learning_rate": 4.4684495702644406e-05, + "loss": 0.4351, + "num_input_tokens_seen": 9622080, + "step": 139 + }, + { + "epoch": 8.6875, + "loss": 0.4303787052631378, + "loss_ce": 0.4198806583881378, + "loss_xval": 0.010498046875, + "num_input_tokens_seen": 9622080, + "step": 139 + }, + { + "epoch": 8.75, + "grad_norm": 33.83518875566213, + "learning_rate": 4.474941049227392e-05, + "loss": 0.4621, + "num_input_tokens_seen": 9693760, + "step": 140 + }, + { + "epoch": 8.75, + "loss": 0.45098790526390076, + "loss_ce": 0.42376622557640076, + "loss_xval": 0.0272216796875, + "num_input_tokens_seen": 9693760, + "step": 140 + }, + { + "epoch": 8.8125, + "grad_norm": 40.96321239486865, + "learning_rate": 4.481386325101608e-05, + "loss": 0.4164, + "num_input_tokens_seen": 9765440, + "step": 141 + }, + { + "epoch": 8.8125, + "loss": 0.41145414113998413, + "loss_ce": 0.38545316457748413, + "loss_xval": 0.0260009765625, + "num_input_tokens_seen": 9765440, + "step": 141 + }, + { + "epoch": 8.875, + "grad_norm": 37.39520904229126, + "learning_rate": 4.48778605094249e-05, + "loss": 0.3979, + "num_input_tokens_seen": 9837184, + "step": 142 + }, + { + "epoch": 8.875, + "loss": 0.4080401062965393, + "loss_ce": 0.3820391297340393, + "loss_xval": 0.0260009765625, + "num_input_tokens_seen": 9837184, + "step": 142 + }, + { + "epoch": 8.9375, + "grad_norm": 34.52606007669613, + "learning_rate": 4.494140866056678e-05, + "loss": 0.3795, + "num_input_tokens_seen": 9908864, + "step": 143 + }, + { + "epoch": 8.9375, + "loss": 0.3671216368675232, + "loss_ce": 0.3521069884300232, + "loss_xval": 0.0150146484375, + "num_input_tokens_seen": 9908864, + "step": 143 + }, + { + "epoch": 9.0, + "grad_norm": 30.32849867940823, + "learning_rate": 4.5004513963852995e-05, + "loss": 0.373, + "num_input_tokens_seen": 9967936, + "step": 144 + }, + { + "epoch": 9.0, + "loss": 0.37388211488723755, + "loss_ce": 0.34702664613723755, + "loss_xval": 0.02685546875, + "num_input_tokens_seen": 9967936, + "step": 144 + }, + { + "epoch": 9.0625, + "grad_norm": 33.853748342915004, + "learning_rate": 4.5067182548739526e-05, + "loss": 0.3726, + "num_input_tokens_seen": 10039552, + "step": 145 + }, + { + "epoch": 9.0625, + "loss": 0.370542973279953, + "loss_ce": 0.351500004529953, + "loss_xval": 0.01904296875, + "num_input_tokens_seen": 10039552, + "step": 145 + }, + { + "epoch": 9.125, + "grad_norm": 38.626009081815425, + "learning_rate": 4.5129420418299804e-05, + "loss": 0.3515, + "num_input_tokens_seen": 10111168, + "step": 146 + }, + { + "epoch": 9.125, + "loss": 0.3514055907726288, + "loss_ce": 0.3271135985851288, + "loss_xval": 0.0242919921875, + "num_input_tokens_seen": 10111168, + "step": 146 + }, + { + "epoch": 9.1875, + "grad_norm": 28.328723972964454, + "learning_rate": 4.519123345267552e-05, + "loss": 0.3439, + "num_input_tokens_seen": 10182848, + "step": 147 + }, + { + "epoch": 9.1875, + "loss": 0.3309248387813568, + "loss_ce": 0.3159712255001068, + "loss_xval": 0.01495361328125, + "num_input_tokens_seen": 10182848, + "step": 147 + }, + { + "epoch": 9.25, + "grad_norm": 35.40827826279614, + "learning_rate": 4.5252627412410396e-05, + "loss": 0.3319, + "num_input_tokens_seen": 10241856, + "step": 148 + }, + { + "epoch": 9.25, + "loss": 0.33163246512413025, + "loss_ce": 0.31014809012413025, + "loss_xval": 0.021484375, + "num_input_tokens_seen": 10241856, + "step": 148 + }, + { + "epoch": 9.3125, + "grad_norm": 30.87887626218836, + "learning_rate": 4.531360794167177e-05, + "loss": 0.3193, + "num_input_tokens_seen": 10313664, + "step": 149 + }, + { + "epoch": 9.3125, + "loss": 0.33782634139060974, + "loss_ce": 0.30852946639060974, + "loss_xval": 0.029296875, + "num_input_tokens_seen": 10313664, + "step": 149 + }, + { + "epoch": 9.375, + "grad_norm": 34.69416994013918, + "learning_rate": 4.537418057136437e-05, + "loss": 0.3064, + "num_input_tokens_seen": 10385408, + "step": 150 + }, + { + "epoch": 9.375, + "loss": 0.30474743247032166, + "loss_ce": 0.29302868247032166, + "loss_xval": 0.01171875, + "num_input_tokens_seen": 10385408, + "step": 150 + }, + { + "epoch": 9.4375, + "grad_norm": 16.01660241849197, + "learning_rate": 4.543435072214071e-05, + "loss": 0.2817, + "num_input_tokens_seen": 10456960, + "step": 151 + }, + { + "epoch": 9.4375, + "loss": 0.2802174985408783, + "loss_ce": 0.2626393735408783, + "loss_xval": 0.017578125, + "num_input_tokens_seen": 10456960, + "step": 151 + }, + { + "epoch": 9.5, + "grad_norm": 12.08386534630288, + "learning_rate": 4.549412370731206e-05, + "loss": 0.2964, + "num_input_tokens_seen": 10528576, + "step": 152 + }, + { + "epoch": 9.5, + "loss": 0.2941746115684509, + "loss_ce": 0.2757419943809509, + "loss_xval": 0.0184326171875, + "num_input_tokens_seen": 10528576, + "step": 152 + }, + { + "epoch": 9.5625, + "grad_norm": 46.64162266885969, + "learning_rate": 4.555350473566404e-05, + "loss": 0.2922, + "num_input_tokens_seen": 10600192, + "step": 153 + }, + { + "epoch": 9.5625, + "loss": 0.29297852516174316, + "loss_ce": 0.26148438453674316, + "loss_xval": 0.031494140625, + "num_input_tokens_seen": 10600192, + "step": 153 + }, + { + "epoch": 9.625, + "grad_norm": 18.386857979962915, + "learning_rate": 4.561249891418045e-05, + "loss": 0.2874, + "num_input_tokens_seen": 10671872, + "step": 154 + }, + { + "epoch": 9.625, + "loss": 0.2867387533187866, + "loss_ce": 0.2618364095687866, + "loss_xval": 0.02490234375, + "num_input_tokens_seen": 10671872, + "step": 154 + }, + { + "epoch": 9.6875, + "grad_norm": 95.87776763441651, + "learning_rate": 4.5671111250678913e-05, + "loss": 0.291, + "num_input_tokens_seen": 10743552, + "step": 155 + }, + { + "epoch": 9.6875, + "loss": 0.28771263360977173, + "loss_ce": 0.24157007038593292, + "loss_xval": 0.046142578125, + "num_input_tokens_seen": 10743552, + "step": 155 + }, + { + "epoch": 9.75, + "grad_norm": 44.96972476442496, + "learning_rate": 4.572934665636191e-05, + "loss": 0.2742, + "num_input_tokens_seen": 10815296, + "step": 156 + }, + { + "epoch": 9.75, + "loss": 0.27817845344543457, + "loss_ce": 0.24399876594543457, + "loss_xval": 0.0341796875, + "num_input_tokens_seen": 10815296, + "step": 156 + }, + { + "epoch": 9.8125, + "grad_norm": 56.099205952139485, + "learning_rate": 4.5787209948286147e-05, + "loss": 0.2734, + "num_input_tokens_seen": 10886976, + "step": 157 + }, + { + "epoch": 9.8125, + "loss": 0.26927852630615234, + "loss_ce": 0.23485469818115234, + "loss_xval": 0.034423828125, + "num_input_tokens_seen": 10886976, + "step": 157 + }, + { + "epoch": 9.875, + "grad_norm": 41.24759861660678, + "learning_rate": 4.5844705851753643e-05, + "loss": 0.2707, + "num_input_tokens_seen": 10958720, + "step": 158 + }, + { + "epoch": 9.875, + "loss": 0.2965546250343323, + "loss_ce": 0.23698429763317108, + "loss_xval": 0.0595703125, + "num_input_tokens_seen": 10958720, + "step": 158 + }, + { + "epoch": 9.9375, + "grad_norm": 45.405964399299954, + "learning_rate": 4.59018390026273e-05, + "loss": 0.2736, + "num_input_tokens_seen": 11030272, + "step": 159 + }, + { + "epoch": 9.9375, + "loss": 0.25298789143562317, + "loss_ce": 0.22674277424812317, + "loss_xval": 0.0262451171875, + "num_input_tokens_seen": 11030272, + "step": 159 + }, + { + "epoch": 10.0, + "grad_norm": 76.86054524094995, + "learning_rate": 4.5958613949573976e-05, + "loss": 0.2655, + "num_input_tokens_seen": 11101952, + "step": 160 + }, + { + "epoch": 10.0, + "loss": 0.2796672284603119, + "loss_ce": 0.2396281659603119, + "loss_xval": 0.0400390625, + "num_input_tokens_seen": 11101952, + "step": 160 + }, + { + "epoch": 10.0625, + "grad_norm": 19.932279617630357, + "learning_rate": 4.6015035156237594e-05, + "loss": 0.2454, + "num_input_tokens_seen": 11161024, + "step": 161 + }, + { + "epoch": 10.0625, + "loss": 0.2661590576171875, + "loss_ce": 0.2231903076171875, + "loss_xval": 0.04296875, + "num_input_tokens_seen": 11161024, + "step": 161 + }, + { + "epoch": 10.125, + "grad_norm": 135.34396180408, + "learning_rate": 4.607110700334503e-05, + "loss": 0.2876, + "num_input_tokens_seen": 11232704, + "step": 162 + }, + { + "epoch": 10.125, + "loss": 0.2808167040348053, + "loss_ce": 0.2114807665348053, + "loss_xval": 0.0693359375, + "num_input_tokens_seen": 11232704, + "step": 162 + }, + { + "epoch": 10.1875, + "grad_norm": 70.37309365651035, + "learning_rate": 4.612683379074717e-05, + "loss": 0.2476, + "num_input_tokens_seen": 11304320, + "step": 163 + }, + { + "epoch": 10.1875, + "loss": 0.25540295243263245, + "loss_ce": 0.21634045243263245, + "loss_xval": 0.0390625, + "num_input_tokens_seen": 11304320, + "step": 163 + }, + { + "epoch": 10.25, + "grad_norm": 102.89804491544115, + "learning_rate": 4.6182219739397555e-05, + "loss": 0.2656, + "num_input_tokens_seen": 11376000, + "step": 164 + }, + { + "epoch": 10.25, + "loss": 0.2687593102455139, + "loss_ce": 0.21871048212051392, + "loss_xval": 0.050048828125, + "num_input_tokens_seen": 11376000, + "step": 164 + }, + { + "epoch": 10.3125, + "grad_norm": 137.28397466828608, + "learning_rate": 4.623726899327088e-05, + "loss": 0.2716, + "num_input_tokens_seen": 11447616, + "step": 165 + }, + { + "epoch": 10.3125, + "loss": 0.2575090527534485, + "loss_ce": 0.1998918503522873, + "loss_xval": 0.0576171875, + "num_input_tokens_seen": 11447616, + "step": 165 + }, + { + "epoch": 10.375, + "grad_norm": 26.430941270572102, + "learning_rate": 4.62919856212236e-05, + "loss": 0.2221, + "num_input_tokens_seen": 11519232, + "step": 166 + }, + { + "epoch": 10.375, + "loss": 0.21056154370307922, + "loss_ce": 0.19273927807807922, + "loss_xval": 0.017822265625, + "num_input_tokens_seen": 11519232, + "step": 166 + }, + { + "epoch": 10.4375, + "grad_norm": 130.14886600445317, + "learning_rate": 4.6346373618798503e-05, + "loss": 0.2633, + "num_input_tokens_seen": 11591040, + "step": 167 + }, + { + "epoch": 10.4375, + "loss": 0.277815043926239, + "loss_ce": 0.201643168926239, + "loss_xval": 0.076171875, + "num_input_tokens_seen": 11591040, + "step": 167 + }, + { + "epoch": 10.5, + "grad_norm": 15.36888208828194, + "learning_rate": 4.640043690997557e-05, + "loss": 0.2105, + "num_input_tokens_seen": 11650048, + "step": 168 + }, + { + "epoch": 10.5, + "loss": 0.21321672201156616, + "loss_ce": 0.19209855794906616, + "loss_xval": 0.0211181640625, + "num_input_tokens_seen": 11650048, + "step": 168 + }, + { + "epoch": 10.5625, + "grad_norm": 129.1002672969776, + "learning_rate": 4.6454179348870823e-05, + "loss": 0.2569, + "num_input_tokens_seen": 11721664, + "step": 169 + }, + { + "epoch": 10.5625, + "loss": 0.25425177812576294, + "loss_ce": 0.19126349687576294, + "loss_xval": 0.06298828125, + "num_input_tokens_seen": 11721664, + "step": 169 + }, + { + "epoch": 10.625, + "grad_norm": 113.95606982901108, + "learning_rate": 4.650760472138503e-05, + "loss": 0.2451, + "num_input_tokens_seen": 11793408, + "step": 170 + }, + { + "epoch": 10.625, + "loss": 0.2487194985151291, + "loss_ce": 0.1932995766401291, + "loss_xval": 0.055419921875, + "num_input_tokens_seen": 11793408, + "step": 170 + }, + { + "epoch": 10.6875, + "grad_norm": 9.33210610394722, + "learning_rate": 4.65607167468041e-05, + "loss": 0.2122, + "num_input_tokens_seen": 11852480, + "step": 171 + }, + { + "epoch": 10.6875, + "loss": 0.20928899943828583, + "loss_ce": 0.18218939006328583, + "loss_xval": 0.027099609375, + "num_input_tokens_seen": 11852480, + "step": 171 + }, + { + "epoch": 10.75, + "grad_norm": 72.11959352369618, + "learning_rate": 4.66135190793528e-05, + "loss": 0.2087, + "num_input_tokens_seen": 11924032, + "step": 172 + }, + { + "epoch": 10.75, + "loss": 0.2318447381258011, + "loss_ce": 0.1837490350008011, + "loss_xval": 0.048095703125, + "num_input_tokens_seen": 11924032, + "step": 172 + }, + { + "epoch": 10.8125, + "grad_norm": 47.15140710616073, + "learning_rate": 4.666601530970347e-05, + "loss": 0.1902, + "num_input_tokens_seen": 11995712, + "step": 173 + }, + { + "epoch": 10.8125, + "loss": 0.1904245764017105, + "loss_ce": 0.1722360998392105, + "loss_xval": 0.0181884765625, + "num_input_tokens_seen": 11995712, + "step": 173 + }, + { + "epoch": 10.875, + "grad_norm": 10.571062509972561, + "learning_rate": 4.6718208966441165e-05, + "loss": 0.1882, + "num_input_tokens_seen": 12067392, + "step": 174 + }, + { + "epoch": 10.875, + "loss": 0.1886989027261734, + "loss_ce": 0.1755153089761734, + "loss_xval": 0.01318359375, + "num_input_tokens_seen": 12067392, + "step": 174 + }, + { + "epoch": 10.9375, + "grad_norm": 46.46707211476998, + "learning_rate": 4.677010351748695e-05, + "loss": 0.1882, + "num_input_tokens_seen": 12139072, + "step": 175 + }, + { + "epoch": 10.9375, + "loss": 0.1885238140821457, + "loss_ce": 0.1647201031446457, + "loss_xval": 0.0238037109375, + "num_input_tokens_seen": 12139072, + "step": 175 + }, + { + "epoch": 11.0, + "grad_norm": 42.51412908712922, + "learning_rate": 4.682170237148049e-05, + "loss": 0.1922, + "num_input_tokens_seen": 12210624, + "step": 176 + }, + { + "epoch": 11.0, + "loss": 0.1938706487417221, + "loss_ce": 0.1682358831167221, + "loss_xval": 0.025634765625, + "num_input_tokens_seen": 12210624, + "step": 176 + }, + { + "epoch": 11.0625, + "grad_norm": 20.645324063063473, + "learning_rate": 4.6873008879123683e-05, + "loss": 0.1954, + "num_input_tokens_seen": 12269696, + "step": 177 + }, + { + "epoch": 11.0625, + "loss": 0.1937786191701889, + "loss_ce": 0.1749797910451889, + "loss_xval": 0.018798828125, + "num_input_tokens_seen": 12269696, + "step": 177 + }, + { + "epoch": 11.125, + "grad_norm": 75.87090591068548, + "learning_rate": 4.692402633448617e-05, + "loss": 0.1903, + "num_input_tokens_seen": 12341376, + "step": 178 + }, + { + "epoch": 11.125, + "loss": 0.1830907016992569, + "loss_ce": 0.1630711704492569, + "loss_xval": 0.02001953125, + "num_input_tokens_seen": 12341376, + "step": 178 + }, + { + "epoch": 11.1875, + "grad_norm": 39.05028784592837, + "learning_rate": 4.6974757976274554e-05, + "loss": 0.1839, + "num_input_tokens_seen": 12387840, + "step": 179 + }, + { + "epoch": 11.1875, + "loss": 0.19825224578380585, + "loss_ce": 0.17273955047130585, + "loss_xval": 0.0255126953125, + "num_input_tokens_seen": 12387840, + "step": 179 + }, + { + "epoch": 11.25, + "grad_norm": 59.302577487431336, + "learning_rate": 4.7025206989066015e-05, + "loss": 0.1898, + "num_input_tokens_seen": 12446912, + "step": 180 + }, + { + "epoch": 11.25, + "loss": 0.18680866062641144, + "loss_ce": 0.16556842625141144, + "loss_xval": 0.021240234375, + "num_input_tokens_seen": 12446912, + "step": 180 + }, + { + "epoch": 11.3125, + "grad_norm": 132.3733658251234, + "learning_rate": 4.7075376504507956e-05, + "loss": 0.2344, + "num_input_tokens_seen": 12518528, + "step": 181 + }, + { + "epoch": 11.3125, + "loss": 0.22184261679649353, + "loss_ce": 0.16031917929649353, + "loss_xval": 0.0615234375, + "num_input_tokens_seen": 12518528, + "step": 181 + }, + { + "epoch": 11.375, + "grad_norm": 123.63188526561277, + "learning_rate": 4.7125269602484475e-05, + "loss": 0.2202, + "num_input_tokens_seen": 12590272, + "step": 182 + }, + { + "epoch": 11.375, + "loss": 0.1961844116449356, + "loss_ce": 0.1507742553949356, + "loss_xval": 0.04541015625, + "num_input_tokens_seen": 12590272, + "step": 182 + }, + { + "epoch": 11.4375, + "grad_norm": 28.488808197631567, + "learning_rate": 4.717488931225096e-05, + "loss": 0.1636, + "num_input_tokens_seen": 12661952, + "step": 183 + }, + { + "epoch": 11.4375, + "loss": 0.15562567114830017, + "loss_ce": 0.14347967505455017, + "loss_xval": 0.01214599609375, + "num_input_tokens_seen": 12661952, + "step": 183 + }, + { + "epoch": 11.5, + "grad_norm": 84.56249611227895, + "learning_rate": 4.722423861353765e-05, + "loss": 0.1893, + "num_input_tokens_seen": 12733504, + "step": 184 + }, + { + "epoch": 11.5, + "loss": 0.19987738132476807, + "loss_ce": 0.15349066257476807, + "loss_xval": 0.04638671875, + "num_input_tokens_seen": 12733504, + "step": 184 + }, + { + "epoch": 11.5625, + "grad_norm": 152.90061856756552, + "learning_rate": 4.727332043762341e-05, + "loss": 0.2362, + "num_input_tokens_seen": 12805120, + "step": 185 + }, + { + "epoch": 11.5625, + "loss": 0.22141288220882416, + "loss_ce": 0.14963553845882416, + "loss_xval": 0.07177734375, + "num_input_tokens_seen": 12805120, + "step": 185 + }, + { + "epoch": 11.625, + "grad_norm": 110.11923740163819, + "learning_rate": 4.732213766838056e-05, + "loss": 0.2016, + "num_input_tokens_seen": 12876800, + "step": 186 + }, + { + "epoch": 11.625, + "loss": 0.197799414396286, + "loss_ce": 0.158004492521286, + "loss_xval": 0.039794921875, + "num_input_tokens_seen": 12876800, + "step": 186 + }, + { + "epoch": 11.6875, + "grad_norm": 14.134948821400592, + "learning_rate": 4.7370693143291545e-05, + "loss": 0.1698, + "num_input_tokens_seen": 12936000, + "step": 187 + }, + { + "epoch": 11.6875, + "loss": 0.17875821888446808, + "loss_ce": 0.15483243763446808, + "loss_xval": 0.02392578125, + "num_input_tokens_seen": 12936000, + "step": 187 + }, + { + "epoch": 11.75, + "grad_norm": 97.72533976836097, + "learning_rate": 4.74189896544387e-05, + "loss": 0.1821, + "num_input_tokens_seen": 13007680, + "step": 188 + }, + { + "epoch": 11.75, + "loss": 0.17285184562206268, + "loss_ce": 0.14892606437206268, + "loss_xval": 0.02392578125, + "num_input_tokens_seen": 13007680, + "step": 188 + }, + { + "epoch": 11.8125, + "grad_norm": 101.84921282322784, + "learning_rate": 4.746702994946761e-05, + "loss": 0.1832, + "num_input_tokens_seen": 13066880, + "step": 189 + }, + { + "epoch": 11.8125, + "loss": 0.19510142505168915, + "loss_ce": 0.13504283130168915, + "loss_xval": 0.06005859375, + "num_input_tokens_seen": 13066880, + "step": 189 + }, + { + "epoch": 11.875, + "grad_norm": 24.2688075939221, + "learning_rate": 4.7514816732525075e-05, + "loss": 0.1571, + "num_input_tokens_seen": 13138432, + "step": 190 + }, + { + "epoch": 11.875, + "loss": 0.158127099275589, + "loss_ce": 0.143905907869339, + "loss_xval": 0.01422119140625, + "num_input_tokens_seen": 13138432, + "step": 190 + }, + { + "epoch": 11.9375, + "grad_norm": 78.76517648907875, + "learning_rate": 4.7562352665172554e-05, + "loss": 0.175, + "num_input_tokens_seen": 13209984, + "step": 191 + }, + { + "epoch": 11.9375, + "loss": 0.18069201707839966, + "loss_ce": 0.15285998582839966, + "loss_xval": 0.02783203125, + "num_input_tokens_seen": 13209984, + "step": 191 + }, + { + "epoch": 12.0, + "grad_norm": 149.9895595333202, + "learning_rate": 4.760964036727562e-05, + "loss": 0.2194, + "num_input_tokens_seen": 13281664, + "step": 192 + }, + { + "epoch": 12.0, + "loss": 0.2099841833114624, + "loss_ce": 0.1377185583114624, + "loss_xval": 0.072265625, + "num_input_tokens_seen": 13281664, + "step": 192 + }, + { + "epoch": 12.0625, + "grad_norm": 146.9176107213647, + "learning_rate": 4.765668241787041e-05, + "loss": 0.2137, + "num_input_tokens_seen": 13353216, + "step": 193 + }, + { + "epoch": 12.0625, + "loss": 0.2261110544204712, + "loss_ce": 0.1435915231704712, + "loss_xval": 0.08251953125, + "num_input_tokens_seen": 13353216, + "step": 193 + }, + { + "epoch": 12.125, + "grad_norm": 56.898685367206404, + "learning_rate": 4.7703481356007625e-05, + "loss": 0.1468, + "num_input_tokens_seen": 13424832, + "step": 194 + }, + { + "epoch": 12.125, + "loss": 0.14375604689121246, + "loss_ce": 0.12715448439121246, + "loss_xval": 0.0166015625, + "num_input_tokens_seen": 13424832, + "step": 194 + }, + { + "epoch": 12.1875, + "grad_norm": 39.22445120701935, + "learning_rate": 4.775003968157493e-05, + "loss": 0.1552, + "num_input_tokens_seen": 13496512, + "step": 195 + }, + { + "epoch": 12.1875, + "loss": 0.14577074348926544, + "loss_ce": 0.13057298958301544, + "loss_xval": 0.01519775390625, + "num_input_tokens_seen": 13496512, + "step": 195 + }, + { + "epoch": 12.25, + "grad_norm": 93.17339739911861, + "learning_rate": 4.779635985609814e-05, + "loss": 0.1683, + "num_input_tokens_seen": 13568128, + "step": 196 + }, + { + "epoch": 12.25, + "loss": 0.16052621603012085, + "loss_ce": 0.12854379415512085, + "loss_xval": 0.031982421875, + "num_input_tokens_seen": 13568128, + "step": 196 + }, + { + "epoch": 12.3125, + "grad_norm": 102.75877130957683, + "learning_rate": 4.7842444303522264e-05, + "loss": 0.1737, + "num_input_tokens_seen": 13639808, + "step": 197 + }, + { + "epoch": 12.3125, + "loss": 0.17900289595127106, + "loss_ce": 0.13115133345127106, + "loss_xval": 0.0478515625, + "num_input_tokens_seen": 13639808, + "step": 197 + }, + { + "epoch": 12.375, + "grad_norm": 84.31936070008948, + "learning_rate": 4.7888295410972525e-05, + "loss": 0.1567, + "num_input_tokens_seen": 13711488, + "step": 198 + }, + { + "epoch": 12.375, + "loss": 0.15471437573432922, + "loss_ce": 0.11906984448432922, + "loss_xval": 0.03564453125, + "num_input_tokens_seen": 13711488, + "step": 198 + }, + { + "epoch": 12.4375, + "grad_norm": 71.28851686781569, + "learning_rate": 4.793391552949641e-05, + "loss": 0.1591, + "num_input_tokens_seen": 13770560, + "step": 199 + }, + { + "epoch": 12.4375, + "loss": 0.15467874705791473, + "loss_ce": 0.13002054393291473, + "loss_xval": 0.024658203125, + "num_input_tokens_seen": 13770560, + "step": 199 + }, + { + "epoch": 12.5, + "grad_norm": 76.02012107203956, + "learning_rate": 4.797930697478699e-05, + "loss": 0.1488, + "num_input_tokens_seen": 13842176, + "step": 200 + }, + { + "epoch": 12.5, + "loss": 0.1352391541004181, + "loss_ce": 0.1154637560248375, + "loss_xval": 0.019775390625, + "num_input_tokens_seen": 13842176, + "step": 200 + }, + { + "epoch": 12.5625, + "grad_norm": 87.7457533680535, + "learning_rate": 4.8024472027888286e-05, + "loss": 0.1727, + "num_input_tokens_seen": 13901248, + "step": 201 + }, + { + "epoch": 12.5625, + "loss": 0.16884461045265198, + "loss_ce": 0.12782898545265198, + "loss_xval": 0.041015625, + "num_input_tokens_seen": 13901248, + "step": 201 + }, + { + "epoch": 12.625, + "grad_norm": 92.68577177289325, + "learning_rate": 4.806941293588307e-05, + "loss": 0.1663, + "num_input_tokens_seen": 13972928, + "step": 202 + }, + { + "epoch": 12.625, + "loss": 0.1651129573583603, + "loss_ce": 0.1262945979833603, + "loss_xval": 0.038818359375, + "num_input_tokens_seen": 13972928, + "step": 202 + }, + { + "epoch": 12.6875, + "grad_norm": 130.08538896402837, + "learning_rate": 4.811413191256374e-05, + "loss": 0.1892, + "num_input_tokens_seen": 14044480, + "step": 203 + }, + { + "epoch": 12.6875, + "loss": 0.17750969529151917, + "loss_ce": 0.12038079649209976, + "loss_xval": 0.05712890625, + "num_input_tokens_seen": 14044480, + "step": 203 + }, + { + "epoch": 12.75, + "grad_norm": 182.930050494289, + "learning_rate": 4.815863113908667e-05, + "loss": 0.2198, + "num_input_tokens_seen": 14116160, + "step": 204 + }, + { + "epoch": 12.75, + "loss": 0.21291296184062958, + "loss_ce": 0.11086218059062958, + "loss_xval": 0.10205078125, + "num_input_tokens_seen": 14116160, + "step": 204 + }, + { + "epoch": 12.8125, + "grad_norm": 169.78449355568006, + "learning_rate": 4.820291276461056e-05, + "loss": 0.2232, + "num_input_tokens_seen": 14187840, + "step": 205 + }, + { + "epoch": 12.8125, + "loss": 0.22619017958641052, + "loss_ce": 0.13732299208641052, + "loss_xval": 0.0888671875, + "num_input_tokens_seen": 14187840, + "step": 205 + }, + { + "epoch": 12.875, + "grad_norm": 77.39150620613387, + "learning_rate": 4.82469789069193e-05, + "loss": 0.1645, + "num_input_tokens_seen": 14259584, + "step": 206 + }, + { + "epoch": 12.875, + "loss": 0.16739881038665771, + "loss_ce": 0.12980115413665771, + "loss_xval": 0.03759765625, + "num_input_tokens_seen": 14259584, + "step": 206 + }, + { + "epoch": 12.9375, + "grad_norm": 59.71568867665327, + "learning_rate": 4.829083165302968e-05, + "loss": 0.138, + "num_input_tokens_seen": 14318656, + "step": 207 + }, + { + "epoch": 12.9375, + "loss": 0.14700518548488617, + "loss_ce": 0.12198077142238617, + "loss_xval": 0.0250244140625, + "num_input_tokens_seen": 14318656, + "step": 207 + }, + { + "epoch": 13.0, + "grad_norm": 176.08463177627178, + "learning_rate": 4.833447305978453e-05, + "loss": 0.2277, + "num_input_tokens_seen": 14390272, + "step": 208 + }, + { + "epoch": 13.0, + "loss": 0.21075105667114258, + "loss_ce": 0.12530183792114258, + "loss_xval": 0.08544921875, + "num_input_tokens_seen": 14390272, + "step": 208 + }, + { + "epoch": 13.0625, + "grad_norm": 207.07064896762472, + "learning_rate": 4.83779051544316e-05, + "loss": 0.2544, + "num_input_tokens_seen": 14462080, + "step": 209 + }, + { + "epoch": 13.0625, + "loss": 0.2749811112880707, + "loss_ce": 0.12752017378807068, + "loss_xval": 0.1474609375, + "num_input_tokens_seen": 14462080, + "step": 209 + }, + { + "epoch": 13.125, + "grad_norm": 152.6246485649904, + "learning_rate": 4.842112993518858e-05, + "loss": 0.2035, + "num_input_tokens_seen": 14521152, + "step": 210 + }, + { + "epoch": 13.125, + "loss": 0.21896807849407196, + "loss_ce": 0.13303057849407196, + "loss_xval": 0.0859375, + "num_input_tokens_seen": 14521152, + "step": 210 + }, + { + "epoch": 13.1875, + "grad_norm": 59.82558956835435, + "learning_rate": 4.846414937179484e-05, + "loss": 0.1316, + "num_input_tokens_seen": 14592832, + "step": 211 + }, + { + "epoch": 13.1875, + "loss": 0.12381850928068161, + "loss_ce": 0.10794936865568161, + "loss_xval": 0.015869140625, + "num_input_tokens_seen": 14592832, + "step": 211 + }, + { + "epoch": 13.25, + "grad_norm": 25.64597703986726, + "learning_rate": 4.850696540604993e-05, + "loss": 0.1209, + "num_input_tokens_seen": 14651840, + "step": 212 + }, + { + "epoch": 13.25, + "loss": 0.11560696363449097, + "loss_ce": 0.10480374097824097, + "loss_xval": 0.01080322265625, + "num_input_tokens_seen": 14651840, + "step": 212 + }, + { + "epoch": 13.3125, + "grad_norm": 69.3914547109082, + "learning_rate": 4.8549579952339555e-05, + "loss": 0.1522, + "num_input_tokens_seen": 14723456, + "step": 213 + }, + { + "epoch": 13.3125, + "loss": 0.15366515517234802, + "loss_ce": 0.11753234267234802, + "loss_xval": 0.0361328125, + "num_input_tokens_seen": 14723456, + "step": 213 + }, + { + "epoch": 13.375, + "grad_norm": 71.26381233983959, + "learning_rate": 4.859199489814922e-05, + "loss": 0.1408, + "num_input_tokens_seen": 14782720, + "step": 214 + }, + { + "epoch": 13.375, + "loss": 0.13961592316627502, + "loss_ce": 0.11434736102819443, + "loss_xval": 0.0252685546875, + "num_input_tokens_seen": 14782720, + "step": 214 + }, + { + "epoch": 13.4375, + "grad_norm": 87.29598783104798, + "learning_rate": 4.863421210456582e-05, + "loss": 0.1467, + "num_input_tokens_seen": 14854400, + "step": 215 + }, + { + "epoch": 13.4375, + "loss": 0.15197014808654785, + "loss_ce": 0.11217523366212845, + "loss_xval": 0.039794921875, + "num_input_tokens_seen": 14854400, + "step": 215 + }, + { + "epoch": 13.5, + "grad_norm": 137.423748091418, + "learning_rate": 4.8676233406767654e-05, + "loss": 0.183, + "num_input_tokens_seen": 14913536, + "step": 216 + }, + { + "epoch": 13.5, + "loss": 0.16861703991889954, + "loss_ce": 0.10514048486948013, + "loss_xval": 0.0634765625, + "num_input_tokens_seen": 14913536, + "step": 216 + }, + { + "epoch": 13.5625, + "grad_norm": 194.04712419667922, + "learning_rate": 4.871806061450313e-05, + "loss": 0.2344, + "num_input_tokens_seen": 14985152, + "step": 217 + }, + { + "epoch": 13.5625, + "loss": 0.23106953501701355, + "loss_ce": 0.11144062131643295, + "loss_xval": 0.11962890625, + "num_input_tokens_seen": 14985152, + "step": 217 + }, + { + "epoch": 13.625, + "grad_norm": 236.6670233746774, + "learning_rate": 4.875969551255842e-05, + "loss": 0.3039, + "num_input_tokens_seen": 15044224, + "step": 218 + }, + { + "epoch": 13.625, + "loss": 0.329833984375, + "loss_ce": 0.1237793043255806, + "loss_xval": 0.2060546875, + "num_input_tokens_seen": 15044224, + "step": 218 + }, + { + "epoch": 13.6875, + "grad_norm": 222.73052377666872, + "learning_rate": 4.8801139861214464e-05, + "loss": 0.2694, + "num_input_tokens_seen": 15103296, + "step": 219 + }, + { + "epoch": 13.6875, + "loss": 0.2580621540546417, + "loss_ce": 0.10669496655464172, + "loss_xval": 0.1513671875, + "num_input_tokens_seen": 15103296, + "step": 219 + }, + { + "epoch": 13.75, + "grad_norm": 134.04747220394003, + "learning_rate": 4.884239539669351e-05, + "loss": 0.1776, + "num_input_tokens_seen": 15162368, + "step": 220 + }, + { + "epoch": 13.75, + "loss": 0.1807592511177063, + "loss_ce": 0.1089818999171257, + "loss_xval": 0.07177734375, + "num_input_tokens_seen": 15162368, + "step": 220 + }, + { + "epoch": 13.8125, + "grad_norm": 8.718688250244332, + "learning_rate": 4.8883463831595575e-05, + "loss": 0.1181, + "num_input_tokens_seen": 15221376, + "step": 221 + }, + { + "epoch": 13.8125, + "loss": 0.11192178726196289, + "loss_ce": 0.09953165054321289, + "loss_xval": 0.01239013671875, + "num_input_tokens_seen": 15221376, + "step": 221 + }, + { + "epoch": 13.875, + "grad_norm": 156.2092295245801, + "learning_rate": 4.8924346855325055e-05, + "loss": 0.1788, + "num_input_tokens_seen": 15292992, + "step": 222 + }, + { + "epoch": 13.875, + "loss": 0.17930957674980164, + "loss_ce": 0.09581348299980164, + "loss_xval": 0.08349609375, + "num_input_tokens_seen": 15292992, + "step": 222 + }, + { + "epoch": 13.9375, + "grad_norm": 257.807381270319, + "learning_rate": 4.896504613450767e-05, + "loss": 0.3138, + "num_input_tokens_seen": 15364672, + "step": 223 + }, + { + "epoch": 13.9375, + "loss": 0.3046586215496063, + "loss_ce": 0.10641643404960632, + "loss_xval": 0.1982421875, + "num_input_tokens_seen": 15364672, + "step": 223 + }, + { + "epoch": 14.0, + "grad_norm": 261.8889355993732, + "learning_rate": 4.900556331339819e-05, + "loss": 0.3191, + "num_input_tokens_seen": 15436352, + "step": 224 + }, + { + "epoch": 14.0, + "loss": 0.25728708505630493, + "loss_ce": 0.10103708505630493, + "loss_xval": 0.15625, + "num_input_tokens_seen": 15436352, + "step": 224 + }, + { + "epoch": 14.0625, + "grad_norm": 194.10209172383972, + "learning_rate": 4.904590001427903e-05, + "loss": 0.2386, + "num_input_tokens_seen": 15507904, + "step": 225 + }, + { + "epoch": 14.0625, + "loss": 0.24090451002120972, + "loss_ce": 0.10418576747179031, + "loss_xval": 0.13671875, + "num_input_tokens_seen": 15507904, + "step": 225 + }, + { + "epoch": 14.125, + "grad_norm": 128.9053385322778, + "learning_rate": 4.908605783784996e-05, + "loss": 0.1716, + "num_input_tokens_seen": 15579584, + "step": 226 + }, + { + "epoch": 14.125, + "loss": 0.17217063903808594, + "loss_ce": 0.09697532653808594, + "loss_xval": 0.0751953125, + "num_input_tokens_seen": 15579584, + "step": 226 + }, + { + "epoch": 14.1875, + "grad_norm": 87.59732886995556, + "learning_rate": 4.9126038363609304e-05, + "loss": 0.1359, + "num_input_tokens_seen": 15651200, + "step": 227 + }, + { + "epoch": 14.1875, + "loss": 0.12465231120586395, + "loss_ce": 0.09169332683086395, + "loss_xval": 0.032958984375, + "num_input_tokens_seen": 15651200, + "step": 227 + }, + { + "epoch": 14.25, + "grad_norm": 70.56237270820588, + "learning_rate": 4.916584315022672e-05, + "loss": 0.145, + "num_input_tokens_seen": 15722752, + "step": 228 + }, + { + "epoch": 14.25, + "loss": 0.16532674431800842, + "loss_ce": 0.09989706426858902, + "loss_xval": 0.0654296875, + "num_input_tokens_seen": 15722752, + "step": 228 + }, + { + "epoch": 14.3125, + "grad_norm": 62.93559938979091, + "learning_rate": 4.920547373590778e-05, + "loss": 0.1368, + "num_input_tokens_seen": 15781760, + "step": 229 + }, + { + "epoch": 14.3125, + "loss": 0.13593743741512299, + "loss_ce": 0.09785149991512299, + "loss_xval": 0.0380859375, + "num_input_tokens_seen": 15781760, + "step": 229 + }, + { + "epoch": 14.375, + "grad_norm": 21.157191577955352, + "learning_rate": 4.924493163875066e-05, + "loss": 0.1233, + "num_input_tokens_seen": 15853376, + "step": 230 + }, + { + "epoch": 14.375, + "loss": 0.11231095343828201, + "loss_ce": 0.09735734015703201, + "loss_xval": 0.01495361328125, + "num_input_tokens_seen": 15853376, + "step": 230 + }, + { + "epoch": 14.4375, + "grad_norm": 59.78704566708404, + "learning_rate": 4.9284218357095105e-05, + "loss": 0.1293, + "num_input_tokens_seen": 15924928, + "step": 231 + }, + { + "epoch": 14.4375, + "loss": 0.13710638880729675, + "loss_ce": 0.10219427198171616, + "loss_xval": 0.034912109375, + "num_input_tokens_seen": 15924928, + "step": 231 + }, + { + "epoch": 14.5, + "grad_norm": 172.68273765724646, + "learning_rate": 4.9323335369863785e-05, + "loss": 0.2016, + "num_input_tokens_seen": 15996608, + "step": 232 + }, + { + "epoch": 14.5, + "loss": 0.21267253160476685, + "loss_ce": 0.09206705540418625, + "loss_xval": 0.12060546875, + "num_input_tokens_seen": 15996608, + "step": 232 + }, + { + "epoch": 14.5625, + "grad_norm": 322.95712013391943, + "learning_rate": 4.936228413689641e-05, + "loss": 0.4173, + "num_input_tokens_seen": 16068288, + "step": 233 + }, + { + "epoch": 14.5625, + "loss": 0.4134458899497986, + "loss_ce": 0.08532088994979858, + "loss_xval": 0.328125, + "num_input_tokens_seen": 16068288, + "step": 233 + }, + { + "epoch": 14.625, + "grad_norm": 451.15748506196144, + "learning_rate": 4.940106609927657e-05, + "loss": 0.7367, + "num_input_tokens_seen": 16139968, + "step": 234 + }, + { + "epoch": 14.625, + "loss": 0.7196348905563354, + "loss_ce": 0.09072863310575485, + "loss_xval": 0.62890625, + "num_input_tokens_seen": 16139968, + "step": 234 + }, + { + "epoch": 14.6875, + "grad_norm": 489.1108218774364, + "learning_rate": 4.943968267965172e-05, + "loss": 0.8289, + "num_input_tokens_seen": 16211648, + "step": 235 + }, + { + "epoch": 14.6875, + "loss": 0.8319761157035828, + "loss_ce": 0.08588234335184097, + "loss_xval": 0.74609375, + "num_input_tokens_seen": 16211648, + "step": 235 + }, + { + "epoch": 14.75, + "grad_norm": 388.11242092173126, + "learning_rate": 4.947813528254631e-05, + "loss": 0.5546, + "num_input_tokens_seen": 16283264, + "step": 236 + }, + { + "epoch": 14.75, + "loss": 0.5663637518882751, + "loss_ce": 0.08003561198711395, + "loss_xval": 0.486328125, + "num_input_tokens_seen": 16283264, + "step": 236 + }, + { + "epoch": 14.8125, + "grad_norm": 162.58093825153682, + "learning_rate": 4.95164252946683e-05, + "loss": 0.1844, + "num_input_tokens_seen": 16342400, + "step": 237 + }, + { + "epoch": 14.8125, + "loss": 0.18206077814102173, + "loss_ce": 0.08831078559160233, + "loss_xval": 0.09375, + "num_input_tokens_seen": 16342400, + "step": 237 + }, + { + "epoch": 14.875, + "grad_norm": 108.88258484145715, + "learning_rate": 4.955455408520925e-05, + "loss": 0.1338, + "num_input_tokens_seen": 16414016, + "step": 238 + }, + { + "epoch": 14.875, + "loss": 0.13403305411338806, + "loss_ce": 0.08447249978780746, + "loss_xval": 0.049560546875, + "num_input_tokens_seen": 16414016, + "step": 238 + }, + { + "epoch": 14.9375, + "grad_norm": 312.07861032238304, + "learning_rate": 4.9592523006138054e-05, + "loss": 0.4097, + "num_input_tokens_seen": 16485632, + "step": 239 + }, + { + "epoch": 14.9375, + "loss": 0.3987405002117157, + "loss_ce": 0.0901467576622963, + "loss_xval": 0.30859375, + "num_input_tokens_seen": 16485632, + "step": 239 + }, + { + "epoch": 15.0, + "grad_norm": 350.5215392823211, + "learning_rate": 4.963033339248863e-05, + "loss": 0.5014, + "num_input_tokens_seen": 16557248, + "step": 240 + }, + { + "epoch": 15.0, + "loss": 0.5261030197143555, + "loss_ce": 0.08079053461551666, + "loss_xval": 0.4453125, + "num_input_tokens_seen": 16557248, + "step": 240 + }, + { + "epoch": 15.0625, + "grad_norm": 192.24256534435753, + "learning_rate": 4.9667986562641596e-05, + "loss": 0.2191, + "num_input_tokens_seen": 16628864, + "step": 241 + }, + { + "epoch": 15.0625, + "loss": 0.22793100774288177, + "loss_ce": 0.08730600774288177, + "loss_xval": 0.140625, + "num_input_tokens_seen": 16628864, + "step": 241 + }, + { + "epoch": 15.125, + "grad_norm": 74.93058959493273, + "learning_rate": 4.970548381860003e-05, + "loss": 0.1164, + "num_input_tokens_seen": 16700416, + "step": 242 + }, + { + "epoch": 15.125, + "loss": 0.10929420590400696, + "loss_ce": 0.08488014340400696, + "loss_xval": 0.0244140625, + "num_input_tokens_seen": 16700416, + "step": 242 + }, + { + "epoch": 15.1875, + "grad_norm": 260.9104255182291, + "learning_rate": 4.9742826446259686e-05, + "loss": 0.3269, + "num_input_tokens_seen": 16772032, + "step": 243 + }, + { + "epoch": 15.1875, + "loss": 0.33294883370399475, + "loss_ce": 0.08099570125341415, + "loss_xval": 0.251953125, + "num_input_tokens_seen": 16772032, + "step": 243 + }, + { + "epoch": 15.25, + "grad_norm": 227.83423730454186, + "learning_rate": 4.978001571567359e-05, + "loss": 0.2694, + "num_input_tokens_seen": 16843712, + "step": 244 + }, + { + "epoch": 15.25, + "loss": 0.26590994000434875, + "loss_ce": 0.08036306500434875, + "loss_xval": 0.185546875, + "num_input_tokens_seen": 16843712, + "step": 244 + }, + { + "epoch": 15.3125, + "grad_norm": 13.556589572008898, + "learning_rate": 4.981705288131116e-05, + "loss": 0.1122, + "num_input_tokens_seen": 16902784, + "step": 245 + }, + { + "epoch": 15.3125, + "loss": 0.12127488106489182, + "loss_ce": 0.09576218575239182, + "loss_xval": 0.0255126953125, + "num_input_tokens_seen": 16902784, + "step": 245 + }, + { + "epoch": 15.375, + "grad_norm": 191.0737255553766, + "learning_rate": 4.98539391823122e-05, + "loss": 0.2406, + "num_input_tokens_seen": 16974528, + "step": 246 + }, + { + "epoch": 15.375, + "loss": 0.2360803782939911, + "loss_ce": 0.09252569824457169, + "loss_xval": 0.1435546875, + "num_input_tokens_seen": 16974528, + "step": 246 + }, + { + "epoch": 15.4375, + "grad_norm": 225.96150851651885, + "learning_rate": 4.9890675842735636e-05, + "loss": 0.288, + "num_input_tokens_seen": 17046272, + "step": 247 + }, + { + "epoch": 15.4375, + "loss": 0.28033164143562317, + "loss_ce": 0.09185508638620377, + "loss_xval": 0.1884765625, + "num_input_tokens_seen": 17046272, + "step": 247 + }, + { + "epoch": 15.5, + "grad_norm": 127.88649841176745, + "learning_rate": 4.992726407180318e-05, + "loss": 0.1589, + "num_input_tokens_seen": 17117952, + "step": 248 + }, + { + "epoch": 15.5, + "loss": 0.1766410917043686, + "loss_ce": 0.08582077920436859, + "loss_xval": 0.0908203125, + "num_input_tokens_seen": 17117952, + "step": 248 + }, + { + "epoch": 15.5625, + "grad_norm": 20.415464624373833, + "learning_rate": 4.996370506413826e-05, + "loss": 0.1034, + "num_input_tokens_seen": 17189568, + "step": 249 + }, + { + "epoch": 15.5625, + "loss": 0.09446126967668533, + "loss_ce": 0.07938558608293533, + "loss_xval": 0.01507568359375, + "num_input_tokens_seen": 17189568, + "step": 249 + }, + { + "epoch": 15.625, + "grad_norm": 143.26601409703582, + "learning_rate": 5e-05, + "loss": 0.1694, + "num_input_tokens_seen": 17248704, + "step": 250 + }, + { + "epoch": 15.625, + "eval_synth_IoU": 0.0, + "eval_synth_MAE_x": 0.261452853679657, + "eval_synth_MAE_y": 0.4066009521484375, + "eval_synth_NUM_probability": 0.8209518492221832, + "eval_synth_inside_bbox": 0.0, + "eval_synth_loss": 0.20519158244132996, + "eval_synth_loss_ce": 0.08092402294278145, + "eval_synth_loss_xval": 0.124267578125, + "eval_synth_runtime": 53.3374, + "eval_synth_samples_per_second": 2.4, + "eval_synth_steps_per_second": 0.075, + "num_input_tokens_seen": 17248704, + "step": 250 + }, + { + "epoch": 15.625, + "loss": 0.23013810813426971, + "loss_ce": 0.08072404563426971, + "loss_xval": 0.1494140625, + "num_input_tokens_seen": 17248704, + "step": 250 + }, + { + "epoch": 15.6875, + "grad_norm": 169.85650761281596, + "learning_rate": 5e-05, + "loss": 0.1935, + "num_input_tokens_seen": 17320512, + "step": 251 + }, + { + "epoch": 15.6875, + "loss": 0.1724509298801422, + "loss_ce": 0.08065404742956161, + "loss_xval": 0.091796875, + "num_input_tokens_seen": 17320512, + "step": 251 + }, + { + "epoch": 15.75, + "grad_norm": 64.15008625818228, + "learning_rate": 5e-05, + "loss": 0.1049, + "num_input_tokens_seen": 17392192, + "step": 252 + }, + { + "epoch": 15.75, + "loss": 0.09971656650304794, + "loss_ce": 0.07945289462804794, + "loss_xval": 0.020263671875, + "num_input_tokens_seen": 17392192, + "step": 252 + }, + { + "epoch": 15.8125, + "grad_norm": 103.9891196216456, + "learning_rate": 5e-05, + "loss": 0.1254, + "num_input_tokens_seen": 17463936, + "step": 253 + }, + { + "epoch": 15.8125, + "loss": 0.12425937503576279, + "loss_ce": 0.07323398441076279, + "loss_xval": 0.051025390625, + "num_input_tokens_seen": 17463936, + "step": 253 + }, + { + "epoch": 15.875, + "grad_norm": 204.58315274712749, + "learning_rate": 5e-05, + "loss": 0.2338, + "num_input_tokens_seen": 17535488, + "step": 254 + }, + { + "epoch": 15.875, + "loss": 0.2364085167646408, + "loss_ce": 0.08406476676464081, + "loss_xval": 0.15234375, + "num_input_tokens_seen": 17535488, + "step": 254 + }, + { + "epoch": 15.9375, + "grad_norm": 162.52441420980293, + "learning_rate": 5e-05, + "loss": 0.1916, + "num_input_tokens_seen": 17607104, + "step": 255 + }, + { + "epoch": 15.9375, + "loss": 0.20207734405994415, + "loss_ce": 0.08098359405994415, + "loss_xval": 0.12109375, + "num_input_tokens_seen": 17607104, + "step": 255 + }, + { + "epoch": 16.0, + "grad_norm": 11.653784354334272, + "learning_rate": 5e-05, + "loss": 0.0978, + "num_input_tokens_seen": 17678720, + "step": 256 + }, + { + "epoch": 16.0, + "loss": 0.08942156285047531, + "loss_ce": 0.07111101597547531, + "loss_xval": 0.018310546875, + "num_input_tokens_seen": 17678720, + "step": 256 + }, + { + "epoch": 16.0625, + "grad_norm": 158.11169008732344, + "learning_rate": 5e-05, + "loss": 0.1773, + "num_input_tokens_seen": 17750400, + "step": 257 + }, + { + "epoch": 16.0625, + "loss": 0.17226354777812958, + "loss_ce": 0.07460729777812958, + "loss_xval": 0.09765625, + "num_input_tokens_seen": 17750400, + "step": 257 + }, + { + "epoch": 16.125, + "grad_norm": 229.81576815742696, + "learning_rate": 5e-05, + "loss": 0.2733, + "num_input_tokens_seen": 17822080, + "step": 258 + }, + { + "epoch": 16.125, + "loss": 0.27170130610466003, + "loss_ce": 0.07736537605524063, + "loss_xval": 0.1943359375, + "num_input_tokens_seen": 17822080, + "step": 258 + }, + { + "epoch": 16.1875, + "grad_norm": 159.28923838871276, + "learning_rate": 5e-05, + "loss": 0.1674, + "num_input_tokens_seen": 17881280, + "step": 259 + }, + { + "epoch": 16.1875, + "loss": 0.1720913052558899, + "loss_ce": 0.06857568025588989, + "loss_xval": 0.103515625, + "num_input_tokens_seen": 17881280, + "step": 259 + }, + { + "epoch": 16.25, + "grad_norm": 5.26556670728744, + "learning_rate": 5e-05, + "loss": 0.0864, + "num_input_tokens_seen": 17940416, + "step": 260 + }, + { + "epoch": 16.25, + "loss": 0.08316627144813538, + "loss_ce": 0.07922950387001038, + "loss_xval": 0.003936767578125, + "num_input_tokens_seen": 17940416, + "step": 260 + }, + { + "epoch": 16.3125, + "grad_norm": 155.7591091021434, + "learning_rate": 5e-05, + "loss": 0.1733, + "num_input_tokens_seen": 18012096, + "step": 261 + }, + { + "epoch": 16.3125, + "loss": 0.16722114384174347, + "loss_ce": 0.07688911259174347, + "loss_xval": 0.09033203125, + "num_input_tokens_seen": 18012096, + "step": 261 + }, + { + "epoch": 16.375, + "grad_norm": 233.63878569347762, + "learning_rate": 5e-05, + "loss": 0.2803, + "num_input_tokens_seen": 18083776, + "step": 262 + }, + { + "epoch": 16.375, + "loss": 0.27516964077949524, + "loss_ce": 0.07692745327949524, + "loss_xval": 0.1982421875, + "num_input_tokens_seen": 18083776, + "step": 262 + }, + { + "epoch": 16.4375, + "grad_norm": 186.41794854520438, + "learning_rate": 5e-05, + "loss": 0.2068, + "num_input_tokens_seen": 18155392, + "step": 263 + }, + { + "epoch": 16.4375, + "loss": 0.21879586577415466, + "loss_ce": 0.07524118572473526, + "loss_xval": 0.1435546875, + "num_input_tokens_seen": 18155392, + "step": 263 + }, + { + "epoch": 16.5, + "grad_norm": 38.078416547409816, + "learning_rate": 5e-05, + "loss": 0.1028, + "num_input_tokens_seen": 18226944, + "step": 264 + }, + { + "epoch": 16.5, + "loss": 0.11282212287187576, + "loss_ce": 0.08389145880937576, + "loss_xval": 0.0289306640625, + "num_input_tokens_seen": 18226944, + "step": 264 + }, + { + "epoch": 16.5625, + "grad_norm": 131.60459627807813, + "learning_rate": 5e-05, + "loss": 0.1424, + "num_input_tokens_seen": 18298496, + "step": 265 + }, + { + "epoch": 16.5625, + "loss": 0.13576087355613708, + "loss_ce": 0.07448158413171768, + "loss_xval": 0.061279296875, + "num_input_tokens_seen": 18298496, + "step": 265 + }, + { + "epoch": 16.625, + "grad_norm": 223.23466176441613, + "learning_rate": 5e-05, + "loss": 0.2574, + "num_input_tokens_seen": 18357632, + "step": 266 + }, + { + "epoch": 16.625, + "loss": 0.27536073327064514, + "loss_ce": 0.06832948327064514, + "loss_xval": 0.20703125, + "num_input_tokens_seen": 18357632, + "step": 266 + }, + { + "epoch": 16.6875, + "grad_norm": 204.3555030951728, + "learning_rate": 5e-05, + "loss": 0.2297, + "num_input_tokens_seen": 18429376, + "step": 267 + }, + { + "epoch": 16.6875, + "loss": 0.225807785987854, + "loss_ce": 0.073464035987854, + "loss_xval": 0.15234375, + "num_input_tokens_seen": 18429376, + "step": 267 + }, + { + "epoch": 16.75, + "grad_norm": 91.23908279974412, + "learning_rate": 5e-05, + "loss": 0.1098, + "num_input_tokens_seen": 18501056, + "step": 268 + }, + { + "epoch": 16.75, + "loss": 0.11241857707500458, + "loss_ce": 0.06822912395000458, + "loss_xval": 0.044189453125, + "num_input_tokens_seen": 18501056, + "step": 268 + }, + { + "epoch": 16.8125, + "grad_norm": 59.27150232738392, + "learning_rate": 5e-05, + "loss": 0.0899, + "num_input_tokens_seen": 18572736, + "step": 269 + }, + { + "epoch": 16.8125, + "loss": 0.0962166041135788, + "loss_ce": 0.0699714869260788, + "loss_xval": 0.0262451171875, + "num_input_tokens_seen": 18572736, + "step": 269 + }, + { + "epoch": 16.875, + "grad_norm": 183.1381956429717, + "learning_rate": 5e-05, + "loss": 0.2093, + "num_input_tokens_seen": 18632000, + "step": 270 + }, + { + "epoch": 16.875, + "loss": 0.22140049934387207, + "loss_ce": 0.08468174934387207, + "loss_xval": 0.13671875, + "num_input_tokens_seen": 18632000, + "step": 270 + }, + { + "epoch": 16.9375, + "grad_norm": 214.36639578000916, + "learning_rate": 5e-05, + "loss": 0.2495, + "num_input_tokens_seen": 18691136, + "step": 271 + }, + { + "epoch": 16.9375, + "loss": 0.25600141286849976, + "loss_ce": 0.07338422536849976, + "loss_xval": 0.1826171875, + "num_input_tokens_seen": 18691136, + "step": 271 + }, + { + "epoch": 17.0, + "grad_norm": 127.84713265646646, + "learning_rate": 5e-05, + "loss": 0.1332, + "num_input_tokens_seen": 18762816, + "step": 272 + }, + { + "epoch": 17.0, + "loss": 0.139483243227005, + "loss_ce": 0.067705899477005, + "loss_xval": 0.07177734375, + "num_input_tokens_seen": 18762816, + "step": 272 + }, + { + "epoch": 17.0625, + "grad_norm": 4.898120648292572, + "learning_rate": 5e-05, + "loss": 0.0764, + "num_input_tokens_seen": 18834432, + "step": 273 + }, + { + "epoch": 17.0625, + "loss": 0.08028127998113632, + "loss_ce": 0.06990530341863632, + "loss_xval": 0.0103759765625, + "num_input_tokens_seen": 18834432, + "step": 273 + }, + { + "epoch": 17.125, + "grad_norm": 95.27997261371199, + "learning_rate": 5e-05, + "loss": 0.1044, + "num_input_tokens_seen": 18906176, + "step": 274 + }, + { + "epoch": 17.125, + "loss": 0.10349054634571075, + "loss_ce": 0.06662531197071075, + "loss_xval": 0.036865234375, + "num_input_tokens_seen": 18906176, + "step": 274 + }, + { + "epoch": 17.1875, + "grad_norm": 146.32212640213385, + "learning_rate": 5e-05, + "loss": 0.152, + "num_input_tokens_seen": 18977920, + "step": 275 + }, + { + "epoch": 17.1875, + "loss": 0.1592569351196289, + "loss_ce": 0.0664834976196289, + "loss_xval": 0.0927734375, + "num_input_tokens_seen": 18977920, + "step": 275 + }, + { + "epoch": 17.25, + "grad_norm": 142.43308471222042, + "learning_rate": 5e-05, + "loss": 0.1569, + "num_input_tokens_seen": 19049536, + "step": 276 + }, + { + "epoch": 17.25, + "loss": 0.15653030574321747, + "loss_ce": 0.06229202449321747, + "loss_xval": 0.09423828125, + "num_input_tokens_seen": 19049536, + "step": 276 + }, + { + "epoch": 17.3125, + "grad_norm": 91.99449362446872, + "learning_rate": 5e-05, + "loss": 0.0961, + "num_input_tokens_seen": 19121280, + "step": 277 + }, + { + "epoch": 17.3125, + "loss": 0.09561696648597717, + "loss_ce": 0.055577900260686874, + "loss_xval": 0.0400390625, + "num_input_tokens_seen": 19121280, + "step": 277 + }, + { + "epoch": 17.375, + "grad_norm": 21.1872764546701, + "learning_rate": 5e-05, + "loss": 0.0708, + "num_input_tokens_seen": 19192896, + "step": 278 + }, + { + "epoch": 17.375, + "loss": 0.07196405529975891, + "loss_ce": 0.06250360608100891, + "loss_xval": 0.00946044921875, + "num_input_tokens_seen": 19192896, + "step": 278 + }, + { + "epoch": 17.4375, + "grad_norm": 39.6768526052572, + "learning_rate": 5e-05, + "loss": 0.0671, + "num_input_tokens_seen": 19264576, + "step": 279 + }, + { + "epoch": 17.4375, + "loss": 0.07075974345207214, + "loss_ce": 0.05574509873986244, + "loss_xval": 0.0150146484375, + "num_input_tokens_seen": 19264576, + "step": 279 + }, + { + "epoch": 17.5, + "grad_norm": 77.921892367936, + "learning_rate": 5e-05, + "loss": 0.0893, + "num_input_tokens_seen": 19336128, + "step": 280 + }, + { + "epoch": 17.5, + "loss": 0.09789624810218811, + "loss_ce": 0.06396070122718811, + "loss_xval": 0.033935546875, + "num_input_tokens_seen": 19336128, + "step": 280 + }, + { + "epoch": 17.5625, + "grad_norm": 82.02369908492679, + "learning_rate": 5e-05, + "loss": 0.0943, + "num_input_tokens_seen": 19395328, + "step": 281 + }, + { + "epoch": 17.5625, + "loss": 0.09304691106081009, + "loss_ce": 0.06655765324831009, + "loss_xval": 0.0264892578125, + "num_input_tokens_seen": 19395328, + "step": 281 + }, + { + "epoch": 17.625, + "grad_norm": 42.31537870755725, + "learning_rate": 5e-05, + "loss": 0.0771, + "num_input_tokens_seen": 19466880, + "step": 282 + }, + { + "epoch": 17.625, + "loss": 0.0733356922864914, + "loss_ce": 0.05801587179303169, + "loss_xval": 0.01531982421875, + "num_input_tokens_seen": 19466880, + "step": 282 + }, + { + "epoch": 17.6875, + "grad_norm": 17.771330987667906, + "learning_rate": 5e-05, + "loss": 0.066, + "num_input_tokens_seen": 19538624, + "step": 283 + }, + { + "epoch": 17.6875, + "loss": 0.0675898939371109, + "loss_ce": 0.0558101125061512, + "loss_xval": 0.01177978515625, + "num_input_tokens_seen": 19538624, + "step": 283 + }, + { + "epoch": 17.75, + "grad_norm": 50.51106428941197, + "learning_rate": 5e-05, + "loss": 0.0796, + "num_input_tokens_seen": 19610176, + "step": 284 + }, + { + "epoch": 17.75, + "loss": 0.08379670232534409, + "loss_ce": 0.06304474920034409, + "loss_xval": 0.020751953125, + "num_input_tokens_seen": 19610176, + "step": 284 + }, + { + "epoch": 17.8125, + "grad_norm": 59.38081278481981, + "learning_rate": 5e-05, + "loss": 0.0827, + "num_input_tokens_seen": 19669248, + "step": 285 + }, + { + "epoch": 17.8125, + "loss": 0.08613239228725433, + "loss_ce": 0.06074177101254463, + "loss_xval": 0.025390625, + "num_input_tokens_seen": 19669248, + "step": 285 + }, + { + "epoch": 17.875, + "grad_norm": 86.85566163114785, + "learning_rate": 5e-05, + "loss": 0.0945, + "num_input_tokens_seen": 19740864, + "step": 286 + }, + { + "epoch": 17.875, + "loss": 0.0980193167924881, + "loss_ce": 0.0587126798927784, + "loss_xval": 0.039306640625, + "num_input_tokens_seen": 19740864, + "step": 286 + }, + { + "epoch": 17.9375, + "grad_norm": 131.48187852125864, + "learning_rate": 5e-05, + "loss": 0.127, + "num_input_tokens_seen": 19812416, + "step": 287 + }, + { + "epoch": 17.9375, + "loss": 0.12445548176765442, + "loss_ce": 0.05951407551765442, + "loss_xval": 0.06494140625, + "num_input_tokens_seen": 19812416, + "step": 287 + }, + { + "epoch": 18.0, + "grad_norm": 186.15300597167504, + "learning_rate": 5e-05, + "loss": 0.1944, + "num_input_tokens_seen": 19884032, + "step": 288 + }, + { + "epoch": 18.0, + "loss": 0.19328424334526062, + "loss_ce": 0.05754205584526062, + "loss_xval": 0.1357421875, + "num_input_tokens_seen": 19884032, + "step": 288 + }, + { + "epoch": 18.0625, + "grad_norm": 273.5012601742529, + "learning_rate": 5e-05, + "loss": 0.3323, + "num_input_tokens_seen": 19955648, + "step": 289 + }, + { + "epoch": 18.0625, + "loss": 0.33222687244415283, + "loss_ce": 0.05292999744415283, + "loss_xval": 0.279296875, + "num_input_tokens_seen": 19955648, + "step": 289 + }, + { + "epoch": 18.125, + "grad_norm": 388.36705185798957, + "learning_rate": 5e-05, + "loss": 0.6139, + "num_input_tokens_seen": 20027328, + "step": 290 + }, + { + "epoch": 18.125, + "loss": 0.6378867626190186, + "loss_ce": 0.05194929242134094, + "loss_xval": 0.5859375, + "num_input_tokens_seen": 20027328, + "step": 290 + }, + { + "epoch": 18.1875, + "grad_norm": 499.00867924698656, + "learning_rate": 5e-05, + "loss": 0.9746, + "num_input_tokens_seen": 20098944, + "step": 291 + }, + { + "epoch": 18.1875, + "loss": 0.9852690696716309, + "loss_ce": 0.05167528986930847, + "loss_xval": 0.93359375, + "num_input_tokens_seen": 20098944, + "step": 291 + }, + { + "epoch": 18.25, + "grad_norm": 554.1905120885209, + "learning_rate": 5e-05, + "loss": 1.1936, + "num_input_tokens_seen": 20170496, + "step": 292 + }, + { + "epoch": 18.25, + "loss": 1.2322349548339844, + "loss_ce": 0.05254746600985527, + "loss_xval": 1.1796875, + "num_input_tokens_seen": 20170496, + "step": 292 + }, + { + "epoch": 18.3125, + "grad_norm": 457.7168361435202, + "learning_rate": 5e-05, + "loss": 0.8529, + "num_input_tokens_seen": 20242176, + "step": 293 + }, + { + "epoch": 18.3125, + "loss": 0.8144187927246094, + "loss_ce": 0.05660631135106087, + "loss_xval": 0.7578125, + "num_input_tokens_seen": 20242176, + "step": 293 + }, + { + "epoch": 18.375, + "grad_norm": 152.99752414373336, + "learning_rate": 5e-05, + "loss": 0.157, + "num_input_tokens_seen": 20313728, + "step": 294 + }, + { + "epoch": 18.375, + "loss": 0.16795939207077026, + "loss_ce": 0.054678138345479965, + "loss_xval": 0.11328125, + "num_input_tokens_seen": 20313728, + "step": 294 + }, + { + "epoch": 18.4375, + "grad_norm": 233.74530405929963, + "learning_rate": 5e-05, + "loss": 0.2758, + "num_input_tokens_seen": 20385344, + "step": 295 + }, + { + "epoch": 18.4375, + "loss": 0.2857314646244049, + "loss_ce": 0.04842676967382431, + "loss_xval": 0.2373046875, + "num_input_tokens_seen": 20385344, + "step": 295 + }, + { + "epoch": 18.5, + "grad_norm": 475.0087558704663, + "learning_rate": 5e-05, + "loss": 0.9749, + "num_input_tokens_seen": 20456960, + "step": 296 + }, + { + "epoch": 18.5, + "loss": 0.9483780264854431, + "loss_ce": 0.06947175413370132, + "loss_xval": 0.87890625, + "num_input_tokens_seen": 20456960, + "step": 296 + }, + { + "epoch": 18.5625, + "grad_norm": 398.2800226521687, + "learning_rate": 5e-05, + "loss": 0.7156, + "num_input_tokens_seen": 20528576, + "step": 297 + }, + { + "epoch": 18.5625, + "loss": 0.6812264919281006, + "loss_ce": 0.06403897702693939, + "loss_xval": 0.6171875, + "num_input_tokens_seen": 20528576, + "step": 297 + }, + { + "epoch": 18.625, + "grad_norm": 44.51041055843072, + "learning_rate": 5e-05, + "loss": 0.09, + "num_input_tokens_seen": 20600128, + "step": 298 + }, + { + "epoch": 18.625, + "loss": 0.08670137822628021, + "loss_ce": 0.05520723760128021, + "loss_xval": 0.031494140625, + "num_input_tokens_seen": 20600128, + "step": 298 + }, + { + "epoch": 18.6875, + "grad_norm": 324.861214138624, + "learning_rate": 5e-05, + "loss": 0.5103, + "num_input_tokens_seen": 20671808, + "step": 299 + }, + { + "epoch": 18.6875, + "loss": 0.48987704515457153, + "loss_ce": 0.06214268505573273, + "loss_xval": 0.427734375, + "num_input_tokens_seen": 20671808, + "step": 299 + }, + { + "epoch": 18.75, + "grad_norm": 385.2579654281278, + "learning_rate": 5e-05, + "loss": 0.6728, + "num_input_tokens_seen": 20743424, + "step": 300 + }, + { + "epoch": 18.75, + "loss": 0.6869125366210938, + "loss_ce": 0.05410004034638405, + "loss_xval": 0.6328125, + "num_input_tokens_seen": 20743424, + "step": 300 + }, + { + "epoch": 18.8125, + "grad_norm": 77.86073528701853, + "learning_rate": 5e-05, + "loss": 0.1027, + "num_input_tokens_seen": 20802496, + "step": 301 + }, + { + "epoch": 18.8125, + "loss": 0.10991650074720383, + "loss_ce": 0.06963329762220383, + "loss_xval": 0.040283203125, + "num_input_tokens_seen": 20802496, + "step": 301 + }, + { + "epoch": 18.875, + "grad_norm": 292.0502032364089, + "learning_rate": 5e-05, + "loss": 0.432, + "num_input_tokens_seen": 20874112, + "step": 302 + }, + { + "epoch": 18.875, + "loss": 0.40421196818351746, + "loss_ce": 0.060461968183517456, + "loss_xval": 0.34375, + "num_input_tokens_seen": 20874112, + "step": 302 + }, + { + "epoch": 18.9375, + "grad_norm": 344.01605003997935, + "learning_rate": 5e-05, + "loss": 0.5894, + "num_input_tokens_seen": 20933376, + "step": 303 + }, + { + "epoch": 18.9375, + "loss": 0.5786466598510742, + "loss_ce": 0.05911543220281601, + "loss_xval": 0.51953125, + "num_input_tokens_seen": 20933376, + "step": 303 + }, + { + "epoch": 19.0, + "grad_norm": 46.29534379699135, + "learning_rate": 5e-05, + "loss": 0.0928, + "num_input_tokens_seen": 21005056, + "step": 304 + }, + { + "epoch": 19.0, + "loss": 0.08590758591890335, + "loss_ce": 0.05978453904390335, + "loss_xval": 0.026123046875, + "num_input_tokens_seen": 21005056, + "step": 304 + }, + { + "epoch": 19.0625, + "grad_norm": 258.9382904270406, + "learning_rate": 5e-05, + "loss": 0.3661, + "num_input_tokens_seen": 21076736, + "step": 305 + }, + { + "epoch": 19.0625, + "loss": 0.3459300398826599, + "loss_ce": 0.06077377498149872, + "loss_xval": 0.28515625, + "num_input_tokens_seen": 21076736, + "step": 305 + }, + { + "epoch": 19.125, + "grad_norm": 238.5090180219324, + "learning_rate": 5e-05, + "loss": 0.3337, + "num_input_tokens_seen": 21148288, + "step": 306 + }, + { + "epoch": 19.125, + "loss": 0.3305038511753082, + "loss_ce": 0.06487884372472763, + "loss_xval": 0.265625, + "num_input_tokens_seen": 21148288, + "step": 306 + }, + { + "epoch": 19.1875, + "grad_norm": 64.6083728076613, + "learning_rate": 5e-05, + "loss": 0.1134, + "num_input_tokens_seen": 21219840, + "step": 307 + }, + { + "epoch": 19.1875, + "loss": 0.10510671883821487, + "loss_ce": 0.07019460946321487, + "loss_xval": 0.034912109375, + "num_input_tokens_seen": 21219840, + "step": 307 + }, + { + "epoch": 19.25, + "grad_norm": 259.0824739486409, + "learning_rate": 5e-05, + "loss": 0.3807, + "num_input_tokens_seen": 21291520, + "step": 308 + }, + { + "epoch": 19.25, + "loss": 0.3581124544143677, + "loss_ce": 0.06514371186494827, + "loss_xval": 0.29296875, + "num_input_tokens_seen": 21291520, + "step": 308 + }, + { + "epoch": 19.3125, + "grad_norm": 104.19307087054078, + "learning_rate": 5e-05, + "loss": 0.1216, + "num_input_tokens_seen": 21363072, + "step": 309 + }, + { + "epoch": 19.3125, + "loss": 0.11511696875095367, + "loss_ce": 0.056279078125953674, + "loss_xval": 0.058837890625, + "num_input_tokens_seen": 21363072, + "step": 309 + }, + { + "epoch": 19.375, + "grad_norm": 169.93634652801455, + "learning_rate": 5e-05, + "loss": 0.2042, + "num_input_tokens_seen": 21434624, + "step": 310 + }, + { + "epoch": 19.375, + "loss": 0.22216372191905975, + "loss_ce": 0.05419497564435005, + "loss_xval": 0.16796875, + "num_input_tokens_seen": 21434624, + "step": 310 + }, + { + "epoch": 19.4375, + "grad_norm": 203.94069831695467, + "learning_rate": 5e-05, + "loss": 0.2734, + "num_input_tokens_seen": 21506304, + "step": 311 + }, + { + "epoch": 19.4375, + "loss": 0.2772751450538635, + "loss_ce": 0.06438452750444412, + "loss_xval": 0.212890625, + "num_input_tokens_seen": 21506304, + "step": 311 + }, + { + "epoch": 19.5, + "grad_norm": 18.024055507046423, + "learning_rate": 5e-05, + "loss": 0.0643, + "num_input_tokens_seen": 21577920, + "step": 312 + }, + { + "epoch": 19.5, + "loss": 0.06432458758354187, + "loss_ce": 0.05309411510825157, + "loss_xval": 0.01123046875, + "num_input_tokens_seen": 21577920, + "step": 312 + }, + { + "epoch": 19.5625, + "grad_norm": 184.21039649224565, + "learning_rate": 5e-05, + "loss": 0.237, + "num_input_tokens_seen": 21649600, + "step": 313 + }, + { + "epoch": 19.5625, + "loss": 0.2161283791065216, + "loss_ce": 0.0559721365571022, + "loss_xval": 0.16015625, + "num_input_tokens_seen": 21649600, + "step": 313 + }, + { + "epoch": 19.625, + "grad_norm": 108.20152056306442, + "learning_rate": 5e-05, + "loss": 0.1182, + "num_input_tokens_seen": 21721408, + "step": 314 + }, + { + "epoch": 19.625, + "loss": 0.12597306072711945, + "loss_ce": 0.055660564452409744, + "loss_xval": 0.0703125, + "num_input_tokens_seen": 21721408, + "step": 314 + }, + { + "epoch": 19.6875, + "grad_norm": 83.49633562498352, + "learning_rate": 5e-05, + "loss": 0.1044, + "num_input_tokens_seen": 21793024, + "step": 315 + }, + { + "epoch": 19.6875, + "loss": 0.1037326231598854, + "loss_ce": 0.0629611387848854, + "loss_xval": 0.040771484375, + "num_input_tokens_seen": 21793024, + "step": 315 + }, + { + "epoch": 19.75, + "grad_norm": 166.61290953107724, + "learning_rate": 5e-05, + "loss": 0.2003, + "num_input_tokens_seen": 21864640, + "step": 316 + }, + { + "epoch": 19.75, + "loss": 0.21483193337917328, + "loss_ce": 0.05076942965388298, + "loss_xval": 0.1640625, + "num_input_tokens_seen": 21864640, + "step": 316 + }, + { + "epoch": 19.8125, + "grad_norm": 71.69323940555286, + "learning_rate": 5e-05, + "loss": 0.0875, + "num_input_tokens_seen": 21936320, + "step": 317 + }, + { + "epoch": 19.8125, + "loss": 0.08265180885791779, + "loss_ce": 0.048960406333208084, + "loss_xval": 0.03369140625, + "num_input_tokens_seen": 21936320, + "step": 317 + }, + { + "epoch": 19.875, + "grad_norm": 82.73207269507293, + "learning_rate": 5e-05, + "loss": 0.1014, + "num_input_tokens_seen": 22008000, + "step": 318 + }, + { + "epoch": 19.875, + "loss": 0.09214404225349426, + "loss_ce": 0.05527880787849426, + "loss_xval": 0.036865234375, + "num_input_tokens_seen": 22008000, + "step": 318 + }, + { + "epoch": 19.9375, + "grad_norm": 129.00645154565726, + "learning_rate": 5e-05, + "loss": 0.1439, + "num_input_tokens_seen": 22079744, + "step": 319 + }, + { + "epoch": 19.9375, + "loss": 0.1347639113664627, + "loss_ce": 0.051267821341753006, + "loss_xval": 0.08349609375, + "num_input_tokens_seen": 22079744, + "step": 319 + }, + { + "epoch": 20.0, + "grad_norm": 53.870059829152936, + "learning_rate": 5e-05, + "loss": 0.0771, + "num_input_tokens_seen": 22138816, + "step": 320 + }, + { + "epoch": 20.0, + "loss": 0.07210846245288849, + "loss_ce": 0.05465241149067879, + "loss_xval": 0.0174560546875, + "num_input_tokens_seen": 22138816, + "step": 320 + }, + { + "epoch": 20.0625, + "grad_norm": 42.41968507185895, + "learning_rate": 5e-05, + "loss": 0.0691, + "num_input_tokens_seen": 22210496, + "step": 321 + }, + { + "epoch": 20.0625, + "loss": 0.06588022410869598, + "loss_ce": 0.04927866533398628, + "loss_xval": 0.0166015625, + "num_input_tokens_seen": 22210496, + "step": 321 + }, + { + "epoch": 20.125, + "grad_norm": 81.37358201753432, + "learning_rate": 5e-05, + "loss": 0.0889, + "num_input_tokens_seen": 22282304, + "step": 322 + }, + { + "epoch": 20.125, + "loss": 0.08118662238121033, + "loss_ce": 0.05079111456871033, + "loss_xval": 0.0303955078125, + "num_input_tokens_seen": 22282304, + "step": 322 + }, + { + "epoch": 20.1875, + "grad_norm": 31.96465123047603, + "learning_rate": 5e-05, + "loss": 0.0593, + "num_input_tokens_seen": 22353856, + "step": 323 + }, + { + "epoch": 20.1875, + "loss": 0.05360583961009979, + "loss_ce": 0.04493884742259979, + "loss_xval": 0.0086669921875, + "num_input_tokens_seen": 22353856, + "step": 323 + }, + { + "epoch": 20.25, + "grad_norm": 54.30901644053265, + "learning_rate": 5e-05, + "loss": 0.074, + "num_input_tokens_seen": 22412928, + "step": 324 + }, + { + "epoch": 20.25, + "loss": 0.07150323688983917, + "loss_ce": 0.044525694102048874, + "loss_xval": 0.0269775390625, + "num_input_tokens_seen": 22412928, + "step": 324 + }, + { + "epoch": 20.3125, + "grad_norm": 100.10696752851614, + "learning_rate": 5e-05, + "loss": 0.1114, + "num_input_tokens_seen": 22484544, + "step": 325 + }, + { + "epoch": 20.3125, + "loss": 0.11652743816375732, + "loss_ce": 0.04767978563904762, + "loss_xval": 0.06884765625, + "num_input_tokens_seen": 22484544, + "step": 325 + }, + { + "epoch": 20.375, + "grad_norm": 86.5660911617045, + "learning_rate": 5e-05, + "loss": 0.0969, + "num_input_tokens_seen": 22556224, + "step": 326 + }, + { + "epoch": 20.375, + "loss": 0.09553231298923492, + "loss_ce": 0.048169028013944626, + "loss_xval": 0.04736328125, + "num_input_tokens_seen": 22556224, + "step": 326 + }, + { + "epoch": 20.4375, + "grad_norm": 6.751561284312927, + "learning_rate": 5e-05, + "loss": 0.0512, + "num_input_tokens_seen": 22627904, + "step": 327 + }, + { + "epoch": 20.4375, + "loss": 0.0450252965092659, + "loss_ce": 0.0397762730717659, + "loss_xval": 0.0052490234375, + "num_input_tokens_seen": 22627904, + "step": 327 + }, + { + "epoch": 20.5, + "grad_norm": 102.19788973198584, + "learning_rate": 5e-05, + "loss": 0.1133, + "num_input_tokens_seen": 22699520, + "step": 328 + }, + { + "epoch": 20.5, + "loss": 0.10728923976421356, + "loss_ce": 0.049672048538923264, + "loss_xval": 0.0576171875, + "num_input_tokens_seen": 22699520, + "step": 328 + }, + { + "epoch": 20.5625, + "grad_norm": 129.43090976643876, + "learning_rate": 5e-05, + "loss": 0.1355, + "num_input_tokens_seen": 22771136, + "step": 329 + }, + { + "epoch": 20.5625, + "loss": 0.1352676898241043, + "loss_ce": 0.04054112359881401, + "loss_xval": 0.0947265625, + "num_input_tokens_seen": 22771136, + "step": 329 + }, + { + "epoch": 20.625, + "grad_norm": 24.348588574348376, + "learning_rate": 5e-05, + "loss": 0.0479, + "num_input_tokens_seen": 22842688, + "step": 330 + }, + { + "epoch": 20.625, + "loss": 0.04819801449775696, + "loss_ce": 0.04111793637275696, + "loss_xval": 0.007080078125, + "num_input_tokens_seen": 22842688, + "step": 330 + }, + { + "epoch": 20.6875, + "grad_norm": 138.02485486652458, + "learning_rate": 5e-05, + "loss": 0.1444, + "num_input_tokens_seen": 22914368, + "step": 331 + }, + { + "epoch": 20.6875, + "loss": 0.1612803339958191, + "loss_ce": 0.04116315022110939, + "loss_xval": 0.1201171875, + "num_input_tokens_seen": 22914368, + "step": 331 + }, + { + "epoch": 20.75, + "grad_norm": 177.826118184589, + "learning_rate": 5e-05, + "loss": 0.2107, + "num_input_tokens_seen": 22986112, + "step": 332 + }, + { + "epoch": 20.75, + "loss": 0.20077544450759888, + "loss_ce": 0.039642639458179474, + "loss_xval": 0.1611328125, + "num_input_tokens_seen": 22986112, + "step": 332 + }, + { + "epoch": 20.8125, + "grad_norm": 32.575859879051436, + "learning_rate": 5e-05, + "loss": 0.0547, + "num_input_tokens_seen": 23057728, + "step": 333 + }, + { + "epoch": 20.8125, + "loss": 0.053013481199741364, + "loss_ce": 0.036167778074741364, + "loss_xval": 0.016845703125, + "num_input_tokens_seen": 23057728, + "step": 333 + }, + { + "epoch": 20.875, + "grad_norm": 145.04876372864413, + "learning_rate": 5e-05, + "loss": 0.1634, + "num_input_tokens_seen": 23129344, + "step": 334 + }, + { + "epoch": 20.875, + "loss": 0.1619020253419876, + "loss_ce": 0.04129656031727791, + "loss_xval": 0.12060546875, + "num_input_tokens_seen": 23129344, + "step": 334 + }, + { + "epoch": 20.9375, + "grad_norm": 168.3578247321365, + "learning_rate": 5e-05, + "loss": 0.1975, + "num_input_tokens_seen": 23188416, + "step": 335 + }, + { + "epoch": 20.9375, + "loss": 0.1955825239419937, + "loss_ce": 0.041285645216703415, + "loss_xval": 0.154296875, + "num_input_tokens_seen": 23188416, + "step": 335 + }, + { + "epoch": 21.0, + "grad_norm": 12.803234575367828, + "learning_rate": 5e-05, + "loss": 0.0448, + "num_input_tokens_seen": 23260160, + "step": 336 + }, + { + "epoch": 21.0, + "loss": 0.04127725958824158, + "loss_ce": 0.03199991583824158, + "loss_xval": 0.00927734375, + "num_input_tokens_seen": 23260160, + "step": 336 + }, + { + "epoch": 21.0625, + "grad_norm": 159.70923416573038, + "learning_rate": 5e-05, + "loss": 0.1813, + "num_input_tokens_seen": 23331712, + "step": 337 + }, + { + "epoch": 21.0625, + "loss": 0.1771336793899536, + "loss_ce": 0.034555550664663315, + "loss_xval": 0.142578125, + "num_input_tokens_seen": 23331712, + "step": 337 + }, + { + "epoch": 21.125, + "grad_norm": 178.92852100380634, + "learning_rate": 5e-05, + "loss": 0.2198, + "num_input_tokens_seen": 23403392, + "step": 338 + }, + { + "epoch": 21.125, + "loss": 0.22689706087112427, + "loss_ce": 0.03451424837112427, + "loss_xval": 0.1923828125, + "num_input_tokens_seen": 23403392, + "step": 338 + }, + { + "epoch": 21.1875, + "grad_norm": 58.222787015758534, + "learning_rate": 5e-05, + "loss": 0.0678, + "num_input_tokens_seen": 23474944, + "step": 339 + }, + { + "epoch": 21.1875, + "loss": 0.060029055923223495, + "loss_ce": 0.038422610610723495, + "loss_xval": 0.0216064453125, + "num_input_tokens_seen": 23474944, + "step": 339 + }, + { + "epoch": 21.25, + "grad_norm": 85.83316430272406, + "learning_rate": 5e-05, + "loss": 0.0826, + "num_input_tokens_seen": 23546560, + "step": 340 + }, + { + "epoch": 21.25, + "loss": 0.092320516705513, + "loss_ce": 0.0334826223552227, + "loss_xval": 0.058837890625, + "num_input_tokens_seen": 23546560, + "step": 340 + }, + { + "epoch": 21.3125, + "grad_norm": 153.34178081295283, + "learning_rate": 5e-05, + "loss": 0.1732, + "num_input_tokens_seen": 23605632, + "step": 341 + }, + { + "epoch": 21.3125, + "loss": 0.18945997953414917, + "loss_ce": 0.03223340958356857, + "loss_xval": 0.1572265625, + "num_input_tokens_seen": 23605632, + "step": 341 + }, + { + "epoch": 21.375, + "grad_norm": 113.06064593355569, + "learning_rate": 5e-05, + "loss": 0.1126, + "num_input_tokens_seen": 23652224, + "step": 342 + }, + { + "epoch": 21.375, + "loss": 0.114508718252182, + "loss_ce": 0.03150090202689171, + "loss_xval": 0.0830078125, + "num_input_tokens_seen": 23652224, + "step": 342 + }, + { + "epoch": 21.4375, + "grad_norm": 9.502862995495352, + "learning_rate": 5e-05, + "loss": 0.0391, + "num_input_tokens_seen": 23724032, + "step": 343 + }, + { + "epoch": 21.4375, + "loss": 0.041819460690021515, + "loss_ce": 0.032542116940021515, + "loss_xval": 0.00927734375, + "num_input_tokens_seen": 23724032, + "step": 343 + }, + { + "epoch": 21.5, + "grad_norm": 93.54614585882905, + "learning_rate": 5e-05, + "loss": 0.0871, + "num_input_tokens_seen": 23783104, + "step": 344 + }, + { + "epoch": 21.5, + "loss": 0.07588471472263336, + "loss_ce": 0.027300726622343063, + "loss_xval": 0.048583984375, + "num_input_tokens_seen": 23783104, + "step": 344 + }, + { + "epoch": 21.5625, + "grad_norm": 125.48052241576953, + "learning_rate": 5e-05, + "loss": 0.1224, + "num_input_tokens_seen": 23854720, + "step": 345 + }, + { + "epoch": 21.5625, + "loss": 0.12017424404621124, + "loss_ce": 0.02886565402150154, + "loss_xval": 0.09130859375, + "num_input_tokens_seen": 23854720, + "step": 345 + }, + { + "epoch": 21.625, + "grad_norm": 57.39062178859967, + "learning_rate": 5e-05, + "loss": 0.0623, + "num_input_tokens_seen": 23926336, + "step": 346 + }, + { + "epoch": 21.625, + "loss": 0.06358948349952698, + "loss_ce": 0.03258362039923668, + "loss_xval": 0.031005859375, + "num_input_tokens_seen": 23926336, + "step": 346 + }, + { + "epoch": 21.6875, + "grad_norm": 42.191072428656106, + "learning_rate": 5e-05, + "loss": 0.0509, + "num_input_tokens_seen": 23997952, + "step": 347 + }, + { + "epoch": 21.6875, + "loss": 0.04895569011569023, + "loss_ce": 0.03375793620944023, + "loss_xval": 0.01519775390625, + "num_input_tokens_seen": 23997952, + "step": 347 + }, + { + "epoch": 21.75, + "grad_norm": 91.60524796764251, + "learning_rate": 5e-05, + "loss": 0.0813, + "num_input_tokens_seen": 24069696, + "step": 348 + }, + { + "epoch": 21.75, + "loss": 0.07642795890569687, + "loss_ce": 0.02808811329305172, + "loss_xval": 0.04833984375, + "num_input_tokens_seen": 24069696, + "step": 348 + }, + { + "epoch": 21.8125, + "grad_norm": 65.79991588306397, + "learning_rate": 5e-05, + "loss": 0.0599, + "num_input_tokens_seen": 24141312, + "step": 349 + }, + { + "epoch": 21.8125, + "loss": 0.04916411638259888, + "loss_ce": 0.023041069507598877, + "loss_xval": 0.026123046875, + "num_input_tokens_seen": 24141312, + "step": 349 + }, + { + "epoch": 21.875, + "grad_norm": 7.238730719380323, + "learning_rate": 5e-05, + "loss": 0.0391, + "num_input_tokens_seen": 24212928, + "step": 350 + }, + { + "epoch": 21.875, + "loss": 0.0389084629714489, + "loss_ce": 0.0302414707839489, + "loss_xval": 0.0086669921875, + "num_input_tokens_seen": 24212928, + "step": 350 + }, + { + "epoch": 21.9375, + "grad_norm": 67.62441618992753, + "learning_rate": 5e-05, + "loss": 0.058, + "num_input_tokens_seen": 24259584, + "step": 351 + }, + { + "epoch": 21.9375, + "loss": 0.061608459800481796, + "loss_ce": 0.025231506675481796, + "loss_xval": 0.036376953125, + "num_input_tokens_seen": 24259584, + "step": 351 + }, + { + "epoch": 22.0, + "grad_norm": 78.57853165326333, + "learning_rate": 5e-05, + "loss": 0.0642, + "num_input_tokens_seen": 24331200, + "step": 352 + }, + { + "epoch": 22.0, + "loss": 0.056535474956035614, + "loss_ce": 0.022355787456035614, + "loss_xval": 0.0341796875, + "num_input_tokens_seen": 24331200, + "step": 352 + }, + { + "epoch": 22.0625, + "grad_norm": 48.73817646103603, + "learning_rate": 5e-05, + "loss": 0.0446, + "num_input_tokens_seen": 24377920, + "step": 353 + }, + { + "epoch": 22.0625, + "loss": 0.04055093228816986, + "loss_ce": 0.02523110806941986, + "loss_xval": 0.01531982421875, + "num_input_tokens_seen": 24377920, + "step": 353 + }, + { + "epoch": 22.125, + "grad_norm": 22.41809225651893, + "learning_rate": 5e-05, + "loss": 0.0334, + "num_input_tokens_seen": 24437120, + "step": 354 + }, + { + "epoch": 22.125, + "loss": 0.03857526183128357, + "loss_ce": 0.02502545714378357, + "loss_xval": 0.0135498046875, + "num_input_tokens_seen": 24437120, + "step": 354 + }, + { + "epoch": 22.1875, + "grad_norm": 21.286114934711435, + "learning_rate": 5e-05, + "loss": 0.0337, + "num_input_tokens_seen": 24508800, + "step": 355 + }, + { + "epoch": 22.1875, + "loss": 0.03242785111069679, + "loss_ce": 0.024066034704446793, + "loss_xval": 0.00836181640625, + "num_input_tokens_seen": 24508800, + "step": 355 + }, + { + "epoch": 22.25, + "grad_norm": 27.768584251920654, + "learning_rate": 5e-05, + "loss": 0.0346, + "num_input_tokens_seen": 24580480, + "step": 356 + }, + { + "epoch": 22.25, + "loss": 0.03272373229265213, + "loss_ce": 0.02051670290529728, + "loss_xval": 0.01220703125, + "num_input_tokens_seen": 24580480, + "step": 356 + }, + { + "epoch": 22.3125, + "grad_norm": 33.0264274440666, + "learning_rate": 5e-05, + "loss": 0.0394, + "num_input_tokens_seen": 24652096, + "step": 357 + }, + { + "epoch": 22.3125, + "loss": 0.03478675335645676, + "loss_ce": 0.023434214293956757, + "loss_xval": 0.0113525390625, + "num_input_tokens_seen": 24652096, + "step": 357 + }, + { + "epoch": 22.375, + "grad_norm": 30.680450549270756, + "learning_rate": 5e-05, + "loss": 0.0307, + "num_input_tokens_seen": 24723712, + "step": 358 + }, + { + "epoch": 22.375, + "loss": 0.0311301089823246, + "loss_ce": 0.0179465152323246, + "loss_xval": 0.01318359375, + "num_input_tokens_seen": 24723712, + "step": 358 + }, + { + "epoch": 22.4375, + "grad_norm": 1.6152731376540712, + "learning_rate": 5e-05, + "loss": 0.0256, + "num_input_tokens_seen": 24795264, + "step": 359 + }, + { + "epoch": 22.4375, + "loss": 0.02527458965778351, + "loss_ce": 0.02133782207965851, + "loss_xval": 0.003936767578125, + "num_input_tokens_seen": 24795264, + "step": 359 + }, + { + "epoch": 22.5, + "grad_norm": 39.24958933219172, + "learning_rate": 5e-05, + "loss": 0.035, + "num_input_tokens_seen": 24866880, + "step": 360 + }, + { + "epoch": 22.5, + "loss": 0.04027906805276871, + "loss_ce": 0.021114028990268707, + "loss_xval": 0.0191650390625, + "num_input_tokens_seen": 24866880, + "step": 360 + }, + { + "epoch": 22.5625, + "grad_norm": 49.680752477148026, + "learning_rate": 5e-05, + "loss": 0.0416, + "num_input_tokens_seen": 24938432, + "step": 361 + }, + { + "epoch": 22.5625, + "loss": 0.04497775062918663, + "loss_ce": 0.02337130531668663, + "loss_xval": 0.0216064453125, + "num_input_tokens_seen": 24938432, + "step": 361 + }, + { + "epoch": 22.625, + "grad_norm": 30.514751566658266, + "learning_rate": 5e-05, + "loss": 0.0308, + "num_input_tokens_seen": 25010112, + "step": 362 + }, + { + "epoch": 22.625, + "loss": 0.03016595169901848, + "loss_ce": 0.015151304192841053, + "loss_xval": 0.0150146484375, + "num_input_tokens_seen": 25010112, + "step": 362 + }, + { + "epoch": 22.6875, + "grad_norm": 3.0108457596333102, + "learning_rate": 5e-05, + "loss": 0.0265, + "num_input_tokens_seen": 25081728, + "step": 363 + }, + { + "epoch": 22.6875, + "loss": 0.026027997955679893, + "loss_ce": 0.017910322174429893, + "loss_xval": 0.00811767578125, + "num_input_tokens_seen": 25081728, + "step": 363 + }, + { + "epoch": 22.75, + "grad_norm": 34.720989573408396, + "learning_rate": 5e-05, + "loss": 0.0276, + "num_input_tokens_seen": 25153280, + "step": 364 + }, + { + "epoch": 22.75, + "loss": 0.03153940662741661, + "loss_ce": 0.014937843196094036, + "loss_xval": 0.0166015625, + "num_input_tokens_seen": 25153280, + "step": 364 + }, + { + "epoch": 22.8125, + "grad_norm": 54.55908300103559, + "learning_rate": 5e-05, + "loss": 0.0403, + "num_input_tokens_seen": 25212416, + "step": 365 + }, + { + "epoch": 22.8125, + "loss": 0.04323313385248184, + "loss_ce": 0.013936257921159267, + "loss_xval": 0.029296875, + "num_input_tokens_seen": 25212416, + "step": 365 + }, + { + "epoch": 22.875, + "grad_norm": 58.489593389674056, + "learning_rate": 5e-05, + "loss": 0.0417, + "num_input_tokens_seen": 25271488, + "step": 366 + }, + { + "epoch": 22.875, + "loss": 0.04008274897933006, + "loss_ce": 0.014081771485507488, + "loss_xval": 0.0260009765625, + "num_input_tokens_seen": 25271488, + "step": 366 + }, + { + "epoch": 22.9375, + "grad_norm": 48.24943727129867, + "learning_rate": 5e-05, + "loss": 0.0368, + "num_input_tokens_seen": 25330688, + "step": 367 + }, + { + "epoch": 22.9375, + "loss": 0.041829679161310196, + "loss_ce": 0.018025968223810196, + "loss_xval": 0.0238037109375, + "num_input_tokens_seen": 25330688, + "step": 367 + }, + { + "epoch": 23.0, + "grad_norm": 28.82296636692516, + "learning_rate": 5e-05, + "loss": 0.0284, + "num_input_tokens_seen": 25402240, + "step": 368 + }, + { + "epoch": 23.0, + "loss": 0.026926452293992043, + "loss_ce": 0.017343932762742043, + "loss_xval": 0.00958251953125, + "num_input_tokens_seen": 25402240, + "step": 368 + }, + { + "epoch": 23.0625, + "grad_norm": 3.7841073776005842, + "learning_rate": 5e-05, + "loss": 0.019, + "num_input_tokens_seen": 25461312, + "step": 369 + }, + { + "epoch": 23.0625, + "loss": 0.018005449324846268, + "loss_ce": 0.010284501127898693, + "loss_xval": 0.007720947265625, + "num_input_tokens_seen": 25461312, + "step": 369 + }, + { + "epoch": 23.125, + "grad_norm": 35.588070766280204, + "learning_rate": 5e-05, + "loss": 0.0255, + "num_input_tokens_seen": 25532992, + "step": 370 + }, + { + "epoch": 23.125, + "loss": 0.02573539689183235, + "loss_ce": 0.012612837366759777, + "loss_xval": 0.01312255859375, + "num_input_tokens_seen": 25532992, + "step": 370 + }, + { + "epoch": 23.1875, + "grad_norm": 65.26191511837887, + "learning_rate": 5e-05, + "loss": 0.0442, + "num_input_tokens_seen": 25592192, + "step": 371 + }, + { + "epoch": 23.1875, + "loss": 0.04462482035160065, + "loss_ce": 0.010689273476600647, + "loss_xval": 0.033935546875, + "num_input_tokens_seen": 25592192, + "step": 371 + }, + { + "epoch": 23.25, + "grad_norm": 97.58296827423466, + "learning_rate": 5e-05, + "loss": 0.0662, + "num_input_tokens_seen": 25651200, + "step": 372 + }, + { + "epoch": 23.25, + "loss": 0.06839506328105927, + "loss_ce": 0.010533735156059265, + "loss_xval": 0.057861328125, + "num_input_tokens_seen": 25651200, + "step": 372 + }, + { + "epoch": 23.3125, + "grad_norm": 151.49134967464235, + "learning_rate": 5e-05, + "loss": 0.1385, + "num_input_tokens_seen": 25722880, + "step": 373 + }, + { + "epoch": 23.3125, + "loss": 0.14277715981006622, + "loss_ce": 0.009964662604033947, + "loss_xval": 0.1328125, + "num_input_tokens_seen": 25722880, + "step": 373 + }, + { + "epoch": 23.375, + "grad_norm": 243.7000987041097, + "learning_rate": 5e-05, + "loss": 0.339, + "num_input_tokens_seen": 25794560, + "step": 374 + }, + { + "epoch": 23.375, + "loss": 0.34568315744400024, + "loss_ce": 0.009745652787387371, + "loss_xval": 0.3359375, + "num_input_tokens_seen": 25794560, + "step": 374 + }, + { + "epoch": 23.4375, + "grad_norm": 387.16968867611166, + "learning_rate": 5e-05, + "loss": 0.8597, + "num_input_tokens_seen": 25866304, + "step": 375 + }, + { + "epoch": 23.4375, + "loss": 0.8643907308578491, + "loss_ce": 0.008921988308429718, + "loss_xval": 0.85546875, + "num_input_tokens_seen": 25866304, + "step": 375 + }, + { + "epoch": 23.5, + "grad_norm": 523.2692863493334, + "learning_rate": 5e-05, + "loss": 1.5678, + "num_input_tokens_seen": 25925312, + "step": 376 + }, + { + "epoch": 23.5, + "loss": 1.5135647058486938, + "loss_ce": 0.013564717024564743, + "loss_xval": 1.5, + "num_input_tokens_seen": 25925312, + "step": 376 + }, + { + "epoch": 23.5625, + "grad_norm": 482.0184380618385, + "learning_rate": 5e-05, + "loss": 1.4084, + "num_input_tokens_seen": 25984512, + "step": 377 + }, + { + "epoch": 23.5625, + "loss": 1.487014889717102, + "loss_ce": 0.018264926970005035, + "loss_xval": 1.46875, + "num_input_tokens_seen": 25984512, + "step": 377 + }, + { + "epoch": 23.625, + "grad_norm": 205.0593908826293, + "learning_rate": 5e-05, + "loss": 0.3361, + "num_input_tokens_seen": 26056192, + "step": 378 + }, + { + "epoch": 23.625, + "loss": 0.34420228004455566, + "loss_ce": 0.02584289014339447, + "loss_xval": 0.318359375, + "num_input_tokens_seen": 26056192, + "step": 378 + }, + { + "epoch": 23.6875, + "grad_norm": 132.83934380697207, + "learning_rate": 5e-05, + "loss": 0.2223, + "num_input_tokens_seen": 26127872, + "step": 379 + }, + { + "epoch": 23.6875, + "loss": 0.2154698669910431, + "loss_ce": 0.04457143321633339, + "loss_xval": 0.1708984375, + "num_input_tokens_seen": 26127872, + "step": 379 + }, + { + "epoch": 23.75, + "grad_norm": 306.0324430341869, + "learning_rate": 5e-05, + "loss": 0.8102, + "num_input_tokens_seen": 26187008, + "step": 380 + }, + { + "epoch": 23.75, + "loss": 0.7603037357330322, + "loss_ce": 0.053272493183612823, + "loss_xval": 0.70703125, + "num_input_tokens_seen": 26187008, + "step": 380 + }, + { + "epoch": 23.8125, + "grad_norm": 243.68404104415083, + "learning_rate": 5e-05, + "loss": 0.5698, + "num_input_tokens_seen": 26258560, + "step": 381 + }, + { + "epoch": 23.8125, + "loss": 0.5247402787208557, + "loss_ce": 0.04622465744614601, + "loss_xval": 0.478515625, + "num_input_tokens_seen": 26258560, + "step": 381 + }, + { + "epoch": 23.875, + "grad_norm": 37.919739354322424, + "learning_rate": 5e-05, + "loss": 0.0741, + "num_input_tokens_seen": 26330368, + "step": 382 + }, + { + "epoch": 23.875, + "loss": 0.07584744691848755, + "loss_ce": 0.03605252504348755, + "loss_xval": 0.039794921875, + "num_input_tokens_seen": 26330368, + "step": 382 + }, + { + "epoch": 23.9375, + "grad_norm": 191.36875903807635, + "learning_rate": 5e-05, + "loss": 0.3273, + "num_input_tokens_seen": 26401920, + "step": 383 + }, + { + "epoch": 23.9375, + "loss": 0.33099889755249023, + "loss_ce": 0.026311399415135384, + "loss_xval": 0.3046875, + "num_input_tokens_seen": 26401920, + "step": 383 + }, + { + "epoch": 24.0, + "grad_norm": 242.1428011928719, + "learning_rate": 5e-05, + "loss": 0.4589, + "num_input_tokens_seen": 26473664, + "step": 384 + }, + { + "epoch": 24.0, + "loss": 0.44113773107528687, + "loss_ce": 0.023168986663222313, + "loss_xval": 0.41796875, + "num_input_tokens_seen": 26473664, + "step": 384 + }, + { + "epoch": 24.0625, + "grad_norm": 30.636740642057283, + "learning_rate": 5e-05, + "loss": 0.0483, + "num_input_tokens_seen": 26545344, + "step": 385 + }, + { + "epoch": 24.0625, + "loss": 0.04922012984752655, + "loss_ce": 0.0277357567101717, + "loss_xval": 0.021484375, + "num_input_tokens_seen": 26545344, + "step": 385 + }, + { + "epoch": 24.125, + "grad_norm": 184.2297319008558, + "learning_rate": 5e-05, + "loss": 0.3037, + "num_input_tokens_seen": 26617024, + "step": 386 + }, + { + "epoch": 24.125, + "loss": 0.30065447092056274, + "loss_ce": 0.0311232078820467, + "loss_xval": 0.26953125, + "num_input_tokens_seen": 26617024, + "step": 386 + }, + { + "epoch": 24.1875, + "grad_norm": 148.02728447044228, + "learning_rate": 5e-05, + "loss": 0.2284, + "num_input_tokens_seen": 26676096, + "step": 387 + }, + { + "epoch": 24.1875, + "loss": 0.20117712020874023, + "loss_ce": 0.026372438296675682, + "loss_xval": 0.1748046875, + "num_input_tokens_seen": 26676096, + "step": 387 + }, + { + "epoch": 24.25, + "grad_norm": 48.7182625078638, + "learning_rate": 5e-05, + "loss": 0.0775, + "num_input_tokens_seen": 26747712, + "step": 388 + }, + { + "epoch": 24.25, + "loss": 0.0815923810005188, + "loss_ce": 0.0259283185005188, + "loss_xval": 0.0556640625, + "num_input_tokens_seen": 26747712, + "step": 388 + }, + { + "epoch": 24.3125, + "grad_norm": 159.84480277227877, + "learning_rate": 5e-05, + "loss": 0.2301, + "num_input_tokens_seen": 26806848, + "step": 389 + }, + { + "epoch": 24.3125, + "loss": 0.2049870491027832, + "loss_ce": 0.025299543514847755, + "loss_xval": 0.1796875, + "num_input_tokens_seen": 26806848, + "step": 389 + }, + { + "epoch": 24.375, + "grad_norm": 90.66012488974275, + "learning_rate": 5e-05, + "loss": 0.1106, + "num_input_tokens_seen": 26865984, + "step": 390 + }, + { + "epoch": 24.375, + "loss": 0.1021375060081482, + "loss_ce": 0.022059379145503044, + "loss_xval": 0.080078125, + "num_input_tokens_seen": 26865984, + "step": 390 + }, + { + "epoch": 24.4375, + "grad_norm": 91.60761564913557, + "learning_rate": 5e-05, + "loss": 0.1192, + "num_input_tokens_seen": 26925120, + "step": 391 + }, + { + "epoch": 24.4375, + "loss": 0.10391199588775635, + "loss_ce": 0.028716687113046646, + "loss_xval": 0.0751953125, + "num_input_tokens_seen": 26925120, + "step": 391 + }, + { + "epoch": 24.5, + "grad_norm": 192.7878292026773, + "learning_rate": 5e-05, + "loss": 0.3131, + "num_input_tokens_seen": 26984320, + "step": 392 + }, + { + "epoch": 24.5, + "loss": 0.31701016426086426, + "loss_ce": 0.022088276222348213, + "loss_xval": 0.294921875, + "num_input_tokens_seen": 26984320, + "step": 392 + }, + { + "epoch": 24.5625, + "grad_norm": 111.49127185827285, + "learning_rate": 5e-05, + "loss": 0.1325, + "num_input_tokens_seen": 27055936, + "step": 393 + }, + { + "epoch": 24.5625, + "loss": 0.14198513329029083, + "loss_ce": 0.023332787677645683, + "loss_xval": 0.11865234375, + "num_input_tokens_seen": 27055936, + "step": 393 + }, + { + "epoch": 24.625, + "grad_norm": 73.49139601180414, + "learning_rate": 5e-05, + "loss": 0.0894, + "num_input_tokens_seen": 27115072, + "step": 394 + }, + { + "epoch": 24.625, + "loss": 0.10783809423446655, + "loss_ce": 0.026783406734466553, + "loss_xval": 0.0810546875, + "num_input_tokens_seen": 27115072, + "step": 394 + }, + { + "epoch": 24.6875, + "grad_norm": 167.08639423336632, + "learning_rate": 5e-05, + "loss": 0.2457, + "num_input_tokens_seen": 27186624, + "step": 395 + }, + { + "epoch": 24.6875, + "loss": 0.24250024557113647, + "loss_ce": 0.022773688659071922, + "loss_xval": 0.2197265625, + "num_input_tokens_seen": 27186624, + "step": 395 + }, + { + "epoch": 24.75, + "grad_norm": 57.70197502018322, + "learning_rate": 5e-05, + "loss": 0.0598, + "num_input_tokens_seen": 27245696, + "step": 396 + }, + { + "epoch": 24.75, + "loss": 0.06535350531339645, + "loss_ce": 0.030929675325751305, + "loss_xval": 0.034423828125, + "num_input_tokens_seen": 27245696, + "step": 396 + }, + { + "epoch": 24.8125, + "grad_norm": 118.89098168669285, + "learning_rate": 5e-05, + "loss": 0.1317, + "num_input_tokens_seen": 27304768, + "step": 397 + }, + { + "epoch": 24.8125, + "loss": 0.125514954328537, + "loss_ce": 0.019069643691182137, + "loss_xval": 0.1064453125, + "num_input_tokens_seen": 27304768, + "step": 397 + }, + { + "epoch": 24.875, + "grad_norm": 169.94360101936388, + "learning_rate": 5e-05, + "loss": 0.2503, + "num_input_tokens_seen": 27376320, + "step": 398 + }, + { + "epoch": 24.875, + "loss": 0.2484622746706009, + "loss_ce": 0.02287633717060089, + "loss_xval": 0.2255859375, + "num_input_tokens_seen": 27376320, + "step": 398 + }, + { + "epoch": 24.9375, + "grad_norm": 63.0037046223998, + "learning_rate": 5e-05, + "loss": 0.0637, + "num_input_tokens_seen": 27435456, + "step": 399 + }, + { + "epoch": 24.9375, + "loss": 0.0754106268286705, + "loss_ce": 0.01901414431631565, + "loss_xval": 0.056396484375, + "num_input_tokens_seen": 27435456, + "step": 399 + }, + { + "epoch": 25.0, + "grad_norm": 83.98852289312464, + "learning_rate": 5e-05, + "loss": 0.0777, + "num_input_tokens_seen": 27507072, + "step": 400 + }, + { + "epoch": 25.0, + "loss": 0.07177116721868515, + "loss_ce": 0.014642258174717426, + "loss_xval": 0.05712890625, + "num_input_tokens_seen": 27507072, + "step": 400 + }, + { + "epoch": 25.0625, + "grad_norm": 96.74470968630307, + "learning_rate": 5e-05, + "loss": 0.0943, + "num_input_tokens_seen": 27578752, + "step": 401 + }, + { + "epoch": 25.0625, + "loss": 0.09034855663776398, + "loss_ce": 0.018571211025118828, + "loss_xval": 0.07177734375, + "num_input_tokens_seen": 27578752, + "step": 401 + }, + { + "epoch": 25.125, + "grad_norm": 28.095202197446476, + "learning_rate": 5e-05, + "loss": 0.0358, + "num_input_tokens_seen": 27650368, + "step": 402 + }, + { + "epoch": 25.125, + "loss": 0.034102149307727814, + "loss_ce": 0.018110936507582664, + "loss_xval": 0.0159912109375, + "num_input_tokens_seen": 27650368, + "step": 402 + }, + { + "epoch": 25.1875, + "grad_norm": 135.5835375799599, + "learning_rate": 5e-05, + "loss": 0.1754, + "num_input_tokens_seen": 27722048, + "step": 403 + }, + { + "epoch": 25.1875, + "loss": 0.17733702063560486, + "loss_ce": 0.015227638185024261, + "loss_xval": 0.162109375, + "num_input_tokens_seen": 27722048, + "step": 403 + }, + { + "epoch": 25.25, + "grad_norm": 82.8180389489728, + "learning_rate": 5e-05, + "loss": 0.0893, + "num_input_tokens_seen": 27768640, + "step": 404 + }, + { + "epoch": 25.25, + "loss": 0.09928497672080994, + "loss_ce": 0.022624818608164787, + "loss_xval": 0.07666015625, + "num_input_tokens_seen": 27768640, + "step": 404 + }, + { + "epoch": 25.3125, + "grad_norm": 84.89081123788657, + "learning_rate": 5e-05, + "loss": 0.0907, + "num_input_tokens_seen": 27840256, + "step": 405 + }, + { + "epoch": 25.3125, + "loss": 0.08249988406896591, + "loss_ce": 0.012675662524998188, + "loss_xval": 0.06982421875, + "num_input_tokens_seen": 27840256, + "step": 405 + }, + { + "epoch": 25.375, + "grad_norm": 155.09101131078668, + "learning_rate": 5e-05, + "loss": 0.237, + "num_input_tokens_seen": 27899328, + "step": 406 + }, + { + "epoch": 25.375, + "loss": 0.22836892306804657, + "loss_ce": 0.015478293411433697, + "loss_xval": 0.212890625, + "num_input_tokens_seen": 27899328, + "step": 406 + }, + { + "epoch": 25.4375, + "grad_norm": 46.687669868373305, + "learning_rate": 5e-05, + "loss": 0.0399, + "num_input_tokens_seen": 27958400, + "step": 407 + }, + { + "epoch": 25.4375, + "loss": 0.029898736625909805, + "loss_ce": 0.01378545444458723, + "loss_xval": 0.01611328125, + "num_input_tokens_seen": 27958400, + "step": 407 + }, + { + "epoch": 25.5, + "grad_norm": 106.1049029587732, + "learning_rate": 5e-05, + "loss": 0.1194, + "num_input_tokens_seen": 28030080, + "step": 408 + }, + { + "epoch": 25.5, + "loss": 0.10383109748363495, + "loss_ce": 0.012522506527602673, + "loss_xval": 0.09130859375, + "num_input_tokens_seen": 28030080, + "step": 408 + }, + { + "epoch": 25.5625, + "grad_norm": 96.26835574978439, + "learning_rate": 5e-05, + "loss": 0.1106, + "num_input_tokens_seen": 28101696, + "step": 409 + }, + { + "epoch": 25.5625, + "loss": 0.10796400904655457, + "loss_ce": 0.011284318752586842, + "loss_xval": 0.0966796875, + "num_input_tokens_seen": 28101696, + "step": 409 + }, + { + "epoch": 25.625, + "grad_norm": 70.72911312420754, + "learning_rate": 5e-05, + "loss": 0.0676, + "num_input_tokens_seen": 28173248, + "step": 410 + }, + { + "epoch": 25.625, + "loss": 0.06852303445339203, + "loss_ce": 0.013347254134714603, + "loss_xval": 0.05517578125, + "num_input_tokens_seen": 28173248, + "step": 410 + }, + { + "epoch": 25.6875, + "grad_norm": 142.01249737081872, + "learning_rate": 5e-05, + "loss": 0.1973, + "num_input_tokens_seen": 28232320, + "step": 411 + }, + { + "epoch": 25.6875, + "loss": 0.18970969319343567, + "loss_ce": 0.011975325644016266, + "loss_xval": 0.177734375, + "num_input_tokens_seen": 28232320, + "step": 411 + }, + { + "epoch": 25.75, + "grad_norm": 30.679026667393117, + "learning_rate": 5e-05, + "loss": 0.0309, + "num_input_tokens_seen": 28304000, + "step": 412 + }, + { + "epoch": 25.75, + "loss": 0.03227870911359787, + "loss_ce": 0.01360194943845272, + "loss_xval": 0.0186767578125, + "num_input_tokens_seen": 28304000, + "step": 412 + }, + { + "epoch": 25.8125, + "grad_norm": 107.5003547141714, + "learning_rate": 5e-05, + "loss": 0.1255, + "num_input_tokens_seen": 28363136, + "step": 413 + }, + { + "epoch": 25.8125, + "loss": 0.13056893646717072, + "loss_ce": 0.010451752692461014, + "loss_xval": 0.1201171875, + "num_input_tokens_seen": 28363136, + "step": 413 + }, + { + "epoch": 25.875, + "grad_norm": 108.86184718629066, + "learning_rate": 5e-05, + "loss": 0.122, + "num_input_tokens_seen": 28434816, + "step": 414 + }, + { + "epoch": 25.875, + "loss": 0.11903096735477448, + "loss_ce": 0.0130739351734519, + "loss_xval": 0.10595703125, + "num_input_tokens_seen": 28434816, + "step": 414 + }, + { + "epoch": 25.9375, + "grad_norm": 26.118808281545533, + "learning_rate": 5e-05, + "loss": 0.0236, + "num_input_tokens_seen": 28493824, + "step": 415 + }, + { + "epoch": 25.9375, + "loss": 0.02844841778278351, + "loss_ce": 0.008184745907783508, + "loss_xval": 0.020263671875, + "num_input_tokens_seen": 28493824, + "step": 415 + }, + { + "epoch": 26.0, + "grad_norm": 128.17331848976858, + "learning_rate": 5e-05, + "loss": 0.1698, + "num_input_tokens_seen": 28565568, + "step": 416 + }, + { + "epoch": 26.0, + "loss": 0.1835230439901352, + "loss_ce": 0.01164803933352232, + "loss_xval": 0.171875, + "num_input_tokens_seen": 28565568, + "step": 416 + }, + { + "epoch": 26.0625, + "grad_norm": 77.50573567485803, + "learning_rate": 5e-05, + "loss": 0.0784, + "num_input_tokens_seen": 28637248, + "step": 417 + }, + { + "epoch": 26.0625, + "loss": 0.07487931102514267, + "loss_ce": 0.00847306102514267, + "loss_xval": 0.06640625, + "num_input_tokens_seen": 28637248, + "step": 417 + }, + { + "epoch": 26.125, + "grad_norm": 46.99801436201133, + "learning_rate": 5e-05, + "loss": 0.0385, + "num_input_tokens_seen": 28708800, + "step": 418 + }, + { + "epoch": 26.125, + "loss": 0.02982092648744583, + "loss_ce": 0.010411746799945831, + "loss_xval": 0.0194091796875, + "num_input_tokens_seen": 28708800, + "step": 418 + }, + { + "epoch": 26.1875, + "grad_norm": 102.79239572865913, + "learning_rate": 5e-05, + "loss": 0.1212, + "num_input_tokens_seen": 28780480, + "step": 419 + }, + { + "epoch": 26.1875, + "loss": 0.12018196284770966, + "loss_ce": 0.00738899689167738, + "loss_xval": 0.11279296875, + "num_input_tokens_seen": 28780480, + "step": 419 + }, + { + "epoch": 26.25, + "grad_norm": 37.28295120694252, + "learning_rate": 5e-05, + "loss": 0.0298, + "num_input_tokens_seen": 28852032, + "step": 420 + }, + { + "epoch": 26.25, + "loss": 0.033372994512319565, + "loss_ce": 0.0076161595061421394, + "loss_xval": 0.0257568359375, + "num_input_tokens_seen": 28852032, + "step": 420 + }, + { + "epoch": 26.3125, + "grad_norm": 62.84732112776997, + "learning_rate": 5e-05, + "loss": 0.0532, + "num_input_tokens_seen": 28911296, + "step": 421 + }, + { + "epoch": 26.3125, + "loss": 0.04969404637813568, + "loss_ce": 0.00892256386578083, + "loss_xval": 0.040771484375, + "num_input_tokens_seen": 28911296, + "step": 421 + }, + { + "epoch": 26.375, + "grad_norm": 63.9094773158963, + "learning_rate": 5e-05, + "loss": 0.0522, + "num_input_tokens_seen": 28982912, + "step": 422 + }, + { + "epoch": 26.375, + "loss": 0.0577460378408432, + "loss_ce": 0.006476507056504488, + "loss_xval": 0.05126953125, + "num_input_tokens_seen": 28982912, + "step": 422 + }, + { + "epoch": 26.4375, + "grad_norm": 13.533038052106159, + "learning_rate": 5e-05, + "loss": 0.0189, + "num_input_tokens_seen": 29054464, + "step": 423 + }, + { + "epoch": 26.4375, + "loss": 0.020495440810918808, + "loss_ce": 0.007678058464080095, + "loss_xval": 0.0128173828125, + "num_input_tokens_seen": 29054464, + "step": 423 + }, + { + "epoch": 26.5, + "grad_norm": 73.36663422996799, + "learning_rate": 5e-05, + "loss": 0.078, + "num_input_tokens_seen": 29113664, + "step": 424 + }, + { + "epoch": 26.5, + "loss": 0.06784259527921677, + "loss_ce": 0.0077840001322329044, + "loss_xval": 0.06005859375, + "num_input_tokens_seen": 29113664, + "step": 424 + }, + { + "epoch": 26.5625, + "grad_norm": 73.33448547583748, + "learning_rate": 5e-05, + "loss": 0.0695, + "num_input_tokens_seen": 29185472, + "step": 425 + }, + { + "epoch": 26.5625, + "loss": 0.07413546741008759, + "loss_ce": 0.0072409361600875854, + "loss_xval": 0.06689453125, + "num_input_tokens_seen": 29185472, + "step": 425 + }, + { + "epoch": 26.625, + "grad_norm": 5.3644065133097625, + "learning_rate": 5e-05, + "loss": 0.0141, + "num_input_tokens_seen": 29244672, + "step": 426 + }, + { + "epoch": 26.625, + "loss": 0.01347382366657257, + "loss_ce": 0.006485298741608858, + "loss_xval": 0.006988525390625, + "num_input_tokens_seen": 29244672, + "step": 426 + }, + { + "epoch": 26.6875, + "grad_norm": 72.55059146303088, + "learning_rate": 5e-05, + "loss": 0.0645, + "num_input_tokens_seen": 29316288, + "step": 427 + }, + { + "epoch": 26.6875, + "loss": 0.07400896400213242, + "loss_ce": 0.006626151502132416, + "loss_xval": 0.0673828125, + "num_input_tokens_seen": 29316288, + "step": 427 + }, + { + "epoch": 26.75, + "grad_norm": 70.20787953586438, + "learning_rate": 5e-05, + "loss": 0.064, + "num_input_tokens_seen": 29387840, + "step": 428 + }, + { + "epoch": 26.75, + "loss": 0.06021607294678688, + "loss_ce": 0.0069934166967868805, + "loss_xval": 0.05322265625, + "num_input_tokens_seen": 29387840, + "step": 428 + }, + { + "epoch": 26.8125, + "grad_norm": 11.058123030300388, + "learning_rate": 5e-05, + "loss": 0.0194, + "num_input_tokens_seen": 29459584, + "step": 429 + }, + { + "epoch": 26.8125, + "loss": 0.02217293716967106, + "loss_ce": 0.006059656385332346, + "loss_xval": 0.01611328125, + "num_input_tokens_seen": 29459584, + "step": 429 + }, + { + "epoch": 26.875, + "grad_norm": 47.16874607456602, + "learning_rate": 5e-05, + "loss": 0.0409, + "num_input_tokens_seen": 29518720, + "step": 430 + }, + { + "epoch": 26.875, + "loss": 0.0351792611181736, + "loss_ce": 0.0041734022088348866, + "loss_xval": 0.031005859375, + "num_input_tokens_seen": 29518720, + "step": 430 + }, + { + "epoch": 26.9375, + "grad_norm": 52.91000873684464, + "learning_rate": 5e-05, + "loss": 0.0371, + "num_input_tokens_seen": 29590400, + "step": 431 + }, + { + "epoch": 26.9375, + "loss": 0.036790959537029266, + "loss_ce": 0.004076115787029266, + "loss_xval": 0.03271484375, + "num_input_tokens_seen": 29590400, + "step": 431 + }, + { + "epoch": 27.0, + "grad_norm": 5.600692805884, + "learning_rate": 5e-05, + "loss": 0.0128, + "num_input_tokens_seen": 29661952, + "step": 432 + }, + { + "epoch": 27.0, + "loss": 0.013566261157393456, + "loss_ce": 0.007004982326179743, + "loss_xval": 0.006561279296875, + "num_input_tokens_seen": 29661952, + "step": 432 + }, + { + "epoch": 27.0625, + "grad_norm": 48.03720973950958, + "learning_rate": 5e-05, + "loss": 0.0387, + "num_input_tokens_seen": 29733504, + "step": 433 + }, + { + "epoch": 27.0625, + "loss": 0.045741401612758636, + "loss_ce": 0.006923042703419924, + "loss_xval": 0.038818359375, + "num_input_tokens_seen": 29733504, + "step": 433 + }, + { + "epoch": 27.125, + "grad_norm": 44.271188473177446, + "learning_rate": 5e-05, + "loss": 0.0336, + "num_input_tokens_seen": 29805248, + "step": 434 + }, + { + "epoch": 27.125, + "loss": 0.028378482908010483, + "loss_ce": 0.004696843214333057, + "loss_xval": 0.023681640625, + "num_input_tokens_seen": 29805248, + "step": 434 + }, + { + "epoch": 27.1875, + "grad_norm": 2.5845658910605147, + "learning_rate": 5e-05, + "loss": 0.0095, + "num_input_tokens_seen": 29876992, + "step": 435 + }, + { + "epoch": 27.1875, + "loss": 0.009095221757888794, + "loss_ce": 0.004914313089102507, + "loss_xval": 0.004180908203125, + "num_input_tokens_seen": 29876992, + "step": 435 + }, + { + "epoch": 27.25, + "grad_norm": 54.92103062539981, + "learning_rate": 5e-05, + "loss": 0.0417, + "num_input_tokens_seen": 29948544, + "step": 436 + }, + { + "epoch": 27.25, + "loss": 0.03424233943223953, + "loss_ce": 0.0042130411602556705, + "loss_xval": 0.030029296875, + "num_input_tokens_seen": 29948544, + "step": 436 + }, + { + "epoch": 27.3125, + "grad_norm": 62.51473236387241, + "learning_rate": 5e-05, + "loss": 0.0471, + "num_input_tokens_seen": 30020160, + "step": 437 + }, + { + "epoch": 27.3125, + "loss": 0.051075927913188934, + "loss_ce": 0.004689209628850222, + "loss_xval": 0.04638671875, + "num_input_tokens_seen": 30020160, + "step": 437 + }, + { + "epoch": 27.375, + "grad_norm": 9.014784016109035, + "learning_rate": 5e-05, + "loss": 0.012, + "num_input_tokens_seen": 30091840, + "step": 438 + }, + { + "epoch": 27.375, + "loss": 0.011384607292711735, + "loss_ce": 0.003968835808336735, + "loss_xval": 0.007415771484375, + "num_input_tokens_seen": 30091840, + "step": 438 + }, + { + "epoch": 27.4375, + "grad_norm": 44.83606998875092, + "learning_rate": 5e-05, + "loss": 0.0279, + "num_input_tokens_seen": 30151040, + "step": 439 + }, + { + "epoch": 27.4375, + "loss": 0.029954617843031883, + "loss_ce": 0.0046860636211931705, + "loss_xval": 0.0252685546875, + "num_input_tokens_seen": 30151040, + "step": 439 + }, + { + "epoch": 27.5, + "grad_norm": 54.64195232542024, + "learning_rate": 5e-05, + "loss": 0.0437, + "num_input_tokens_seen": 30222656, + "step": 440 + }, + { + "epoch": 27.5, + "loss": 0.046291086822748184, + "loss_ce": 0.004543041344732046, + "loss_xval": 0.041748046875, + "num_input_tokens_seen": 30222656, + "step": 440 + }, + { + "epoch": 27.5625, + "grad_norm": 29.63204149481485, + "learning_rate": 5e-05, + "loss": 0.0154, + "num_input_tokens_seen": 30294208, + "step": 441 + }, + { + "epoch": 27.5625, + "loss": 0.018965570256114006, + "loss_ce": 0.0038898871280252934, + "loss_xval": 0.01507568359375, + "num_input_tokens_seen": 30294208, + "step": 441 + }, + { + "epoch": 27.625, + "grad_norm": 12.016289337786652, + "learning_rate": 5e-05, + "loss": 0.0084, + "num_input_tokens_seen": 30353408, + "step": 442 + }, + { + "epoch": 27.625, + "loss": 0.008693516254425049, + "loss_ce": 0.002590001095086336, + "loss_xval": 0.006103515625, + "num_input_tokens_seen": 30353408, + "step": 442 + }, + { + "epoch": 27.6875, + "grad_norm": 49.116878889158016, + "learning_rate": 5e-05, + "loss": 0.0294, + "num_input_tokens_seen": 30412544, + "step": 443 + }, + { + "epoch": 27.6875, + "loss": 0.023649927228689194, + "loss_ce": 0.0030200453475117683, + "loss_xval": 0.0206298828125, + "num_input_tokens_seen": 30412544, + "step": 443 + }, + { + "epoch": 27.75, + "grad_norm": 59.35697128703133, + "learning_rate": 5e-05, + "loss": 0.0421, + "num_input_tokens_seen": 30484352, + "step": 444 + }, + { + "epoch": 27.75, + "loss": 0.03642663359642029, + "loss_ce": 0.00297936936840415, + "loss_xval": 0.033447265625, + "num_input_tokens_seen": 30484352, + "step": 444 + }, + { + "epoch": 27.8125, + "grad_norm": 41.49363581547399, + "learning_rate": 5e-05, + "loss": 0.0252, + "num_input_tokens_seen": 30555968, + "step": 445 + }, + { + "epoch": 27.8125, + "loss": 0.021612796932458878, + "loss_ce": 0.004156741313636303, + "loss_xval": 0.0174560546875, + "num_input_tokens_seen": 30555968, + "step": 445 + }, + { + "epoch": 27.875, + "grad_norm": 4.900475318391131, + "learning_rate": 5e-05, + "loss": 0.0079, + "num_input_tokens_seen": 30627648, + "step": 446 + }, + { + "epoch": 27.875, + "loss": 0.008801901713013649, + "loss_ce": 0.0037970193661749363, + "loss_xval": 0.0050048828125, + "num_input_tokens_seen": 30627648, + "step": 446 + }, + { + "epoch": 27.9375, + "grad_norm": 29.975252846709004, + "learning_rate": 5e-05, + "loss": 0.0153, + "num_input_tokens_seen": 30686848, + "step": 447 + }, + { + "epoch": 27.9375, + "loss": 0.012868822552263737, + "loss_ce": 0.003713548881933093, + "loss_xval": 0.0091552734375, + "num_input_tokens_seen": 30686848, + "step": 447 + }, + { + "epoch": 28.0, + "grad_norm": 38.18920800664839, + "learning_rate": 5e-05, + "loss": 0.0244, + "num_input_tokens_seen": 30758592, + "step": 448 + }, + { + "epoch": 28.0, + "loss": 0.025430506095290184, + "loss_ce": 0.002847498282790184, + "loss_xval": 0.0225830078125, + "num_input_tokens_seen": 30758592, + "step": 448 + }, + { + "epoch": 28.0625, + "grad_norm": 22.862525129157383, + "learning_rate": 5e-05, + "loss": 0.0111, + "num_input_tokens_seen": 30817728, + "step": 449 + }, + { + "epoch": 28.0625, + "loss": 0.011791002005338669, + "loss_ce": 0.0031850445084273815, + "loss_xval": 0.00860595703125, + "num_input_tokens_seen": 30817728, + "step": 449 + }, + { + "epoch": 28.125, + "grad_norm": 9.087768044891721, + "learning_rate": 5e-05, + "loss": 0.0081, + "num_input_tokens_seen": 30889280, + "step": 450 + }, + { + "epoch": 28.125, + "loss": 0.005972502753138542, + "loss_ce": 0.003103850409388542, + "loss_xval": 0.00286865234375, + "num_input_tokens_seen": 30889280, + "step": 450 + }, + { + "epoch": 28.1875, + "grad_norm": 43.22152195751561, + "learning_rate": 5e-05, + "loss": 0.0248, + "num_input_tokens_seen": 30960960, + "step": 451 + }, + { + "epoch": 28.1875, + "loss": 0.025609012693166733, + "loss_ce": 0.002659793710336089, + "loss_xval": 0.02294921875, + "num_input_tokens_seen": 30960960, + "step": 451 + }, + { + "epoch": 28.25, + "grad_norm": 56.557234628273605, + "learning_rate": 5e-05, + "loss": 0.0364, + "num_input_tokens_seen": 31032576, + "step": 452 + }, + { + "epoch": 28.25, + "loss": 0.03853260725736618, + "loss_ce": 0.003376358188688755, + "loss_xval": 0.03515625, + "num_input_tokens_seen": 31032576, + "step": 452 + }, + { + "epoch": 28.3125, + "grad_norm": 49.96014160420832, + "learning_rate": 5e-05, + "loss": 0.0308, + "num_input_tokens_seen": 31091584, + "step": 453 + }, + { + "epoch": 28.3125, + "loss": 0.031207242980599403, + "loss_ce": 0.0031310718040913343, + "loss_xval": 0.028076171875, + "num_input_tokens_seen": 31091584, + "step": 453 + }, + { + "epoch": 28.375, + "grad_norm": 38.69272253503625, + "learning_rate": 5e-05, + "loss": 0.0205, + "num_input_tokens_seen": 31163200, + "step": 454 + }, + { + "epoch": 28.375, + "loss": 0.022096609696745872, + "loss_ce": 0.002809500088915229, + "loss_xval": 0.019287109375, + "num_input_tokens_seen": 31163200, + "step": 454 + }, + { + "epoch": 28.4375, + "grad_norm": 30.798340846655005, + "learning_rate": 5e-05, + "loss": 0.0167, + "num_input_tokens_seen": 31234752, + "step": 455 + }, + { + "epoch": 28.4375, + "loss": 0.0183558352291584, + "loss_ce": 0.0032801516354084015, + "loss_xval": 0.01507568359375, + "num_input_tokens_seen": 31234752, + "step": 455 + }, + { + "epoch": 28.5, + "grad_norm": 30.85733710728687, + "learning_rate": 5e-05, + "loss": 0.0146, + "num_input_tokens_seen": 31306496, + "step": 456 + }, + { + "epoch": 28.5, + "loss": 0.016646770760416985, + "loss_ce": 0.0023645448964089155, + "loss_xval": 0.0142822265625, + "num_input_tokens_seen": 31306496, + "step": 456 + }, + { + "epoch": 28.5625, + "grad_norm": 26.032608242571797, + "learning_rate": 5e-05, + "loss": 0.0133, + "num_input_tokens_seen": 31365632, + "step": 457 + }, + { + "epoch": 28.5625, + "loss": 0.01072744745761156, + "loss_ce": 0.0023656312841922045, + "loss_xval": 0.00836181640625, + "num_input_tokens_seen": 31365632, + "step": 457 + }, + { + "epoch": 28.625, + "grad_norm": 15.7544578200061, + "learning_rate": 5e-05, + "loss": 0.009, + "num_input_tokens_seen": 31437248, + "step": 458 + }, + { + "epoch": 28.625, + "loss": 0.008288905955851078, + "loss_ce": 0.0020633197855204344, + "loss_xval": 0.0062255859375, + "num_input_tokens_seen": 31437248, + "step": 458 + }, + { + "epoch": 28.6875, + "grad_norm": 11.918111014755505, + "learning_rate": 5e-05, + "loss": 0.0063, + "num_input_tokens_seen": 31508928, + "step": 459 + }, + { + "epoch": 28.6875, + "loss": 0.006751437671482563, + "loss_ce": 0.002845187671482563, + "loss_xval": 0.00390625, + "num_input_tokens_seen": 31508928, + "step": 459 + }, + { + "epoch": 28.75, + "grad_norm": 10.662391058200697, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 31568064, + "step": 460 + }, + { + "epoch": 28.75, + "loss": 0.005551275797188282, + "loss_ce": 0.0018586487276479602, + "loss_xval": 0.003692626953125, + "num_input_tokens_seen": 31568064, + "step": 460 + }, + { + "epoch": 28.8125, + "grad_norm": 9.083266060823656, + "learning_rate": 5e-05, + "loss": 0.0063, + "num_input_tokens_seen": 31639744, + "step": 461 + }, + { + "epoch": 28.8125, + "loss": 0.007424565963447094, + "loss_ce": 0.0026333059649914503, + "loss_xval": 0.004791259765625, + "num_input_tokens_seen": 31639744, + "step": 461 + }, + { + "epoch": 28.875, + "grad_norm": 6.863640803613258, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 31698880, + "step": 462 + }, + { + "epoch": 28.875, + "loss": 0.0041747791692614555, + "loss_ce": 0.0018859605770558119, + "loss_xval": 0.002288818359375, + "num_input_tokens_seen": 31698880, + "step": 462 + }, + { + "epoch": 28.9375, + "grad_norm": 2.9356043389315865, + "learning_rate": 5e-05, + "loss": 0.0058, + "num_input_tokens_seen": 31770560, + "step": 463 + }, + { + "epoch": 28.9375, + "loss": 0.005293050315231085, + "loss_ce": 0.0015699057839810848, + "loss_xval": 0.00372314453125, + "num_input_tokens_seen": 31770560, + "step": 463 + }, + { + "epoch": 29.0, + "grad_norm": 5.050925803376897, + "learning_rate": 5e-05, + "loss": 0.0067, + "num_input_tokens_seen": 31842112, + "step": 464 + }, + { + "epoch": 29.0, + "loss": 0.006694018840789795, + "loss_ce": 0.001750171068124473, + "loss_xval": 0.00494384765625, + "num_input_tokens_seen": 31842112, + "step": 464 + }, + { + "epoch": 29.0625, + "grad_norm": 22.310520898540933, + "learning_rate": 5e-05, + "loss": 0.0093, + "num_input_tokens_seen": 31913728, + "step": 465 + }, + { + "epoch": 29.0625, + "loss": 0.009156938642263412, + "loss_ce": 0.002259966218844056, + "loss_xval": 0.00689697265625, + "num_input_tokens_seen": 31913728, + "step": 465 + }, + { + "epoch": 29.125, + "grad_norm": 54.42750478439811, + "learning_rate": 5e-05, + "loss": 0.0321, + "num_input_tokens_seen": 31985344, + "step": 466 + }, + { + "epoch": 29.125, + "loss": 0.029004469513893127, + "loss_ce": 0.0019048596732318401, + "loss_xval": 0.027099609375, + "num_input_tokens_seen": 31985344, + "step": 466 + }, + { + "epoch": 29.1875, + "grad_norm": 89.64622913235212, + "learning_rate": 5e-05, + "loss": 0.0845, + "num_input_tokens_seen": 32057024, + "step": 467 + }, + { + "epoch": 29.1875, + "loss": 0.08670613914728165, + "loss_ce": 0.0017451988533139229, + "loss_xval": 0.0849609375, + "num_input_tokens_seen": 32057024, + "step": 467 + }, + { + "epoch": 29.25, + "grad_norm": 132.36809969049492, + "learning_rate": 5e-05, + "loss": 0.1769, + "num_input_tokens_seen": 32128768, + "step": 468 + }, + { + "epoch": 29.25, + "loss": 0.17079618573188782, + "loss_ce": 0.0018508683424443007, + "loss_xval": 0.1689453125, + "num_input_tokens_seen": 32128768, + "step": 468 + }, + { + "epoch": 29.3125, + "grad_norm": 198.67521722432534, + "learning_rate": 5e-05, + "loss": 0.3852, + "num_input_tokens_seen": 32200320, + "step": 469 + }, + { + "epoch": 29.3125, + "loss": 0.3854331970214844, + "loss_ce": 0.002620699815452099, + "loss_xval": 0.3828125, + "num_input_tokens_seen": 32200320, + "step": 469 + }, + { + "epoch": 29.375, + "grad_norm": 276.7772498319319, + "learning_rate": 5e-05, + "loss": 0.7342, + "num_input_tokens_seen": 32272064, + "step": 470 + }, + { + "epoch": 29.375, + "loss": 0.7204177379608154, + "loss_ce": 0.0016677571693435311, + "loss_xval": 0.71875, + "num_input_tokens_seen": 32272064, + "step": 470 + }, + { + "epoch": 29.4375, + "grad_norm": 323.3059130938597, + "learning_rate": 5e-05, + "loss": 1.1, + "num_input_tokens_seen": 32343680, + "step": 471 + }, + { + "epoch": 29.4375, + "loss": 1.1041285991668701, + "loss_ce": 0.0025661007966846228, + "loss_xval": 1.1015625, + "num_input_tokens_seen": 32343680, + "step": 471 + }, + { + "epoch": 29.5, + "grad_norm": 247.84388629445735, + "learning_rate": 5e-05, + "loss": 0.7679, + "num_input_tokens_seen": 32415296, + "step": 472 + }, + { + "epoch": 29.5, + "loss": 0.7610682845115662, + "loss_ce": 0.011068296618759632, + "loss_xval": 0.75, + "num_input_tokens_seen": 32415296, + "step": 472 + }, + { + "epoch": 29.5625, + "grad_norm": 79.33459788940463, + "learning_rate": 5e-05, + "loss": 0.1329, + "num_input_tokens_seen": 32486976, + "step": 473 + }, + { + "epoch": 29.5625, + "loss": 0.14590966701507568, + "loss_ce": 0.01700342260301113, + "loss_xval": 0.12890625, + "num_input_tokens_seen": 32486976, + "step": 473 + }, + { + "epoch": 29.625, + "grad_norm": 92.30159962135761, + "learning_rate": 5e-05, + "loss": 0.1728, + "num_input_tokens_seen": 32546112, + "step": 474 + }, + { + "epoch": 29.625, + "loss": 0.1546037346124649, + "loss_ce": 0.020814666524529457, + "loss_xval": 0.1337890625, + "num_input_tokens_seen": 32546112, + "step": 474 + }, + { + "epoch": 29.6875, + "grad_norm": 206.2393649928487, + "learning_rate": 5e-05, + "loss": 0.6537, + "num_input_tokens_seen": 32617792, + "step": 475 + }, + { + "epoch": 29.6875, + "loss": 0.6552282571792603, + "loss_ce": 0.022415766492486, + "loss_xval": 0.6328125, + "num_input_tokens_seen": 32617792, + "step": 475 + }, + { + "epoch": 29.75, + "grad_norm": 174.12080752050923, + "learning_rate": 5e-05, + "loss": 0.4548, + "num_input_tokens_seen": 32689344, + "step": 476 + }, + { + "epoch": 29.75, + "loss": 0.4094059467315674, + "loss_ce": 0.018780935555696487, + "loss_xval": 0.390625, + "num_input_tokens_seen": 32689344, + "step": 476 + }, + { + "epoch": 29.8125, + "grad_norm": 16.614748602973496, + "learning_rate": 5e-05, + "loss": 0.0486, + "num_input_tokens_seen": 32761024, + "step": 477 + }, + { + "epoch": 29.8125, + "loss": 0.05317946523427963, + "loss_ce": 0.010210715234279633, + "loss_xval": 0.04296875, + "num_input_tokens_seen": 32761024, + "step": 477 + }, + { + "epoch": 29.875, + "grad_norm": 155.42975100857723, + "learning_rate": 5e-05, + "loss": 0.2962, + "num_input_tokens_seen": 32820224, + "step": 478 + }, + { + "epoch": 29.875, + "loss": 0.3082675635814667, + "loss_ce": 0.011392563581466675, + "loss_xval": 0.296875, + "num_input_tokens_seen": 32820224, + "step": 478 + }, + { + "epoch": 29.9375, + "grad_norm": 175.08209465974755, + "learning_rate": 5e-05, + "loss": 0.4032, + "num_input_tokens_seen": 32891904, + "step": 479 + }, + { + "epoch": 29.9375, + "loss": 0.37343543767929077, + "loss_ce": 0.008201051503419876, + "loss_xval": 0.365234375, + "num_input_tokens_seen": 32891904, + "step": 479 + }, + { + "epoch": 30.0, + "grad_norm": 72.59815396689685, + "learning_rate": 5e-05, + "loss": 0.0902, + "num_input_tokens_seen": 32963520, + "step": 480 + }, + { + "epoch": 30.0, + "loss": 0.08620818704366684, + "loss_ce": 0.01198943704366684, + "loss_xval": 0.07421875, + "num_input_tokens_seen": 32963520, + "step": 480 + }, + { + "epoch": 30.0625, + "grad_norm": 84.90202896547324, + "learning_rate": 5e-05, + "loss": 0.1233, + "num_input_tokens_seen": 33035072, + "step": 481 + }, + { + "epoch": 30.0625, + "loss": 0.11216428875923157, + "loss_ce": 0.02671506628394127, + "loss_xval": 0.08544921875, + "num_input_tokens_seen": 33035072, + "step": 481 + }, + { + "epoch": 30.125, + "grad_norm": 185.0124554464278, + "learning_rate": 5e-05, + "loss": 0.4448, + "num_input_tokens_seen": 33094144, + "step": 482 + }, + { + "epoch": 30.125, + "loss": 0.4436917006969452, + "loss_ce": 0.05306670442223549, + "loss_xval": 0.390625, + "num_input_tokens_seen": 33094144, + "step": 482 + }, + { + "epoch": 30.1875, + "grad_norm": 160.28780193792343, + "learning_rate": 5e-05, + "loss": 0.3597, + "num_input_tokens_seen": 33165760, + "step": 483 + }, + { + "epoch": 30.1875, + "loss": 0.40504151582717896, + "loss_ce": 0.03590088710188866, + "loss_xval": 0.369140625, + "num_input_tokens_seen": 33165760, + "step": 483 + }, + { + "epoch": 30.25, + "grad_norm": 24.348194390421313, + "learning_rate": 5e-05, + "loss": 0.0453, + "num_input_tokens_seen": 33237376, + "step": 484 + }, + { + "epoch": 30.25, + "loss": 0.04609951376914978, + "loss_ce": 0.01948818564414978, + "loss_xval": 0.026611328125, + "num_input_tokens_seen": 33237376, + "step": 484 + }, + { + "epoch": 30.3125, + "grad_norm": 115.32401946114567, + "learning_rate": 5e-05, + "loss": 0.2025, + "num_input_tokens_seen": 33309056, + "step": 485 + }, + { + "epoch": 30.3125, + "loss": 0.1775987148284912, + "loss_ce": 0.02037215791642666, + "loss_xval": 0.1572265625, + "num_input_tokens_seen": 33309056, + "step": 485 + }, + { + "epoch": 30.375, + "grad_norm": 129.3704814022795, + "learning_rate": 5e-05, + "loss": 0.2717, + "num_input_tokens_seen": 33380736, + "step": 486 + }, + { + "epoch": 30.375, + "loss": 0.27861320972442627, + "loss_ce": 0.026660069823265076, + "loss_xval": 0.251953125, + "num_input_tokens_seen": 33380736, + "step": 486 + }, + { + "epoch": 30.4375, + "grad_norm": 18.90543693800268, + "learning_rate": 5e-05, + "loss": 0.0492, + "num_input_tokens_seen": 33452352, + "step": 487 + }, + { + "epoch": 30.4375, + "loss": 0.05479831621050835, + "loss_ce": 0.018177222460508347, + "loss_xval": 0.03662109375, + "num_input_tokens_seen": 33452352, + "step": 487 + }, + { + "epoch": 30.5, + "grad_norm": 98.72747960845389, + "learning_rate": 5e-05, + "loss": 0.1848, + "num_input_tokens_seen": 33524032, + "step": 488 + }, + { + "epoch": 30.5, + "loss": 0.1754639744758606, + "loss_ce": 0.020190536975860596, + "loss_xval": 0.1552734375, + "num_input_tokens_seen": 33524032, + "step": 488 + }, + { + "epoch": 30.5625, + "grad_norm": 108.46431537751594, + "learning_rate": 5e-05, + "loss": 0.244, + "num_input_tokens_seen": 33595648, + "step": 489 + }, + { + "epoch": 30.5625, + "loss": 0.24445466697216034, + "loss_ce": 0.019845297560095787, + "loss_xval": 0.224609375, + "num_input_tokens_seen": 33595648, + "step": 489 + }, + { + "epoch": 30.625, + "grad_norm": 16.58390160532647, + "learning_rate": 5e-05, + "loss": 0.0316, + "num_input_tokens_seen": 33667328, + "step": 490 + }, + { + "epoch": 30.625, + "loss": 0.027875222265720367, + "loss_ce": 0.015485084615647793, + "loss_xval": 0.01239013671875, + "num_input_tokens_seen": 33667328, + "step": 490 + }, + { + "epoch": 30.6875, + "grad_norm": 89.30391448559037, + "learning_rate": 5e-05, + "loss": 0.1477, + "num_input_tokens_seen": 33738944, + "step": 491 + }, + { + "epoch": 30.6875, + "loss": 0.14867781102657318, + "loss_ce": 0.01684187538921833, + "loss_xval": 0.1318359375, + "num_input_tokens_seen": 33738944, + "step": 491 + }, + { + "epoch": 30.75, + "grad_norm": 76.51561580318129, + "learning_rate": 5e-05, + "loss": 0.105, + "num_input_tokens_seen": 33810688, + "step": 492 + }, + { + "epoch": 30.75, + "loss": 0.09136239439249039, + "loss_ce": 0.016655363142490387, + "loss_xval": 0.07470703125, + "num_input_tokens_seen": 33810688, + "step": 492 + }, + { + "epoch": 30.8125, + "grad_norm": 47.40846111493751, + "learning_rate": 5e-05, + "loss": 0.0548, + "num_input_tokens_seen": 33882304, + "step": 493 + }, + { + "epoch": 30.8125, + "loss": 0.051707200706005096, + "loss_ce": 0.015818530693650246, + "loss_xval": 0.035888671875, + "num_input_tokens_seen": 33882304, + "step": 493 + }, + { + "epoch": 30.875, + "grad_norm": 116.85312574162306, + "learning_rate": 5e-05, + "loss": 0.2196, + "num_input_tokens_seen": 33953856, + "step": 494 + }, + { + "epoch": 30.875, + "loss": 0.21689002215862274, + "loss_ce": 0.012788456864655018, + "loss_xval": 0.2041015625, + "num_input_tokens_seen": 33953856, + "step": 494 + }, + { + "epoch": 30.9375, + "grad_norm": 33.56625863849918, + "learning_rate": 5e-05, + "loss": 0.066, + "num_input_tokens_seen": 34025536, + "step": 495 + }, + { + "epoch": 30.9375, + "loss": 0.07113610208034515, + "loss_ce": 0.016936881467700005, + "loss_xval": 0.05419921875, + "num_input_tokens_seen": 34025536, + "step": 495 + }, + { + "epoch": 31.0, + "grad_norm": 89.30015482102085, + "learning_rate": 5e-05, + "loss": 0.1394, + "num_input_tokens_seen": 34097280, + "step": 496 + }, + { + "epoch": 31.0, + "loss": 0.1428450495004654, + "loss_ce": 0.011985674500465393, + "loss_xval": 0.130859375, + "num_input_tokens_seen": 34097280, + "step": 496 + }, + { + "epoch": 31.0625, + "grad_norm": 111.67380604143827, + "learning_rate": 5e-05, + "loss": 0.1921, + "num_input_tokens_seen": 34156352, + "step": 497 + }, + { + "epoch": 31.0625, + "loss": 0.19294708967208862, + "loss_ce": 0.01032990776002407, + "loss_xval": 0.1826171875, + "num_input_tokens_seen": 34156352, + "step": 497 + }, + { + "epoch": 31.125, + "grad_norm": 15.672979639453287, + "learning_rate": 5e-05, + "loss": 0.04, + "num_input_tokens_seen": 34228096, + "step": 498 + }, + { + "epoch": 31.125, + "loss": 0.04462538659572601, + "loss_ce": 0.010201560333371162, + "loss_xval": 0.034423828125, + "num_input_tokens_seen": 34228096, + "step": 498 + }, + { + "epoch": 31.1875, + "grad_norm": 101.23147864018928, + "learning_rate": 5e-05, + "loss": 0.1685, + "num_input_tokens_seen": 34299648, + "step": 499 + }, + { + "epoch": 31.1875, + "loss": 0.14524954557418823, + "loss_ce": 0.007554229814559221, + "loss_xval": 0.1376953125, + "num_input_tokens_seen": 34299648, + "step": 499 + }, + { + "epoch": 31.25, + "grad_norm": 79.7314578785406, + "learning_rate": 5e-05, + "loss": 0.1155, + "num_input_tokens_seen": 34371456, + "step": 500 + }, + { + "epoch": 31.25, + "eval_synth_IoU": 0.00709187425673008, + "eval_synth_MAE_x": 0.183349609375, + "eval_synth_MAE_y": 0.22174072265625, + "eval_synth_NUM_probability": 0.9508986622095108, + "eval_synth_inside_bbox": 0.0, + "eval_synth_loss": 0.03743727132678032, + "eval_synth_loss_ce": 0.0061567522352561355, + "eval_synth_loss_xval": 0.031280517578125, + "eval_synth_runtime": 60.0762, + "eval_synth_samples_per_second": 2.131, + "eval_synth_steps_per_second": 0.067, + "num_input_tokens_seen": 34371456, + "step": 500 + }, + { + "epoch": 31.25, + "loss": 0.038767192512750626, + "loss_ce": 0.004831646103411913, + "loss_xval": 0.033935546875, + "num_input_tokens_seen": 34371456, + "step": 500 + }, + { + "epoch": 31.3125, + "grad_norm": 33.35997221627081, + "learning_rate": 5e-05, + "loss": 0.0416, + "num_input_tokens_seen": 34443136, + "step": 501 + }, + { + "epoch": 31.3125, + "loss": 0.03074466995894909, + "loss_ce": 0.006330606993287802, + "loss_xval": 0.0244140625, + "num_input_tokens_seen": 34443136, + "step": 501 + }, + { + "epoch": 31.375, + "grad_norm": 101.3020414172374, + "learning_rate": 5e-05, + "loss": 0.1878, + "num_input_tokens_seen": 34514688, + "step": 502 + }, + { + "epoch": 31.375, + "loss": 0.18065997958183289, + "loss_ce": 0.0039021666161715984, + "loss_xval": 0.1767578125, + "num_input_tokens_seen": 34514688, + "step": 502 + }, + { + "epoch": 31.4375, + "grad_norm": 58.21370222284822, + "learning_rate": 5e-05, + "loss": 0.0698, + "num_input_tokens_seen": 34586432, + "step": 503 + }, + { + "epoch": 31.4375, + "loss": 0.0838276594877243, + "loss_ce": 0.004237812012434006, + "loss_xval": 0.07958984375, + "num_input_tokens_seen": 34586432, + "step": 503 + }, + { + "epoch": 31.5, + "grad_norm": 41.29086127352125, + "learning_rate": 5e-05, + "loss": 0.0441, + "num_input_tokens_seen": 34658048, + "step": 504 + }, + { + "epoch": 31.5, + "loss": 0.03785308822989464, + "loss_ce": 0.005138244479894638, + "loss_xval": 0.03271484375, + "num_input_tokens_seen": 34658048, + "step": 504 + }, + { + "epoch": 31.5625, + "grad_norm": 85.45688443812506, + "learning_rate": 5e-05, + "loss": 0.1334, + "num_input_tokens_seen": 34717184, + "step": 505 + }, + { + "epoch": 31.5625, + "loss": 0.14185205101966858, + "loss_ce": 0.006109867710620165, + "loss_xval": 0.1357421875, + "num_input_tokens_seen": 34717184, + "step": 505 + }, + { + "epoch": 31.625, + "grad_norm": 32.42188566199593, + "learning_rate": 5e-05, + "loss": 0.0356, + "num_input_tokens_seen": 34788736, + "step": 506 + }, + { + "epoch": 31.625, + "loss": 0.03629305213689804, + "loss_ce": 0.005287191364914179, + "loss_xval": 0.031005859375, + "num_input_tokens_seen": 34788736, + "step": 506 + }, + { + "epoch": 31.6875, + "grad_norm": 49.39589605562242, + "learning_rate": 5e-05, + "loss": 0.0519, + "num_input_tokens_seen": 34860352, + "step": 507 + }, + { + "epoch": 31.6875, + "loss": 0.05294110253453255, + "loss_ce": 0.0028922753408551216, + "loss_xval": 0.050048828125, + "num_input_tokens_seen": 34860352, + "step": 507 + }, + { + "epoch": 31.75, + "grad_norm": 75.95056302700068, + "learning_rate": 5e-05, + "loss": 0.1077, + "num_input_tokens_seen": 34931968, + "step": 508 + }, + { + "epoch": 31.75, + "loss": 0.11320182681083679, + "loss_ce": 0.0033385478891432285, + "loss_xval": 0.10986328125, + "num_input_tokens_seen": 34931968, + "step": 508 + }, + { + "epoch": 31.8125, + "grad_norm": 19.239734144914703, + "learning_rate": 5e-05, + "loss": 0.016, + "num_input_tokens_seen": 35003520, + "step": 509 + }, + { + "epoch": 31.8125, + "loss": 0.01861836388707161, + "loss_ce": 0.002383012091740966, + "loss_xval": 0.0162353515625, + "num_input_tokens_seen": 35003520, + "step": 509 + }, + { + "epoch": 31.875, + "grad_norm": 63.359210101062615, + "learning_rate": 5e-05, + "loss": 0.0702, + "num_input_tokens_seen": 35075200, + "step": 510 + }, + { + "epoch": 31.875, + "loss": 0.06541571021080017, + "loss_ce": 0.0024274298921227455, + "loss_xval": 0.06298828125, + "num_input_tokens_seen": 35075200, + "step": 510 + }, + { + "epoch": 31.9375, + "grad_norm": 72.4722047412447, + "learning_rate": 5e-05, + "loss": 0.0872, + "num_input_tokens_seen": 35146944, + "step": 511 + }, + { + "epoch": 31.9375, + "loss": 0.09188695251941681, + "loss_ce": 0.0025314840022474527, + "loss_xval": 0.08935546875, + "num_input_tokens_seen": 35146944, + "step": 511 + }, + { + "epoch": 32.0, + "grad_norm": 3.152057996325055, + "learning_rate": 5e-05, + "loss": 0.0102, + "num_input_tokens_seen": 35218560, + "step": 512 + }, + { + "epoch": 32.0, + "loss": 0.009740998968482018, + "loss_ce": 0.002325227949768305, + "loss_xval": 0.007415771484375, + "num_input_tokens_seen": 35218560, + "step": 512 + }, + { + "epoch": 32.0625, + "grad_norm": 64.23363309997048, + "learning_rate": 5e-05, + "loss": 0.0727, + "num_input_tokens_seen": 35290112, + "step": 513 + }, + { + "epoch": 32.0625, + "loss": 0.07529156655073166, + "loss_ce": 0.002049379050731659, + "loss_xval": 0.0732421875, + "num_input_tokens_seen": 35290112, + "step": 513 + }, + { + "epoch": 32.125, + "grad_norm": 60.79170962146762, + "learning_rate": 5e-05, + "loss": 0.0633, + "num_input_tokens_seen": 35361856, + "step": 514 + }, + { + "epoch": 32.125, + "loss": 0.062045708298683167, + "loss_ce": 0.002231256105005741, + "loss_xval": 0.059814453125, + "num_input_tokens_seen": 35361856, + "step": 514 + }, + { + "epoch": 32.1875, + "grad_norm": 3.960201489811377, + "learning_rate": 5e-05, + "loss": 0.0083, + "num_input_tokens_seen": 35433472, + "step": 515 + }, + { + "epoch": 32.1875, + "loss": 0.008324277587234974, + "loss_ce": 0.0019461041083559394, + "loss_xval": 0.006378173828125, + "num_input_tokens_seen": 35433472, + "step": 515 + }, + { + "epoch": 32.25, + "grad_norm": 51.90847741220123, + "learning_rate": 5e-05, + "loss": 0.05, + "num_input_tokens_seen": 35505088, + "step": 516 + }, + { + "epoch": 32.25, + "loss": 0.047266796231269836, + "loss_ce": 0.002344920299947262, + "loss_xval": 0.044921875, + "num_input_tokens_seen": 35505088, + "step": 516 + }, + { + "epoch": 32.3125, + "grad_norm": 33.40482968325523, + "learning_rate": 5e-05, + "loss": 0.0274, + "num_input_tokens_seen": 35576640, + "step": 517 + }, + { + "epoch": 32.3125, + "loss": 0.028853023424744606, + "loss_ce": 0.0019975542090833187, + "loss_xval": 0.02685546875, + "num_input_tokens_seen": 35576640, + "step": 517 + }, + { + "epoch": 32.375, + "grad_norm": 19.400593021580494, + "learning_rate": 5e-05, + "loss": 0.0125, + "num_input_tokens_seen": 35648192, + "step": 518 + }, + { + "epoch": 32.375, + "loss": 0.013727393001317978, + "loss_ce": 0.002252783626317978, + "loss_xval": 0.011474609375, + "num_input_tokens_seen": 35648192, + "step": 518 + }, + { + "epoch": 32.4375, + "grad_norm": 46.919959828504645, + "learning_rate": 5e-05, + "loss": 0.0414, + "num_input_tokens_seen": 35719936, + "step": 519 + }, + { + "epoch": 32.4375, + "loss": 0.034388408064842224, + "loss_ce": 0.0019177051726728678, + "loss_xval": 0.032470703125, + "num_input_tokens_seen": 35719936, + "step": 519 + }, + { + "epoch": 32.5, + "grad_norm": 25.058903049742, + "learning_rate": 5e-05, + "loss": 0.0189, + "num_input_tokens_seen": 35791680, + "step": 520 + }, + { + "epoch": 32.5, + "loss": 0.017532430589199066, + "loss_ce": 0.00196846597827971, + "loss_xval": 0.01556396484375, + "num_input_tokens_seen": 35791680, + "step": 520 + }, + { + "epoch": 32.5625, + "grad_norm": 13.897068822398493, + "learning_rate": 5e-05, + "loss": 0.0139, + "num_input_tokens_seen": 35863232, + "step": 521 + }, + { + "epoch": 32.5625, + "loss": 0.013923520222306252, + "loss_ce": 0.002876157173886895, + "loss_xval": 0.01104736328125, + "num_input_tokens_seen": 35863232, + "step": 521 + }, + { + "epoch": 32.625, + "grad_norm": 26.52398400048985, + "learning_rate": 5e-05, + "loss": 0.0198, + "num_input_tokens_seen": 35934784, + "step": 522 + }, + { + "epoch": 32.625, + "loss": 0.019185151904821396, + "loss_ce": 0.0023394490126520395, + "loss_xval": 0.016845703125, + "num_input_tokens_seen": 35934784, + "step": 522 + }, + { + "epoch": 32.6875, + "grad_norm": 18.375483995081037, + "learning_rate": 5e-05, + "loss": 0.0114, + "num_input_tokens_seen": 36006464, + "step": 523 + }, + { + "epoch": 32.6875, + "loss": 0.01426710095256567, + "loss_ce": 0.0019379997393116355, + "loss_xval": 0.0123291015625, + "num_input_tokens_seen": 36006464, + "step": 523 + }, + { + "epoch": 32.75, + "grad_norm": 6.104764631350985, + "learning_rate": 5e-05, + "loss": 0.0082, + "num_input_tokens_seen": 36078016, + "step": 524 + }, + { + "epoch": 32.75, + "loss": 0.006929485592991114, + "loss_ce": 0.0029622004367411137, + "loss_xval": 0.00396728515625, + "num_input_tokens_seen": 36078016, + "step": 524 + }, + { + "epoch": 32.8125, + "grad_norm": 3.738792946367771, + "learning_rate": 5e-05, + "loss": 0.0067, + "num_input_tokens_seen": 36149696, + "step": 525 + }, + { + "epoch": 32.8125, + "loss": 0.004459495190531015, + "loss_ce": 0.0013924787053838372, + "loss_xval": 0.0030670166015625, + "num_input_tokens_seen": 36149696, + "step": 525 + }, + { + "epoch": 32.875, + "grad_norm": 4.412211012903223, + "learning_rate": 5e-05, + "loss": 0.0054, + "num_input_tokens_seen": 36221312, + "step": 526 + }, + { + "epoch": 32.875, + "loss": 0.005546785891056061, + "loss_ce": 0.0018236414762213826, + "loss_xval": 0.00372314453125, + "num_input_tokens_seen": 36221312, + "step": 526 + }, + { + "epoch": 32.9375, + "grad_norm": 5.271021579259849, + "learning_rate": 5e-05, + "loss": 0.006, + "num_input_tokens_seen": 36280448, + "step": 527 + }, + { + "epoch": 32.9375, + "loss": 0.006776329595595598, + "loss_ce": 0.0017409290885552764, + "loss_xval": 0.005035400390625, + "num_input_tokens_seen": 36280448, + "step": 527 + }, + { + "epoch": 33.0, + "grad_norm": 8.210751475709698, + "learning_rate": 5e-05, + "loss": 0.0064, + "num_input_tokens_seen": 36352128, + "step": 528 + }, + { + "epoch": 33.0, + "loss": 0.005801305174827576, + "loss_ce": 0.0019866079092025757, + "loss_xval": 0.003814697265625, + "num_input_tokens_seen": 36352128, + "step": 528 + }, + { + "epoch": 33.0625, + "grad_norm": 9.888229864178133, + "learning_rate": 5e-05, + "loss": 0.0075, + "num_input_tokens_seen": 36411264, + "step": 529 + }, + { + "epoch": 33.0625, + "loss": 0.00706527940928936, + "loss_ce": 0.0013890096452087164, + "loss_xval": 0.00567626953125, + "num_input_tokens_seen": 36411264, + "step": 529 + }, + { + "epoch": 33.125, + "grad_norm": 14.21697984091777, + "learning_rate": 5e-05, + "loss": 0.0084, + "num_input_tokens_seen": 36483008, + "step": 530 + }, + { + "epoch": 33.125, + "loss": 0.008422967046499252, + "loss_ce": 0.0016175472410395741, + "loss_xval": 0.006805419921875, + "num_input_tokens_seen": 36483008, + "step": 530 + }, + { + "epoch": 33.1875, + "grad_norm": 11.435123159471466, + "learning_rate": 5e-05, + "loss": 0.0074, + "num_input_tokens_seen": 36542080, + "step": 531 + }, + { + "epoch": 33.1875, + "loss": 0.00780049292370677, + "loss_ce": 0.0013002488994970918, + "loss_xval": 0.006500244140625, + "num_input_tokens_seen": 36542080, + "step": 531 + }, + { + "epoch": 33.25, + "grad_norm": 2.0343152338608244, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 36613696, + "step": 532 + }, + { + "epoch": 33.25, + "loss": 0.005734146572649479, + "loss_ce": 0.001064957003109157, + "loss_xval": 0.004669189453125, + "num_input_tokens_seen": 36613696, + "step": 532 + }, + { + "epoch": 33.3125, + "grad_norm": 5.0903071876766806, + "learning_rate": 5e-05, + "loss": 0.0053, + "num_input_tokens_seen": 36685312, + "step": 533 + }, + { + "epoch": 33.3125, + "loss": 0.004778821021318436, + "loss_ce": 0.001009899890050292, + "loss_xval": 0.0037689208984375, + "num_input_tokens_seen": 36685312, + "step": 533 + }, + { + "epoch": 33.375, + "grad_norm": 7.323274835426254, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 36757056, + "step": 534 + }, + { + "epoch": 33.375, + "loss": 0.0067153251729905605, + "loss_ce": 0.0011916436487808824, + "loss_xval": 0.005523681640625, + "num_input_tokens_seen": 36757056, + "step": 534 + }, + { + "epoch": 33.4375, + "grad_norm": 8.30632649383284, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 36816256, + "step": 535 + }, + { + "epoch": 33.4375, + "loss": 0.0058524045161902905, + "loss_ce": 0.0010916623286902905, + "loss_xval": 0.0047607421875, + "num_input_tokens_seen": 36816256, + "step": 535 + }, + { + "epoch": 33.5, + "grad_norm": 14.708122951546155, + "learning_rate": 5e-05, + "loss": 0.0068, + "num_input_tokens_seen": 36887936, + "step": 536 + }, + { + "epoch": 33.5, + "loss": 0.007070864085108042, + "loss_ce": 0.0012725242413580418, + "loss_xval": 0.00579833984375, + "num_input_tokens_seen": 36887936, + "step": 536 + }, + { + "epoch": 33.5625, + "grad_norm": 16.30334818048439, + "learning_rate": 5e-05, + "loss": 0.0071, + "num_input_tokens_seen": 36959552, + "step": 537 + }, + { + "epoch": 33.5625, + "loss": 0.00768459215760231, + "loss_ce": 0.0009707247372716665, + "loss_xval": 0.0067138671875, + "num_input_tokens_seen": 36959552, + "step": 537 + }, + { + "epoch": 33.625, + "grad_norm": 5.956799141206181, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 37018624, + "step": 538 + }, + { + "epoch": 33.625, + "loss": 0.0054640574380755424, + "loss_ce": 0.0009169381810352206, + "loss_xval": 0.004547119140625, + "num_input_tokens_seen": 37018624, + "step": 538 + }, + { + "epoch": 33.6875, + "grad_norm": 5.938199449973524, + "learning_rate": 5e-05, + "loss": 0.006, + "num_input_tokens_seen": 37077696, + "step": 539 + }, + { + "epoch": 33.6875, + "loss": 0.0063777221366763115, + "loss_ce": 0.0008235230925492942, + "loss_xval": 0.00555419921875, + "num_input_tokens_seen": 37077696, + "step": 539 + }, + { + "epoch": 33.75, + "grad_norm": 9.67535270082586, + "learning_rate": 5e-05, + "loss": 0.0053, + "num_input_tokens_seen": 37149312, + "step": 540 + }, + { + "epoch": 33.75, + "loss": 0.0036917172838002443, + "loss_ce": 0.0008230649982579052, + "loss_xval": 0.00286865234375, + "num_input_tokens_seen": 37149312, + "step": 540 + }, + { + "epoch": 33.8125, + "grad_norm": 7.6229017292072285, + "learning_rate": 5e-05, + "loss": 0.007, + "num_input_tokens_seen": 37220992, + "step": 541 + }, + { + "epoch": 33.8125, + "loss": 0.0077782683074474335, + "loss_ce": 0.0009118132875300944, + "loss_xval": 0.006866455078125, + "num_input_tokens_seen": 37220992, + "step": 541 + }, + { + "epoch": 33.875, + "grad_norm": 2.035502392885776, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_input_tokens_seen": 37292544, + "step": 542 + }, + { + "epoch": 33.875, + "loss": 0.0037268272135406733, + "loss_ce": 0.0008581749279983342, + "loss_xval": 0.00286865234375, + "num_input_tokens_seen": 37292544, + "step": 542 + }, + { + "epoch": 33.9375, + "grad_norm": 2.7394241925506413, + "learning_rate": 5e-05, + "loss": 0.003, + "num_input_tokens_seen": 37364096, + "step": 543 + }, + { + "epoch": 33.9375, + "loss": 0.0030486832838505507, + "loss_ce": 0.0009429703350178897, + "loss_xval": 0.002105712890625, + "num_input_tokens_seen": 37364096, + "step": 543 + }, + { + "epoch": 34.0, + "grad_norm": 6.268802781769899, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_input_tokens_seen": 37435776, + "step": 544 + }, + { + "epoch": 34.0, + "loss": 0.002665142063051462, + "loss_ce": 0.0008188285282813013, + "loss_xval": 0.0018463134765625, + "num_input_tokens_seen": 37435776, + "step": 544 + }, + { + "epoch": 34.0625, + "grad_norm": 5.029103856315326, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 37507392, + "step": 545 + }, + { + "epoch": 34.0625, + "loss": 0.004083414562046528, + "loss_ce": 0.0006807046011090279, + "loss_xval": 0.0034027099609375, + "num_input_tokens_seen": 37507392, + "step": 545 + }, + { + "epoch": 34.125, + "grad_norm": 1.2852869284529702, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 37566528, + "step": 546 + }, + { + "epoch": 34.125, + "loss": 0.001781684928573668, + "loss_ce": 0.000744087272323668, + "loss_xval": 0.00103759765625, + "num_input_tokens_seen": 37566528, + "step": 546 + }, + { + "epoch": 34.1875, + "grad_norm": 4.627840663233622, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_input_tokens_seen": 37638208, + "step": 547 + }, + { + "epoch": 34.1875, + "loss": 0.0028193281032145023, + "loss_ce": 0.0006983564235270023, + "loss_xval": 0.0021209716796875, + "num_input_tokens_seen": 37638208, + "step": 547 + }, + { + "epoch": 34.25, + "grad_norm": 7.831680161882376, + "learning_rate": 5e-05, + "loss": 0.0037, + "num_input_tokens_seen": 37709760, + "step": 548 + }, + { + "epoch": 34.25, + "loss": 0.0046610175631940365, + "loss_ce": 0.0008615790284238756, + "loss_xval": 0.0037994384765625, + "num_input_tokens_seen": 37709760, + "step": 548 + }, + { + "epoch": 34.3125, + "grad_norm": 3.0887274586067153, + "learning_rate": 5e-05, + "loss": 0.0033, + "num_input_tokens_seen": 37781376, + "step": 549 + }, + { + "epoch": 34.3125, + "loss": 0.0022106545511633158, + "loss_ce": 0.0007686989265494049, + "loss_xval": 0.00144195556640625, + "num_input_tokens_seen": 37781376, + "step": 549 + }, + { + "epoch": 34.375, + "grad_norm": 3.0136299204466135, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 37840512, + "step": 550 + }, + { + "epoch": 34.375, + "loss": 0.002881981898099184, + "loss_ce": 0.000547387229744345, + "loss_xval": 0.0023345947265625, + "num_input_tokens_seen": 37840512, + "step": 550 + }, + { + "epoch": 34.4375, + "grad_norm": 5.8966589422552955, + "learning_rate": 5e-05, + "loss": 0.0033, + "num_input_tokens_seen": 37912256, + "step": 551 + }, + { + "epoch": 34.4375, + "loss": 0.003600254189223051, + "loss_ce": 0.000594272802118212, + "loss_xval": 0.0030059814453125, + "num_input_tokens_seen": 37912256, + "step": 551 + }, + { + "epoch": 34.5, + "grad_norm": 6.267751347166229, + "learning_rate": 5e-05, + "loss": 0.0033, + "num_input_tokens_seen": 37971392, + "step": 552 + }, + { + "epoch": 34.5, + "loss": 0.003300173208117485, + "loss_ce": 0.0008282493217848241, + "loss_xval": 0.002471923828125, + "num_input_tokens_seen": 37971392, + "step": 552 + }, + { + "epoch": 34.5625, + "grad_norm": 4.66913309713646, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_input_tokens_seen": 38030336, + "step": 553 + }, + { + "epoch": 34.5625, + "loss": 0.003951496444642544, + "loss_ce": 0.000747150566894561, + "loss_xval": 0.003204345703125, + "num_input_tokens_seen": 38030336, + "step": 553 + }, + { + "epoch": 34.625, + "grad_norm": 4.052412059769818, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 38089408, + "step": 554 + }, + { + "epoch": 34.625, + "loss": 0.0021589105017483234, + "loss_ce": 0.0005872553447261453, + "loss_xval": 0.0015716552734375, + "num_input_tokens_seen": 38089408, + "step": 554 + }, + { + "epoch": 34.6875, + "grad_norm": 2.423240420462405, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_input_tokens_seen": 38161024, + "step": 555 + }, + { + "epoch": 34.6875, + "loss": 0.0019102065125480294, + "loss_ce": 0.0006437270203605294, + "loss_xval": 0.0012664794921875, + "num_input_tokens_seen": 38161024, + "step": 555 + }, + { + "epoch": 34.75, + "grad_norm": 2.3341505500246424, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 38232640, + "step": 556 + }, + { + "epoch": 34.75, + "loss": 0.00247854832559824, + "loss_ce": 0.0007237876416184008, + "loss_xval": 0.0017547607421875, + "num_input_tokens_seen": 38232640, + "step": 556 + }, + { + "epoch": 34.8125, + "grad_norm": 4.626196368155673, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 38291712, + "step": 557 + }, + { + "epoch": 34.8125, + "loss": 0.0018908249912783504, + "loss_ce": 0.0005175339756533504, + "loss_xval": 0.001373291015625, + "num_input_tokens_seen": 38291712, + "step": 557 + }, + { + "epoch": 34.875, + "grad_norm": 9.620699963063549, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_input_tokens_seen": 38363392, + "step": 558 + }, + { + "epoch": 34.875, + "loss": 0.0025881037581712008, + "loss_ce": 0.0006197200273163617, + "loss_xval": 0.0019683837890625, + "num_input_tokens_seen": 38363392, + "step": 558 + }, + { + "epoch": 34.9375, + "grad_norm": 20.22670183452228, + "learning_rate": 5e-05, + "loss": 0.0078, + "num_input_tokens_seen": 38435008, + "step": 559 + }, + { + "epoch": 34.9375, + "loss": 0.007559607736766338, + "loss_ce": 0.0006626348476856947, + "loss_xval": 0.00689697265625, + "num_input_tokens_seen": 38435008, + "step": 559 + }, + { + "epoch": 35.0, + "grad_norm": 41.86820033044202, + "learning_rate": 5e-05, + "loss": 0.0284, + "num_input_tokens_seen": 38494016, + "step": 560 + }, + { + "epoch": 35.0, + "loss": 0.03127765655517578, + "loss_ce": 0.0005159361753612757, + "loss_xval": 0.03076171875, + "num_input_tokens_seen": 38494016, + "step": 560 + }, + { + "epoch": 35.0625, + "grad_norm": 90.8855239560365, + "learning_rate": 5e-05, + "loss": 0.1259, + "num_input_tokens_seen": 38565760, + "step": 561 + }, + { + "epoch": 35.0625, + "loss": 0.13142003118991852, + "loss_ce": 0.0005606548511423171, + "loss_xval": 0.130859375, + "num_input_tokens_seen": 38565760, + "step": 561 + }, + { + "epoch": 35.125, + "grad_norm": 191.29554767759743, + "learning_rate": 5e-05, + "loss": 0.5571, + "num_input_tokens_seen": 38637312, + "step": 562 + }, + { + "epoch": 35.125, + "loss": 0.5629037618637085, + "loss_ce": 0.0004037575563415885, + "loss_xval": 0.5625, + "num_input_tokens_seen": 38637312, + "step": 562 + }, + { + "epoch": 35.1875, + "grad_norm": 350.50346982799715, + "learning_rate": 5e-05, + "loss": 1.8912, + "num_input_tokens_seen": 38709056, + "step": 563 + }, + { + "epoch": 35.1875, + "loss": 1.883461833000183, + "loss_ce": 0.0006492895190604031, + "loss_xval": 1.8828125, + "num_input_tokens_seen": 38709056, + "step": 563 + }, + { + "epoch": 35.25, + "grad_norm": 366.21743192231634, + "learning_rate": 5e-05, + "loss": 2.7019, + "num_input_tokens_seen": 38768128, + "step": 564 + }, + { + "epoch": 35.25, + "loss": 2.739633321762085, + "loss_ce": 0.0052583953365683556, + "loss_xval": 2.734375, + "num_input_tokens_seen": 38768128, + "step": 564 + }, + { + "epoch": 35.3125, + "grad_norm": 551.2575174025078, + "learning_rate": 5e-05, + "loss": 5.1266, + "num_input_tokens_seen": 38839872, + "step": 565 + }, + { + "epoch": 35.3125, + "loss": 5.231325149536133, + "loss_ce": 0.10632535815238953, + "loss_xval": 5.125, + "num_input_tokens_seen": 38839872, + "step": 565 + }, + { + "epoch": 35.375, + "grad_norm": 274.40929267866454, + "learning_rate": 5e-05, + "loss": 1.5613, + "num_input_tokens_seen": 38911488, + "step": 566 + }, + { + "epoch": 35.375, + "loss": 1.4925724267959595, + "loss_ce": 0.04725995287299156, + "loss_xval": 1.4453125, + "num_input_tokens_seen": 38911488, + "step": 566 + }, + { + "epoch": 35.4375, + "grad_norm": 770.5286052127881, + "learning_rate": 5e-05, + "loss": 5.75, + "num_input_tokens_seen": 38970624, + "step": 567 + }, + { + "epoch": 35.4375, + "loss": 5.361253261566162, + "loss_ce": 0.3300033211708069, + "loss_xval": 5.03125, + "num_input_tokens_seen": 38970624, + "step": 567 + }, + { + "epoch": 35.5, + "grad_norm": 1251.903653622121, + "learning_rate": 5e-05, + "loss": 14.7884, + "num_input_tokens_seen": 39042304, + "step": 568 + }, + { + "epoch": 35.5, + "loss": 14.415470123291016, + "loss_ce": 0.5404703617095947, + "loss_xval": 13.875, + "num_input_tokens_seen": 39042304, + "step": 568 + }, + { + "epoch": 35.5625, + "grad_norm": 938.7359719425729, + "learning_rate": 5e-05, + "loss": 10.5851, + "num_input_tokens_seen": 39113984, + "step": 569 + }, + { + "epoch": 35.5625, + "loss": 10.34769058227539, + "loss_ce": 1.5351901054382324, + "loss_xval": 8.8125, + "num_input_tokens_seen": 39113984, + "step": 569 + }, + { + "epoch": 35.625, + "grad_norm": 1230.5398739652574, + "learning_rate": 5e-05, + "loss": 22.0495, + "num_input_tokens_seen": 39185728, + "step": 570 + }, + { + "epoch": 35.625, + "loss": 21.479244232177734, + "loss_ce": 0.4792449176311493, + "loss_xval": 21.0, + "num_input_tokens_seen": 39185728, + "step": 570 + }, + { + "epoch": 35.6875, + "grad_norm": 696.4657801334389, + "learning_rate": 5e-05, + "loss": 5.2556, + "num_input_tokens_seen": 39257280, + "step": 571 + }, + { + "epoch": 35.6875, + "loss": 5.062924385070801, + "loss_ce": 0.46917423605918884, + "loss_xval": 4.59375, + "num_input_tokens_seen": 39257280, + "step": 571 + }, + { + "epoch": 35.75, + "grad_norm": 869.8728621056691, + "learning_rate": 5e-05, + "loss": 5.5111, + "num_input_tokens_seen": 39328896, + "step": 572 + }, + { + "epoch": 35.75, + "loss": 5.267719745635986, + "loss_ce": 0.7052197456359863, + "loss_xval": 4.5625, + "num_input_tokens_seen": 39328896, + "step": 572 + }, + { + "epoch": 35.8125, + "grad_norm": 3549.498225625143, + "learning_rate": 5e-05, + "loss": 31.3322, + "num_input_tokens_seen": 39400512, + "step": 573 + }, + { + "epoch": 35.8125, + "loss": 35.222599029541016, + "loss_ce": 0.7225989699363708, + "loss_xval": 34.5, + "num_input_tokens_seen": 39400512, + "step": 573 + }, + { + "epoch": 35.875, + "grad_norm": 1180.2634837582707, + "learning_rate": 5e-05, + "loss": 9.5381, + "num_input_tokens_seen": 39472192, + "step": 574 + }, + { + "epoch": 35.875, + "loss": 9.109892845153809, + "loss_ce": 0.5473928451538086, + "loss_xval": 8.5625, + "num_input_tokens_seen": 39472192, + "step": 574 + }, + { + "epoch": 35.9375, + "grad_norm": 116.29402504782887, + "learning_rate": 5e-05, + "loss": 0.9392, + "num_input_tokens_seen": 39543744, + "step": 575 + }, + { + "epoch": 35.9375, + "loss": 0.8281427621841431, + "loss_ce": 0.5703302621841431, + "loss_xval": 0.2578125, + "num_input_tokens_seen": 39543744, + "step": 575 + }, + { + "epoch": 36.0, + "grad_norm": 918.9252968800259, + "learning_rate": 5e-05, + "loss": 7.6593, + "num_input_tokens_seen": 39615296, + "step": 576 + }, + { + "epoch": 36.0, + "loss": 7.5638298988342285, + "loss_ce": 1.0325798988342285, + "loss_xval": 6.53125, + "num_input_tokens_seen": 39615296, + "step": 576 + }, + { + "epoch": 36.0625, + "grad_norm": 358.6239331313297, + "learning_rate": 5e-05, + "loss": 2.1741, + "num_input_tokens_seen": 39686912, + "step": 577 + }, + { + "epoch": 36.0625, + "loss": 2.2433958053588867, + "loss_ce": 1.1730833053588867, + "loss_xval": 1.0703125, + "num_input_tokens_seen": 39686912, + "step": 577 + }, + { + "epoch": 36.125, + "grad_norm": 702.9413630456082, + "learning_rate": 5e-05, + "loss": 4.5602, + "num_input_tokens_seen": 39758592, + "step": 578 + }, + { + "epoch": 36.125, + "loss": 4.851856231689453, + "loss_ce": 0.961230993270874, + "loss_xval": 3.890625, + "num_input_tokens_seen": 39758592, + "step": 578 + }, + { + "epoch": 36.1875, + "grad_norm": 602.1591127964381, + "learning_rate": 5e-05, + "loss": 3.6184, + "num_input_tokens_seen": 39830144, + "step": 579 + }, + { + "epoch": 36.1875, + "loss": 3.764401435852051, + "loss_ce": 0.545651376247406, + "loss_xval": 3.21875, + "num_input_tokens_seen": 39830144, + "step": 579 + }, + { + "epoch": 36.25, + "grad_norm": 401.1552345637159, + "learning_rate": 5e-05, + "loss": 1.8375, + "num_input_tokens_seen": 39901760, + "step": 580 + }, + { + "epoch": 36.25, + "loss": 1.8537099361419678, + "loss_ce": 0.38495996594429016, + "loss_xval": 1.46875, + "num_input_tokens_seen": 39901760, + "step": 580 + }, + { + "epoch": 36.3125, + "grad_norm": 744.8830304932653, + "learning_rate": 5e-05, + "loss": 4.9107, + "num_input_tokens_seen": 39973312, + "step": 581 + }, + { + "epoch": 36.3125, + "loss": 4.981729984283447, + "loss_ce": 0.35673001408576965, + "loss_xval": 4.625, + "num_input_tokens_seen": 39973312, + "step": 581 + }, + { + "epoch": 36.375, + "grad_norm": 41.1117289556165, + "learning_rate": 5e-05, + "loss": 0.3746, + "num_input_tokens_seen": 40019840, + "step": 582 + }, + { + "epoch": 36.375, + "loss": 0.372890442609787, + "loss_ce": 0.291347473859787, + "loss_xval": 0.08154296875, + "num_input_tokens_seen": 40019840, + "step": 582 + }, + { + "epoch": 36.4375, + "grad_norm": 522.7553105033345, + "learning_rate": 5e-05, + "loss": 2.9091, + "num_input_tokens_seen": 40091456, + "step": 583 + }, + { + "epoch": 36.4375, + "loss": 2.9245858192443848, + "loss_ce": 0.2683357298374176, + "loss_xval": 2.65625, + "num_input_tokens_seen": 40091456, + "step": 583 + }, + { + "epoch": 36.5, + "grad_norm": 100.93606363922494, + "learning_rate": 5e-05, + "loss": 0.386, + "num_input_tokens_seen": 40163136, + "step": 584 + }, + { + "epoch": 36.5, + "loss": 0.4024519622325897, + "loss_ce": 0.2530378997325897, + "loss_xval": 0.1494140625, + "num_input_tokens_seen": 40163136, + "step": 584 + }, + { + "epoch": 36.5625, + "grad_norm": 455.6989539550385, + "learning_rate": 5e-05, + "loss": 2.2411, + "num_input_tokens_seen": 40222336, + "step": 585 + }, + { + "epoch": 36.5625, + "loss": 2.1417577266693115, + "loss_ce": 0.2589452564716339, + "loss_xval": 1.8828125, + "num_input_tokens_seen": 40222336, + "step": 585 + }, + { + "epoch": 36.625, + "grad_norm": 164.02950173346085, + "learning_rate": 5e-05, + "loss": 0.5776, + "num_input_tokens_seen": 40281408, + "step": 586 + }, + { + "epoch": 36.625, + "loss": 0.591841995716095, + "loss_ce": 0.25004512071609497, + "loss_xval": 0.341796875, + "num_input_tokens_seen": 40281408, + "step": 586 + }, + { + "epoch": 36.6875, + "grad_norm": 286.35296257900984, + "learning_rate": 5e-05, + "loss": 1.1763, + "num_input_tokens_seen": 40353152, + "step": 587 + }, + { + "epoch": 36.6875, + "loss": 1.0982005596160889, + "loss_ce": 0.25445055961608887, + "loss_xval": 0.84375, + "num_input_tokens_seen": 40353152, + "step": 587 + }, + { + "epoch": 36.75, + "grad_norm": 263.32879398104865, + "learning_rate": 5e-05, + "loss": 1.0375, + "num_input_tokens_seen": 40412224, + "step": 588 + }, + { + "epoch": 36.75, + "loss": 1.0531641244888306, + "loss_ce": 0.24457040429115295, + "loss_xval": 0.80859375, + "num_input_tokens_seen": 40412224, + "step": 588 + }, + { + "epoch": 36.8125, + "grad_norm": 151.29925845397975, + "learning_rate": 5e-05, + "loss": 0.513, + "num_input_tokens_seen": 40483904, + "step": 589 + }, + { + "epoch": 36.8125, + "loss": 0.5355762243270874, + "loss_ce": 0.2328418642282486, + "loss_xval": 0.302734375, + "num_input_tokens_seen": 40483904, + "step": 589 + }, + { + "epoch": 36.875, + "grad_norm": 311.1048830892808, + "learning_rate": 5e-05, + "loss": 1.2992, + "num_input_tokens_seen": 40555648, + "step": 590 + }, + { + "epoch": 36.875, + "loss": 1.2941535711288452, + "loss_ce": 0.23165352642536163, + "loss_xval": 1.0625, + "num_input_tokens_seen": 40555648, + "step": 590 + }, + { + "epoch": 36.9375, + "grad_norm": 25.12563712108501, + "learning_rate": 5e-05, + "loss": 0.2608, + "num_input_tokens_seen": 40614848, + "step": 591 + }, + { + "epoch": 36.9375, + "loss": 0.2656458020210266, + "loss_ce": 0.22218875586986542, + "loss_xval": 0.04345703125, + "num_input_tokens_seen": 40614848, + "step": 591 + }, + { + "epoch": 37.0, + "grad_norm": 265.0904914487781, + "learning_rate": 5e-05, + "loss": 1.0982, + "num_input_tokens_seen": 40686400, + "step": 592 + }, + { + "epoch": 37.0, + "loss": 1.1134018898010254, + "loss_ce": 0.2149643748998642, + "loss_xval": 0.8984375, + "num_input_tokens_seen": 40686400, + "step": 592 + }, + { + "epoch": 37.0625, + "grad_norm": 98.20828086212038, + "learning_rate": 5e-05, + "loss": 0.3804, + "num_input_tokens_seen": 40758016, + "step": 593 + }, + { + "epoch": 37.0625, + "loss": 0.37509244680404663, + "loss_ce": 0.20614711940288544, + "loss_xval": 0.1689453125, + "num_input_tokens_seen": 40758016, + "step": 593 + }, + { + "epoch": 37.125, + "grad_norm": 156.23063382829764, + "learning_rate": 5e-05, + "loss": 0.5599, + "num_input_tokens_seen": 40817088, + "step": 594 + }, + { + "epoch": 37.125, + "loss": 0.5345146656036377, + "loss_ce": 0.1985771507024765, + "loss_xval": 0.3359375, + "num_input_tokens_seen": 40817088, + "step": 594 + }, + { + "epoch": 37.1875, + "grad_norm": 205.7645154073751, + "learning_rate": 5e-05, + "loss": 0.7541, + "num_input_tokens_seen": 40888704, + "step": 595 + }, + { + "epoch": 37.1875, + "loss": 0.7232792973518372, + "loss_ce": 0.19593554735183716, + "loss_xval": 0.52734375, + "num_input_tokens_seen": 40888704, + "step": 595 + }, + { + "epoch": 37.25, + "grad_norm": 54.92131899982118, + "learning_rate": 5e-05, + "loss": 0.2654, + "num_input_tokens_seen": 40947776, + "step": 596 + }, + { + "epoch": 37.25, + "loss": 0.27080121636390686, + "loss_ce": 0.20146527886390686, + "loss_xval": 0.0693359375, + "num_input_tokens_seen": 40947776, + "step": 596 + }, + { + "epoch": 37.3125, + "grad_norm": 205.73367835848026, + "learning_rate": 5e-05, + "loss": 0.7528, + "num_input_tokens_seen": 41019456, + "step": 597 + }, + { + "epoch": 37.3125, + "loss": 0.7497467994689941, + "loss_ce": 0.18334054946899414, + "loss_xval": 0.56640625, + "num_input_tokens_seen": 41019456, + "step": 597 + }, + { + "epoch": 37.375, + "grad_norm": 60.87860178293434, + "learning_rate": 5e-05, + "loss": 0.2509, + "num_input_tokens_seen": 41078528, + "step": 598 + }, + { + "epoch": 37.375, + "loss": 0.238224059343338, + "loss_ce": 0.179630309343338, + "loss_xval": 0.05859375, + "num_input_tokens_seen": 41078528, + "step": 598 + }, + { + "epoch": 37.4375, + "grad_norm": 140.53096665849083, + "learning_rate": 5e-05, + "loss": 0.4592, + "num_input_tokens_seen": 41150080, + "step": 599 + }, + { + "epoch": 37.4375, + "loss": 0.4615119397640228, + "loss_ce": 0.17440256476402283, + "loss_xval": 0.287109375, + "num_input_tokens_seen": 41150080, + "step": 599 + }, + { + "epoch": 37.5, + "grad_norm": 141.48099157738406, + "learning_rate": 5e-05, + "loss": 0.4551, + "num_input_tokens_seen": 41221760, + "step": 600 + }, + { + "epoch": 37.5, + "loss": 0.44427230954170227, + "loss_ce": 0.17278793454170227, + "loss_xval": 0.271484375, + "num_input_tokens_seen": 41221760, + "step": 600 + }, + { + "epoch": 37.5625, + "grad_norm": 56.75102315773255, + "learning_rate": 5e-05, + "loss": 0.2472, + "num_input_tokens_seen": 41293568, + "step": 601 + }, + { + "epoch": 37.5625, + "loss": 0.23851189017295837, + "loss_ce": 0.16771110892295837, + "loss_xval": 0.07080078125, + "num_input_tokens_seen": 41293568, + "step": 601 + }, + { + "epoch": 37.625, + "grad_norm": 149.9406290414657, + "learning_rate": 5e-05, + "loss": 0.4996, + "num_input_tokens_seen": 41365184, + "step": 602 + }, + { + "epoch": 37.625, + "loss": 0.4996224045753479, + "loss_ce": 0.1714974194765091, + "loss_xval": 0.328125, + "num_input_tokens_seen": 41365184, + "step": 602 + }, + { + "epoch": 37.6875, + "grad_norm": 29.47624024193657, + "learning_rate": 5e-05, + "loss": 0.1971, + "num_input_tokens_seen": 41436800, + "step": 603 + }, + { + "epoch": 37.6875, + "loss": 0.19674646854400635, + "loss_ce": 0.16159021854400635, + "loss_xval": 0.03515625, + "num_input_tokens_seen": 41436800, + "step": 603 + }, + { + "epoch": 37.75, + "grad_norm": 125.28948276890881, + "learning_rate": 5e-05, + "loss": 0.3971, + "num_input_tokens_seen": 41508480, + "step": 604 + }, + { + "epoch": 37.75, + "loss": 0.45287567377090454, + "loss_ce": 0.16186004877090454, + "loss_xval": 0.291015625, + "num_input_tokens_seen": 41508480, + "step": 604 + }, + { + "epoch": 37.8125, + "grad_norm": 95.54904903701473, + "learning_rate": 5e-05, + "loss": 0.3014, + "num_input_tokens_seen": 41580224, + "step": 605 + }, + { + "epoch": 37.8125, + "loss": 0.31282493472099304, + "loss_ce": 0.16145774722099304, + "loss_xval": 0.1513671875, + "num_input_tokens_seen": 41580224, + "step": 605 + }, + { + "epoch": 37.875, + "grad_norm": 78.91158457455924, + "learning_rate": 5e-05, + "loss": 0.2589, + "num_input_tokens_seen": 41651904, + "step": 606 + }, + { + "epoch": 37.875, + "loss": 0.2595600187778473, + "loss_ce": 0.1628803312778473, + "loss_xval": 0.0966796875, + "num_input_tokens_seen": 41651904, + "step": 606 + }, + { + "epoch": 37.9375, + "grad_norm": 117.50889229373615, + "learning_rate": 5e-05, + "loss": 0.3626, + "num_input_tokens_seen": 41723520, + "step": 607 + }, + { + "epoch": 37.9375, + "loss": 0.3645981252193451, + "loss_ce": 0.1546371877193451, + "loss_xval": 0.2099609375, + "num_input_tokens_seen": 41723520, + "step": 607 + }, + { + "epoch": 38.0, + "grad_norm": 11.030866607250644, + "learning_rate": 5e-05, + "loss": 0.1699, + "num_input_tokens_seen": 41795136, + "step": 608 + }, + { + "epoch": 38.0, + "loss": 0.16384030878543854, + "loss_ce": 0.15279294550418854, + "loss_xval": 0.01104736328125, + "num_input_tokens_seen": 41795136, + "step": 608 + }, + { + "epoch": 38.0625, + "grad_norm": 115.85949003037487, + "learning_rate": 5e-05, + "loss": 0.3556, + "num_input_tokens_seen": 41866752, + "step": 609 + }, + { + "epoch": 38.0625, + "loss": 0.373709499835968, + "loss_ce": 0.15105323493480682, + "loss_xval": 0.22265625, + "num_input_tokens_seen": 41866752, + "step": 609 + }, + { + "epoch": 38.125, + "grad_norm": 35.99753609550145, + "learning_rate": 5e-05, + "loss": 0.1821, + "num_input_tokens_seen": 41938432, + "step": 610 + }, + { + "epoch": 38.125, + "loss": 0.18259596824645996, + "loss_ce": 0.14841628074645996, + "loss_xval": 0.0341796875, + "num_input_tokens_seen": 41938432, + "step": 610 + }, + { + "epoch": 38.1875, + "grad_norm": 67.90140780338362, + "learning_rate": 5e-05, + "loss": 0.228, + "num_input_tokens_seen": 42009984, + "step": 611 + }, + { + "epoch": 38.1875, + "loss": 0.23608772456645966, + "loss_ce": 0.14966194331645966, + "loss_xval": 0.08642578125, + "num_input_tokens_seen": 42009984, + "step": 611 + }, + { + "epoch": 38.25, + "grad_norm": 76.47480758466817, + "learning_rate": 5e-05, + "loss": 0.2357, + "num_input_tokens_seen": 42081664, + "step": 612 + }, + { + "epoch": 38.25, + "loss": 0.2421186864376068, + "loss_ce": 0.1469038426876068, + "loss_xval": 0.09521484375, + "num_input_tokens_seen": 42081664, + "step": 612 + }, + { + "epoch": 38.3125, + "grad_norm": 16.84212415909238, + "learning_rate": 5e-05, + "loss": 0.156, + "num_input_tokens_seen": 42153216, + "step": 613 + }, + { + "epoch": 38.3125, + "loss": 0.1534736603498459, + "loss_ce": 0.1434028595685959, + "loss_xval": 0.01007080078125, + "num_input_tokens_seen": 42153216, + "step": 613 + }, + { + "epoch": 38.375, + "grad_norm": 88.89162301325827, + "learning_rate": 5e-05, + "loss": 0.2588, + "num_input_tokens_seen": 42224832, + "step": 614 + }, + { + "epoch": 38.375, + "loss": 0.268532931804657, + "loss_ce": 0.14255638420581818, + "loss_xval": 0.1259765625, + "num_input_tokens_seen": 42224832, + "step": 614 + }, + { + "epoch": 38.4375, + "grad_norm": 23.5430509471996, + "learning_rate": 5e-05, + "loss": 0.158, + "num_input_tokens_seen": 42296512, + "step": 615 + }, + { + "epoch": 38.4375, + "loss": 0.16037799417972565, + "loss_ce": 0.14243365824222565, + "loss_xval": 0.0179443359375, + "num_input_tokens_seen": 42296512, + "step": 615 + }, + { + "epoch": 38.5, + "grad_norm": 60.88480572029057, + "learning_rate": 5e-05, + "loss": 0.2048, + "num_input_tokens_seen": 42368064, + "step": 616 + }, + { + "epoch": 38.5, + "loss": 0.21100540459156036, + "loss_ce": 0.14362259209156036, + "loss_xval": 0.0673828125, + "num_input_tokens_seen": 42368064, + "step": 616 + }, + { + "epoch": 38.5625, + "grad_norm": 66.58093227045465, + "learning_rate": 5e-05, + "loss": 0.2093, + "num_input_tokens_seen": 42427072, + "step": 617 + }, + { + "epoch": 38.5625, + "loss": 0.21216745674610138, + "loss_ce": 0.13746042549610138, + "loss_xval": 0.07470703125, + "num_input_tokens_seen": 42427072, + "step": 617 + }, + { + "epoch": 38.625, + "grad_norm": 41.32810840119761, + "learning_rate": 5e-05, + "loss": 0.1741, + "num_input_tokens_seen": 42498752, + "step": 618 + }, + { + "epoch": 38.625, + "loss": 0.17929083108901978, + "loss_ce": 0.13827520608901978, + "loss_xval": 0.041015625, + "num_input_tokens_seen": 42498752, + "step": 618 + }, + { + "epoch": 38.6875, + "grad_norm": 71.79194449299779, + "learning_rate": 5e-05, + "loss": 0.2116, + "num_input_tokens_seen": 42570496, + "step": 619 + }, + { + "epoch": 38.6875, + "loss": 0.21144595742225647, + "loss_ce": 0.13234439492225647, + "loss_xval": 0.0791015625, + "num_input_tokens_seen": 42570496, + "step": 619 + }, + { + "epoch": 38.75, + "grad_norm": 7.438002395643336, + "learning_rate": 5e-05, + "loss": 0.14, + "num_input_tokens_seen": 42642176, + "step": 620 + }, + { + "epoch": 38.75, + "loss": 0.14000988006591797, + "loss_ce": 0.13289928436279297, + "loss_xval": 0.007110595703125, + "num_input_tokens_seen": 42642176, + "step": 620 + }, + { + "epoch": 38.8125, + "grad_norm": 58.768235228479554, + "learning_rate": 5e-05, + "loss": 0.1878, + "num_input_tokens_seen": 42713728, + "step": 621 + }, + { + "epoch": 38.8125, + "loss": 0.17987307906150818, + "loss_ce": 0.13421878218650818, + "loss_xval": 0.045654296875, + "num_input_tokens_seen": 42713728, + "step": 621 + }, + { + "epoch": 38.875, + "grad_norm": 31.55788652090228, + "learning_rate": 5e-05, + "loss": 0.149, + "num_input_tokens_seen": 42772864, + "step": 622 + }, + { + "epoch": 38.875, + "loss": 0.14639230072498322, + "loss_ce": 0.13143868744373322, + "loss_xval": 0.01495361328125, + "num_input_tokens_seen": 42772864, + "step": 622 + }, + { + "epoch": 38.9375, + "grad_norm": 50.2062141649387, + "learning_rate": 5e-05, + "loss": 0.174, + "num_input_tokens_seen": 42844416, + "step": 623 + }, + { + "epoch": 38.9375, + "loss": 0.17014646530151367, + "loss_ce": 0.13181638717651367, + "loss_xval": 0.038330078125, + "num_input_tokens_seen": 42844416, + "step": 623 + }, + { + "epoch": 39.0, + "grad_norm": 39.727476739538965, + "learning_rate": 5e-05, + "loss": 0.1575, + "num_input_tokens_seen": 42903552, + "step": 624 + }, + { + "epoch": 39.0, + "loss": 0.15018722414970398, + "loss_ce": 0.12662765383720398, + "loss_xval": 0.0235595703125, + "num_input_tokens_seen": 42903552, + "step": 624 + }, + { + "epoch": 39.0625, + "grad_norm": 19.034774434189423, + "learning_rate": 5e-05, + "loss": 0.1347, + "num_input_tokens_seen": 42975296, + "step": 625 + }, + { + "epoch": 39.0625, + "loss": 0.13135939836502075, + "loss_ce": 0.12476760149002075, + "loss_xval": 0.006591796875, + "num_input_tokens_seen": 42975296, + "step": 625 + }, + { + "epoch": 39.125, + "grad_norm": 56.67017709198661, + "learning_rate": 5e-05, + "loss": 0.1763, + "num_input_tokens_seen": 43034432, + "step": 626 + }, + { + "epoch": 39.125, + "loss": 0.17652033269405365, + "loss_ce": 0.12305353581905365, + "loss_xval": 0.053466796875, + "num_input_tokens_seen": 43034432, + "step": 626 + }, + { + "epoch": 39.1875, + "grad_norm": 3.7113754650312445, + "learning_rate": 5e-05, + "loss": 0.1303, + "num_input_tokens_seen": 43106112, + "step": 627 + }, + { + "epoch": 39.1875, + "loss": 0.12929722666740417, + "loss_ce": 0.12178990989923477, + "loss_xval": 0.00750732421875, + "num_input_tokens_seen": 43106112, + "step": 627 + }, + { + "epoch": 39.25, + "grad_norm": 54.51094923701809, + "learning_rate": 5e-05, + "loss": 0.1703, + "num_input_tokens_seen": 43177792, + "step": 628 + }, + { + "epoch": 39.25, + "loss": 0.18063339591026306, + "loss_ce": 0.12106308341026306, + "loss_xval": 0.0595703125, + "num_input_tokens_seen": 43177792, + "step": 628 + }, + { + "epoch": 39.3125, + "grad_norm": 19.084346976100914, + "learning_rate": 5e-05, + "loss": 0.134, + "num_input_tokens_seen": 43249472, + "step": 629 + }, + { + "epoch": 39.3125, + "loss": 0.13764870166778564, + "loss_ce": 0.12574684619903564, + "loss_xval": 0.01190185546875, + "num_input_tokens_seen": 43249472, + "step": 629 + }, + { + "epoch": 39.375, + "grad_norm": 58.28364550359889, + "learning_rate": 5e-05, + "loss": 0.1704, + "num_input_tokens_seen": 43321152, + "step": 630 + }, + { + "epoch": 39.375, + "loss": 0.17048946022987366, + "loss_ce": 0.11946406215429306, + "loss_xval": 0.051025390625, + "num_input_tokens_seen": 43321152, + "step": 630 + }, + { + "epoch": 39.4375, + "grad_norm": 26.589554454909614, + "learning_rate": 5e-05, + "loss": 0.1319, + "num_input_tokens_seen": 43392768, + "step": 631 + }, + { + "epoch": 39.4375, + "loss": 0.1374756097793579, + "loss_ce": 0.11879885196685791, + "loss_xval": 0.0186767578125, + "num_input_tokens_seen": 43392768, + "step": 631 + }, + { + "epoch": 39.5, + "grad_norm": 48.98562574688163, + "learning_rate": 5e-05, + "loss": 0.153, + "num_input_tokens_seen": 43464448, + "step": 632 + }, + { + "epoch": 39.5, + "loss": 0.1484987884759903, + "loss_ce": 0.1145632416009903, + "loss_xval": 0.033935546875, + "num_input_tokens_seen": 43464448, + "step": 632 + }, + { + "epoch": 39.5625, + "grad_norm": 38.604355288622315, + "learning_rate": 5e-05, + "loss": 0.1457, + "num_input_tokens_seen": 43536064, + "step": 633 + }, + { + "epoch": 39.5625, + "loss": 0.144775390625, + "loss_ce": 0.1132812425494194, + "loss_xval": 0.031494140625, + "num_input_tokens_seen": 43536064, + "step": 633 + }, + { + "epoch": 39.625, + "grad_norm": 39.82814144958052, + "learning_rate": 5e-05, + "loss": 0.1402, + "num_input_tokens_seen": 43607872, + "step": 634 + }, + { + "epoch": 39.625, + "loss": 0.1378423571586609, + "loss_ce": 0.11391657590866089, + "loss_xval": 0.02392578125, + "num_input_tokens_seen": 43607872, + "step": 634 + }, + { + "epoch": 39.6875, + "grad_norm": 42.46176254037517, + "learning_rate": 5e-05, + "loss": 0.1436, + "num_input_tokens_seen": 43679488, + "step": 635 + }, + { + "epoch": 39.6875, + "loss": 0.1384507417678833, + "loss_ce": 0.1140366792678833, + "loss_xval": 0.0244140625, + "num_input_tokens_seen": 43679488, + "step": 635 + }, + { + "epoch": 39.75, + "grad_norm": 24.345686051123515, + "learning_rate": 5e-05, + "loss": 0.1243, + "num_input_tokens_seen": 43738560, + "step": 636 + }, + { + "epoch": 39.75, + "loss": 0.12430451065301895, + "loss_ce": 0.10898468643426895, + "loss_xval": 0.01531982421875, + "num_input_tokens_seen": 43738560, + "step": 636 + }, + { + "epoch": 39.8125, + "grad_norm": 43.87853861651247, + "learning_rate": 5e-05, + "loss": 0.1395, + "num_input_tokens_seen": 43797568, + "step": 637 + }, + { + "epoch": 39.8125, + "loss": 0.1355985701084137, + "loss_ce": 0.1086210310459137, + "loss_xval": 0.0269775390625, + "num_input_tokens_seen": 43797568, + "step": 637 + }, + { + "epoch": 39.875, + "grad_norm": 6.588837352574171, + "learning_rate": 5e-05, + "loss": 0.1169, + "num_input_tokens_seen": 43869184, + "step": 638 + }, + { + "epoch": 39.875, + "loss": 0.11672370135784149, + "loss_ce": 0.11235968768596649, + "loss_xval": 0.004364013671875, + "num_input_tokens_seen": 43869184, + "step": 638 + }, + { + "epoch": 39.9375, + "grad_norm": 48.67125834183872, + "learning_rate": 5e-05, + "loss": 0.1465, + "num_input_tokens_seen": 43940800, + "step": 639 + }, + { + "epoch": 39.9375, + "loss": 0.14488358795642853, + "loss_ce": 0.10997147858142853, + "loss_xval": 0.034912109375, + "num_input_tokens_seen": 43940800, + "step": 639 + }, + { + "epoch": 40.0, + "grad_norm": 4.979504580343616, + "learning_rate": 5e-05, + "loss": 0.1124, + "num_input_tokens_seen": 44012416, + "step": 640 + }, + { + "epoch": 40.0, + "loss": 0.11065511405467987, + "loss_ce": 0.10659627616405487, + "loss_xval": 0.004058837890625, + "num_input_tokens_seen": 44012416, + "step": 640 + }, + { + "epoch": 40.0625, + "grad_norm": 51.66416460471596, + "learning_rate": 5e-05, + "loss": 0.1475, + "num_input_tokens_seen": 44084160, + "step": 641 + }, + { + "epoch": 40.0625, + "loss": 0.14437633752822876, + "loss_ce": 0.10360485315322876, + "loss_xval": 0.040771484375, + "num_input_tokens_seen": 44084160, + "step": 641 + }, + { + "epoch": 40.125, + "grad_norm": 14.566490513755292, + "learning_rate": 5e-05, + "loss": 0.1104, + "num_input_tokens_seen": 44155712, + "step": 642 + }, + { + "epoch": 40.125, + "loss": 0.10976142436265945, + "loss_ce": 0.10231513530015945, + "loss_xval": 0.0074462890625, + "num_input_tokens_seen": 44155712, + "step": 642 + }, + { + "epoch": 40.1875, + "grad_norm": 61.00209116862136, + "learning_rate": 5e-05, + "loss": 0.1549, + "num_input_tokens_seen": 44227392, + "step": 643 + }, + { + "epoch": 40.1875, + "loss": 0.15537142753601074, + "loss_ce": 0.09921907633543015, + "loss_xval": 0.05615234375, + "num_input_tokens_seen": 44227392, + "step": 643 + }, + { + "epoch": 40.25, + "grad_norm": 11.347214980938771, + "learning_rate": 5e-05, + "loss": 0.1063, + "num_input_tokens_seen": 44286336, + "step": 644 + }, + { + "epoch": 40.25, + "loss": 0.10684747993946075, + "loss_ce": 0.10077448189258575, + "loss_xval": 0.006072998046875, + "num_input_tokens_seen": 44286336, + "step": 644 + }, + { + "epoch": 40.3125, + "grad_norm": 57.017450004203646, + "learning_rate": 5e-05, + "loss": 0.148, + "num_input_tokens_seen": 44358016, + "step": 645 + }, + { + "epoch": 40.3125, + "loss": 0.14290070533752441, + "loss_ce": 0.09773469716310501, + "loss_xval": 0.045166015625, + "num_input_tokens_seen": 44358016, + "step": 645 + }, + { + "epoch": 40.375, + "grad_norm": 16.615742633522416, + "learning_rate": 5e-05, + "loss": 0.1075, + "num_input_tokens_seen": 44429760, + "step": 646 + }, + { + "epoch": 40.375, + "loss": 0.10806427896022797, + "loss_ce": 0.09756623208522797, + "loss_xval": 0.010498046875, + "num_input_tokens_seen": 44429760, + "step": 646 + }, + { + "epoch": 40.4375, + "grad_norm": 58.16047818456758, + "learning_rate": 5e-05, + "loss": 0.1494, + "num_input_tokens_seen": 44488832, + "step": 647 + }, + { + "epoch": 40.4375, + "loss": 0.1497732698917389, + "loss_ce": 0.09679476171731949, + "loss_xval": 0.052978515625, + "num_input_tokens_seen": 44488832, + "step": 647 + }, + { + "epoch": 40.5, + "grad_norm": 16.82982616979513, + "learning_rate": 5e-05, + "loss": 0.104, + "num_input_tokens_seen": 44560448, + "step": 648 + }, + { + "epoch": 40.5, + "loss": 0.10390922427177429, + "loss_ce": 0.09426566958427429, + "loss_xval": 0.0096435546875, + "num_input_tokens_seen": 44560448, + "step": 648 + }, + { + "epoch": 40.5625, + "grad_norm": 59.937320299463295, + "learning_rate": 5e-05, + "loss": 0.1482, + "num_input_tokens_seen": 44632064, + "step": 649 + }, + { + "epoch": 40.5625, + "loss": 0.14854532480239868, + "loss_ce": 0.09263712167739868, + "loss_xval": 0.055908203125, + "num_input_tokens_seen": 44632064, + "step": 649 + }, + { + "epoch": 40.625, + "grad_norm": 8.174049550372988, + "learning_rate": 5e-05, + "loss": 0.0969, + "num_input_tokens_seen": 44703680, + "step": 650 + }, + { + "epoch": 40.625, + "loss": 0.09525209665298462, + "loss_ce": 0.09174257516860962, + "loss_xval": 0.003509521484375, + "num_input_tokens_seen": 44703680, + "step": 650 + }, + { + "epoch": 40.6875, + "grad_norm": 55.966905853940496, + "learning_rate": 5e-05, + "loss": 0.1393, + "num_input_tokens_seen": 44775296, + "step": 651 + }, + { + "epoch": 40.6875, + "loss": 0.1412389874458313, + "loss_ce": 0.0909460112452507, + "loss_xval": 0.05029296875, + "num_input_tokens_seen": 44775296, + "step": 651 + }, + { + "epoch": 40.75, + "grad_norm": 0.8837807743869168, + "learning_rate": 5e-05, + "loss": 0.0934, + "num_input_tokens_seen": 44846848, + "step": 652 + }, + { + "epoch": 40.75, + "loss": 0.09453385323286057, + "loss_ce": 0.08836930245161057, + "loss_xval": 0.00616455078125, + "num_input_tokens_seen": 44846848, + "step": 652 + }, + { + "epoch": 40.8125, + "grad_norm": 46.75400872468516, + "learning_rate": 5e-05, + "loss": 0.1207, + "num_input_tokens_seen": 44918464, + "step": 653 + }, + { + "epoch": 40.8125, + "loss": 0.1174541562795639, + "loss_ce": 0.0849834531545639, + "loss_xval": 0.032470703125, + "num_input_tokens_seen": 44918464, + "step": 653 + }, + { + "epoch": 40.875, + "grad_norm": 5.538176318189277, + "learning_rate": 5e-05, + "loss": 0.0958, + "num_input_tokens_seen": 44990016, + "step": 654 + }, + { + "epoch": 40.875, + "loss": 0.10027727484703064, + "loss_ce": 0.09420427680015564, + "loss_xval": 0.006072998046875, + "num_input_tokens_seen": 44990016, + "step": 654 + }, + { + "epoch": 40.9375, + "grad_norm": 50.93686152495069, + "learning_rate": 5e-05, + "loss": 0.1246, + "num_input_tokens_seen": 45061632, + "step": 655 + }, + { + "epoch": 40.9375, + "loss": 0.11831493675708771, + "loss_ce": 0.08560009300708771, + "loss_xval": 0.03271484375, + "num_input_tokens_seen": 45061632, + "step": 655 + }, + { + "epoch": 41.0, + "grad_norm": 6.387264661998158, + "learning_rate": 5e-05, + "loss": 0.0879, + "num_input_tokens_seen": 45133312, + "step": 656 + }, + { + "epoch": 41.0, + "loss": 0.08795207738876343, + "loss_ce": 0.08313030004501343, + "loss_xval": 0.00482177734375, + "num_input_tokens_seen": 45133312, + "step": 656 + }, + { + "epoch": 41.0625, + "grad_norm": 38.77820198456228, + "learning_rate": 5e-05, + "loss": 0.1052, + "num_input_tokens_seen": 45204992, + "step": 657 + }, + { + "epoch": 41.0625, + "loss": 0.10423070937395096, + "loss_ce": 0.08054906874895096, + "loss_xval": 0.023681640625, + "num_input_tokens_seen": 45204992, + "step": 657 + }, + { + "epoch": 41.125, + "grad_norm": 7.696398023612668, + "learning_rate": 5e-05, + "loss": 0.0888, + "num_input_tokens_seen": 45276736, + "step": 658 + }, + { + "epoch": 41.125, + "loss": 0.08869247138500214, + "loss_ce": 0.08310775458812714, + "loss_xval": 0.005584716796875, + "num_input_tokens_seen": 45276736, + "step": 658 + }, + { + "epoch": 41.1875, + "grad_norm": 42.066906629424764, + "learning_rate": 5e-05, + "loss": 0.1093, + "num_input_tokens_seen": 45348352, + "step": 659 + }, + { + "epoch": 41.1875, + "loss": 0.10560226440429688, + "loss_ce": 0.07996749877929688, + "loss_xval": 0.025634765625, + "num_input_tokens_seen": 45348352, + "step": 659 + }, + { + "epoch": 41.25, + "grad_norm": 24.179182286884714, + "learning_rate": 5e-05, + "loss": 0.0898, + "num_input_tokens_seen": 45420096, + "step": 660 + }, + { + "epoch": 41.25, + "loss": 0.08728927373886108, + "loss_ce": 0.07892745733261108, + "loss_xval": 0.00836181640625, + "num_input_tokens_seen": 45420096, + "step": 660 + }, + { + "epoch": 41.3125, + "grad_norm": 37.78980795468517, + "learning_rate": 5e-05, + "loss": 0.1027, + "num_input_tokens_seen": 45491712, + "step": 661 + }, + { + "epoch": 41.3125, + "loss": 0.09915024042129517, + "loss_ce": 0.07766586542129517, + "loss_xval": 0.021484375, + "num_input_tokens_seen": 45491712, + "step": 661 + }, + { + "epoch": 41.375, + "grad_norm": 33.059393467942876, + "learning_rate": 5e-05, + "loss": 0.0992, + "num_input_tokens_seen": 45563392, + "step": 662 + }, + { + "epoch": 41.375, + "loss": 0.10039538890123367, + "loss_ce": 0.08000964671373367, + "loss_xval": 0.0203857421875, + "num_input_tokens_seen": 45563392, + "step": 662 + }, + { + "epoch": 41.4375, + "grad_norm": 32.69051951121018, + "learning_rate": 5e-05, + "loss": 0.0912, + "num_input_tokens_seen": 45635008, + "step": 663 + }, + { + "epoch": 41.4375, + "loss": 0.09104805439710617, + "loss_ce": 0.07212715595960617, + "loss_xval": 0.0189208984375, + "num_input_tokens_seen": 45635008, + "step": 663 + }, + { + "epoch": 41.5, + "grad_norm": 44.721200169778065, + "learning_rate": 5e-05, + "loss": 0.1053, + "num_input_tokens_seen": 45694016, + "step": 664 + }, + { + "epoch": 41.5, + "loss": 0.10693207383155823, + "loss_ce": 0.07580414414405823, + "loss_xval": 0.0311279296875, + "num_input_tokens_seen": 45694016, + "step": 664 + }, + { + "epoch": 41.5625, + "grad_norm": 10.834594654550962, + "learning_rate": 5e-05, + "loss": 0.0782, + "num_input_tokens_seen": 45765824, + "step": 665 + }, + { + "epoch": 41.5625, + "loss": 0.07746880501508713, + "loss_ce": 0.07350151985883713, + "loss_xval": 0.00396728515625, + "num_input_tokens_seen": 45765824, + "step": 665 + }, + { + "epoch": 41.625, + "grad_norm": 42.53339591806573, + "learning_rate": 5e-05, + "loss": 0.1004, + "num_input_tokens_seen": 45837504, + "step": 666 + }, + { + "epoch": 41.625, + "loss": 0.1011945903301239, + "loss_ce": 0.0706770122051239, + "loss_xval": 0.030517578125, + "num_input_tokens_seen": 45837504, + "step": 666 + }, + { + "epoch": 41.6875, + "grad_norm": 6.314878555402224, + "learning_rate": 5e-05, + "loss": 0.0723, + "num_input_tokens_seen": 45909120, + "step": 667 + }, + { + "epoch": 41.6875, + "loss": 0.07241284847259521, + "loss_ce": 0.06925427913665771, + "loss_xval": 0.0031585693359375, + "num_input_tokens_seen": 45909120, + "step": 667 + }, + { + "epoch": 41.75, + "grad_norm": 38.929875736582176, + "learning_rate": 5e-05, + "loss": 0.0937, + "num_input_tokens_seen": 45980736, + "step": 668 + }, + { + "epoch": 41.75, + "loss": 0.09472913295030594, + "loss_ce": 0.07043714076280594, + "loss_xval": 0.0242919921875, + "num_input_tokens_seen": 45980736, + "step": 668 + }, + { + "epoch": 41.8125, + "grad_norm": 24.81714823157948, + "learning_rate": 5e-05, + "loss": 0.0811, + "num_input_tokens_seen": 46052416, + "step": 669 + }, + { + "epoch": 41.8125, + "loss": 0.08008313924074173, + "loss_ce": 0.06952405720949173, + "loss_xval": 0.01055908203125, + "num_input_tokens_seen": 46052416, + "step": 669 + }, + { + "epoch": 41.875, + "grad_norm": 27.465588929973283, + "learning_rate": 5e-05, + "loss": 0.0793, + "num_input_tokens_seen": 46111424, + "step": 670 + }, + { + "epoch": 41.875, + "loss": 0.07748164236545563, + "loss_ce": 0.06686152517795563, + "loss_xval": 0.0106201171875, + "num_input_tokens_seen": 46111424, + "step": 670 + }, + { + "epoch": 41.9375, + "grad_norm": 46.14633889565823, + "learning_rate": 5e-05, + "loss": 0.0991, + "num_input_tokens_seen": 46183104, + "step": 671 + }, + { + "epoch": 41.9375, + "loss": 0.10552562028169632, + "loss_ce": 0.06890452653169632, + "loss_xval": 0.03662109375, + "num_input_tokens_seen": 46183104, + "step": 671 + }, + { + "epoch": 42.0, + "grad_norm": 12.898889615697131, + "learning_rate": 5e-05, + "loss": 0.0699, + "num_input_tokens_seen": 46242176, + "step": 672 + }, + { + "epoch": 42.0, + "loss": 0.07312614470720291, + "loss_ce": 0.06470329314470291, + "loss_xval": 0.0084228515625, + "num_input_tokens_seen": 46242176, + "step": 672 + }, + { + "epoch": 42.0625, + "grad_norm": 22.250142954382216, + "learning_rate": 5e-05, + "loss": 0.0728, + "num_input_tokens_seen": 46313920, + "step": 673 + }, + { + "epoch": 42.0625, + "loss": 0.0726141631603241, + "loss_ce": 0.0619330070912838, + "loss_xval": 0.01068115234375, + "num_input_tokens_seen": 46313920, + "step": 673 + }, + { + "epoch": 42.125, + "grad_norm": 15.733860421093425, + "learning_rate": 5e-05, + "loss": 0.0683, + "num_input_tokens_seen": 46385664, + "step": 674 + }, + { + "epoch": 42.125, + "loss": 0.06907017529010773, + "loss_ce": 0.06220372021198273, + "loss_xval": 0.006866455078125, + "num_input_tokens_seen": 46385664, + "step": 674 + }, + { + "epoch": 42.1875, + "grad_norm": 16.41276651460441, + "learning_rate": 5e-05, + "loss": 0.0708, + "num_input_tokens_seen": 46457408, + "step": 675 + }, + { + "epoch": 42.1875, + "loss": 0.0687999352812767, + "loss_ce": 0.0634288415312767, + "loss_xval": 0.00537109375, + "num_input_tokens_seen": 46457408, + "step": 675 + }, + { + "epoch": 42.25, + "grad_norm": 32.91585484605843, + "learning_rate": 5e-05, + "loss": 0.0785, + "num_input_tokens_seen": 46528960, + "step": 676 + }, + { + "epoch": 42.25, + "loss": 0.07857609540224075, + "loss_ce": 0.06258488446474075, + "loss_xval": 0.0159912109375, + "num_input_tokens_seen": 46528960, + "step": 676 + }, + { + "epoch": 42.3125, + "grad_norm": 13.792516811760255, + "learning_rate": 5e-05, + "loss": 0.0672, + "num_input_tokens_seen": 46588096, + "step": 677 + }, + { + "epoch": 42.3125, + "loss": 0.07050401717424393, + "loss_ce": 0.06086046248674393, + "loss_xval": 0.0096435546875, + "num_input_tokens_seen": 46588096, + "step": 677 + }, + { + "epoch": 42.375, + "grad_norm": 15.583831144343767, + "learning_rate": 5e-05, + "loss": 0.0654, + "num_input_tokens_seen": 46659648, + "step": 678 + }, + { + "epoch": 42.375, + "loss": 0.06597653776407242, + "loss_ce": 0.05791989713907242, + "loss_xval": 0.008056640625, + "num_input_tokens_seen": 46659648, + "step": 678 + }, + { + "epoch": 42.4375, + "grad_norm": 13.894096468825236, + "learning_rate": 5e-05, + "loss": 0.067, + "num_input_tokens_seen": 46731264, + "step": 679 + }, + { + "epoch": 42.4375, + "loss": 0.06740245968103409, + "loss_ce": 0.06007824093103409, + "loss_xval": 0.00732421875, + "num_input_tokens_seen": 46731264, + "step": 679 + }, + { + "epoch": 42.5, + "grad_norm": 11.877728077317133, + "learning_rate": 5e-05, + "loss": 0.0619, + "num_input_tokens_seen": 46802944, + "step": 680 + }, + { + "epoch": 42.5, + "loss": 0.06254500895738602, + "loss_ce": 0.05720443278551102, + "loss_xval": 0.005340576171875, + "num_input_tokens_seen": 46802944, + "step": 680 + }, + { + "epoch": 42.5625, + "grad_norm": 27.267415158619585, + "learning_rate": 5e-05, + "loss": 0.0697, + "num_input_tokens_seen": 46874496, + "step": 681 + }, + { + "epoch": 42.5625, + "loss": 0.06751132011413574, + "loss_ce": 0.05512118339538574, + "loss_xval": 0.01239013671875, + "num_input_tokens_seen": 46874496, + "step": 681 + }, + { + "epoch": 42.625, + "grad_norm": 16.835417062638697, + "learning_rate": 5e-05, + "loss": 0.0592, + "num_input_tokens_seen": 46946176, + "step": 682 + }, + { + "epoch": 42.625, + "loss": 0.06013280153274536, + "loss_ce": 0.05412083864212036, + "loss_xval": 0.006011962890625, + "num_input_tokens_seen": 46946176, + "step": 682 + }, + { + "epoch": 42.6875, + "grad_norm": 12.117372223184038, + "learning_rate": 5e-05, + "loss": 0.0611, + "num_input_tokens_seen": 47017792, + "step": 683 + }, + { + "epoch": 42.6875, + "loss": 0.05970475450158119, + "loss_ce": 0.05415055528283119, + "loss_xval": 0.00555419921875, + "num_input_tokens_seen": 47017792, + "step": 683 + }, + { + "epoch": 42.75, + "grad_norm": 42.62280927507315, + "learning_rate": 5e-05, + "loss": 0.0806, + "num_input_tokens_seen": 47089344, + "step": 684 + }, + { + "epoch": 42.75, + "loss": 0.08083441853523254, + "loss_ce": 0.053246524184942245, + "loss_xval": 0.027587890625, + "num_input_tokens_seen": 47089344, + "step": 684 + }, + { + "epoch": 42.8125, + "grad_norm": 48.03632893846273, + "learning_rate": 5e-05, + "loss": 0.085, + "num_input_tokens_seen": 47148352, + "step": 685 + }, + { + "epoch": 42.8125, + "loss": 0.08726750314235687, + "loss_ce": 0.05333195626735687, + "loss_xval": 0.033935546875, + "num_input_tokens_seen": 47148352, + "step": 685 + }, + { + "epoch": 42.875, + "grad_norm": 18.319266290733662, + "learning_rate": 5e-05, + "loss": 0.058, + "num_input_tokens_seen": 47220032, + "step": 686 + }, + { + "epoch": 42.875, + "loss": 0.05868493393063545, + "loss_ce": 0.05191003158688545, + "loss_xval": 0.00677490234375, + "num_input_tokens_seen": 47220032, + "step": 686 + }, + { + "epoch": 42.9375, + "grad_norm": 20.324921894587835, + "learning_rate": 5e-05, + "loss": 0.0575, + "num_input_tokens_seen": 47291648, + "step": 687 + }, + { + "epoch": 42.9375, + "loss": 0.057490069419145584, + "loss_ce": 0.049769122153520584, + "loss_xval": 0.007720947265625, + "num_input_tokens_seen": 47291648, + "step": 687 + }, + { + "epoch": 43.0, + "grad_norm": 35.3073557714966, + "learning_rate": 5e-05, + "loss": 0.0695, + "num_input_tokens_seen": 47363456, + "step": 688 + }, + { + "epoch": 43.0, + "loss": 0.07189452648162842, + "loss_ce": 0.04870116710662842, + "loss_xval": 0.023193359375, + "num_input_tokens_seen": 47363456, + "step": 688 + }, + { + "epoch": 43.0625, + "grad_norm": 24.857812599646923, + "learning_rate": 5e-05, + "loss": 0.0629, + "num_input_tokens_seen": 47435008, + "step": 689 + }, + { + "epoch": 43.0625, + "loss": 0.06210353970527649, + "loss_ce": 0.04714992642402649, + "loss_xval": 0.01495361328125, + "num_input_tokens_seen": 47435008, + "step": 689 + }, + { + "epoch": 43.125, + "grad_norm": 0.8296193514274374, + "learning_rate": 5e-05, + "loss": 0.0515, + "num_input_tokens_seen": 47506752, + "step": 690 + }, + { + "epoch": 43.125, + "loss": 0.05312083289027214, + "loss_ce": 0.04974864050745964, + "loss_xval": 0.0033721923828125, + "num_input_tokens_seen": 47506752, + "step": 690 + }, + { + "epoch": 43.1875, + "grad_norm": 26.00094135796779, + "learning_rate": 5e-05, + "loss": 0.0599, + "num_input_tokens_seen": 47578368, + "step": 691 + }, + { + "epoch": 43.1875, + "loss": 0.06024426594376564, + "loss_ce": 0.04614514485001564, + "loss_xval": 0.01409912109375, + "num_input_tokens_seen": 47578368, + "step": 691 + }, + { + "epoch": 43.25, + "grad_norm": 32.288835474595736, + "learning_rate": 5e-05, + "loss": 0.0623, + "num_input_tokens_seen": 47637504, + "step": 692 + }, + { + "epoch": 43.25, + "loss": 0.06244288384914398, + "loss_ce": 0.04535304009914398, + "loss_xval": 0.01708984375, + "num_input_tokens_seen": 47637504, + "step": 692 + }, + { + "epoch": 43.3125, + "grad_norm": 10.261597614874594, + "learning_rate": 5e-05, + "loss": 0.0508, + "num_input_tokens_seen": 47709248, + "step": 693 + }, + { + "epoch": 43.3125, + "loss": 0.051792070269584656, + "loss_ce": 0.043308183550834656, + "loss_xval": 0.00848388671875, + "num_input_tokens_seen": 47709248, + "step": 693 + }, + { + "epoch": 43.375, + "grad_norm": 21.415064135225904, + "learning_rate": 5e-05, + "loss": 0.0521, + "num_input_tokens_seen": 47780928, + "step": 694 + }, + { + "epoch": 43.375, + "loss": 0.053237855434417725, + "loss_ce": 0.042983949184417725, + "loss_xval": 0.01025390625, + "num_input_tokens_seen": 47780928, + "step": 694 + }, + { + "epoch": 43.4375, + "grad_norm": 43.97768401612803, + "learning_rate": 5e-05, + "loss": 0.072, + "num_input_tokens_seen": 47852544, + "step": 695 + }, + { + "epoch": 43.4375, + "loss": 0.0703013688325882, + "loss_ce": 0.043201759457588196, + "loss_xval": 0.027099609375, + "num_input_tokens_seen": 47852544, + "step": 695 + }, + { + "epoch": 43.5, + "grad_norm": 50.50532393833621, + "learning_rate": 5e-05, + "loss": 0.0806, + "num_input_tokens_seen": 47924096, + "step": 696 + }, + { + "epoch": 43.5, + "loss": 0.08449646830558777, + "loss_ce": 0.04226014390587807, + "loss_xval": 0.042236328125, + "num_input_tokens_seen": 47924096, + "step": 696 + }, + { + "epoch": 43.5625, + "grad_norm": 39.98741018375859, + "learning_rate": 5e-05, + "loss": 0.0663, + "num_input_tokens_seen": 47995648, + "step": 697 + }, + { + "epoch": 43.5625, + "loss": 0.0640028789639473, + "loss_ce": 0.041175730526447296, + "loss_xval": 0.0228271484375, + "num_input_tokens_seen": 47995648, + "step": 697 + }, + { + "epoch": 43.625, + "grad_norm": 19.82453199076641, + "learning_rate": 5e-05, + "loss": 0.0557, + "num_input_tokens_seen": 48067392, + "step": 698 + }, + { + "epoch": 43.625, + "loss": 0.06127607077360153, + "loss_ce": 0.04015790671110153, + "loss_xval": 0.0211181640625, + "num_input_tokens_seen": 48067392, + "step": 698 + }, + { + "epoch": 43.6875, + "grad_norm": 3.178978415114982, + "learning_rate": 5e-05, + "loss": 0.0444, + "num_input_tokens_seen": 48139008, + "step": 699 + }, + { + "epoch": 43.6875, + "loss": 0.0443594753742218, + "loss_ce": 0.0406668484210968, + "loss_xval": 0.003692626953125, + "num_input_tokens_seen": 48139008, + "step": 699 + }, + { + "epoch": 43.75, + "grad_norm": 19.867462089830223, + "learning_rate": 5e-05, + "loss": 0.0472, + "num_input_tokens_seen": 48210688, + "step": 700 + }, + { + "epoch": 43.75, + "loss": 0.04552736133337021, + "loss_ce": 0.03808107227087021, + "loss_xval": 0.0074462890625, + "num_input_tokens_seen": 48210688, + "step": 700 + }, + { + "epoch": 43.8125, + "grad_norm": 30.11916544309693, + "learning_rate": 5e-05, + "loss": 0.0548, + "num_input_tokens_seen": 48282304, + "step": 701 + }, + { + "epoch": 43.8125, + "loss": 0.05572984740138054, + "loss_ce": 0.03973863646388054, + "loss_xval": 0.0159912109375, + "num_input_tokens_seen": 48282304, + "step": 701 + }, + { + "epoch": 43.875, + "grad_norm": 33.42329124347825, + "learning_rate": 5e-05, + "loss": 0.0578, + "num_input_tokens_seen": 48353856, + "step": 702 + }, + { + "epoch": 43.875, + "loss": 0.059752244502305984, + "loss_ce": 0.038878221064805984, + "loss_xval": 0.0208740234375, + "num_input_tokens_seen": 48353856, + "step": 702 + }, + { + "epoch": 43.9375, + "grad_norm": 34.79651760572601, + "learning_rate": 5e-05, + "loss": 0.0606, + "num_input_tokens_seen": 48412992, + "step": 703 + }, + { + "epoch": 43.9375, + "loss": 0.057222314178943634, + "loss_ce": 0.034273095428943634, + "loss_xval": 0.02294921875, + "num_input_tokens_seen": 48412992, + "step": 703 + }, + { + "epoch": 44.0, + "grad_norm": 42.63367419447805, + "learning_rate": 5e-05, + "loss": 0.0643, + "num_input_tokens_seen": 48484672, + "step": 704 + }, + { + "epoch": 44.0, + "loss": 0.061634887009859085, + "loss_ce": 0.038319457322359085, + "loss_xval": 0.0233154296875, + "num_input_tokens_seen": 48484672, + "step": 704 + }, + { + "epoch": 44.0625, + "grad_norm": 56.91280811688065, + "learning_rate": 5e-05, + "loss": 0.0814, + "num_input_tokens_seen": 48556352, + "step": 705 + }, + { + "epoch": 44.0625, + "loss": 0.07580693066120148, + "loss_ce": 0.034791309386491776, + "loss_xval": 0.041015625, + "num_input_tokens_seen": 48556352, + "step": 705 + }, + { + "epoch": 44.125, + "grad_norm": 78.36616983589818, + "learning_rate": 5e-05, + "loss": 0.125, + "num_input_tokens_seen": 48627968, + "step": 706 + }, + { + "epoch": 44.125, + "loss": 0.12948673963546753, + "loss_ce": 0.03671329841017723, + "loss_xval": 0.0927734375, + "num_input_tokens_seen": 48627968, + "step": 706 + }, + { + "epoch": 44.1875, + "grad_norm": 106.25507358737354, + "learning_rate": 5e-05, + "loss": 0.1908, + "num_input_tokens_seen": 48699584, + "step": 707 + }, + { + "epoch": 44.1875, + "loss": 0.19240882992744446, + "loss_ce": 0.03615882247686386, + "loss_xval": 0.15625, + "num_input_tokens_seen": 48699584, + "step": 707 + }, + { + "epoch": 44.25, + "grad_norm": 139.4602745129167, + "learning_rate": 5e-05, + "loss": 0.3064, + "num_input_tokens_seen": 48771328, + "step": 708 + }, + { + "epoch": 44.25, + "loss": 0.31655657291412354, + "loss_ce": 0.035306572914123535, + "loss_xval": 0.28125, + "num_input_tokens_seen": 48771328, + "step": 708 + }, + { + "epoch": 44.3125, + "grad_norm": 175.85977776893148, + "learning_rate": 5e-05, + "loss": 0.4576, + "num_input_tokens_seen": 48843008, + "step": 709 + }, + { + "epoch": 44.3125, + "loss": 0.45811718702316284, + "loss_ce": 0.03428905829787254, + "loss_xval": 0.423828125, + "num_input_tokens_seen": 48843008, + "step": 709 + }, + { + "epoch": 44.375, + "grad_norm": 192.97655324199405, + "learning_rate": 5e-05, + "loss": 0.5435, + "num_input_tokens_seen": 48914560, + "step": 710 + }, + { + "epoch": 44.375, + "loss": 0.5439357757568359, + "loss_ce": 0.036123279482126236, + "loss_xval": 0.5078125, + "num_input_tokens_seen": 48914560, + "step": 710 + }, + { + "epoch": 44.4375, + "grad_norm": 165.50638734522764, + "learning_rate": 5e-05, + "loss": 0.4227, + "num_input_tokens_seen": 48986240, + "step": 711 + }, + { + "epoch": 44.4375, + "loss": 0.41279712319374084, + "loss_ce": 0.03193773701786995, + "loss_xval": 0.380859375, + "num_input_tokens_seen": 48986240, + "step": 711 + }, + { + "epoch": 44.5, + "grad_norm": 84.90065968277804, + "learning_rate": 5e-05, + "loss": 0.1522, + "num_input_tokens_seen": 49057856, + "step": 712 + }, + { + "epoch": 44.5, + "loss": 0.15466076135635376, + "loss_ce": 0.03844981640577316, + "loss_xval": 0.1162109375, + "num_input_tokens_seen": 49057856, + "step": 712 + }, + { + "epoch": 44.5625, + "grad_norm": 35.09701586106571, + "learning_rate": 5e-05, + "loss": 0.0663, + "num_input_tokens_seen": 49129408, + "step": 713 + }, + { + "epoch": 44.5625, + "loss": 0.0639449805021286, + "loss_ce": 0.0364791564643383, + "loss_xval": 0.0274658203125, + "num_input_tokens_seen": 49129408, + "step": 713 + }, + { + "epoch": 44.625, + "grad_norm": 129.3152090152038, + "learning_rate": 5e-05, + "loss": 0.2909, + "num_input_tokens_seen": 49201088, + "step": 714 + }, + { + "epoch": 44.625, + "loss": 0.2906899154186249, + "loss_ce": 0.04166646674275398, + "loss_xval": 0.2490234375, + "num_input_tokens_seen": 49201088, + "step": 714 + }, + { + "epoch": 44.6875, + "grad_norm": 136.13492585050543, + "learning_rate": 5e-05, + "loss": 0.3203, + "num_input_tokens_seen": 49272832, + "step": 715 + }, + { + "epoch": 44.6875, + "loss": 0.315001517534256, + "loss_ce": 0.043517131358385086, + "loss_xval": 0.271484375, + "num_input_tokens_seen": 49272832, + "step": 715 + }, + { + "epoch": 44.75, + "grad_norm": 54.00318687230522, + "learning_rate": 5e-05, + "loss": 0.0901, + "num_input_tokens_seen": 49344576, + "step": 716 + }, + { + "epoch": 44.75, + "loss": 0.09366574883460999, + "loss_ce": 0.039466530084609985, + "loss_xval": 0.05419921875, + "num_input_tokens_seen": 49344576, + "step": 716 + }, + { + "epoch": 44.8125, + "grad_norm": 59.18192953802604, + "learning_rate": 5e-05, + "loss": 0.1019, + "num_input_tokens_seen": 49416192, + "step": 717 + }, + { + "epoch": 44.8125, + "loss": 0.10018584132194519, + "loss_ce": 0.04037138447165489, + "loss_xval": 0.059814453125, + "num_input_tokens_seen": 49416192, + "step": 717 + }, + { + "epoch": 44.875, + "grad_norm": 116.34551079767814, + "learning_rate": 5e-05, + "loss": 0.256, + "num_input_tokens_seen": 49487744, + "step": 718 + }, + { + "epoch": 44.875, + "loss": 0.24698016047477722, + "loss_ce": 0.042878590524196625, + "loss_xval": 0.2041015625, + "num_input_tokens_seen": 49487744, + "step": 718 + }, + { + "epoch": 44.9375, + "grad_norm": 72.96403398639748, + "learning_rate": 5e-05, + "loss": 0.1235, + "num_input_tokens_seen": 49546880, + "step": 719 + }, + { + "epoch": 44.9375, + "loss": 0.1306336671113968, + "loss_ce": 0.03688367083668709, + "loss_xval": 0.09375, + "num_input_tokens_seen": 49546880, + "step": 719 + }, + { + "epoch": 45.0, + "grad_norm": 34.10621333807227, + "learning_rate": 5e-05, + "loss": 0.062, + "num_input_tokens_seen": 49618496, + "step": 720 + }, + { + "epoch": 45.0, + "loss": 0.06327150762081146, + "loss_ce": 0.03885744512081146, + "loss_xval": 0.0244140625, + "num_input_tokens_seen": 49618496, + "step": 720 + }, + { + "epoch": 45.0625, + "grad_norm": 106.42200449605606, + "learning_rate": 5e-05, + "loss": 0.2205, + "num_input_tokens_seen": 49690048, + "step": 721 + }, + { + "epoch": 45.0625, + "loss": 0.23118966817855835, + "loss_ce": 0.03685373440384865, + "loss_xval": 0.1943359375, + "num_input_tokens_seen": 49690048, + "step": 721 + }, + { + "epoch": 45.125, + "grad_norm": 83.45709473306052, + "learning_rate": 5e-05, + "loss": 0.1568, + "num_input_tokens_seen": 49761728, + "step": 722 + }, + { + "epoch": 45.125, + "loss": 0.14840547740459442, + "loss_ce": 0.03658906742930412, + "loss_xval": 0.11181640625, + "num_input_tokens_seen": 49761728, + "step": 722 + }, + { + "epoch": 45.1875, + "grad_norm": 2.3389858812521966, + "learning_rate": 5e-05, + "loss": 0.0446, + "num_input_tokens_seen": 49833280, + "step": 723 + }, + { + "epoch": 45.1875, + "loss": 0.04393548145890236, + "loss_ce": 0.03792351856827736, + "loss_xval": 0.006011962890625, + "num_input_tokens_seen": 49833280, + "step": 723 + }, + { + "epoch": 45.25, + "grad_norm": 72.71402722285599, + "learning_rate": 5e-05, + "loss": 0.1276, + "num_input_tokens_seen": 49904896, + "step": 724 + }, + { + "epoch": 45.25, + "loss": 0.1313050538301468, + "loss_ce": 0.03267224133014679, + "loss_xval": 0.0986328125, + "num_input_tokens_seen": 49904896, + "step": 724 + }, + { + "epoch": 45.3125, + "grad_norm": 86.73820778200127, + "learning_rate": 5e-05, + "loss": 0.166, + "num_input_tokens_seen": 49976448, + "step": 725 + }, + { + "epoch": 45.3125, + "loss": 0.17679466307163239, + "loss_ce": 0.039099350571632385, + "loss_xval": 0.1376953125, + "num_input_tokens_seen": 49976448, + "step": 725 + }, + { + "epoch": 45.375, + "grad_norm": 41.67951897794212, + "learning_rate": 5e-05, + "loss": 0.0656, + "num_input_tokens_seen": 50048000, + "step": 726 + }, + { + "epoch": 45.375, + "loss": 0.06182122975587845, + "loss_ce": 0.03350091725587845, + "loss_xval": 0.0283203125, + "num_input_tokens_seen": 50048000, + "step": 726 + }, + { + "epoch": 45.4375, + "grad_norm": 25.223047984935373, + "learning_rate": 5e-05, + "loss": 0.0496, + "num_input_tokens_seen": 50119680, + "step": 727 + }, + { + "epoch": 45.4375, + "loss": 0.04861375689506531, + "loss_ce": 0.03701707720756531, + "loss_xval": 0.0115966796875, + "num_input_tokens_seen": 50119680, + "step": 727 + }, + { + "epoch": 45.5, + "grad_norm": 58.577934750268255, + "learning_rate": 5e-05, + "loss": 0.0958, + "num_input_tokens_seen": 50191424, + "step": 728 + }, + { + "epoch": 45.5, + "loss": 0.09291598200798035, + "loss_ce": 0.03432223200798035, + "loss_xval": 0.05859375, + "num_input_tokens_seen": 50191424, + "step": 728 + }, + { + "epoch": 45.5625, + "grad_norm": 39.21227207563384, + "learning_rate": 5e-05, + "loss": 0.0598, + "num_input_tokens_seen": 50263168, + "step": 729 + }, + { + "epoch": 45.5625, + "loss": 0.061409905552864075, + "loss_ce": 0.028695063665509224, + "loss_xval": 0.03271484375, + "num_input_tokens_seen": 50263168, + "step": 729 + }, + { + "epoch": 45.625, + "grad_norm": 9.434706828644057, + "learning_rate": 5e-05, + "loss": 0.0343, + "num_input_tokens_seen": 50334720, + "step": 730 + }, + { + "epoch": 45.625, + "loss": 0.03275488317012787, + "loss_ce": 0.027292238548398018, + "loss_xval": 0.005462646484375, + "num_input_tokens_seen": 50334720, + "step": 730 + }, + { + "epoch": 45.6875, + "grad_norm": 44.995512555068565, + "learning_rate": 5e-05, + "loss": 0.0675, + "num_input_tokens_seen": 50393856, + "step": 731 + }, + { + "epoch": 45.6875, + "loss": 0.0645364299416542, + "loss_ce": 0.029868463054299355, + "loss_xval": 0.03466796875, + "num_input_tokens_seen": 50393856, + "step": 731 + }, + { + "epoch": 45.75, + "grad_norm": 41.09193323924135, + "learning_rate": 5e-05, + "loss": 0.0608, + "num_input_tokens_seen": 50452928, + "step": 732 + }, + { + "epoch": 45.75, + "loss": 0.05605471879243851, + "loss_ce": 0.026513701304793358, + "loss_xval": 0.029541015625, + "num_input_tokens_seen": 50452928, + "step": 732 + }, + { + "epoch": 45.8125, + "grad_norm": 7.2153893512299145, + "learning_rate": 5e-05, + "loss": 0.0316, + "num_input_tokens_seen": 50512064, + "step": 733 + }, + { + "epoch": 45.8125, + "loss": 0.030238093808293343, + "loss_ce": 0.026057185605168343, + "loss_xval": 0.004180908203125, + "num_input_tokens_seen": 50512064, + "step": 733 + }, + { + "epoch": 45.875, + "grad_norm": 29.270123435698675, + "learning_rate": 5e-05, + "loss": 0.0448, + "num_input_tokens_seen": 50583616, + "step": 734 + }, + { + "epoch": 45.875, + "loss": 0.04943709075450897, + "loss_ce": 0.026609940454363823, + "loss_xval": 0.0228271484375, + "num_input_tokens_seen": 50583616, + "step": 734 + }, + { + "epoch": 45.9375, + "grad_norm": 43.11011675650782, + "learning_rate": 5e-05, + "loss": 0.0611, + "num_input_tokens_seen": 50655360, + "step": 735 + }, + { + "epoch": 45.9375, + "loss": 0.05930478125810623, + "loss_ce": 0.02610165812075138, + "loss_xval": 0.033203125, + "num_input_tokens_seen": 50655360, + "step": 735 + }, + { + "epoch": 46.0, + "grad_norm": 26.0936617683548, + "learning_rate": 5e-05, + "loss": 0.0423, + "num_input_tokens_seen": 50726912, + "step": 736 + }, + { + "epoch": 46.0, + "loss": 0.045453645288944244, + "loss_ce": 0.024213409051299095, + "loss_xval": 0.021240234375, + "num_input_tokens_seen": 50726912, + "step": 736 + }, + { + "epoch": 46.0625, + "grad_norm": 7.645360379252364, + "learning_rate": 5e-05, + "loss": 0.0248, + "num_input_tokens_seen": 50786112, + "step": 737 + }, + { + "epoch": 46.0625, + "loss": 0.024106912314891815, + "loss_ce": 0.019864968955516815, + "loss_xval": 0.004241943359375, + "num_input_tokens_seen": 50786112, + "step": 737 + }, + { + "epoch": 46.125, + "grad_norm": 36.16606847044462, + "learning_rate": 5e-05, + "loss": 0.0496, + "num_input_tokens_seen": 50857792, + "step": 738 + }, + { + "epoch": 46.125, + "loss": 0.048444587737321854, + "loss_ce": 0.025495368987321854, + "loss_xval": 0.02294921875, + "num_input_tokens_seen": 50857792, + "step": 738 + }, + { + "epoch": 46.1875, + "grad_norm": 43.482696946571494, + "learning_rate": 5e-05, + "loss": 0.059, + "num_input_tokens_seen": 50929344, + "step": 739 + }, + { + "epoch": 46.1875, + "loss": 0.057134151458740234, + "loss_ce": 0.024175165221095085, + "loss_xval": 0.032958984375, + "num_input_tokens_seen": 50929344, + "step": 739 + }, + { + "epoch": 46.25, + "grad_norm": 22.770786901935196, + "learning_rate": 5e-05, + "loss": 0.0308, + "num_input_tokens_seen": 50988480, + "step": 740 + }, + { + "epoch": 46.25, + "loss": 0.029636967927217484, + "loss_ce": 0.017124760895967484, + "loss_xval": 0.01251220703125, + "num_input_tokens_seen": 50988480, + "step": 740 + }, + { + "epoch": 46.3125, + "grad_norm": 16.404867876806033, + "learning_rate": 5e-05, + "loss": 0.0272, + "num_input_tokens_seen": 51060160, + "step": 741 + }, + { + "epoch": 46.3125, + "loss": 0.026602361351251602, + "loss_ce": 0.019644353538751602, + "loss_xval": 0.0069580078125, + "num_input_tokens_seen": 51060160, + "step": 741 + }, + { + "epoch": 46.375, + "grad_norm": 54.91413940931545, + "learning_rate": 5e-05, + "loss": 0.0718, + "num_input_tokens_seen": 51119296, + "step": 742 + }, + { + "epoch": 46.375, + "loss": 0.07138977199792862, + "loss_ce": 0.01670227386057377, + "loss_xval": 0.0546875, + "num_input_tokens_seen": 51119296, + "step": 742 + }, + { + "epoch": 46.4375, + "grad_norm": 71.93891750065025, + "learning_rate": 5e-05, + "loss": 0.1097, + "num_input_tokens_seen": 51190912, + "step": 743 + }, + { + "epoch": 46.4375, + "loss": 0.11477219313383102, + "loss_ce": 0.017115944996476173, + "loss_xval": 0.09765625, + "num_input_tokens_seen": 51190912, + "step": 743 + }, + { + "epoch": 46.5, + "grad_norm": 59.730350011881576, + "learning_rate": 5e-05, + "loss": 0.0768, + "num_input_tokens_seen": 51262656, + "step": 744 + }, + { + "epoch": 46.5, + "loss": 0.07721823453903198, + "loss_ce": 0.013253391720354557, + "loss_xval": 0.06396484375, + "num_input_tokens_seen": 51262656, + "step": 744 + }, + { + "epoch": 46.5625, + "grad_norm": 24.58129799871284, + "learning_rate": 5e-05, + "loss": 0.0272, + "num_input_tokens_seen": 51334208, + "step": 745 + }, + { + "epoch": 46.5625, + "loss": 0.02467159926891327, + "loss_ce": 0.014539762400090694, + "loss_xval": 0.0101318359375, + "num_input_tokens_seen": 51334208, + "step": 745 + }, + { + "epoch": 46.625, + "grad_norm": 18.53325748451815, + "learning_rate": 5e-05, + "loss": 0.0218, + "num_input_tokens_seen": 51405824, + "step": 746 + }, + { + "epoch": 46.625, + "loss": 0.017218932509422302, + "loss_ce": 0.011847837828099728, + "loss_xval": 0.00537109375, + "num_input_tokens_seen": 51405824, + "step": 746 + }, + { + "epoch": 46.6875, + "grad_norm": 50.9913159022942, + "learning_rate": 5e-05, + "loss": 0.0579, + "num_input_tokens_seen": 51477440, + "step": 747 + }, + { + "epoch": 46.6875, + "loss": 0.059509001672267914, + "loss_ce": 0.009216032922267914, + "loss_xval": 0.05029296875, + "num_input_tokens_seen": 51477440, + "step": 747 + }, + { + "epoch": 46.75, + "grad_norm": 63.66315135326273, + "learning_rate": 5e-05, + "loss": 0.0821, + "num_input_tokens_seen": 51549120, + "step": 748 + }, + { + "epoch": 46.75, + "loss": 0.0909615084528923, + "loss_ce": 0.008930260315537453, + "loss_xval": 0.08203125, + "num_input_tokens_seen": 51549120, + "step": 748 + }, + { + "epoch": 46.8125, + "grad_norm": 53.49089256030126, + "learning_rate": 5e-05, + "loss": 0.0614, + "num_input_tokens_seen": 51608256, + "step": 749 + }, + { + "epoch": 46.8125, + "loss": 0.05685190483927727, + "loss_ce": 0.008023779839277267, + "loss_xval": 0.048828125, + "num_input_tokens_seen": 51608256, + "step": 749 + }, + { + "epoch": 46.875, + "grad_norm": 22.96802504271489, + "learning_rate": 5e-05, + "loss": 0.0204, + "num_input_tokens_seen": 51679936, + "step": 750 + }, + { + "epoch": 46.875, + "eval_synth_IoU": 0.0007812945987097919, + "eval_synth_MAE_x": 0.0928192138671875, + "eval_synth_MAE_y": 0.088897705078125, + "eval_synth_NUM_probability": 0.9719446450471878, + "eval_synth_inside_bbox": 0.0, + "eval_synth_loss": 0.015559088438749313, + "eval_synth_loss_ce": 0.006327520357444882, + "eval_synth_loss_xval": 0.0092315673828125, + "eval_synth_runtime": 55.1359, + "eval_synth_samples_per_second": 2.322, + "eval_synth_steps_per_second": 0.073, + "num_input_tokens_seen": 51679936, + "step": 750 + }, + { + "epoch": 46.875, + "loss": 0.018887830898165703, + "loss_ce": 0.0071080452762544155, + "loss_xval": 0.01177978515625, + "num_input_tokens_seen": 51679936, + "step": 750 + }, + { + "epoch": 46.9375, + "grad_norm": 19.42051608033023, + "learning_rate": 5e-05, + "loss": 0.0179, + "num_input_tokens_seen": 51751680, + "step": 751 + }, + { + "epoch": 46.9375, + "loss": 0.012810613960027695, + "loss_ce": 0.007103826384991407, + "loss_xval": 0.005706787109375, + "num_input_tokens_seen": 51751680, + "step": 751 + }, + { + "epoch": 47.0, + "grad_norm": 54.658150025767185, + "learning_rate": 5e-05, + "loss": 0.0623, + "num_input_tokens_seen": 51823232, + "step": 752 + }, + { + "epoch": 47.0, + "loss": 0.06768595427274704, + "loss_ce": 0.006650795694440603, + "loss_xval": 0.06103515625, + "num_input_tokens_seen": 51823232, + "step": 752 + }, + { + "epoch": 47.0625, + "grad_norm": 69.07348114452765, + "learning_rate": 5e-05, + "loss": 0.0919, + "num_input_tokens_seen": 51894912, + "step": 753 + }, + { + "epoch": 47.0625, + "loss": 0.09093870967626572, + "loss_ce": 0.005489490460604429, + "loss_xval": 0.08544921875, + "num_input_tokens_seen": 51894912, + "step": 753 + }, + { + "epoch": 47.125, + "grad_norm": 60.03217462112842, + "learning_rate": 5e-05, + "loss": 0.0746, + "num_input_tokens_seen": 51966656, + "step": 754 + }, + { + "epoch": 47.125, + "loss": 0.07557681947946548, + "loss_ce": 0.004776036832481623, + "loss_xval": 0.07080078125, + "num_input_tokens_seen": 51966656, + "step": 754 + }, + { + "epoch": 47.1875, + "grad_norm": 43.325007150716914, + "learning_rate": 5e-05, + "loss": 0.0462, + "num_input_tokens_seen": 52038336, + "step": 755 + }, + { + "epoch": 47.1875, + "loss": 0.04101795330643654, + "loss_ce": 0.004396860953420401, + "loss_xval": 0.03662109375, + "num_input_tokens_seen": 52038336, + "step": 755 + }, + { + "epoch": 47.25, + "grad_norm": 27.442651321523833, + "learning_rate": 5e-05, + "loss": 0.0226, + "num_input_tokens_seen": 52109888, + "step": 756 + }, + { + "epoch": 47.25, + "loss": 0.017444264143705368, + "loss_ce": 0.004260670859366655, + "loss_xval": 0.01318359375, + "num_input_tokens_seen": 52109888, + "step": 756 + }, + { + "epoch": 47.3125, + "grad_norm": 8.615957725807633, + "learning_rate": 5e-05, + "loss": 0.013, + "num_input_tokens_seen": 52181632, + "step": 757 + }, + { + "epoch": 47.3125, + "loss": 0.014540612697601318, + "loss_ce": 0.004530847072601318, + "loss_xval": 0.010009765625, + "num_input_tokens_seen": 52181632, + "step": 757 + }, + { + "epoch": 47.375, + "grad_norm": 10.469872066339313, + "learning_rate": 5e-05, + "loss": 0.011, + "num_input_tokens_seen": 52240640, + "step": 758 + }, + { + "epoch": 47.375, + "loss": 0.011658506467938423, + "loss_ce": 0.003540830686688423, + "loss_xval": 0.00811767578125, + "num_input_tokens_seen": 52240640, + "step": 758 + }, + { + "epoch": 47.4375, + "grad_norm": 22.950363116153852, + "learning_rate": 5e-05, + "loss": 0.0188, + "num_input_tokens_seen": 52312320, + "step": 759 + }, + { + "epoch": 47.4375, + "loss": 0.01578526385128498, + "loss_ce": 0.003212022129446268, + "loss_xval": 0.0125732421875, + "num_input_tokens_seen": 52312320, + "step": 759 + }, + { + "epoch": 47.5, + "grad_norm": 24.16847740208771, + "learning_rate": 5e-05, + "loss": 0.0171, + "num_input_tokens_seen": 52384000, + "step": 760 + }, + { + "epoch": 47.5, + "loss": 0.01860148459672928, + "loss_ce": 0.0032206252217292786, + "loss_xval": 0.015380859375, + "num_input_tokens_seen": 52384000, + "step": 760 + }, + { + "epoch": 47.5625, + "grad_norm": 17.844345643001546, + "learning_rate": 5e-05, + "loss": 0.0128, + "num_input_tokens_seen": 52455680, + "step": 761 + }, + { + "epoch": 47.5625, + "loss": 0.013861645944416523, + "loss_ce": 0.003424634225666523, + "loss_xval": 0.01043701171875, + "num_input_tokens_seen": 52455680, + "step": 761 + }, + { + "epoch": 47.625, + "grad_norm": 18.057741465889073, + "learning_rate": 5e-05, + "loss": 0.0106, + "num_input_tokens_seen": 52527296, + "step": 762 + }, + { + "epoch": 47.625, + "loss": 0.011138019151985645, + "loss_ce": 0.002715167822316289, + "loss_xval": 0.0084228515625, + "num_input_tokens_seen": 52527296, + "step": 762 + }, + { + "epoch": 47.6875, + "grad_norm": 25.332474387451597, + "learning_rate": 5e-05, + "loss": 0.0169, + "num_input_tokens_seen": 52598976, + "step": 763 + }, + { + "epoch": 47.6875, + "loss": 0.017574511468410492, + "loss_ce": 0.0027429680339992046, + "loss_xval": 0.01483154296875, + "num_input_tokens_seen": 52598976, + "step": 763 + }, + { + "epoch": 47.75, + "grad_norm": 35.70284428611629, + "learning_rate": 5e-05, + "loss": 0.0292, + "num_input_tokens_seen": 52658240, + "step": 764 + }, + { + "epoch": 47.75, + "loss": 0.029903825372457504, + "loss_ce": 0.0024380050599575043, + "loss_xval": 0.0274658203125, + "num_input_tokens_seen": 52658240, + "step": 764 + }, + { + "epoch": 47.8125, + "grad_norm": 49.93003045007072, + "learning_rate": 5e-05, + "loss": 0.0493, + "num_input_tokens_seen": 52729920, + "step": 765 + }, + { + "epoch": 47.8125, + "loss": 0.047652266919612885, + "loss_ce": 0.002486249664798379, + "loss_xval": 0.045166015625, + "num_input_tokens_seen": 52729920, + "step": 765 + }, + { + "epoch": 47.875, + "grad_norm": 67.404459413659, + "learning_rate": 5e-05, + "loss": 0.0829, + "num_input_tokens_seen": 52801728, + "step": 766 + }, + { + "epoch": 47.875, + "loss": 0.0774126872420311, + "loss_ce": 0.0017290961695834994, + "loss_xval": 0.07568359375, + "num_input_tokens_seen": 52801728, + "step": 766 + }, + { + "epoch": 47.9375, + "grad_norm": 90.78005663657689, + "learning_rate": 5e-05, + "loss": 0.149, + "num_input_tokens_seen": 52860864, + "step": 767 + }, + { + "epoch": 47.9375, + "loss": 0.14267286658287048, + "loss_ce": 0.0020478684455156326, + "loss_xval": 0.140625, + "num_input_tokens_seen": 52860864, + "step": 767 + }, + { + "epoch": 48.0, + "grad_norm": 120.70776113463316, + "learning_rate": 5e-05, + "loss": 0.258, + "num_input_tokens_seen": 52920064, + "step": 768 + }, + { + "epoch": 48.0, + "loss": 0.2656416893005371, + "loss_ce": 0.0019698101095855236, + "loss_xval": 0.263671875, + "num_input_tokens_seen": 52920064, + "step": 768 + }, + { + "epoch": 48.0625, + "grad_norm": 154.26644625699004, + "learning_rate": 5e-05, + "loss": 0.4307, + "num_input_tokens_seen": 52991744, + "step": 769 + }, + { + "epoch": 48.0625, + "loss": 0.42958393692970276, + "loss_ce": 0.0018495743861421943, + "loss_xval": 0.427734375, + "num_input_tokens_seen": 52991744, + "step": 769 + }, + { + "epoch": 48.125, + "grad_norm": 183.65303034626527, + "learning_rate": 5e-05, + "loss": 0.6016, + "num_input_tokens_seen": 53063424, + "step": 770 + }, + { + "epoch": 48.125, + "loss": 0.5957886576652527, + "loss_ce": 0.002038649283349514, + "loss_xval": 0.59375, + "num_input_tokens_seen": 53063424, + "step": 770 + }, + { + "epoch": 48.1875, + "grad_norm": 180.1180856007345, + "learning_rate": 5e-05, + "loss": 0.5999, + "num_input_tokens_seen": 53135104, + "step": 771 + }, + { + "epoch": 48.1875, + "loss": 0.5726340413093567, + "loss_ce": 0.0023215236142277718, + "loss_xval": 0.5703125, + "num_input_tokens_seen": 53135104, + "step": 771 + }, + { + "epoch": 48.25, + "grad_norm": 125.47094176348381, + "learning_rate": 5e-05, + "loss": 0.2987, + "num_input_tokens_seen": 53206784, + "step": 772 + }, + { + "epoch": 48.25, + "loss": 0.2898358702659607, + "loss_ce": 0.002726496197283268, + "loss_xval": 0.287109375, + "num_input_tokens_seen": 53206784, + "step": 772 + }, + { + "epoch": 48.3125, + "grad_norm": 24.422705550921062, + "learning_rate": 5e-05, + "loss": 0.0202, + "num_input_tokens_seen": 53278528, + "step": 773 + }, + { + "epoch": 48.3125, + "loss": 0.015276818536221981, + "loss_ce": 0.003924279473721981, + "loss_xval": 0.0113525390625, + "num_input_tokens_seen": 53278528, + "step": 773 + }, + { + "epoch": 48.375, + "grad_norm": 81.92908246583572, + "learning_rate": 5e-05, + "loss": 0.1362, + "num_input_tokens_seen": 53350144, + "step": 774 + }, + { + "epoch": 48.375, + "loss": 0.1354779452085495, + "loss_ce": 0.003642010036855936, + "loss_xval": 0.1318359375, + "num_input_tokens_seen": 53350144, + "step": 774 + }, + { + "epoch": 48.4375, + "grad_norm": 132.71379220552564, + "learning_rate": 5e-05, + "loss": 0.3499, + "num_input_tokens_seen": 53421824, + "step": 775 + }, + { + "epoch": 48.4375, + "loss": 0.3635503947734833, + "loss_ce": 0.00417538033798337, + "loss_xval": 0.359375, + "num_input_tokens_seen": 53421824, + "step": 775 + }, + { + "epoch": 48.5, + "grad_norm": 103.21943678223388, + "learning_rate": 5e-05, + "loss": 0.2241, + "num_input_tokens_seen": 53493504, + "step": 776 + }, + { + "epoch": 48.5, + "loss": 0.22680360078811646, + "loss_ce": 0.005123913753777742, + "loss_xval": 0.2216796875, + "num_input_tokens_seen": 53493504, + "step": 776 + }, + { + "epoch": 48.5625, + "grad_norm": 20.388756780472935, + "learning_rate": 5e-05, + "loss": 0.0227, + "num_input_tokens_seen": 53565120, + "step": 777 + }, + { + "epoch": 48.5625, + "loss": 0.016845600679516792, + "loss_ce": 0.00463856989517808, + "loss_xval": 0.01220703125, + "num_input_tokens_seen": 53565120, + "step": 777 + }, + { + "epoch": 48.625, + "grad_norm": 54.35238290952437, + "learning_rate": 5e-05, + "loss": 0.0736, + "num_input_tokens_seen": 53624320, + "step": 778 + }, + { + "epoch": 48.625, + "loss": 0.08257697522640228, + "loss_ce": 0.005916816648095846, + "loss_xval": 0.07666015625, + "num_input_tokens_seen": 53624320, + "step": 778 + }, + { + "epoch": 48.6875, + "grad_norm": 69.92504449745852, + "learning_rate": 5e-05, + "loss": 0.1096, + "num_input_tokens_seen": 53683520, + "step": 779 + }, + { + "epoch": 48.6875, + "loss": 0.11179614067077637, + "loss_ce": 0.005350826773792505, + "loss_xval": 0.1064453125, + "num_input_tokens_seen": 53683520, + "step": 779 + }, + { + "epoch": 48.75, + "grad_norm": 31.00750248115586, + "learning_rate": 5e-05, + "loss": 0.0312, + "num_input_tokens_seen": 53755072, + "step": 780 + }, + { + "epoch": 48.75, + "loss": 0.03486330807209015, + "loss_ce": 0.006176783703267574, + "loss_xval": 0.0286865234375, + "num_input_tokens_seen": 53755072, + "step": 780 + }, + { + "epoch": 48.8125, + "grad_norm": 26.982223682099708, + "learning_rate": 5e-05, + "loss": 0.0294, + "num_input_tokens_seen": 53826752, + "step": 781 + }, + { + "epoch": 48.8125, + "loss": 0.02688555046916008, + "loss_ce": 0.005401174537837505, + "loss_xval": 0.021484375, + "num_input_tokens_seen": 53826752, + "step": 781 + }, + { + "epoch": 48.875, + "grad_norm": 55.79318187391173, + "learning_rate": 5e-05, + "loss": 0.0742, + "num_input_tokens_seen": 53898496, + "step": 782 + }, + { + "epoch": 48.875, + "loss": 0.08878543227910995, + "loss_ce": 0.004801060538738966, + "loss_xval": 0.083984375, + "num_input_tokens_seen": 53898496, + "step": 782 + }, + { + "epoch": 48.9375, + "grad_norm": 31.524279884792552, + "learning_rate": 5e-05, + "loss": 0.0291, + "num_input_tokens_seen": 53970176, + "step": 783 + }, + { + "epoch": 48.9375, + "loss": 0.024802180007100105, + "loss_ce": 0.004416438285261393, + "loss_xval": 0.0203857421875, + "num_input_tokens_seen": 53970176, + "step": 783 + }, + { + "epoch": 49.0, + "grad_norm": 18.41228108143884, + "learning_rate": 5e-05, + "loss": 0.0153, + "num_input_tokens_seen": 54041792, + "step": 784 + }, + { + "epoch": 49.0, + "loss": 0.017882753163576126, + "loss_ce": 0.0043329475447535515, + "loss_xval": 0.0135498046875, + "num_input_tokens_seen": 54041792, + "step": 784 + }, + { + "epoch": 49.0625, + "grad_norm": 55.099401878604795, + "learning_rate": 5e-05, + "loss": 0.0738, + "num_input_tokens_seen": 54113344, + "step": 785 + }, + { + "epoch": 49.0625, + "loss": 0.0743025466799736, + "loss_ce": 0.0039900485426187515, + "loss_xval": 0.0703125, + "num_input_tokens_seen": 54113344, + "step": 785 + }, + { + "epoch": 49.125, + "grad_norm": 54.48862222878034, + "learning_rate": 5e-05, + "loss": 0.0694, + "num_input_tokens_seen": 54184896, + "step": 786 + }, + { + "epoch": 49.125, + "loss": 0.06297452747821808, + "loss_ce": 0.004136639181524515, + "loss_xval": 0.058837890625, + "num_input_tokens_seen": 54184896, + "step": 786 + }, + { + "epoch": 49.1875, + "grad_norm": 25.23820781380214, + "learning_rate": 5e-05, + "loss": 0.0204, + "num_input_tokens_seen": 54256576, + "step": 787 + }, + { + "epoch": 49.1875, + "loss": 0.01891055330634117, + "loss_ce": 0.0038959055673331022, + "loss_xval": 0.0150146484375, + "num_input_tokens_seen": 54256576, + "step": 787 + }, + { + "epoch": 49.25, + "grad_norm": 6.017261666530016, + "learning_rate": 5e-05, + "loss": 0.0074, + "num_input_tokens_seen": 54328384, + "step": 788 + }, + { + "epoch": 49.25, + "loss": 0.006588565185666084, + "loss_ce": 0.0036131010856479406, + "loss_xval": 0.0029754638671875, + "num_input_tokens_seen": 54328384, + "step": 788 + }, + { + "epoch": 49.3125, + "grad_norm": 27.539892076431858, + "learning_rate": 5e-05, + "loss": 0.022, + "num_input_tokens_seen": 54387520, + "step": 789 + }, + { + "epoch": 49.3125, + "loss": 0.021436044946312904, + "loss_ce": 0.004712412133812904, + "loss_xval": 0.0167236328125, + "num_input_tokens_seen": 54387520, + "step": 789 + }, + { + "epoch": 49.375, + "grad_norm": 32.542488691104325, + "learning_rate": 5e-05, + "loss": 0.0303, + "num_input_tokens_seen": 54459136, + "step": 790 + }, + { + "epoch": 49.375, + "loss": 0.030650347471237183, + "loss_ce": 0.005992144346237183, + "loss_xval": 0.024658203125, + "num_input_tokens_seen": 54459136, + "step": 790 + }, + { + "epoch": 49.4375, + "grad_norm": 20.238510613982434, + "learning_rate": 5e-05, + "loss": 0.0137, + "num_input_tokens_seen": 54530752, + "step": 791 + }, + { + "epoch": 49.4375, + "loss": 0.011637124232947826, + "loss_ce": 0.0035194484516978264, + "loss_xval": 0.00811767578125, + "num_input_tokens_seen": 54530752, + "step": 791 + }, + { + "epoch": 49.5, + "grad_norm": 1.5253884409622565, + "learning_rate": 5e-05, + "loss": 0.0056, + "num_input_tokens_seen": 54602368, + "step": 792 + }, + { + "epoch": 49.5, + "loss": 0.005845654755830765, + "loss_ce": 0.002915967023000121, + "loss_xval": 0.0029296875, + "num_input_tokens_seen": 54602368, + "step": 792 + }, + { + "epoch": 49.5625, + "grad_norm": 21.408122372907325, + "learning_rate": 5e-05, + "loss": 0.0151, + "num_input_tokens_seen": 54673920, + "step": 793 + }, + { + "epoch": 49.5625, + "loss": 0.017070988193154335, + "loss_ce": 0.0031549730338156223, + "loss_xval": 0.013916015625, + "num_input_tokens_seen": 54673920, + "step": 793 + }, + { + "epoch": 49.625, + "grad_norm": 32.00954556671291, + "learning_rate": 5e-05, + "loss": 0.0274, + "num_input_tokens_seen": 54733120, + "step": 794 + }, + { + "epoch": 49.625, + "loss": 0.026889847591519356, + "loss_ce": 0.0027199252508580685, + "loss_xval": 0.024169921875, + "num_input_tokens_seen": 54733120, + "step": 794 + }, + { + "epoch": 49.6875, + "grad_norm": 32.08281797834498, + "learning_rate": 5e-05, + "loss": 0.0261, + "num_input_tokens_seen": 54804800, + "step": 795 + }, + { + "epoch": 49.6875, + "loss": 0.02954932302236557, + "loss_ce": 0.00257178395986557, + "loss_xval": 0.0269775390625, + "num_input_tokens_seen": 54804800, + "step": 795 + }, + { + "epoch": 49.75, + "grad_norm": 23.6610997266782, + "learning_rate": 5e-05, + "loss": 0.0171, + "num_input_tokens_seen": 54876352, + "step": 796 + }, + { + "epoch": 49.75, + "loss": 0.016563788056373596, + "loss_ce": 0.002464667893946171, + "loss_xval": 0.01409912109375, + "num_input_tokens_seen": 54876352, + "step": 796 + }, + { + "epoch": 49.8125, + "grad_norm": 10.80261321499895, + "learning_rate": 5e-05, + "loss": 0.0069, + "num_input_tokens_seen": 54947968, + "step": 797 + }, + { + "epoch": 49.8125, + "loss": 0.008322998881340027, + "loss_ce": 0.0021584476344287395, + "loss_xval": 0.00616455078125, + "num_input_tokens_seen": 54947968, + "step": 797 + }, + { + "epoch": 49.875, + "grad_norm": 1.9500645700249208, + "learning_rate": 5e-05, + "loss": 0.0046, + "num_input_tokens_seen": 55007168, + "step": 798 + }, + { + "epoch": 49.875, + "loss": 0.004437156952917576, + "loss_ce": 0.0020873036701232195, + "loss_xval": 0.002349853515625, + "num_input_tokens_seen": 55007168, + "step": 798 + }, + { + "epoch": 49.9375, + "grad_norm": 11.979543997613995, + "learning_rate": 5e-05, + "loss": 0.0073, + "num_input_tokens_seen": 55066176, + "step": 799 + }, + { + "epoch": 49.9375, + "loss": 0.00749744800850749, + "loss_ce": 0.00227894214913249, + "loss_xval": 0.005218505859375, + "num_input_tokens_seen": 55066176, + "step": 799 + }, + { + "epoch": 50.0, + "grad_norm": 19.932370797577512, + "learning_rate": 5e-05, + "loss": 0.0134, + "num_input_tokens_seen": 55137984, + "step": 800 + }, + { + "epoch": 50.0, + "loss": 0.013562907464802265, + "loss_ce": 0.0020272627007216215, + "loss_xval": 0.01153564453125, + "num_input_tokens_seen": 55137984, + "step": 800 + }, + { + "epoch": 50.0625, + "grad_norm": 26.848126234216654, + "learning_rate": 5e-05, + "loss": 0.021, + "num_input_tokens_seen": 55197120, + "step": 801 + }, + { + "epoch": 50.0625, + "loss": 0.018240634351968765, + "loss_ce": 0.0020052818581461906, + "loss_xval": 0.0162353515625, + "num_input_tokens_seen": 55197120, + "step": 801 + }, + { + "epoch": 50.125, + "grad_norm": 26.692372871133674, + "learning_rate": 5e-05, + "loss": 0.0185, + "num_input_tokens_seen": 55268736, + "step": 802 + }, + { + "epoch": 50.125, + "loss": 0.01761055923998356, + "loss_ce": 0.0016193479532375932, + "loss_xval": 0.0159912109375, + "num_input_tokens_seen": 55268736, + "step": 802 + }, + { + "epoch": 50.1875, + "grad_norm": 18.658201527903525, + "learning_rate": 5e-05, + "loss": 0.0113, + "num_input_tokens_seen": 55340480, + "step": 803 + }, + { + "epoch": 50.1875, + "loss": 0.012265665456652641, + "loss_ce": 0.0016455486183986068, + "loss_xval": 0.0106201171875, + "num_input_tokens_seen": 55340480, + "step": 803 + }, + { + "epoch": 50.25, + "grad_norm": 8.941056023600712, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 55412032, + "step": 804 + }, + { + "epoch": 50.25, + "loss": 0.005654824897646904, + "loss_ce": 0.001626504585146904, + "loss_xval": 0.0040283203125, + "num_input_tokens_seen": 55412032, + "step": 804 + }, + { + "epoch": 50.3125, + "grad_norm": 0.7011671163304171, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 55483712, + "step": 805 + }, + { + "epoch": 50.3125, + "loss": 0.0031639118678867817, + "loss_ce": 0.0016151447780430317, + "loss_xval": 0.00154876708984375, + "num_input_tokens_seen": 55483712, + "step": 805 + }, + { + "epoch": 50.375, + "grad_norm": 11.37878069005779, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 55555328, + "step": 806 + }, + { + "epoch": 50.375, + "loss": 0.005364611279219389, + "loss_ce": 0.0017330195987597108, + "loss_xval": 0.003631591796875, + "num_input_tokens_seen": 55555328, + "step": 806 + }, + { + "epoch": 50.4375, + "grad_norm": 22.824887834021848, + "learning_rate": 5e-05, + "loss": 0.0151, + "num_input_tokens_seen": 55626944, + "step": 807 + }, + { + "epoch": 50.4375, + "loss": 0.01386871188879013, + "loss_ce": 0.0016006450168788433, + "loss_xval": 0.01226806640625, + "num_input_tokens_seen": 55626944, + "step": 807 + }, + { + "epoch": 50.5, + "grad_norm": 32.869491648733195, + "learning_rate": 5e-05, + "loss": 0.0267, + "num_input_tokens_seen": 55698496, + "step": 808 + }, + { + "epoch": 50.5, + "loss": 0.024559233337640762, + "loss_ce": 0.001487944507971406, + "loss_xval": 0.0230712890625, + "num_input_tokens_seen": 55698496, + "step": 808 + }, + { + "epoch": 50.5625, + "grad_norm": 42.62504396274984, + "learning_rate": 5e-05, + "loss": 0.0419, + "num_input_tokens_seen": 55770112, + "step": 809 + }, + { + "epoch": 50.5625, + "loss": 0.03527969866991043, + "loss_ce": 0.0013441527262330055, + "loss_xval": 0.033935546875, + "num_input_tokens_seen": 55770112, + "step": 809 + }, + { + "epoch": 50.625, + "grad_norm": 55.812846213983434, + "learning_rate": 5e-05, + "loss": 0.0697, + "num_input_tokens_seen": 55841792, + "step": 810 + }, + { + "epoch": 50.625, + "loss": 0.07067803293466568, + "loss_ce": 0.0013420972973108292, + "loss_xval": 0.0693359375, + "num_input_tokens_seen": 55841792, + "step": 810 + }, + { + "epoch": 50.6875, + "grad_norm": 73.57934830845704, + "learning_rate": 5e-05, + "loss": 0.1194, + "num_input_tokens_seen": 55913344, + "step": 811 + }, + { + "epoch": 50.6875, + "loss": 0.12286809086799622, + "loss_ce": 0.0012860629940405488, + "loss_xval": 0.12158203125, + "num_input_tokens_seen": 55913344, + "step": 811 + }, + { + "epoch": 50.75, + "grad_norm": 92.5057681700446, + "learning_rate": 5e-05, + "loss": 0.1853, + "num_input_tokens_seen": 55985024, + "step": 812 + }, + { + "epoch": 50.75, + "loss": 0.17899751663208008, + "loss_ce": 0.0012631455902010202, + "loss_xval": 0.177734375, + "num_input_tokens_seen": 55985024, + "step": 812 + }, + { + "epoch": 50.8125, + "grad_norm": 110.50358208916595, + "learning_rate": 5e-05, + "loss": 0.2668, + "num_input_tokens_seen": 56044032, + "step": 813 + }, + { + "epoch": 50.8125, + "loss": 0.27068424224853516, + "loss_ce": 0.0011529907351359725, + "loss_xval": 0.26953125, + "num_input_tokens_seen": 56044032, + "step": 813 + }, + { + "epoch": 50.875, + "grad_norm": 120.20554805484593, + "learning_rate": 5e-05, + "loss": 0.3158, + "num_input_tokens_seen": 56115776, + "step": 814 + }, + { + "epoch": 50.875, + "loss": 0.31771913170814514, + "loss_ce": 0.0013128790305927396, + "loss_xval": 0.31640625, + "num_input_tokens_seen": 56115776, + "step": 814 + }, + { + "epoch": 50.9375, + "grad_norm": 114.74657469509621, + "learning_rate": 5e-05, + "loss": 0.2944, + "num_input_tokens_seen": 56187456, + "step": 815 + }, + { + "epoch": 50.9375, + "loss": 0.2767355144023895, + "loss_ce": 0.0013448747340589762, + "loss_xval": 0.275390625, + "num_input_tokens_seen": 56187456, + "step": 815 + }, + { + "epoch": 51.0, + "grad_norm": 90.33489565175944, + "learning_rate": 5e-05, + "loss": 0.1812, + "num_input_tokens_seen": 56259072, + "step": 816 + }, + { + "epoch": 51.0, + "loss": 0.17537222802639008, + "loss_ce": 0.001544097438454628, + "loss_xval": 0.173828125, + "num_input_tokens_seen": 56259072, + "step": 816 + }, + { + "epoch": 51.0625, + "grad_norm": 44.040262786271, + "learning_rate": 5e-05, + "loss": 0.047, + "num_input_tokens_seen": 56330688, + "step": 817 + }, + { + "epoch": 51.0625, + "loss": 0.04732108861207962, + "loss_ce": 0.0019109330605715513, + "loss_xval": 0.04541015625, + "num_input_tokens_seen": 56330688, + "step": 817 + }, + { + "epoch": 51.125, + "grad_norm": 8.197144710350221, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 56402304, + "step": 818 + }, + { + "epoch": 51.125, + "loss": 0.00620028143748641, + "loss_ce": 0.00195833807811141, + "loss_xval": 0.004241943359375, + "num_input_tokens_seen": 56402304, + "step": 818 + }, + { + "epoch": 51.1875, + "grad_norm": 51.23137999157995, + "learning_rate": 5e-05, + "loss": 0.068, + "num_input_tokens_seen": 56473984, + "step": 819 + }, + { + "epoch": 51.1875, + "loss": 0.06873885542154312, + "loss_ce": 0.0018443216104060411, + "loss_xval": 0.06689453125, + "num_input_tokens_seen": 56473984, + "step": 819 + }, + { + "epoch": 51.25, + "grad_norm": 75.80088109055204, + "learning_rate": 5e-05, + "loss": 0.1361, + "num_input_tokens_seen": 56545600, + "step": 820 + }, + { + "epoch": 51.25, + "loss": 0.13631145656108856, + "loss_ce": 0.002522395458072424, + "loss_xval": 0.1337890625, + "num_input_tokens_seen": 56545600, + "step": 820 + }, + { + "epoch": 51.3125, + "grad_norm": 73.5638812770527, + "learning_rate": 5e-05, + "loss": 0.1285, + "num_input_tokens_seen": 56617216, + "step": 821 + }, + { + "epoch": 51.3125, + "loss": 0.13125012814998627, + "loss_ce": 0.002343877451494336, + "loss_xval": 0.12890625, + "num_input_tokens_seen": 56617216, + "step": 821 + }, + { + "epoch": 51.375, + "grad_norm": 50.491024677513764, + "learning_rate": 5e-05, + "loss": 0.0624, + "num_input_tokens_seen": 56688960, + "step": 822 + }, + { + "epoch": 51.375, + "loss": 0.06392044574022293, + "loss_ce": 0.0021528673823922873, + "loss_xval": 0.061767578125, + "num_input_tokens_seen": 56688960, + "step": 822 + }, + { + "epoch": 51.4375, + "grad_norm": 13.627391662624422, + "learning_rate": 5e-05, + "loss": 0.008, + "num_input_tokens_seen": 56760512, + "step": 823 + }, + { + "epoch": 51.4375, + "loss": 0.009278560988605022, + "loss_ce": 0.0019848598167300224, + "loss_xval": 0.007293701171875, + "num_input_tokens_seen": 56760512, + "step": 823 + }, + { + "epoch": 51.5, + "grad_norm": 21.93003165557537, + "learning_rate": 5e-05, + "loss": 0.0162, + "num_input_tokens_seen": 56832256, + "step": 824 + }, + { + "epoch": 51.5, + "loss": 0.018117789179086685, + "loss_ce": 0.0020045076962560415, + "loss_xval": 0.01611328125, + "num_input_tokens_seen": 56832256, + "step": 824 + }, + { + "epoch": 51.5625, + "grad_norm": 37.6954295759489, + "learning_rate": 5e-05, + "loss": 0.0405, + "num_input_tokens_seen": 56904000, + "step": 825 + }, + { + "epoch": 51.5625, + "loss": 0.04147803410887718, + "loss_ce": 0.006565925199538469, + "loss_xval": 0.034912109375, + "num_input_tokens_seen": 56904000, + "step": 825 + }, + { + "epoch": 51.625, + "grad_norm": 37.611391142785784, + "learning_rate": 5e-05, + "loss": 0.0376, + "num_input_tokens_seen": 56975616, + "step": 826 + }, + { + "epoch": 51.625, + "loss": 0.04057503119111061, + "loss_ce": 0.0020008108112961054, + "loss_xval": 0.03857421875, + "num_input_tokens_seen": 56975616, + "step": 826 + }, + { + "epoch": 51.6875, + "grad_norm": 29.196912703371584, + "learning_rate": 5e-05, + "loss": 0.0231, + "num_input_tokens_seen": 57047296, + "step": 827 + }, + { + "epoch": 51.6875, + "loss": 0.024808669462800026, + "loss_ce": 0.001981521723791957, + "loss_xval": 0.0228271484375, + "num_input_tokens_seen": 57047296, + "step": 827 + }, + { + "epoch": 51.75, + "grad_norm": 16.54743485919119, + "learning_rate": 5e-05, + "loss": 0.011, + "num_input_tokens_seen": 57106496, + "step": 828 + }, + { + "epoch": 51.75, + "loss": 0.011654208414256573, + "loss_ce": 0.0019496186869218946, + "loss_xval": 0.00970458984375, + "num_input_tokens_seen": 57106496, + "step": 828 + }, + { + "epoch": 51.8125, + "grad_norm": 3.7849532984052625, + "learning_rate": 5e-05, + "loss": 0.0054, + "num_input_tokens_seen": 57178176, + "step": 829 + }, + { + "epoch": 51.8125, + "loss": 0.0048974500969052315, + "loss_ce": 0.0018914688844233751, + "loss_xval": 0.0030059814453125, + "num_input_tokens_seen": 57178176, + "step": 829 + }, + { + "epoch": 51.875, + "grad_norm": 6.105116572368139, + "learning_rate": 5e-05, + "loss": 0.0068, + "num_input_tokens_seen": 57249792, + "step": 830 + }, + { + "epoch": 51.875, + "loss": 0.0064654904417693615, + "loss_ce": 0.0019183713011443615, + "loss_xval": 0.004547119140625, + "num_input_tokens_seen": 57249792, + "step": 830 + }, + { + "epoch": 51.9375, + "grad_norm": 13.59154846215083, + "learning_rate": 5e-05, + "loss": 0.0089, + "num_input_tokens_seen": 57321536, + "step": 831 + }, + { + "epoch": 51.9375, + "loss": 0.008973270654678345, + "loss_ce": 0.002167850499972701, + "loss_xval": 0.006805419921875, + "num_input_tokens_seen": 57321536, + "step": 831 + }, + { + "epoch": 52.0, + "grad_norm": 22.67058972281316, + "learning_rate": 5e-05, + "loss": 0.0185, + "num_input_tokens_seen": 57393216, + "step": 832 + }, + { + "epoch": 52.0, + "loss": 0.022312132641673088, + "loss_ce": 0.002536741318181157, + "loss_xval": 0.019775390625, + "num_input_tokens_seen": 57393216, + "step": 832 + }, + { + "epoch": 52.0625, + "grad_norm": 31.26119262513664, + "learning_rate": 5e-05, + "loss": 0.0276, + "num_input_tokens_seen": 57465024, + "step": 833 + }, + { + "epoch": 52.0625, + "loss": 0.027556775137782097, + "loss_ce": 0.001799939782358706, + "loss_xval": 0.0257568359375, + "num_input_tokens_seen": 57465024, + "step": 833 + }, + { + "epoch": 52.125, + "grad_norm": 39.97021875837148, + "learning_rate": 5e-05, + "loss": 0.0408, + "num_input_tokens_seen": 57536832, + "step": 834 + }, + { + "epoch": 52.125, + "loss": 0.03928814083337784, + "loss_ce": 0.0016904838848859072, + "loss_xval": 0.03759765625, + "num_input_tokens_seen": 57536832, + "step": 834 + }, + { + "epoch": 52.1875, + "grad_norm": 54.09472417184387, + "learning_rate": 5e-05, + "loss": 0.072, + "num_input_tokens_seen": 57608576, + "step": 835 + }, + { + "epoch": 52.1875, + "loss": 0.07196825742721558, + "loss_ce": 0.0016557550989091396, + "loss_xval": 0.0703125, + "num_input_tokens_seen": 57608576, + "step": 835 + }, + { + "epoch": 52.25, + "grad_norm": 71.16024527891, + "learning_rate": 5e-05, + "loss": 0.1223, + "num_input_tokens_seen": 57680320, + "step": 836 + }, + { + "epoch": 52.25, + "loss": 0.12591731548309326, + "loss_ce": 0.0014055914944037795, + "loss_xval": 0.12451171875, + "num_input_tokens_seen": 57680320, + "step": 836 + }, + { + "epoch": 52.3125, + "grad_norm": 86.84082121350787, + "learning_rate": 5e-05, + "loss": 0.1791, + "num_input_tokens_seen": 57751872, + "step": 837 + }, + { + "epoch": 52.3125, + "loss": 0.18296390771865845, + "loss_ce": 0.0013232819037511945, + "loss_xval": 0.181640625, + "num_input_tokens_seen": 57751872, + "step": 837 + }, + { + "epoch": 52.375, + "grad_norm": 96.18561268671853, + "learning_rate": 5e-05, + "loss": 0.2226, + "num_input_tokens_seen": 57823488, + "step": 838 + }, + { + "epoch": 52.375, + "loss": 0.22117449343204498, + "loss_ce": 0.001447926159016788, + "loss_xval": 0.2197265625, + "num_input_tokens_seen": 57823488, + "step": 838 + }, + { + "epoch": 52.4375, + "grad_norm": 91.7109355876937, + "learning_rate": 5e-05, + "loss": 0.2035, + "num_input_tokens_seen": 57895104, + "step": 839 + }, + { + "epoch": 52.4375, + "loss": 0.20843414962291718, + "loss_ce": 0.001402897178195417, + "loss_xval": 0.20703125, + "num_input_tokens_seen": 57895104, + "step": 839 + }, + { + "epoch": 52.5, + "grad_norm": 62.32489949280706, + "learning_rate": 5e-05, + "loss": 0.0974, + "num_input_tokens_seen": 57966784, + "step": 840 + }, + { + "epoch": 52.5, + "loss": 0.10197938978672028, + "loss_ce": 0.0013934536837041378, + "loss_xval": 0.1005859375, + "num_input_tokens_seen": 57966784, + "step": 840 + }, + { + "epoch": 52.5625, + "grad_norm": 16.85693768146133, + "learning_rate": 5e-05, + "loss": 0.0162, + "num_input_tokens_seen": 58038528, + "step": 841 + }, + { + "epoch": 52.5625, + "loss": 0.015338940545916557, + "loss_ce": 0.001422925153747201, + "loss_xval": 0.013916015625, + "num_input_tokens_seen": 58038528, + "step": 841 + }, + { + "epoch": 52.625, + "grad_norm": 26.557199399991575, + "learning_rate": 5e-05, + "loss": 0.0313, + "num_input_tokens_seen": 58110144, + "step": 842 + }, + { + "epoch": 52.625, + "loss": 0.032941192388534546, + "loss_ce": 0.001691192970611155, + "loss_xval": 0.03125, + "num_input_tokens_seen": 58110144, + "step": 842 + }, + { + "epoch": 52.6875, + "grad_norm": 61.834426831310644, + "learning_rate": 5e-05, + "loss": 0.1024, + "num_input_tokens_seen": 58181888, + "step": 843 + }, + { + "epoch": 52.6875, + "loss": 0.11410608887672424, + "loss_ce": 0.0013131167506799102, + "loss_xval": 0.11279296875, + "num_input_tokens_seen": 58181888, + "step": 843 + }, + { + "epoch": 52.75, + "grad_norm": 85.75039908293095, + "learning_rate": 5e-05, + "loss": 0.1846, + "num_input_tokens_seen": 58253568, + "step": 844 + }, + { + "epoch": 52.75, + "loss": 0.1811586171388626, + "loss_ce": 0.0014711236581206322, + "loss_xval": 0.1796875, + "num_input_tokens_seen": 58253568, + "step": 844 + }, + { + "epoch": 52.8125, + "grad_norm": 93.53596735754489, + "learning_rate": 5e-05, + "loss": 0.2221, + "num_input_tokens_seen": 58325184, + "step": 845 + }, + { + "epoch": 52.8125, + "loss": 0.22300904989242554, + "loss_ce": 0.001329358434304595, + "loss_xval": 0.2216796875, + "num_input_tokens_seen": 58325184, + "step": 845 + }, + { + "epoch": 52.875, + "grad_norm": 77.09685538155797, + "learning_rate": 5e-05, + "loss": 0.1562, + "num_input_tokens_seen": 58396928, + "step": 846 + }, + { + "epoch": 52.875, + "loss": 0.15464122593402863, + "loss_ce": 0.0013209060998633504, + "loss_xval": 0.1533203125, + "num_input_tokens_seen": 58396928, + "step": 846 + }, + { + "epoch": 52.9375, + "grad_norm": 40.01764226139686, + "learning_rate": 5e-05, + "loss": 0.0554, + "num_input_tokens_seen": 58468544, + "step": 847 + }, + { + "epoch": 52.9375, + "loss": 0.04871372878551483, + "loss_ce": 0.0013504456728696823, + "loss_xval": 0.04736328125, + "num_input_tokens_seen": 58468544, + "step": 847 + }, + { + "epoch": 53.0, + "grad_norm": 3.160508660622948, + "learning_rate": 5e-05, + "loss": 0.0141, + "num_input_tokens_seen": 58540224, + "step": 848 + }, + { + "epoch": 53.0, + "loss": 0.013072260655462742, + "loss_ce": 0.0014145454624667764, + "loss_xval": 0.01165771484375, + "num_input_tokens_seen": 58540224, + "step": 848 + }, + { + "epoch": 53.0625, + "grad_norm": 36.930479844854915, + "learning_rate": 5e-05, + "loss": 0.0432, + "num_input_tokens_seen": 58611840, + "step": 849 + }, + { + "epoch": 53.0625, + "loss": 0.04419334605336189, + "loss_ce": 0.0014687355142086744, + "loss_xval": 0.042724609375, + "num_input_tokens_seen": 58611840, + "step": 849 + }, + { + "epoch": 53.125, + "grad_norm": 49.85794519163775, + "learning_rate": 5e-05, + "loss": 0.0681, + "num_input_tokens_seen": 58670912, + "step": 850 + }, + { + "epoch": 53.125, + "loss": 0.06584130972623825, + "loss_ce": 0.001388181815855205, + "loss_xval": 0.064453125, + "num_input_tokens_seen": 58670912, + "step": 850 + }, + { + "epoch": 53.1875, + "grad_norm": 41.56149633060318, + "learning_rate": 5e-05, + "loss": 0.0478, + "num_input_tokens_seen": 58742592, + "step": 851 + }, + { + "epoch": 53.1875, + "loss": 0.04808543249964714, + "loss_ce": 0.0019428534433245659, + "loss_xval": 0.046142578125, + "num_input_tokens_seen": 58742592, + "step": 851 + }, + { + "epoch": 53.25, + "grad_norm": 20.379106913212443, + "learning_rate": 5e-05, + "loss": 0.015, + "num_input_tokens_seen": 58814208, + "step": 852 + }, + { + "epoch": 53.25, + "loss": 0.013091476634144783, + "loss_ce": 0.0019220428075641394, + "loss_xval": 0.01116943359375, + "num_input_tokens_seen": 58814208, + "step": 852 + }, + { + "epoch": 53.3125, + "grad_norm": 4.5415851008096935, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 58885888, + "step": 853 + }, + { + "epoch": 53.3125, + "loss": 0.0059251245111227036, + "loss_ce": 0.0018968043150380254, + "loss_xval": 0.0040283203125, + "num_input_tokens_seen": 58885888, + "step": 853 + }, + { + "epoch": 53.375, + "grad_norm": 22.50266574558752, + "learning_rate": 5e-05, + "loss": 0.0207, + "num_input_tokens_seen": 58957504, + "step": 854 + }, + { + "epoch": 53.375, + "loss": 0.022456642240285873, + "loss_ce": 0.0015826192684471607, + "loss_xval": 0.0208740234375, + "num_input_tokens_seen": 58957504, + "step": 854 + }, + { + "epoch": 53.4375, + "grad_norm": 29.03587347070425, + "learning_rate": 5e-05, + "loss": 0.0281, + "num_input_tokens_seen": 59029248, + "step": 855 + }, + { + "epoch": 53.4375, + "loss": 0.029709484428167343, + "loss_ce": 0.0018774535274133086, + "loss_xval": 0.02783203125, + "num_input_tokens_seen": 59029248, + "step": 855 + }, + { + "epoch": 53.5, + "grad_norm": 28.401153106004635, + "learning_rate": 5e-05, + "loss": 0.0272, + "num_input_tokens_seen": 59088384, + "step": 856 + }, + { + "epoch": 53.5, + "loss": 0.026484563946723938, + "loss_ce": 0.0014601502334699035, + "loss_xval": 0.0250244140625, + "num_input_tokens_seen": 59088384, + "step": 856 + }, + { + "epoch": 53.5625, + "grad_norm": 27.042709640622366, + "learning_rate": 5e-05, + "loss": 0.0238, + "num_input_tokens_seen": 59160128, + "step": 857 + }, + { + "epoch": 53.5625, + "loss": 0.021871447563171387, + "loss_ce": 0.0016077766194939613, + "loss_xval": 0.020263671875, + "num_input_tokens_seen": 59160128, + "step": 857 + }, + { + "epoch": 53.625, + "grad_norm": 22.177055726651922, + "learning_rate": 5e-05, + "loss": 0.0167, + "num_input_tokens_seen": 59231744, + "step": 858 + }, + { + "epoch": 53.625, + "loss": 0.016048060730099678, + "loss_ce": 0.0016437633894383907, + "loss_xval": 0.014404296875, + "num_input_tokens_seen": 59231744, + "step": 858 + }, + { + "epoch": 53.6875, + "grad_norm": 13.365363544559107, + "learning_rate": 5e-05, + "loss": 0.0081, + "num_input_tokens_seen": 59303360, + "step": 859 + }, + { + "epoch": 53.6875, + "loss": 0.00661351066082716, + "loss_ce": 0.0013950045686215162, + "loss_xval": 0.005218505859375, + "num_input_tokens_seen": 59303360, + "step": 859 + }, + { + "epoch": 53.75, + "grad_norm": 2.290110524547106, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 59375040, + "step": 860 + }, + { + "epoch": 53.75, + "loss": 0.003171001560986042, + "loss_ce": 0.0015306818531826138, + "loss_xval": 0.00164031982421875, + "num_input_tokens_seen": 59375040, + "step": 860 + }, + { + "epoch": 53.8125, + "grad_norm": 11.050180430092226, + "learning_rate": 5e-05, + "loss": 0.0077, + "num_input_tokens_seen": 59434176, + "step": 861 + }, + { + "epoch": 53.8125, + "loss": 0.00867583230137825, + "loss_ce": 0.0013821311295032501, + "loss_xval": 0.007293701171875, + "num_input_tokens_seen": 59434176, + "step": 861 + }, + { + "epoch": 53.875, + "grad_norm": 23.083052407895174, + "learning_rate": 5e-05, + "loss": 0.0189, + "num_input_tokens_seen": 59505856, + "step": 862 + }, + { + "epoch": 53.875, + "loss": 0.01968074031174183, + "loss_ce": 0.0013701926218345761, + "loss_xval": 0.018310546875, + "num_input_tokens_seen": 59505856, + "step": 862 + }, + { + "epoch": 53.9375, + "grad_norm": 31.603491961859262, + "learning_rate": 5e-05, + "loss": 0.031, + "num_input_tokens_seen": 59577536, + "step": 863 + }, + { + "epoch": 53.9375, + "loss": 0.02824649214744568, + "loss_ce": 0.0013910232810303569, + "loss_xval": 0.02685546875, + "num_input_tokens_seen": 59577536, + "step": 863 + }, + { + "epoch": 54.0, + "grad_norm": 37.373668693249535, + "learning_rate": 5e-05, + "loss": 0.0404, + "num_input_tokens_seen": 59649216, + "step": 864 + }, + { + "epoch": 54.0, + "loss": 0.04348291456699371, + "loss_ce": 0.0012465871404856443, + "loss_xval": 0.042236328125, + "num_input_tokens_seen": 59649216, + "step": 864 + }, + { + "epoch": 54.0625, + "grad_norm": 42.12211569032457, + "learning_rate": 5e-05, + "loss": 0.0502, + "num_input_tokens_seen": 59720832, + "step": 865 + }, + { + "epoch": 54.0625, + "loss": 0.05193883180618286, + "loss_ce": 0.0011575802927836776, + "loss_xval": 0.05078125, + "num_input_tokens_seen": 59720832, + "step": 865 + }, + { + "epoch": 54.125, + "grad_norm": 47.381010910843735, + "learning_rate": 5e-05, + "loss": 0.0632, + "num_input_tokens_seen": 59792512, + "step": 866 + }, + { + "epoch": 54.125, + "loss": 0.06457292288541794, + "loss_ce": 0.0010963573586195707, + "loss_xval": 0.0634765625, + "num_input_tokens_seen": 59792512, + "step": 866 + }, + { + "epoch": 54.1875, + "grad_norm": 54.87808466607328, + "learning_rate": 5e-05, + "loss": 0.0844, + "num_input_tokens_seen": 59864128, + "step": 867 + }, + { + "epoch": 54.1875, + "loss": 0.08025375008583069, + "loss_ce": 0.0011521849082782865, + "loss_xval": 0.0791015625, + "num_input_tokens_seen": 59864128, + "step": 867 + }, + { + "epoch": 54.25, + "grad_norm": 61.93344752696612, + "learning_rate": 5e-05, + "loss": 0.1067, + "num_input_tokens_seen": 59923200, + "step": 868 + }, + { + "epoch": 54.25, + "loss": 0.10697054117918015, + "loss_ce": 0.0010135115589946508, + "loss_xval": 0.10595703125, + "num_input_tokens_seen": 59923200, + "step": 868 + }, + { + "epoch": 54.3125, + "grad_norm": 63.59808744718001, + "learning_rate": 5e-05, + "loss": 0.1134, + "num_input_tokens_seen": 59982336, + "step": 869 + }, + { + "epoch": 54.3125, + "loss": 0.11188820004463196, + "loss_ce": 0.001048356993123889, + "loss_xval": 0.11083984375, + "num_input_tokens_seen": 59982336, + "step": 869 + }, + { + "epoch": 54.375, + "grad_norm": 56.94998811770308, + "learning_rate": 5e-05, + "loss": 0.0915, + "num_input_tokens_seen": 60041472, + "step": 870 + }, + { + "epoch": 54.375, + "loss": 0.0917876660823822, + "loss_ce": 0.0009673540480434895, + "loss_xval": 0.0908203125, + "num_input_tokens_seen": 60041472, + "step": 870 + }, + { + "epoch": 54.4375, + "grad_norm": 41.91686845647309, + "learning_rate": 5e-05, + "loss": 0.0506, + "num_input_tokens_seen": 60113216, + "step": 871 + }, + { + "epoch": 54.4375, + "loss": 0.045974716544151306, + "loss_ce": 0.0010528410784900188, + "loss_xval": 0.044921875, + "num_input_tokens_seen": 60113216, + "step": 871 + }, + { + "epoch": 54.5, + "grad_norm": 23.635588748282537, + "learning_rate": 5e-05, + "loss": 0.0175, + "num_input_tokens_seen": 60184832, + "step": 872 + }, + { + "epoch": 54.5, + "loss": 0.016869032755494118, + "loss_ce": 0.0009998929454013705, + "loss_xval": 0.015869140625, + "num_input_tokens_seen": 60184832, + "step": 872 + }, + { + "epoch": 54.5625, + "grad_norm": 5.082441886348326, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 60256576, + "step": 873 + }, + { + "epoch": 54.5625, + "loss": 0.003460080362856388, + "loss_ce": 0.0010491915745660663, + "loss_xval": 0.002410888671875, + "num_input_tokens_seen": 60256576, + "step": 873 + }, + { + "epoch": 54.625, + "grad_norm": 13.605292859198338, + "learning_rate": 5e-05, + "loss": 0.0076, + "num_input_tokens_seen": 60328128, + "step": 874 + }, + { + "epoch": 54.625, + "loss": 0.00831228494644165, + "loss_ce": 0.0009270310983993113, + "loss_xval": 0.00738525390625, + "num_input_tokens_seen": 60328128, + "step": 874 + }, + { + "epoch": 54.6875, + "grad_norm": 30.014976579341187, + "learning_rate": 5e-05, + "loss": 0.0269, + "num_input_tokens_seen": 60387264, + "step": 875 + }, + { + "epoch": 54.6875, + "loss": 0.025903020054101944, + "loss_ce": 0.001000675605610013, + "loss_xval": 0.02490234375, + "num_input_tokens_seen": 60387264, + "step": 875 + }, + { + "epoch": 54.75, + "grad_norm": 41.671989374196215, + "learning_rate": 5e-05, + "loss": 0.0508, + "num_input_tokens_seen": 60458880, + "step": 876 + }, + { + "epoch": 54.75, + "loss": 0.04798507317900658, + "loss_ce": 0.001110074925236404, + "loss_xval": 0.046875, + "num_input_tokens_seen": 60458880, + "step": 876 + }, + { + "epoch": 54.8125, + "grad_norm": 48.6149903146668, + "learning_rate": 5e-05, + "loss": 0.0684, + "num_input_tokens_seen": 60530496, + "step": 877 + }, + { + "epoch": 54.8125, + "loss": 0.07246832549571991, + "loss_ce": 0.0011792667210102081, + "loss_xval": 0.0712890625, + "num_input_tokens_seen": 60530496, + "step": 877 + }, + { + "epoch": 54.875, + "grad_norm": 52.85622195242736, + "learning_rate": 5e-05, + "loss": 0.0813, + "num_input_tokens_seen": 60602112, + "step": 878 + }, + { + "epoch": 54.875, + "loss": 0.07911968231201172, + "loss_ce": 0.0009946819627657533, + "loss_xval": 0.078125, + "num_input_tokens_seen": 60602112, + "step": 878 + }, + { + "epoch": 54.9375, + "grad_norm": 55.263215932973885, + "learning_rate": 5e-05, + "loss": 0.0894, + "num_input_tokens_seen": 60673728, + "step": 879 + }, + { + "epoch": 54.9375, + "loss": 0.08692839741706848, + "loss_ce": 0.0009908954380080104, + "loss_xval": 0.0859375, + "num_input_tokens_seen": 60673728, + "step": 879 + }, + { + "epoch": 55.0, + "grad_norm": 54.75293670657225, + "learning_rate": 5e-05, + "loss": 0.0876, + "num_input_tokens_seen": 60745280, + "step": 880 + }, + { + "epoch": 55.0, + "loss": 0.085189089179039, + "loss_ce": 0.0012047119671478868, + "loss_xval": 0.083984375, + "num_input_tokens_seen": 60745280, + "step": 880 + }, + { + "epoch": 55.0625, + "grad_norm": 49.616545082817495, + "learning_rate": 5e-05, + "loss": 0.0727, + "num_input_tokens_seen": 60816896, + "step": 881 + }, + { + "epoch": 55.0625, + "loss": 0.07568308711051941, + "loss_ce": 0.0009760558605194092, + "loss_xval": 0.07470703125, + "num_input_tokens_seen": 60816896, + "step": 881 + }, + { + "epoch": 55.125, + "grad_norm": 41.843744921751586, + "learning_rate": 5e-05, + "loss": 0.0519, + "num_input_tokens_seen": 60876096, + "step": 882 + }, + { + "epoch": 55.125, + "loss": 0.04788345843553543, + "loss_ce": 0.0010084599489346147, + "loss_xval": 0.046875, + "num_input_tokens_seen": 60876096, + "step": 882 + }, + { + "epoch": 55.1875, + "grad_norm": 31.37590948848245, + "learning_rate": 5e-05, + "loss": 0.0304, + "num_input_tokens_seen": 60935104, + "step": 883 + }, + { + "epoch": 55.1875, + "loss": 0.033360954374074936, + "loss_ce": 0.0008902526460587978, + "loss_xval": 0.032470703125, + "num_input_tokens_seen": 60935104, + "step": 883 + }, + { + "epoch": 55.25, + "grad_norm": 15.947873383411899, + "learning_rate": 5e-05, + "loss": 0.0102, + "num_input_tokens_seen": 61006656, + "step": 884 + }, + { + "epoch": 55.25, + "loss": 0.009581946767866611, + "loss_ce": 0.000914954871404916, + "loss_xval": 0.0086669921875, + "num_input_tokens_seen": 61006656, + "step": 884 + }, + { + "epoch": 55.3125, + "grad_norm": 0.661683516786272, + "learning_rate": 5e-05, + "loss": 0.0037, + "num_input_tokens_seen": 61078272, + "step": 885 + }, + { + "epoch": 55.3125, + "loss": 0.0037136925384402275, + "loss_ce": 0.0009976280853152275, + "loss_xval": 0.002716064453125, + "num_input_tokens_seen": 61078272, + "step": 885 + }, + { + "epoch": 55.375, + "grad_norm": 11.31514067465619, + "learning_rate": 5e-05, + "loss": 0.0066, + "num_input_tokens_seen": 61137408, + "step": 886 + }, + { + "epoch": 55.375, + "loss": 0.006238732486963272, + "loss_ce": 0.0008371211588382721, + "loss_xval": 0.005401611328125, + "num_input_tokens_seen": 61137408, + "step": 886 + }, + { + "epoch": 55.4375, + "grad_norm": 18.242504586556592, + "learning_rate": 5e-05, + "loss": 0.0123, + "num_input_tokens_seen": 61209024, + "step": 887 + }, + { + "epoch": 55.4375, + "loss": 0.013944677077233791, + "loss_ce": 0.000822118716314435, + "loss_xval": 0.01312255859375, + "num_input_tokens_seen": 61209024, + "step": 887 + }, + { + "epoch": 55.5, + "grad_norm": 24.271649102582902, + "learning_rate": 5e-05, + "loss": 0.019, + "num_input_tokens_seen": 61280704, + "step": 888 + }, + { + "epoch": 55.5, + "loss": 0.01951012946665287, + "loss_ce": 0.0008333711884915829, + "loss_xval": 0.0186767578125, + "num_input_tokens_seen": 61280704, + "step": 888 + }, + { + "epoch": 55.5625, + "grad_norm": 32.47503220114088, + "learning_rate": 5e-05, + "loss": 0.0325, + "num_input_tokens_seen": 61339776, + "step": 889 + }, + { + "epoch": 55.5625, + "loss": 0.03378288447856903, + "loss_ce": 0.0008238990558311343, + "loss_xval": 0.032958984375, + "num_input_tokens_seen": 61339776, + "step": 889 + }, + { + "epoch": 55.625, + "grad_norm": 44.32927293806377, + "learning_rate": 5e-05, + "loss": 0.058, + "num_input_tokens_seen": 61399040, + "step": 890 + }, + { + "epoch": 55.625, + "loss": 0.053077153861522675, + "loss_ce": 0.0008310599368996918, + "loss_xval": 0.05224609375, + "num_input_tokens_seen": 61399040, + "step": 890 + }, + { + "epoch": 55.6875, + "grad_norm": 61.68685014028904, + "learning_rate": 5e-05, + "loss": 0.1114, + "num_input_tokens_seen": 61470656, + "step": 891 + }, + { + "epoch": 55.6875, + "loss": 0.11456962674856186, + "loss_ce": 0.0008000986417755485, + "loss_xval": 0.11376953125, + "num_input_tokens_seen": 61470656, + "step": 891 + }, + { + "epoch": 55.75, + "grad_norm": 83.67046241533913, + "learning_rate": 5e-05, + "loss": 0.204, + "num_input_tokens_seen": 61542208, + "step": 892 + }, + { + "epoch": 55.75, + "loss": 0.20592321455478668, + "loss_ce": 0.0008450897876173258, + "loss_xval": 0.205078125, + "num_input_tokens_seen": 61542208, + "step": 892 + }, + { + "epoch": 55.8125, + "grad_norm": 105.657566184581, + "learning_rate": 5e-05, + "loss": 0.326, + "num_input_tokens_seen": 61601344, + "step": 893 + }, + { + "epoch": 55.8125, + "loss": 0.3270096182823181, + "loss_ce": 0.0008377549238502979, + "loss_xval": 0.326171875, + "num_input_tokens_seen": 61601344, + "step": 893 + }, + { + "epoch": 55.875, + "grad_norm": 116.78406432128128, + "learning_rate": 5e-05, + "loss": 0.4071, + "num_input_tokens_seen": 61672896, + "step": 894 + }, + { + "epoch": 55.875, + "loss": 0.4032019376754761, + "loss_ce": 0.0008581792353652418, + "loss_xval": 0.40234375, + "num_input_tokens_seen": 61672896, + "step": 894 + }, + { + "epoch": 55.9375, + "grad_norm": 101.11755702111572, + "learning_rate": 5e-05, + "loss": 0.3116, + "num_input_tokens_seen": 61744512, + "step": 895 + }, + { + "epoch": 55.9375, + "loss": 0.309630423784256, + "loss_ce": 0.001036668079905212, + "loss_xval": 0.30859375, + "num_input_tokens_seen": 61744512, + "step": 895 + }, + { + "epoch": 56.0, + "grad_norm": 52.970602494824114, + "learning_rate": 5e-05, + "loss": 0.089, + "num_input_tokens_seen": 61803648, + "step": 896 + }, + { + "epoch": 56.0, + "loss": 0.09438222646713257, + "loss_ce": 0.001120504573918879, + "loss_xval": 0.09326171875, + "num_input_tokens_seen": 61803648, + "step": 896 + }, + { + "epoch": 56.0625, + "grad_norm": 11.375753228798327, + "learning_rate": 5e-05, + "loss": 0.0088, + "num_input_tokens_seen": 61875328, + "step": 897 + }, + { + "epoch": 56.0625, + "loss": 0.007805442437529564, + "loss_ce": 0.0012441633734852076, + "loss_xval": 0.006561279296875, + "num_input_tokens_seen": 61875328, + "step": 897 + }, + { + "epoch": 56.125, + "grad_norm": 62.90691314652422, + "learning_rate": 5e-05, + "loss": 0.1301, + "num_input_tokens_seen": 61946880, + "step": 898 + }, + { + "epoch": 56.125, + "loss": 0.12718553841114044, + "loss_ce": 0.001208969741128385, + "loss_xval": 0.1259765625, + "num_input_tokens_seen": 61946880, + "step": 898 + }, + { + "epoch": 56.1875, + "grad_norm": 84.84208103644006, + "learning_rate": 5e-05, + "loss": 0.2331, + "num_input_tokens_seen": 62006016, + "step": 899 + }, + { + "epoch": 56.1875, + "loss": 0.22287721931934357, + "loss_ce": 0.0011975381057709455, + "loss_xval": 0.2216796875, + "num_input_tokens_seen": 62006016, + "step": 899 + }, + { + "epoch": 56.25, + "grad_norm": 67.01970442288534, + "learning_rate": 5e-05, + "loss": 0.1478, + "num_input_tokens_seen": 62065216, + "step": 900 + }, + { + "epoch": 56.25, + "loss": 0.14876626431941986, + "loss_ce": 0.001305332756601274, + "loss_xval": 0.1474609375, + "num_input_tokens_seen": 62065216, + "step": 900 + }, + { + "epoch": 56.3125, + "grad_norm": 20.31969946362518, + "learning_rate": 5e-05, + "loss": 0.0165, + "num_input_tokens_seen": 62136768, + "step": 901 + }, + { + "epoch": 56.3125, + "loss": 0.018054738640785217, + "loss_ce": 0.0013311064103618264, + "loss_xval": 0.0167236328125, + "num_input_tokens_seen": 62136768, + "step": 901 + }, + { + "epoch": 56.375, + "grad_norm": 26.062323010289205, + "learning_rate": 5e-05, + "loss": 0.0262, + "num_input_tokens_seen": 62208320, + "step": 902 + }, + { + "epoch": 56.375, + "loss": 0.025609955191612244, + "loss_ce": 0.001195893157273531, + "loss_xval": 0.0244140625, + "num_input_tokens_seen": 62208320, + "step": 902 + }, + { + "epoch": 56.4375, + "grad_norm": 52.54992119672312, + "learning_rate": 5e-05, + "loss": 0.0925, + "num_input_tokens_seen": 62279872, + "step": 903 + }, + { + "epoch": 56.4375, + "loss": 0.09242052584886551, + "loss_ce": 0.001111934194341302, + "loss_xval": 0.09130859375, + "num_input_tokens_seen": 62279872, + "step": 903 + }, + { + "epoch": 56.5, + "grad_norm": 51.81828437755531, + "learning_rate": 5e-05, + "loss": 0.0896, + "num_input_tokens_seen": 62351488, + "step": 904 + }, + { + "epoch": 56.5, + "loss": 0.08236326277256012, + "loss_ce": 0.001308571663685143, + "loss_xval": 0.0810546875, + "num_input_tokens_seen": 62351488, + "step": 904 + }, + { + "epoch": 56.5625, + "grad_norm": 27.88396242758177, + "learning_rate": 5e-05, + "loss": 0.0283, + "num_input_tokens_seen": 62423168, + "step": 905 + }, + { + "epoch": 56.5625, + "loss": 0.02919887565076351, + "loss_ce": 0.0012447733897715807, + "loss_xval": 0.0279541015625, + "num_input_tokens_seen": 62423168, + "step": 905 + }, + { + "epoch": 56.625, + "grad_norm": 7.343225292992126, + "learning_rate": 5e-05, + "loss": 0.0042, + "num_input_tokens_seen": 62494912, + "step": 906 + }, + { + "epoch": 56.625, + "loss": 0.004097954370081425, + "loss_ce": 0.0011530080810189247, + "loss_xval": 0.0029449462890625, + "num_input_tokens_seen": 62494912, + "step": 906 + }, + { + "epoch": 56.6875, + "grad_norm": 35.796164062283395, + "learning_rate": 5e-05, + "loss": 0.0438, + "num_input_tokens_seen": 62566464, + "step": 907 + }, + { + "epoch": 56.6875, + "loss": 0.04529685154557228, + "loss_ce": 0.0011073980713263154, + "loss_xval": 0.044189453125, + "num_input_tokens_seen": 62566464, + "step": 907 + }, + { + "epoch": 56.75, + "grad_norm": 42.181166309964844, + "learning_rate": 5e-05, + "loss": 0.0593, + "num_input_tokens_seen": 62638080, + "step": 908 + }, + { + "epoch": 56.75, + "loss": 0.05803143233060837, + "loss_ce": 0.0011466656578704715, + "loss_xval": 0.056884765625, + "num_input_tokens_seen": 62638080, + "step": 908 + }, + { + "epoch": 56.8125, + "grad_norm": 28.04768382436155, + "learning_rate": 5e-05, + "loss": 0.0284, + "num_input_tokens_seen": 62697280, + "step": 909 + }, + { + "epoch": 56.8125, + "loss": 0.028996262699365616, + "loss_ce": 0.0010421618353575468, + "loss_xval": 0.0279541015625, + "num_input_tokens_seen": 62697280, + "step": 909 + }, + { + "epoch": 56.875, + "grad_norm": 4.604248343503638, + "learning_rate": 5e-05, + "loss": 0.0035, + "num_input_tokens_seen": 62768896, + "step": 910 + }, + { + "epoch": 56.875, + "loss": 0.0034466590732336044, + "loss_ce": 0.0011120643466711044, + "loss_xval": 0.0023345947265625, + "num_input_tokens_seen": 62768896, + "step": 910 + }, + { + "epoch": 56.9375, + "grad_norm": 16.717241202153325, + "learning_rate": 5e-05, + "loss": 0.0116, + "num_input_tokens_seen": 62840512, + "step": 911 + }, + { + "epoch": 56.9375, + "loss": 0.013039151206612587, + "loss_ce": 0.0010762609308585525, + "loss_xval": 0.011962890625, + "num_input_tokens_seen": 62840512, + "step": 911 + }, + { + "epoch": 57.0, + "grad_norm": 26.396870079955534, + "learning_rate": 5e-05, + "loss": 0.0255, + "num_input_tokens_seen": 62912128, + "step": 912 + }, + { + "epoch": 57.0, + "loss": 0.026932695880532265, + "loss_ce": 0.001053789397701621, + "loss_xval": 0.02587890625, + "num_input_tokens_seen": 62912128, + "step": 912 + }, + { + "epoch": 57.0625, + "grad_norm": 22.369721128961444, + "learning_rate": 5e-05, + "loss": 0.0187, + "num_input_tokens_seen": 62983808, + "step": 913 + }, + { + "epoch": 57.0625, + "loss": 0.017884692177176476, + "loss_ce": 0.0010389896342530847, + "loss_xval": 0.016845703125, + "num_input_tokens_seen": 62983808, + "step": 913 + }, + { + "epoch": 57.125, + "grad_norm": 8.173019447239868, + "learning_rate": 5e-05, + "loss": 0.0044, + "num_input_tokens_seen": 63055552, + "step": 914 + }, + { + "epoch": 57.125, + "loss": 0.004705994389951229, + "loss_ce": 0.001059143920429051, + "loss_xval": 0.0036468505859375, + "num_input_tokens_seen": 63055552, + "step": 914 + }, + { + "epoch": 57.1875, + "grad_norm": 10.20139356248153, + "learning_rate": 5e-05, + "loss": 0.0059, + "num_input_tokens_seen": 63127296, + "step": 915 + }, + { + "epoch": 57.1875, + "loss": 0.005690255202353001, + "loss_ce": 0.0009905482875183225, + "loss_xval": 0.00469970703125, + "num_input_tokens_seen": 63127296, + "step": 915 + }, + { + "epoch": 57.25, + "grad_norm": 23.837598745509396, + "learning_rate": 5e-05, + "loss": 0.0218, + "num_input_tokens_seen": 63199040, + "step": 916 + }, + { + "epoch": 57.25, + "loss": 0.022357981652021408, + "loss_ce": 0.001117747975513339, + "loss_xval": 0.021240234375, + "num_input_tokens_seen": 63199040, + "step": 916 + }, + { + "epoch": 57.3125, + "grad_norm": 24.947858111336256, + "learning_rate": 5e-05, + "loss": 0.0237, + "num_input_tokens_seen": 63270720, + "step": 917 + }, + { + "epoch": 57.3125, + "loss": 0.023415779694914818, + "loss_ce": 0.0009548426023684442, + "loss_xval": 0.0224609375, + "num_input_tokens_seen": 63270720, + "step": 917 + }, + { + "epoch": 57.375, + "grad_norm": 16.266573389443806, + "learning_rate": 5e-05, + "loss": 0.0115, + "num_input_tokens_seen": 63342400, + "step": 918 + }, + { + "epoch": 57.375, + "loss": 0.010243535041809082, + "loss_ce": 0.0009051557281054556, + "loss_xval": 0.00933837890625, + "num_input_tokens_seen": 63342400, + "step": 918 + }, + { + "epoch": 57.4375, + "grad_norm": 4.543084565084577, + "learning_rate": 5e-05, + "loss": 0.0025, + "num_input_tokens_seen": 63401600, + "step": 919 + }, + { + "epoch": 57.4375, + "loss": 0.002306486712768674, + "loss_ce": 0.0009179369662888348, + "loss_xval": 0.0013885498046875, + "num_input_tokens_seen": 63401600, + "step": 919 + }, + { + "epoch": 57.5, + "grad_norm": 6.932013648934979, + "learning_rate": 5e-05, + "loss": 0.0034, + "num_input_tokens_seen": 63473344, + "step": 920 + }, + { + "epoch": 57.5, + "loss": 0.0029933510813862085, + "loss_ce": 0.0008266030927188694, + "loss_xval": 0.002166748046875, + "num_input_tokens_seen": 63473344, + "step": 920 + }, + { + "epoch": 57.5625, + "grad_norm": 15.423595588632633, + "learning_rate": 5e-05, + "loss": 0.0101, + "num_input_tokens_seen": 63544960, + "step": 921 + }, + { + "epoch": 57.5625, + "loss": 0.008886837400496006, + "loss_ce": 0.0010133021278306842, + "loss_xval": 0.00787353515625, + "num_input_tokens_seen": 63544960, + "step": 921 + }, + { + "epoch": 57.625, + "grad_norm": 17.914449695553976, + "learning_rate": 5e-05, + "loss": 0.0126, + "num_input_tokens_seen": 63616640, + "step": 922 + }, + { + "epoch": 57.625, + "loss": 0.013764435425400734, + "loss_ce": 0.0008249821839854121, + "loss_xval": 0.012939453125, + "num_input_tokens_seen": 63616640, + "step": 922 + }, + { + "epoch": 57.6875, + "grad_norm": 14.011915071038517, + "learning_rate": 5e-05, + "loss": 0.0087, + "num_input_tokens_seen": 63688192, + "step": 923 + }, + { + "epoch": 57.6875, + "loss": 0.0082526421174407, + "loss_ce": 0.0008063528803177178, + "loss_xval": 0.0074462890625, + "num_input_tokens_seen": 63688192, + "step": 923 + }, + { + "epoch": 57.75, + "grad_norm": 5.3692390458080865, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 63759744, + "step": 924 + }, + { + "epoch": 57.75, + "loss": 0.002704136073589325, + "loss_ce": 0.0007891579298302531, + "loss_xval": 0.00191497802734375, + "num_input_tokens_seen": 63759744, + "step": 924 + }, + { + "epoch": 57.8125, + "grad_norm": 5.053317204803564, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 63818752, + "step": 925 + }, + { + "epoch": 57.8125, + "loss": 0.002319210208952427, + "loss_ce": 0.0006865198374725878, + "loss_xval": 0.0016326904296875, + "num_input_tokens_seen": 63818752, + "step": 925 + }, + { + "epoch": 57.875, + "grad_norm": 13.882537585237738, + "learning_rate": 5e-05, + "loss": 0.0084, + "num_input_tokens_seen": 63890560, + "step": 926 + }, + { + "epoch": 57.875, + "loss": 0.0076802195981144905, + "loss_ce": 0.0008137643453665078, + "loss_xval": 0.006866455078125, + "num_input_tokens_seen": 63890560, + "step": 926 + }, + { + "epoch": 57.9375, + "grad_norm": 20.089664118763753, + "learning_rate": 5e-05, + "loss": 0.0156, + "num_input_tokens_seen": 63962240, + "step": 927 + }, + { + "epoch": 57.9375, + "loss": 0.014369064942002296, + "loss_ce": 0.0006971897091716528, + "loss_xval": 0.013671875, + "num_input_tokens_seen": 63962240, + "step": 927 + }, + { + "epoch": 58.0, + "grad_norm": 23.570347705929276, + "learning_rate": 5e-05, + "loss": 0.021, + "num_input_tokens_seen": 64033920, + "step": 928 + }, + { + "epoch": 58.0, + "loss": 0.019682860001921654, + "loss_ce": 0.0007619610987603664, + "loss_xval": 0.0189208984375, + "num_input_tokens_seen": 64033920, + "step": 928 + }, + { + "epoch": 58.0625, + "grad_norm": 24.207978511849383, + "learning_rate": 5e-05, + "loss": 0.0219, + "num_input_tokens_seen": 64105664, + "step": 929 + }, + { + "epoch": 58.0625, + "loss": 0.02138231135904789, + "loss_ce": 0.0006303590489551425, + "loss_xval": 0.020751953125, + "num_input_tokens_seen": 64105664, + "step": 929 + }, + { + "epoch": 58.125, + "grad_norm": 23.42852044875404, + "learning_rate": 5e-05, + "loss": 0.0207, + "num_input_tokens_seen": 64177216, + "step": 930 + }, + { + "epoch": 58.125, + "loss": 0.02136334218084812, + "loss_ce": 0.0006113895797170699, + "loss_xval": 0.020751953125, + "num_input_tokens_seen": 64177216, + "step": 930 + }, + { + "epoch": 58.1875, + "grad_norm": 22.312109718917235, + "learning_rate": 5e-05, + "loss": 0.0189, + "num_input_tokens_seen": 64236288, + "step": 931 + }, + { + "epoch": 58.1875, + "loss": 0.019073408097028732, + "loss_ce": 0.0006407916662283242, + "loss_xval": 0.0184326171875, + "num_input_tokens_seen": 64236288, + "step": 931 + }, + { + "epoch": 58.25, + "grad_norm": 21.508149968679223, + "learning_rate": 5e-05, + "loss": 0.0176, + "num_input_tokens_seen": 64307840, + "step": 932 + }, + { + "epoch": 58.25, + "loss": 0.01990123651921749, + "loss_ce": 0.0006141266785562038, + "loss_xval": 0.019287109375, + "num_input_tokens_seen": 64307840, + "step": 932 + }, + { + "epoch": 58.3125, + "grad_norm": 22.156431498551758, + "learning_rate": 5e-05, + "loss": 0.0185, + "num_input_tokens_seen": 64379392, + "step": 933 + }, + { + "epoch": 58.3125, + "loss": 0.018906189128756523, + "loss_ce": 0.0005956426030024886, + "loss_xval": 0.018310546875, + "num_input_tokens_seen": 64379392, + "step": 933 + }, + { + "epoch": 58.375, + "grad_norm": 23.568803424797704, + "learning_rate": 5e-05, + "loss": 0.0206, + "num_input_tokens_seen": 64450944, + "step": 934 + }, + { + "epoch": 58.375, + "loss": 0.020614925771951675, + "loss_ce": 0.0005953953368589282, + "loss_xval": 0.02001953125, + "num_input_tokens_seen": 64450944, + "step": 934 + }, + { + "epoch": 58.4375, + "grad_norm": 24.20906636976166, + "learning_rate": 5e-05, + "loss": 0.0217, + "num_input_tokens_seen": 64522624, + "step": 935 + }, + { + "epoch": 58.4375, + "loss": 0.02329493872821331, + "loss_ce": 0.000589860079344362, + "loss_xval": 0.022705078125, + "num_input_tokens_seen": 64522624, + "step": 935 + }, + { + "epoch": 58.5, + "grad_norm": 25.28492068445306, + "learning_rate": 5e-05, + "loss": 0.024, + "num_input_tokens_seen": 64594304, + "step": 936 + }, + { + "epoch": 58.5, + "loss": 0.02387586608529091, + "loss_ce": 0.0005604361067526042, + "loss_xval": 0.0233154296875, + "num_input_tokens_seen": 64594304, + "step": 936 + }, + { + "epoch": 58.5625, + "grad_norm": 29.171874285488098, + "learning_rate": 5e-05, + "loss": 0.0312, + "num_input_tokens_seen": 64665984, + "step": 937 + }, + { + "epoch": 58.5625, + "loss": 0.03120674006640911, + "loss_ce": 0.0005670919199474156, + "loss_xval": 0.0306396484375, + "num_input_tokens_seen": 64665984, + "step": 937 + }, + { + "epoch": 58.625, + "grad_norm": 36.220269709923116, + "learning_rate": 5e-05, + "loss": 0.0474, + "num_input_tokens_seen": 64737600, + "step": 938 + }, + { + "epoch": 58.625, + "loss": 0.04941752925515175, + "loss_ce": 0.0005894028581678867, + "loss_xval": 0.048828125, + "num_input_tokens_seen": 64737600, + "step": 938 + }, + { + "epoch": 58.6875, + "grad_norm": 44.53325592874753, + "learning_rate": 5e-05, + "loss": 0.0716, + "num_input_tokens_seen": 64809152, + "step": 939 + }, + { + "epoch": 58.6875, + "loss": 0.0684363916516304, + "loss_ce": 0.0005652988911606371, + "loss_xval": 0.06787109375, + "num_input_tokens_seen": 64809152, + "step": 939 + }, + { + "epoch": 58.75, + "grad_norm": 53.332865645494145, + "learning_rate": 5e-05, + "loss": 0.1024, + "num_input_tokens_seen": 64880704, + "step": 940 + }, + { + "epoch": 58.75, + "loss": 0.10312040895223618, + "loss_ce": 0.0005813451134599745, + "loss_xval": 0.1025390625, + "num_input_tokens_seen": 64880704, + "step": 940 + }, + { + "epoch": 58.8125, + "grad_norm": 61.78611720533413, + "learning_rate": 5e-05, + "loss": 0.1383, + "num_input_tokens_seen": 64952448, + "step": 941 + }, + { + "epoch": 58.8125, + "loss": 0.14115650951862335, + "loss_ce": 0.0005315167945809662, + "loss_xval": 0.140625, + "num_input_tokens_seen": 64952448, + "step": 941 + }, + { + "epoch": 58.875, + "grad_norm": 67.57119829473461, + "learning_rate": 5e-05, + "loss": 0.1657, + "num_input_tokens_seen": 65024000, + "step": 942 + }, + { + "epoch": 58.875, + "loss": 0.16276301443576813, + "loss_ce": 0.0006536326254718006, + "loss_xval": 0.162109375, + "num_input_tokens_seen": 65024000, + "step": 942 + }, + { + "epoch": 58.9375, + "grad_norm": 67.3748450718513, + "learning_rate": 5e-05, + "loss": 0.1666, + "num_input_tokens_seen": 65095552, + "step": 943 + }, + { + "epoch": 58.9375, + "loss": 0.17051354050636292, + "loss_ce": 0.0005916593945585191, + "loss_xval": 0.169921875, + "num_input_tokens_seen": 65095552, + "step": 943 + }, + { + "epoch": 59.0, + "grad_norm": 58.048157118684784, + "learning_rate": 5e-05, + "loss": 0.1269, + "num_input_tokens_seen": 65167104, + "step": 944 + }, + { + "epoch": 59.0, + "loss": 0.1286008656024933, + "loss_ce": 0.0006711802561767399, + "loss_xval": 0.1279296875, + "num_input_tokens_seen": 65167104, + "step": 944 + }, + { + "epoch": 59.0625, + "grad_norm": 38.19997686993994, + "learning_rate": 5e-05, + "loss": 0.0568, + "num_input_tokens_seen": 65238720, + "step": 945 + }, + { + "epoch": 59.0625, + "loss": 0.05981079116463661, + "loss_ce": 0.0007287596818059683, + "loss_xval": 0.05908203125, + "num_input_tokens_seen": 65238720, + "step": 945 + }, + { + "epoch": 59.125, + "grad_norm": 9.525180721691116, + "learning_rate": 5e-05, + "loss": 0.0057, + "num_input_tokens_seen": 65310336, + "step": 946 + }, + { + "epoch": 59.125, + "loss": 0.006437521893531084, + "loss_ce": 0.0007612523622810841, + "loss_xval": 0.00567626953125, + "num_input_tokens_seen": 65310336, + "step": 946 + }, + { + "epoch": 59.1875, + "grad_norm": 20.86777318321889, + "learning_rate": 5e-05, + "loss": 0.0183, + "num_input_tokens_seen": 65382016, + "step": 947 + }, + { + "epoch": 59.1875, + "loss": 0.01804313436150551, + "loss_ce": 0.0008312196005135775, + "loss_xval": 0.0172119140625, + "num_input_tokens_seen": 65382016, + "step": 947 + }, + { + "epoch": 59.25, + "grad_norm": 44.605782594914494, + "learning_rate": 5e-05, + "loss": 0.0775, + "num_input_tokens_seen": 65441152, + "step": 948 + }, + { + "epoch": 59.25, + "loss": 0.07944431900978088, + "loss_ce": 0.0008310358389280736, + "loss_xval": 0.07861328125, + "num_input_tokens_seen": 65441152, + "step": 948 + }, + { + "epoch": 59.3125, + "grad_norm": 57.15983641908541, + "learning_rate": 5e-05, + "loss": 0.1251, + "num_input_tokens_seen": 65512832, + "step": 949 + }, + { + "epoch": 59.3125, + "loss": 0.12679226696491241, + "loss_ce": 0.0008157067350111902, + "loss_xval": 0.1259765625, + "num_input_tokens_seen": 65512832, + "step": 949 + }, + { + "epoch": 59.375, + "grad_norm": 52.388511589162, + "learning_rate": 5e-05, + "loss": 0.1072, + "num_input_tokens_seen": 65584384, + "step": 950 + }, + { + "epoch": 59.375, + "loss": 0.10724907368421555, + "loss_ce": 0.0008037587977014482, + "loss_xval": 0.1064453125, + "num_input_tokens_seen": 65584384, + "step": 950 + }, + { + "epoch": 59.4375, + "grad_norm": 29.62629590918632, + "learning_rate": 5e-05, + "loss": 0.036, + "num_input_tokens_seen": 65655936, + "step": 951 + }, + { + "epoch": 59.4375, + "loss": 0.03387866169214249, + "loss_ce": 0.0009196769678965211, + "loss_xval": 0.032958984375, + "num_input_tokens_seen": 65655936, + "step": 951 + }, + { + "epoch": 59.5, + "grad_norm": 3.4575314631603558, + "learning_rate": 5e-05, + "loss": 0.0026, + "num_input_tokens_seen": 65727552, + "step": 952 + }, + { + "epoch": 59.5, + "loss": 0.002521348185837269, + "loss_ce": 0.0008428815053775907, + "loss_xval": 0.001678466796875, + "num_input_tokens_seen": 65727552, + "step": 952 + }, + { + "epoch": 59.5625, + "grad_norm": 34.69990940993319, + "learning_rate": 5e-05, + "loss": 0.0484, + "num_input_tokens_seen": 65799168, + "step": 953 + }, + { + "epoch": 59.5625, + "loss": 0.05138184130191803, + "loss_ce": 0.0008447307045571506, + "loss_xval": 0.050537109375, + "num_input_tokens_seen": 65799168, + "step": 953 + }, + { + "epoch": 59.625, + "grad_norm": 54.85586285901504, + "learning_rate": 5e-05, + "loss": 0.1207, + "num_input_tokens_seen": 65870848, + "step": 954 + }, + { + "epoch": 59.625, + "loss": 0.12335727363824844, + "loss_ce": 0.000798677618149668, + "loss_xval": 0.12255859375, + "num_input_tokens_seen": 65870848, + "step": 954 + }, + { + "epoch": 59.6875, + "grad_norm": 58.16518006510348, + "learning_rate": 5e-05, + "loss": 0.1336, + "num_input_tokens_seen": 65942592, + "step": 955 + }, + { + "epoch": 59.6875, + "loss": 0.1276523470878601, + "loss_ce": 0.0006992240669205785, + "loss_xval": 0.126953125, + "num_input_tokens_seen": 65942592, + "step": 955 + }, + { + "epoch": 59.75, + "grad_norm": 41.478621513823285, + "learning_rate": 5e-05, + "loss": 0.0689, + "num_input_tokens_seen": 66001664, + "step": 956 + }, + { + "epoch": 59.75, + "loss": 0.06770110130310059, + "loss_ce": 0.0008065711590461433, + "loss_xval": 0.06689453125, + "num_input_tokens_seen": 66001664, + "step": 956 + }, + { + "epoch": 59.8125, + "grad_norm": 12.798023338883505, + "learning_rate": 5e-05, + "loss": 0.0084, + "num_input_tokens_seen": 66073216, + "step": 957 + }, + { + "epoch": 59.8125, + "loss": 0.010349994525313377, + "loss_ce": 0.0008895453647710383, + "loss_xval": 0.00946044921875, + "num_input_tokens_seen": 66073216, + "step": 957 + }, + { + "epoch": 59.875, + "grad_norm": 14.997701812579756, + "learning_rate": 5e-05, + "loss": 0.0113, + "num_input_tokens_seen": 66145024, + "step": 958 + }, + { + "epoch": 59.875, + "loss": 0.011212315410375595, + "loss_ce": 0.0007753035170026124, + "loss_xval": 0.01043701171875, + "num_input_tokens_seen": 66145024, + "step": 958 + }, + { + "epoch": 59.9375, + "grad_norm": 31.64754420739134, + "learning_rate": 5e-05, + "loss": 0.0409, + "num_input_tokens_seen": 66204032, + "step": 959 + }, + { + "epoch": 59.9375, + "loss": 0.046755120158195496, + "loss_ce": 0.0008566842298023403, + "loss_xval": 0.0458984375, + "num_input_tokens_seen": 66204032, + "step": 959 + }, + { + "epoch": 60.0, + "grad_norm": 33.04461809351946, + "learning_rate": 5e-05, + "loss": 0.0446, + "num_input_tokens_seen": 66275776, + "step": 960 + }, + { + "epoch": 60.0, + "loss": 0.041984446346759796, + "loss_ce": 0.0007246798486448824, + "loss_xval": 0.041259765625, + "num_input_tokens_seen": 66275776, + "step": 960 + }, + { + "epoch": 60.0625, + "grad_norm": 23.900029676188776, + "learning_rate": 5e-05, + "loss": 0.0245, + "num_input_tokens_seen": 66347456, + "step": 961 + }, + { + "epoch": 60.0625, + "loss": 0.026303108781576157, + "loss_ce": 0.0007904136436991394, + "loss_xval": 0.0255126953125, + "num_input_tokens_seen": 66347456, + "step": 961 + }, + { + "epoch": 60.125, + "grad_norm": 10.796280459241752, + "learning_rate": 5e-05, + "loss": 0.0065, + "num_input_tokens_seen": 66419008, + "step": 962 + }, + { + "epoch": 60.125, + "loss": 0.007391719613224268, + "loss_ce": 0.0007388876401819289, + "loss_xval": 0.00665283203125, + "num_input_tokens_seen": 66419008, + "step": 962 + }, + { + "epoch": 60.1875, + "grad_norm": 3.5390655467739354, + "learning_rate": 5e-05, + "loss": 0.0038, + "num_input_tokens_seen": 66490560, + "step": 963 + }, + { + "epoch": 60.1875, + "loss": 0.003932563588023186, + "loss_ce": 0.0006824417505413294, + "loss_xval": 0.0032501220703125, + "num_input_tokens_seen": 66490560, + "step": 963 + }, + { + "epoch": 60.25, + "grad_norm": 14.230928746177904, + "learning_rate": 5e-05, + "loss": 0.0109, + "num_input_tokens_seen": 66562304, + "step": 964 + }, + { + "epoch": 60.25, + "loss": 0.00826042890548706, + "loss_ce": 0.0007225867011584342, + "loss_xval": 0.007537841796875, + "num_input_tokens_seen": 66562304, + "step": 964 + }, + { + "epoch": 60.3125, + "grad_norm": 15.974909633110373, + "learning_rate": 5e-05, + "loss": 0.0126, + "num_input_tokens_seen": 66634112, + "step": 965 + }, + { + "epoch": 60.3125, + "loss": 0.016383003443479538, + "loss_ce": 0.0006359326071105897, + "loss_xval": 0.0157470703125, + "num_input_tokens_seen": 66634112, + "step": 965 + }, + { + "epoch": 60.375, + "grad_norm": 10.808098917481177, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 66705728, + "step": 966 + }, + { + "epoch": 60.375, + "loss": 0.0064287637360394, + "loss_ce": 0.0005999064305797219, + "loss_xval": 0.005828857421875, + "num_input_tokens_seen": 66705728, + "step": 966 + }, + { + "epoch": 60.4375, + "grad_norm": 5.528738803033689, + "learning_rate": 5e-05, + "loss": 0.0042, + "num_input_tokens_seen": 66764800, + "step": 967 + }, + { + "epoch": 60.4375, + "loss": 0.0019519556080922484, + "loss_ce": 0.0006091821705922484, + "loss_xval": 0.0013427734375, + "num_input_tokens_seen": 66764800, + "step": 967 + }, + { + "epoch": 60.5, + "grad_norm": 0.7874106581706314, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 66836352, + "step": 968 + }, + { + "epoch": 60.5, + "loss": 0.002332956064492464, + "loss_ce": 0.000601083564106375, + "loss_xval": 0.00173187255859375, + "num_input_tokens_seen": 66836352, + "step": 968 + }, + { + "epoch": 60.5625, + "grad_norm": 3.489877042850535, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_input_tokens_seen": 66907968, + "step": 969 + }, + { + "epoch": 60.5625, + "loss": 0.0026276027783751488, + "loss_ce": 0.0005981839494779706, + "loss_xval": 0.0020294189453125, + "num_input_tokens_seen": 66907968, + "step": 969 + }, + { + "epoch": 60.625, + "grad_norm": 3.717508401531591, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 66979584, + "step": 970 + }, + { + "epoch": 60.625, + "loss": 0.0015087537467479706, + "loss_ce": 0.0005398206412792206, + "loss_xval": 0.00096893310546875, + "num_input_tokens_seen": 66979584, + "step": 970 + }, + { + "epoch": 60.6875, + "grad_norm": 0.5407000248907325, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 67051200, + "step": 971 + }, + { + "epoch": 60.6875, + "loss": 0.0016631756443530321, + "loss_ce": 0.000610319257248193, + "loss_xval": 0.0010528564453125, + "num_input_tokens_seen": 67051200, + "step": 971 + }, + { + "epoch": 60.75, + "grad_norm": 1.6571346214836031, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 67122816, + "step": 972 + }, + { + "epoch": 60.75, + "loss": 0.001810828223824501, + "loss_ce": 0.000544348731637001, + "loss_xval": 0.0012664794921875, + "num_input_tokens_seen": 67122816, + "step": 972 + }, + { + "epoch": 60.8125, + "grad_norm": 1.5450918420386779, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 67194368, + "step": 973 + }, + { + "epoch": 60.8125, + "loss": 0.0022611215244978666, + "loss_ce": 0.0005368783604353666, + "loss_xval": 0.0017242431640625, + "num_input_tokens_seen": 67194368, + "step": 973 + }, + { + "epoch": 60.875, + "grad_norm": 0.7884652256068236, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 67265920, + "step": 974 + }, + { + "epoch": 60.875, + "loss": 0.0010354053229093552, + "loss_ce": 0.0005013477057218552, + "loss_xval": 0.0005340576171875, + "num_input_tokens_seen": 67265920, + "step": 974 + }, + { + "epoch": 60.9375, + "grad_norm": 0.5366298158606575, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 67337600, + "step": 975 + }, + { + "epoch": 60.9375, + "loss": 0.0011774263111874461, + "loss_ce": 0.0005022248951718211, + "loss_xval": 0.000675201416015625, + "num_input_tokens_seen": 67337600, + "step": 975 + }, + { + "epoch": 61.0, + "grad_norm": 2.310955169102163, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 67409280, + "step": 976 + }, + { + "epoch": 61.0, + "loss": 0.001500699669122696, + "loss_ce": 0.00048599025467410684, + "loss_xval": 0.00101470947265625, + "num_input_tokens_seen": 67409280, + "step": 976 + }, + { + "epoch": 61.0625, + "grad_norm": 3.349124799438185, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 67480896, + "step": 977 + }, + { + "epoch": 61.0625, + "loss": 0.0016312211519107223, + "loss_ce": 0.00044103560503572226, + "loss_xval": 0.001190185546875, + "num_input_tokens_seen": 67480896, + "step": 977 + }, + { + "epoch": 61.125, + "grad_norm": 3.8676898786882665, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 67552512, + "step": 978 + }, + { + "epoch": 61.125, + "loss": 0.0016393068945035338, + "loss_ce": 0.0004414919239934534, + "loss_xval": 0.00119781494140625, + "num_input_tokens_seen": 67552512, + "step": 978 + }, + { + "epoch": 61.1875, + "grad_norm": 4.27868966415631, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 67624128, + "step": 979 + }, + { + "epoch": 61.1875, + "loss": 0.0018708731513470411, + "loss_ce": 0.00045180582674220204, + "loss_xval": 0.0014190673828125, + "num_input_tokens_seen": 67624128, + "step": 979 + }, + { + "epoch": 61.25, + "grad_norm": 5.103740631510975, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 67695808, + "step": 980 + }, + { + "epoch": 61.25, + "loss": 0.001763427397236228, + "loss_ce": 0.0003901363234035671, + "loss_xval": 0.001373291015625, + "num_input_tokens_seen": 67695808, + "step": 980 + }, + { + "epoch": 61.3125, + "grad_norm": 7.877201524071439, + "learning_rate": 5e-05, + "loss": 0.0036, + "num_input_tokens_seen": 67754944, + "step": 981 + }, + { + "epoch": 61.3125, + "loss": 0.003038346767425537, + "loss_ce": 0.00042909375042654574, + "loss_xval": 0.0026092529296875, + "num_input_tokens_seen": 67754944, + "step": 981 + }, + { + "epoch": 61.375, + "grad_norm": 13.964321159085115, + "learning_rate": 5e-05, + "loss": 0.0089, + "num_input_tokens_seen": 67826624, + "step": 982 + }, + { + "epoch": 61.375, + "loss": 0.009481735527515411, + "loss_ce": 0.00038749678060412407, + "loss_xval": 0.00909423828125, + "num_input_tokens_seen": 67826624, + "step": 982 + }, + { + "epoch": 61.4375, + "grad_norm": 25.125654309287086, + "learning_rate": 5e-05, + "loss": 0.0268, + "num_input_tokens_seen": 67873344, + "step": 983 + }, + { + "epoch": 61.4375, + "loss": 0.02721680887043476, + "loss_ce": 0.000361339480150491, + "loss_xval": 0.02685546875, + "num_input_tokens_seen": 67873344, + "step": 983 + }, + { + "epoch": 61.5, + "grad_norm": 45.26377203729771, + "learning_rate": 5e-05, + "loss": 0.087, + "num_input_tokens_seen": 67944960, + "step": 984 + }, + { + "epoch": 61.5, + "loss": 0.08778023719787598, + "loss_ce": 0.0003778930695261806, + "loss_xval": 0.08740234375, + "num_input_tokens_seen": 67944960, + "step": 984 + }, + { + "epoch": 61.5625, + "grad_norm": 79.46728193534139, + "learning_rate": 5e-05, + "loss": 0.266, + "num_input_tokens_seen": 68016576, + "step": 985 + }, + { + "epoch": 61.5625, + "loss": 0.2660321593284607, + "loss_ce": 0.0004071468429174274, + "loss_xval": 0.265625, + "num_input_tokens_seen": 68016576, + "step": 985 + }, + { + "epoch": 61.625, + "grad_norm": 121.02400174402173, + "learning_rate": 5e-05, + "loss": 0.6274, + "num_input_tokens_seen": 68088256, + "step": 986 + }, + { + "epoch": 61.625, + "loss": 0.6254241466522217, + "loss_ce": 0.0004241722053848207, + "loss_xval": 0.625, + "num_input_tokens_seen": 68088256, + "step": 986 + }, + { + "epoch": 61.6875, + "grad_norm": 137.4454848089739, + "learning_rate": 5e-05, + "loss": 0.8307, + "num_input_tokens_seen": 68159808, + "step": 987 + }, + { + "epoch": 61.6875, + "loss": 0.8248781561851501, + "loss_ce": 0.0006593933212570846, + "loss_xval": 0.82421875, + "num_input_tokens_seen": 68159808, + "step": 987 + }, + { + "epoch": 61.75, + "grad_norm": 87.03113064555924, + "learning_rate": 5e-05, + "loss": 0.3425, + "num_input_tokens_seen": 68231360, + "step": 988 + }, + { + "epoch": 61.75, + "loss": 0.3444867432117462, + "loss_ce": 0.0007367533398792148, + "loss_xval": 0.34375, + "num_input_tokens_seen": 68231360, + "step": 988 + }, + { + "epoch": 61.8125, + "grad_norm": 10.68486256134069, + "learning_rate": 5e-05, + "loss": 0.0083, + "num_input_tokens_seen": 68303040, + "step": 989 + }, + { + "epoch": 61.8125, + "loss": 0.008067439310252666, + "loss_ce": 0.0006516677094623446, + "loss_xval": 0.007415771484375, + "num_input_tokens_seen": 68303040, + "step": 989 + }, + { + "epoch": 61.875, + "grad_norm": 71.27713716064294, + "learning_rate": 5e-05, + "loss": 0.2234, + "num_input_tokens_seen": 68362112, + "step": 990 + }, + { + "epoch": 61.875, + "loss": 0.2184855192899704, + "loss_ce": 0.0007120799855329096, + "loss_xval": 0.2177734375, + "num_input_tokens_seen": 68362112, + "step": 990 + }, + { + "epoch": 61.9375, + "grad_norm": 150.67348138910384, + "learning_rate": 5e-05, + "loss": 0.9576, + "num_input_tokens_seen": 68433728, + "step": 991 + }, + { + "epoch": 61.9375, + "loss": 0.9381079077720642, + "loss_ce": 0.0006078826263546944, + "loss_xval": 0.9375, + "num_input_tokens_seen": 68433728, + "step": 991 + }, + { + "epoch": 62.0, + "grad_norm": 160.0697716894221, + "learning_rate": 5e-05, + "loss": 0.9985, + "num_input_tokens_seen": 68505280, + "step": 992 + }, + { + "epoch": 62.0, + "loss": 1.0446293354034424, + "loss_ce": 0.005566796753555536, + "loss_xval": 1.0390625, + "num_input_tokens_seen": 68505280, + "step": 992 + }, + { + "epoch": 62.0625, + "grad_norm": 54.725170062410925, + "learning_rate": 5e-05, + "loss": 0.1661, + "num_input_tokens_seen": 68576896, + "step": 993 + }, + { + "epoch": 62.0625, + "loss": 0.14711056649684906, + "loss_ce": 0.001602747361175716, + "loss_xval": 0.1455078125, + "num_input_tokens_seen": 68576896, + "step": 993 + }, + { + "epoch": 62.125, + "grad_norm": 57.095522982761416, + "learning_rate": 5e-05, + "loss": 0.2123, + "num_input_tokens_seen": 68648448, + "step": 994 + }, + { + "epoch": 62.125, + "loss": 0.19358542561531067, + "loss_ce": 0.009991676546633244, + "loss_xval": 0.18359375, + "num_input_tokens_seen": 68648448, + "step": 994 + }, + { + "epoch": 62.1875, + "grad_norm": 129.51112432923358, + "learning_rate": 5e-05, + "loss": 0.9835, + "num_input_tokens_seen": 68720128, + "step": 995 + }, + { + "epoch": 62.1875, + "loss": 1.0289194583892822, + "loss_ce": 0.028919465839862823, + "loss_xval": 1.0, + "num_input_tokens_seen": 68720128, + "step": 995 + }, + { + "epoch": 62.25, + "grad_norm": 236.6345678485908, + "learning_rate": 5e-05, + "loss": 3.1, + "num_input_tokens_seen": 68791744, + "step": 996 + }, + { + "epoch": 62.25, + "loss": 3.211564302444458, + "loss_ce": 0.03968937322497368, + "loss_xval": 3.171875, + "num_input_tokens_seen": 68791744, + "step": 996 + }, + { + "epoch": 62.3125, + "grad_norm": 225.87529604424057, + "learning_rate": 5e-05, + "loss": 2.516, + "num_input_tokens_seen": 68851008, + "step": 997 + }, + { + "epoch": 62.3125, + "loss": 2.585651397705078, + "loss_ce": 0.00752646429464221, + "loss_xval": 2.578125, + "num_input_tokens_seen": 68851008, + "step": 997 + }, + { + "epoch": 62.375, + "grad_norm": 22.233360328809926, + "learning_rate": 5e-05, + "loss": 0.1002, + "num_input_tokens_seen": 68922624, + "step": 998 + }, + { + "epoch": 62.375, + "loss": 0.08905638754367828, + "loss_ce": 0.06305541098117828, + "loss_xval": 0.0260009765625, + "num_input_tokens_seen": 68922624, + "step": 998 + }, + { + "epoch": 62.4375, + "grad_norm": 330.7636326856827, + "learning_rate": 5e-05, + "loss": 3.8083, + "num_input_tokens_seen": 68994176, + "step": 999 + }, + { + "epoch": 62.4375, + "loss": 3.894900321960449, + "loss_ce": 0.03552544116973877, + "loss_xval": 3.859375, + "num_input_tokens_seen": 68994176, + "step": 999 + }, + { + "epoch": 62.5, + "grad_norm": 180.3650646320968, + "learning_rate": 5e-05, + "loss": 1.4232, + "num_input_tokens_seen": 69065728, + "step": 1000 + }, + { + "epoch": 62.5, + "eval_synth_IoU": 0.0, + "eval_synth_MAE_x": 1.335601806640625, + "eval_synth_MAE_y": 1.298309326171875, + "eval_synth_NUM_probability": 0.9881191998720169, + "eval_synth_inside_bbox": 0.0, + "eval_synth_loss": 1.8374046087265015, + "eval_synth_loss_ce": 0.015139079187065363, + "eval_synth_loss_xval": 1.822265625, + "eval_synth_runtime": 63.5264, + "eval_synth_samples_per_second": 2.015, + "eval_synth_steps_per_second": 0.063, + "num_input_tokens_seen": 69065728, + "step": 1000 + }, + { + "epoch": 62.5, + "loss": 1.84526789188385, + "loss_ce": 0.017142947763204575, + "loss_xval": 1.828125, + "num_input_tokens_seen": 69065728, + "step": 1000 + }, + { + "epoch": 62.5625, + "grad_norm": 200.80735424840003, + "learning_rate": 5e-05, + "loss": 1.8535, + "num_input_tokens_seen": 69124736, + "step": 1001 + }, + { + "epoch": 62.5625, + "loss": 1.8449586629867554, + "loss_ce": 0.016833681613206863, + "loss_xval": 1.828125, + "num_input_tokens_seen": 69124736, + "step": 1001 + }, + { + "epoch": 62.625, + "grad_norm": 114.3159442962799, + "learning_rate": 5e-05, + "loss": 0.565, + "num_input_tokens_seen": 69183872, + "step": 1002 + }, + { + "epoch": 62.625, + "loss": 0.5889109373092651, + "loss_ce": 0.018598422408103943, + "loss_xval": 0.5703125, + "num_input_tokens_seen": 69183872, + "step": 1002 + }, + { + "epoch": 62.6875, + "grad_norm": 186.98765624810193, + "learning_rate": 5e-05, + "loss": 1.5284, + "num_input_tokens_seen": 69255424, + "step": 1003 + }, + { + "epoch": 62.6875, + "loss": 1.472475528717041, + "loss_ce": 0.011538065969944, + "loss_xval": 1.4609375, + "num_input_tokens_seen": 69255424, + "step": 1003 + }, + { + "epoch": 62.75, + "grad_norm": 66.58049614141694, + "learning_rate": 5e-05, + "loss": 0.2263, + "num_input_tokens_seen": 69326976, + "step": 1004 + }, + { + "epoch": 62.75, + "loss": 0.23296989500522614, + "loss_ce": 0.012266767211258411, + "loss_xval": 0.220703125, + "num_input_tokens_seen": 69326976, + "step": 1004 + }, + { + "epoch": 62.8125, + "grad_norm": 159.02076786011705, + "learning_rate": 5e-05, + "loss": 1.1836, + "num_input_tokens_seen": 69398656, + "step": 1005 + }, + { + "epoch": 62.8125, + "loss": 1.1524460315704346, + "loss_ce": 0.00400858698412776, + "loss_xval": 1.1484375, + "num_input_tokens_seen": 69398656, + "step": 1005 + }, + { + "epoch": 62.875, + "grad_norm": 7.503047980086942, + "learning_rate": 5e-05, + "loss": 0.0494, + "num_input_tokens_seen": 69470336, + "step": 1006 + }, + { + "epoch": 62.875, + "loss": 0.04918452724814415, + "loss_ce": 0.0037743712309747934, + "loss_xval": 0.04541015625, + "num_input_tokens_seen": 69470336, + "step": 1006 + }, + { + "epoch": 62.9375, + "grad_norm": 149.27054672521433, + "learning_rate": 5e-05, + "loss": 1.0062, + "num_input_tokens_seen": 69542080, + "step": 1007 + }, + { + "epoch": 62.9375, + "loss": 0.8986896872520447, + "loss_ce": 0.004158430732786655, + "loss_xval": 0.89453125, + "num_input_tokens_seen": 69542080, + "step": 1007 + }, + { + "epoch": 63.0, + "grad_norm": 47.09901096637737, + "learning_rate": 5e-05, + "loss": 0.1531, + "num_input_tokens_seen": 69613760, + "step": 1008 + }, + { + "epoch": 63.0, + "loss": 0.16589389741420746, + "loss_ce": 0.0067142159678041935, + "loss_xval": 0.1591796875, + "num_input_tokens_seen": 69613760, + "step": 1008 + }, + { + "epoch": 63.0625, + "grad_norm": 70.27234087538793, + "learning_rate": 5e-05, + "loss": 0.3167, + "num_input_tokens_seen": 69685376, + "step": 1009 + }, + { + "epoch": 63.0625, + "loss": 0.31789273023605347, + "loss_ce": 0.019064605236053467, + "loss_xval": 0.298828125, + "num_input_tokens_seen": 69685376, + "step": 1009 + }, + { + "epoch": 63.125, + "grad_norm": 72.23082569297506, + "learning_rate": 5e-05, + "loss": 0.349, + "num_input_tokens_seen": 69757056, + "step": 1010 + }, + { + "epoch": 63.125, + "loss": 0.352620929479599, + "loss_ce": 0.012777186930179596, + "loss_xval": 0.33984375, + "num_input_tokens_seen": 69757056, + "step": 1010 + }, + { + "epoch": 63.1875, + "grad_norm": 22.212031709182547, + "learning_rate": 5e-05, + "loss": 0.0616, + "num_input_tokens_seen": 69828672, + "step": 1011 + }, + { + "epoch": 63.1875, + "loss": 0.06873536854982376, + "loss_ce": 0.013803728856146336, + "loss_xval": 0.054931640625, + "num_input_tokens_seen": 69828672, + "step": 1011 + }, + { + "epoch": 63.25, + "grad_norm": 72.80873328440106, + "learning_rate": 5e-05, + "loss": 0.3807, + "num_input_tokens_seen": 69900288, + "step": 1012 + }, + { + "epoch": 63.25, + "loss": 0.4027869403362274, + "loss_ce": 0.012161929160356522, + "loss_xval": 0.390625, + "num_input_tokens_seen": 69900288, + "step": 1012 + }, + { + "epoch": 63.3125, + "grad_norm": 17.47620030362804, + "learning_rate": 5e-05, + "loss": 0.0398, + "num_input_tokens_seen": 69971904, + "step": 1013 + }, + { + "epoch": 63.3125, + "loss": 0.03510742262005806, + "loss_ce": 0.013623046688735485, + "loss_xval": 0.021484375, + "num_input_tokens_seen": 69971904, + "step": 1013 + }, + { + "epoch": 63.375, + "grad_norm": 55.426872710066, + "learning_rate": 5e-05, + "loss": 0.2349, + "num_input_tokens_seen": 70043520, + "step": 1014 + }, + { + "epoch": 63.375, + "loss": 0.22966395318508148, + "loss_ce": 0.012867072597146034, + "loss_xval": 0.216796875, + "num_input_tokens_seen": 70043520, + "step": 1014 + }, + { + "epoch": 63.4375, + "grad_norm": 44.943384288182784, + "learning_rate": 5e-05, + "loss": 0.1586, + "num_input_tokens_seen": 70115264, + "step": 1015 + }, + { + "epoch": 63.4375, + "loss": 0.15268297493457794, + "loss_ce": 0.012057974934577942, + "loss_xval": 0.140625, + "num_input_tokens_seen": 70115264, + "step": 1015 + }, + { + "epoch": 63.5, + "grad_norm": 20.501575432429476, + "learning_rate": 5e-05, + "loss": 0.0473, + "num_input_tokens_seen": 70186816, + "step": 1016 + }, + { + "epoch": 63.5, + "loss": 0.047375284135341644, + "loss_ce": 0.011242473497986794, + "loss_xval": 0.0361328125, + "num_input_tokens_seen": 70186816, + "step": 1016 + }, + { + "epoch": 63.5625, + "grad_norm": 50.11477187146609, + "learning_rate": 5e-05, + "loss": 0.1953, + "num_input_tokens_seen": 70258368, + "step": 1017 + }, + { + "epoch": 63.5625, + "loss": 0.2005818635225296, + "loss_ce": 0.009175607934594154, + "loss_xval": 0.19140625, + "num_input_tokens_seen": 70258368, + "step": 1017 + }, + { + "epoch": 63.625, + "grad_norm": 17.595502628261446, + "learning_rate": 5e-05, + "loss": 0.0397, + "num_input_tokens_seen": 70329920, + "step": 1018 + }, + { + "epoch": 63.625, + "loss": 0.044486869126558304, + "loss_ce": 0.007133353501558304, + "loss_xval": 0.037353515625, + "num_input_tokens_seen": 70329920, + "step": 1018 + }, + { + "epoch": 63.6875, + "grad_norm": 34.4850294390446, + "learning_rate": 5e-05, + "loss": 0.0961, + "num_input_tokens_seen": 70401536, + "step": 1019 + }, + { + "epoch": 63.6875, + "loss": 0.11015944927930832, + "loss_ce": 0.005667262244969606, + "loss_xval": 0.1044921875, + "num_input_tokens_seen": 70401536, + "step": 1019 + }, + { + "epoch": 63.75, + "grad_norm": 41.12267515604947, + "learning_rate": 5e-05, + "loss": 0.1271, + "num_input_tokens_seen": 70473088, + "step": 1020 + }, + { + "epoch": 63.75, + "loss": 0.12851181626319885, + "loss_ce": 0.0054649352096021175, + "loss_xval": 0.123046875, + "num_input_tokens_seen": 70473088, + "step": 1020 + }, + { + "epoch": 63.8125, + "grad_norm": 1.6562020306690155, + "learning_rate": 5e-05, + "loss": 0.0132, + "num_input_tokens_seen": 70544768, + "step": 1021 + }, + { + "epoch": 63.8125, + "loss": 0.009423565119504929, + "loss_ce": 0.005029033403843641, + "loss_xval": 0.00439453125, + "num_input_tokens_seen": 70544768, + "step": 1021 + }, + { + "epoch": 63.875, + "grad_norm": 40.425729593839435, + "learning_rate": 5e-05, + "loss": 0.1248, + "num_input_tokens_seen": 70616320, + "step": 1022 + }, + { + "epoch": 63.875, + "loss": 0.12477166950702667, + "loss_ce": 0.005142764188349247, + "loss_xval": 0.11962890625, + "num_input_tokens_seen": 70616320, + "step": 1022 + }, + { + "epoch": 63.9375, + "grad_norm": 26.761025062310726, + "learning_rate": 5e-05, + "loss": 0.0593, + "num_input_tokens_seen": 70687936, + "step": 1023 + }, + { + "epoch": 63.9375, + "loss": 0.05864199995994568, + "loss_ce": 0.004686920437961817, + "loss_xval": 0.053955078125, + "num_input_tokens_seen": 70687936, + "step": 1023 + }, + { + "epoch": 64.0, + "grad_norm": 16.220804961953952, + "learning_rate": 5e-05, + "loss": 0.0266, + "num_input_tokens_seen": 70759744, + "step": 1024 + }, + { + "epoch": 64.0, + "loss": 0.03134244307875633, + "loss_ce": 0.0038766234647482634, + "loss_xval": 0.0274658203125, + "num_input_tokens_seen": 70759744, + "step": 1024 + }, + { + "epoch": 64.0625, + "grad_norm": 37.128469054500776, + "learning_rate": 5e-05, + "loss": 0.1006, + "num_input_tokens_seen": 70831424, + "step": 1025 + }, + { + "epoch": 64.0625, + "loss": 0.10453501343727112, + "loss_ce": 0.003949078731238842, + "loss_xval": 0.1005859375, + "num_input_tokens_seen": 70831424, + "step": 1025 + }, + { + "epoch": 64.125, + "grad_norm": 9.66025199247302, + "learning_rate": 5e-05, + "loss": 0.0183, + "num_input_tokens_seen": 70903168, + "step": 1026 + }, + { + "epoch": 64.125, + "loss": 0.017569195479154587, + "loss_ce": 0.003653179621323943, + "loss_xval": 0.013916015625, + "num_input_tokens_seen": 70903168, + "step": 1026 + }, + { + "epoch": 64.1875, + "grad_norm": 29.50918018984489, + "learning_rate": 5e-05, + "loss": 0.0682, + "num_input_tokens_seen": 70974848, + "step": 1027 + }, + { + "epoch": 64.1875, + "loss": 0.06166379898786545, + "loss_ce": 0.0035583314020186663, + "loss_xval": 0.05810546875, + "num_input_tokens_seen": 70974848, + "step": 1027 + }, + { + "epoch": 64.25, + "grad_norm": 24.911312204437753, + "learning_rate": 5e-05, + "loss": 0.0505, + "num_input_tokens_seen": 71046400, + "step": 1028 + }, + { + "epoch": 64.25, + "loss": 0.05654432252049446, + "loss_ce": 0.0035658059641718864, + "loss_xval": 0.052978515625, + "num_input_tokens_seen": 71046400, + "step": 1028 + }, + { + "epoch": 64.3125, + "grad_norm": 9.45537493788124, + "learning_rate": 5e-05, + "loss": 0.0149, + "num_input_tokens_seen": 71105472, + "step": 1029 + }, + { + "epoch": 64.3125, + "loss": 0.015377216972410679, + "loss_ce": 0.0030481156427413225, + "loss_xval": 0.0123291015625, + "num_input_tokens_seen": 71105472, + "step": 1029 + }, + { + "epoch": 64.375, + "grad_norm": 26.640750365389675, + "learning_rate": 5e-05, + "loss": 0.0554, + "num_input_tokens_seen": 71177088, + "step": 1030 + }, + { + "epoch": 64.375, + "loss": 0.05298246070742607, + "loss_ce": 0.00293363188393414, + "loss_xval": 0.050048828125, + "num_input_tokens_seen": 71177088, + "step": 1030 + }, + { + "epoch": 64.4375, + "grad_norm": 11.291872650185505, + "learning_rate": 5e-05, + "loss": 0.0144, + "num_input_tokens_seen": 71248768, + "step": 1031 + }, + { + "epoch": 64.4375, + "loss": 0.014845769852399826, + "loss_ce": 0.002821844071149826, + "loss_xval": 0.01202392578125, + "num_input_tokens_seen": 71248768, + "step": 1031 + }, + { + "epoch": 64.5, + "grad_norm": 18.072573126276268, + "learning_rate": 5e-05, + "loss": 0.0278, + "num_input_tokens_seen": 71320384, + "step": 1032 + }, + { + "epoch": 64.5, + "loss": 0.027823323383927345, + "loss_ce": 0.0029209794010967016, + "loss_xval": 0.02490234375, + "num_input_tokens_seen": 71320384, + "step": 1032 + }, + { + "epoch": 64.5625, + "grad_norm": 21.662435019484295, + "learning_rate": 5e-05, + "loss": 0.0382, + "num_input_tokens_seen": 71392000, + "step": 1033 + }, + { + "epoch": 64.5625, + "loss": 0.03982299193739891, + "loss_ce": 0.002713618101552129, + "loss_xval": 0.037109375, + "num_input_tokens_seen": 71392000, + "step": 1033 + }, + { + "epoch": 64.625, + "grad_norm": 2.838038638389152, + "learning_rate": 5e-05, + "loss": 0.0068, + "num_input_tokens_seen": 71463680, + "step": 1034 + }, + { + "epoch": 64.625, + "loss": 0.006255846470594406, + "loss_ce": 0.002441149204969406, + "loss_xval": 0.003814697265625, + "num_input_tokens_seen": 71463680, + "step": 1034 + }, + { + "epoch": 64.6875, + "grad_norm": 20.100523420312953, + "learning_rate": 5e-05, + "loss": 0.0371, + "num_input_tokens_seen": 71535488, + "step": 1035 + }, + { + "epoch": 64.6875, + "loss": 0.03598330169916153, + "loss_ce": 0.0022918949835002422, + "loss_xval": 0.03369140625, + "num_input_tokens_seen": 71535488, + "step": 1035 + }, + { + "epoch": 64.75, + "grad_norm": 11.602017826304696, + "learning_rate": 5e-05, + "loss": 0.0152, + "num_input_tokens_seen": 71594624, + "step": 1036 + }, + { + "epoch": 64.75, + "loss": 0.016812482848763466, + "loss_ce": 0.0021640451159328222, + "loss_xval": 0.0146484375, + "num_input_tokens_seen": 71594624, + "step": 1036 + }, + { + "epoch": 64.8125, + "grad_norm": 8.661482672312363, + "learning_rate": 5e-05, + "loss": 0.0096, + "num_input_tokens_seen": 71666368, + "step": 1037 + }, + { + "epoch": 64.8125, + "loss": 0.010122916661202908, + "loss_ce": 0.002249381737783551, + "loss_xval": 0.00787353515625, + "num_input_tokens_seen": 71666368, + "step": 1037 + }, + { + "epoch": 64.875, + "grad_norm": 18.271595554722843, + "learning_rate": 5e-05, + "loss": 0.0279, + "num_input_tokens_seen": 71738048, + "step": 1038 + }, + { + "epoch": 64.875, + "loss": 0.03008742816746235, + "loss_ce": 0.0022553973831236362, + "loss_xval": 0.02783203125, + "num_input_tokens_seen": 71738048, + "step": 1038 + }, + { + "epoch": 64.9375, + "grad_norm": 3.060520876484566, + "learning_rate": 5e-05, + "loss": 0.005, + "num_input_tokens_seen": 71809600, + "step": 1039 + }, + { + "epoch": 64.9375, + "loss": 0.006218142807483673, + "loss_ce": 0.002128787338733673, + "loss_xval": 0.00408935546875, + "num_input_tokens_seen": 71809600, + "step": 1039 + }, + { + "epoch": 65.0, + "grad_norm": 13.986125332437513, + "learning_rate": 5e-05, + "loss": 0.0174, + "num_input_tokens_seen": 71868672, + "step": 1040 + }, + { + "epoch": 65.0, + "loss": 0.019594108685851097, + "loss_ce": 0.001893913489766419, + "loss_xval": 0.0177001953125, + "num_input_tokens_seen": 71868672, + "step": 1040 + }, + { + "epoch": 65.0625, + "grad_norm": 12.209464179139154, + "learning_rate": 5e-05, + "loss": 0.0145, + "num_input_tokens_seen": 71940480, + "step": 1041 + }, + { + "epoch": 65.0625, + "loss": 0.014442279934883118, + "loss_ce": 0.0018690372817218304, + "loss_xval": 0.0125732421875, + "num_input_tokens_seen": 71940480, + "step": 1041 + }, + { + "epoch": 65.125, + "grad_norm": 4.50926374607243, + "learning_rate": 5e-05, + "loss": 0.0058, + "num_input_tokens_seen": 72012096, + "step": 1042 + }, + { + "epoch": 65.125, + "loss": 0.0058934856206178665, + "loss_ce": 0.0019262002315372229, + "loss_xval": 0.00396728515625, + "num_input_tokens_seen": 72012096, + "step": 1042 + }, + { + "epoch": 65.1875, + "grad_norm": 15.853797766114015, + "learning_rate": 5e-05, + "loss": 0.0215, + "num_input_tokens_seen": 72083648, + "step": 1043 + }, + { + "epoch": 65.1875, + "loss": 0.021349458023905754, + "loss_ce": 0.0021844184957444668, + "loss_xval": 0.0191650390625, + "num_input_tokens_seen": 72083648, + "step": 1043 + }, + { + "epoch": 65.25, + "grad_norm": 3.198409504037858, + "learning_rate": 5e-05, + "loss": 0.0047, + "num_input_tokens_seen": 72155328, + "step": 1044 + }, + { + "epoch": 65.25, + "loss": 0.004548709373921156, + "loss_ce": 0.0017258335137739778, + "loss_xval": 0.0028228759765625, + "num_input_tokens_seen": 72155328, + "step": 1044 + }, + { + "epoch": 65.3125, + "grad_norm": 11.095617305917825, + "learning_rate": 5e-05, + "loss": 0.0121, + "num_input_tokens_seen": 72226944, + "step": 1045 + }, + { + "epoch": 65.3125, + "loss": 0.011855102144181728, + "loss_ce": 0.0016622307011857629, + "loss_xval": 0.01019287109375, + "num_input_tokens_seen": 72226944, + "step": 1045 + }, + { + "epoch": 65.375, + "grad_norm": 10.122140050011588, + "learning_rate": 5e-05, + "loss": 0.0107, + "num_input_tokens_seen": 72298496, + "step": 1046 + }, + { + "epoch": 65.375, + "loss": 0.011897360906004906, + "loss_ce": 0.001582419266924262, + "loss_xval": 0.01031494140625, + "num_input_tokens_seen": 72298496, + "step": 1046 + }, + { + "epoch": 65.4375, + "grad_norm": 7.1144618783809745, + "learning_rate": 5e-05, + "loss": 0.0066, + "num_input_tokens_seen": 72370112, + "step": 1047 + }, + { + "epoch": 65.4375, + "loss": 0.006526774261146784, + "loss_ce": 0.0016134442994371057, + "loss_xval": 0.004913330078125, + "num_input_tokens_seen": 72370112, + "step": 1047 + }, + { + "epoch": 65.5, + "grad_norm": 10.69983363857111, + "learning_rate": 5e-05, + "loss": 0.0109, + "num_input_tokens_seen": 72429184, + "step": 1048 + }, + { + "epoch": 65.5, + "loss": 0.010820340365171432, + "loss_ce": 0.0014209267683327198, + "loss_xval": 0.0093994140625, + "num_input_tokens_seen": 72429184, + "step": 1048 + }, + { + "epoch": 65.5625, + "grad_norm": 0.7117590553510648, + "learning_rate": 5e-05, + "loss": 0.0038, + "num_input_tokens_seen": 72500800, + "step": 1049 + }, + { + "epoch": 65.5625, + "loss": 0.0037945066578686237, + "loss_ce": 0.0013683591969311237, + "loss_xval": 0.0024261474609375, + "num_input_tokens_seen": 72500800, + "step": 1049 + }, + { + "epoch": 65.625, + "grad_norm": 10.867323388580141, + "learning_rate": 5e-05, + "loss": 0.0111, + "num_input_tokens_seen": 72559936, + "step": 1050 + }, + { + "epoch": 65.625, + "loss": 0.011647101491689682, + "loss_ce": 0.001271124929189682, + "loss_xval": 0.0103759765625, + "num_input_tokens_seen": 72559936, + "step": 1050 + }, + { + "epoch": 65.6875, + "grad_norm": 5.686620656543113, + "learning_rate": 5e-05, + "loss": 0.005, + "num_input_tokens_seen": 72631616, + "step": 1051 + }, + { + "epoch": 65.6875, + "loss": 0.005414774175733328, + "loss_ce": 0.0012949011288583279, + "loss_xval": 0.004119873046875, + "num_input_tokens_seen": 72631616, + "step": 1051 + }, + { + "epoch": 65.75, + "grad_norm": 7.442335265025644, + "learning_rate": 5e-05, + "loss": 0.0071, + "num_input_tokens_seen": 72703296, + "step": 1052 + }, + { + "epoch": 65.75, + "loss": 0.007290867157280445, + "loss_ce": 0.001217869110405445, + "loss_xval": 0.006072998046875, + "num_input_tokens_seen": 72703296, + "step": 1052 + }, + { + "epoch": 65.8125, + "grad_norm": 8.994490972618683, + "learning_rate": 5e-05, + "loss": 0.0095, + "num_input_tokens_seen": 72774912, + "step": 1053 + }, + { + "epoch": 65.8125, + "loss": 0.010024000890552998, + "loss_ce": 0.0012349386233836412, + "loss_xval": 0.0087890625, + "num_input_tokens_seen": 72774912, + "step": 1053 + }, + { + "epoch": 65.875, + "grad_norm": 4.616509642337401, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 72846528, + "step": 1054 + }, + { + "epoch": 65.875, + "loss": 0.0034966999664902687, + "loss_ce": 0.0011926227016374469, + "loss_xval": 0.0023040771484375, + "num_input_tokens_seen": 72846528, + "step": 1054 + }, + { + "epoch": 65.9375, + "grad_norm": 8.391185923447743, + "learning_rate": 5e-05, + "loss": 0.0065, + "num_input_tokens_seen": 72918080, + "step": 1055 + }, + { + "epoch": 65.9375, + "loss": 0.006700682453811169, + "loss_ce": 0.0011464833514764905, + "loss_xval": 0.00555419921875, + "num_input_tokens_seen": 72918080, + "step": 1055 + }, + { + "epoch": 66.0, + "grad_norm": 0.5200151132607369, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_input_tokens_seen": 72989632, + "step": 1056 + }, + { + "epoch": 66.0, + "loss": 0.0027731633745133877, + "loss_ce": 0.0015372015768662095, + "loss_xval": 0.0012359619140625, + "num_input_tokens_seen": 72989632, + "step": 1056 + }, + { + "epoch": 66.0625, + "grad_norm": 7.8363630439888, + "learning_rate": 5e-05, + "loss": 0.0064, + "num_input_tokens_seen": 73061248, + "step": 1057 + }, + { + "epoch": 66.0625, + "loss": 0.005550441797822714, + "loss_ce": 0.0010948755079880357, + "loss_xval": 0.00445556640625, + "num_input_tokens_seen": 73061248, + "step": 1057 + }, + { + "epoch": 66.125, + "grad_norm": 2.6651868002357277, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 73132800, + "step": 1058 + }, + { + "epoch": 66.125, + "loss": 0.0026142490096390247, + "loss_ce": 0.0011112582869827747, + "loss_xval": 0.00150299072265625, + "num_input_tokens_seen": 73132800, + "step": 1058 + }, + { + "epoch": 66.1875, + "grad_norm": 4.598939057870578, + "learning_rate": 5e-05, + "loss": 0.0036, + "num_input_tokens_seen": 73204352, + "step": 1059 + }, + { + "epoch": 66.1875, + "loss": 0.003604610450565815, + "loss_ce": 0.001025875099003315, + "loss_xval": 0.0025787353515625, + "num_input_tokens_seen": 73204352, + "step": 1059 + }, + { + "epoch": 66.25, + "grad_norm": 5.179290730834164, + "learning_rate": 5e-05, + "loss": 0.0038, + "num_input_tokens_seen": 73275968, + "step": 1060 + }, + { + "epoch": 66.25, + "loss": 0.0038217175751924515, + "loss_ce": 0.0009988414822146297, + "loss_xval": 0.0028228759765625, + "num_input_tokens_seen": 73275968, + "step": 1060 + }, + { + "epoch": 66.3125, + "grad_norm": 2.314749895515334, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 73347648, + "step": 1061 + }, + { + "epoch": 66.3125, + "loss": 0.0025231381878256798, + "loss_ce": 0.0010277768597006798, + "loss_xval": 0.001495361328125, + "num_input_tokens_seen": 73347648, + "step": 1061 + }, + { + "epoch": 66.375, + "grad_norm": 7.116350353107652, + "learning_rate": 5e-05, + "loss": 0.005, + "num_input_tokens_seen": 73419392, + "step": 1062 + }, + { + "epoch": 66.375, + "loss": 0.005443810019642115, + "loss_ce": 0.0010187613079324365, + "loss_xval": 0.004425048828125, + "num_input_tokens_seen": 73419392, + "step": 1062 + }, + { + "epoch": 66.4375, + "grad_norm": 0.7000035546873251, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 73490944, + "step": 1063 + }, + { + "epoch": 66.4375, + "loss": 0.001345249591395259, + "loss_ce": 0.0008722271886654198, + "loss_xval": 0.0004730224609375, + "num_input_tokens_seen": 73490944, + "step": 1063 + }, + { + "epoch": 66.5, + "grad_norm": 7.391093285153892, + "learning_rate": 5e-05, + "loss": 0.0054, + "num_input_tokens_seen": 73562624, + "step": 1064 + }, + { + "epoch": 66.5, + "loss": 0.005488572642207146, + "loss_ce": 0.0008499008254148066, + "loss_xval": 0.004638671875, + "num_input_tokens_seen": 73562624, + "step": 1064 + }, + { + "epoch": 66.5625, + "grad_norm": 1.4615064596624099, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 73634240, + "step": 1065 + }, + { + "epoch": 66.5625, + "loss": 0.0013411077670753002, + "loss_ce": 0.0008528265752829611, + "loss_xval": 0.00048828125, + "num_input_tokens_seen": 73634240, + "step": 1065 + }, + { + "epoch": 66.625, + "grad_norm": 7.578130916384667, + "learning_rate": 5e-05, + "loss": 0.0062, + "num_input_tokens_seen": 73705792, + "step": 1066 + }, + { + "epoch": 66.625, + "loss": 0.006180023308843374, + "loss_ce": 0.000778412155341357, + "loss_xval": 0.005401611328125, + "num_input_tokens_seen": 73705792, + "step": 1066 + }, + { + "epoch": 66.6875, + "grad_norm": 2.7945854003731605, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 73777472, + "step": 1067 + }, + { + "epoch": 66.6875, + "loss": 0.002022083615884185, + "loss_ce": 0.0008166392799466848, + "loss_xval": 0.0012054443359375, + "num_input_tokens_seen": 73777472, + "step": 1067 + }, + { + "epoch": 66.75, + "grad_norm": 7.82940226419613, + "learning_rate": 5e-05, + "loss": 0.0054, + "num_input_tokens_seen": 73849088, + "step": 1068 + }, + { + "epoch": 66.75, + "loss": 0.005442717578262091, + "loss_ce": 0.0007735280669294298, + "loss_xval": 0.004669189453125, + "num_input_tokens_seen": 73849088, + "step": 1068 + }, + { + "epoch": 66.8125, + "grad_norm": 2.9098149809574982, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 73920640, + "step": 1069 + }, + { + "epoch": 66.8125, + "loss": 0.0020586801692843437, + "loss_ce": 0.0007006479427218437, + "loss_xval": 0.0013580322265625, + "num_input_tokens_seen": 73920640, + "step": 1069 + }, + { + "epoch": 66.875, + "grad_norm": 5.130710698587081, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_input_tokens_seen": 73979776, + "step": 1070 + }, + { + "epoch": 66.875, + "loss": 0.003260532859712839, + "loss_ce": 0.000727573991753161, + "loss_xval": 0.002532958984375, + "num_input_tokens_seen": 73979776, + "step": 1070 + }, + { + "epoch": 66.9375, + "grad_norm": 5.263835846590998, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_input_tokens_seen": 74051456, + "step": 1071 + }, + { + "epoch": 66.9375, + "loss": 0.0030756406486034393, + "loss_ce": 0.0006952694384381175, + "loss_xval": 0.00238037109375, + "num_input_tokens_seen": 74051456, + "step": 1071 + }, + { + "epoch": 67.0, + "grad_norm": 4.423793604769365, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_input_tokens_seen": 74123136, + "step": 1072 + }, + { + "epoch": 67.0, + "loss": 0.003088552039116621, + "loss_ce": 0.0007234398508444428, + "loss_xval": 0.0023651123046875, + "num_input_tokens_seen": 74123136, + "step": 1072 + }, + { + "epoch": 67.0625, + "grad_norm": 5.4151559989195714, + "learning_rate": 5e-05, + "loss": 0.0035, + "num_input_tokens_seen": 74169728, + "step": 1073 + }, + { + "epoch": 67.0625, + "loss": 0.00415863236412406, + "loss_ce": 0.0006643697270192206, + "loss_xval": 0.0034942626953125, + "num_input_tokens_seen": 74169728, + "step": 1073 + }, + { + "epoch": 67.125, + "grad_norm": 2.6697138026778893, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 74241280, + "step": 1074 + }, + { + "epoch": 67.125, + "loss": 0.0019654666539281607, + "loss_ce": 0.0006760990363545716, + "loss_xval": 0.00128936767578125, + "num_input_tokens_seen": 74241280, + "step": 1074 + }, + { + "epoch": 67.1875, + "grad_norm": 6.466851461930606, + "learning_rate": 5e-05, + "loss": 0.0041, + "num_input_tokens_seen": 74312832, + "step": 1075 + }, + { + "epoch": 67.1875, + "loss": 0.004360595252364874, + "loss_ce": 0.0006984857027418911, + "loss_xval": 0.003662109375, + "num_input_tokens_seen": 74312832, + "step": 1075 + }, + { + "epoch": 67.25, + "grad_norm": 2.748059317089271, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 74384448, + "step": 1076 + }, + { + "epoch": 67.25, + "loss": 0.0014462934341281652, + "loss_ce": 0.0006108746747486293, + "loss_xval": 0.000835418701171875, + "num_input_tokens_seen": 74384448, + "step": 1076 + }, + { + "epoch": 67.3125, + "grad_norm": 6.290131306911363, + "learning_rate": 5e-05, + "loss": 0.0036, + "num_input_tokens_seen": 74456000, + "step": 1077 + }, + { + "epoch": 67.3125, + "loss": 0.004040678031742573, + "loss_ce": 0.0006227090489119291, + "loss_xval": 0.00341796875, + "num_input_tokens_seen": 74456000, + "step": 1077 + }, + { + "epoch": 67.375, + "grad_norm": 3.3652699857297126, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 74527680, + "step": 1078 + }, + { + "epoch": 67.375, + "loss": 0.0017457085195928812, + "loss_ce": 0.0005936700035817921, + "loss_xval": 0.00115203857421875, + "num_input_tokens_seen": 74527680, + "step": 1078 + }, + { + "epoch": 67.4375, + "grad_norm": 4.970270177767509, + "learning_rate": 5e-05, + "loss": 0.0026, + "num_input_tokens_seen": 74599360, + "step": 1079 + }, + { + "epoch": 67.4375, + "loss": 0.002559410873800516, + "loss_ce": 0.0005910270265303552, + "loss_xval": 0.0019683837890625, + "num_input_tokens_seen": 74599360, + "step": 1079 + }, + { + "epoch": 67.5, + "grad_norm": 3.2857842297194555, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 74671040, + "step": 1080 + }, + { + "epoch": 67.5, + "loss": 0.001640387112274766, + "loss_ce": 0.0005493836360983551, + "loss_xval": 0.00109100341796875, + "num_input_tokens_seen": 74671040, + "step": 1080 + }, + { + "epoch": 67.5625, + "grad_norm": 3.3166996297328875, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 74730112, + "step": 1081 + }, + { + "epoch": 67.5625, + "loss": 0.0017551060300320387, + "loss_ce": 0.0005115147796459496, + "loss_xval": 0.00124359130859375, + "num_input_tokens_seen": 74730112, + "step": 1081 + }, + { + "epoch": 67.625, + "grad_norm": 2.5686066250891417, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 74801792, + "step": 1082 + }, + { + "epoch": 67.625, + "loss": 0.0016136132180690765, + "loss_ce": 0.0005531273782253265, + "loss_xval": 0.00106048583984375, + "num_input_tokens_seen": 74801792, + "step": 1082 + }, + { + "epoch": 67.6875, + "grad_norm": 2.109128772753182, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 74860992, + "step": 1083 + }, + { + "epoch": 67.6875, + "loss": 0.001649228623136878, + "loss_ce": 0.0005505957524292171, + "loss_xval": 0.0010986328125, + "num_input_tokens_seen": 74860992, + "step": 1083 + }, + { + "epoch": 67.75, + "grad_norm": 2.3087502620833376, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 74932672, + "step": 1084 + }, + { + "epoch": 67.75, + "loss": 0.0018915177788585424, + "loss_ce": 0.0005258561577647924, + "loss_xval": 0.00136566162109375, + "num_input_tokens_seen": 74932672, + "step": 1084 + }, + { + "epoch": 67.8125, + "grad_norm": 1.606598249894243, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 75004352, + "step": 1085 + }, + { + "epoch": 67.8125, + "loss": 0.001132636098191142, + "loss_ce": 0.000495581713039428, + "loss_xval": 0.000637054443359375, + "num_input_tokens_seen": 75004352, + "step": 1085 + }, + { + "epoch": 67.875, + "grad_norm": 2.3263075112685443, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 75075904, + "step": 1086 + }, + { + "epoch": 67.875, + "loss": 0.0011369904968887568, + "loss_ce": 0.0005418977816589177, + "loss_xval": 0.0005950927734375, + "num_input_tokens_seen": 75075904, + "step": 1086 + }, + { + "epoch": 67.9375, + "grad_norm": 1.9644424594312697, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 75147520, + "step": 1087 + }, + { + "epoch": 67.9375, + "loss": 0.001179728889837861, + "loss_ce": 0.00047400989569723606, + "loss_xval": 0.000705718994140625, + "num_input_tokens_seen": 75147520, + "step": 1087 + }, + { + "epoch": 68.0, + "grad_norm": 4.111860002263121, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 75219136, + "step": 1088 + }, + { + "epoch": 68.0, + "loss": 0.002233484061434865, + "loss_ce": 0.00047109383740462363, + "loss_xval": 0.00176239013671875, + "num_input_tokens_seen": 75219136, + "step": 1088 + }, + { + "epoch": 68.0625, + "grad_norm": 0.582760051076, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 75290816, + "step": 1089 + }, + { + "epoch": 68.0625, + "loss": 0.0007814638083800673, + "loss_ce": 0.0004781954048667103, + "loss_xval": 0.0003032684326171875, + "num_input_tokens_seen": 75290816, + "step": 1089 + }, + { + "epoch": 68.125, + "grad_norm": 3.3103458752612496, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 75362624, + "step": 1090 + }, + { + "epoch": 68.125, + "loss": 0.0014409287832677364, + "loss_ce": 0.00044910749420523643, + "loss_xval": 0.0009918212890625, + "num_input_tokens_seen": 75362624, + "step": 1090 + }, + { + "epoch": 68.1875, + "grad_norm": 2.086643414735248, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 75434432, + "step": 1091 + }, + { + "epoch": 68.1875, + "loss": 0.0010076756589114666, + "loss_ce": 0.00042021225090138614, + "loss_xval": 0.00058746337890625, + "num_input_tokens_seen": 75434432, + "step": 1091 + }, + { + "epoch": 68.25, + "grad_norm": 2.022816660904633, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 75506048, + "step": 1092 + }, + { + "epoch": 68.25, + "loss": 0.0015586587833240628, + "loss_ce": 0.0004447671817615628, + "loss_xval": 0.0011138916015625, + "num_input_tokens_seen": 75506048, + "step": 1092 + }, + { + "epoch": 68.3125, + "grad_norm": 3.020208647060878, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 75577600, + "step": 1093 + }, + { + "epoch": 68.3125, + "loss": 0.0018930145306512713, + "loss_ce": 0.00043580017518252134, + "loss_xval": 0.00145721435546875, + "num_input_tokens_seen": 75577600, + "step": 1093 + }, + { + "epoch": 68.375, + "grad_norm": 0.3506489901589061, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 75649216, + "step": 1094 + }, + { + "epoch": 68.375, + "loss": 0.0009144733194261789, + "loss_ce": 0.0004109333094675094, + "loss_xval": 0.0005035400390625, + "num_input_tokens_seen": 75649216, + "step": 1094 + }, + { + "epoch": 68.4375, + "grad_norm": 2.0988202553666055, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 75720832, + "step": 1095 + }, + { + "epoch": 68.4375, + "loss": 0.0012659782078117132, + "loss_ce": 0.00041530077578499913, + "loss_xval": 0.000850677490234375, + "num_input_tokens_seen": 75720832, + "step": 1095 + }, + { + "epoch": 68.5, + "grad_norm": 1.105888327952167, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 75792448, + "step": 1096 + }, + { + "epoch": 68.5, + "loss": 0.0007757066050544381, + "loss_ce": 0.00043047647341154516, + "loss_xval": 0.0003452301025390625, + "num_input_tokens_seen": 75792448, + "step": 1096 + }, + { + "epoch": 68.5625, + "grad_norm": 1.9460696134458952, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 75864256, + "step": 1097 + }, + { + "epoch": 68.5625, + "loss": 0.0008472871268168092, + "loss_ce": 0.0004124116094317287, + "loss_xval": 0.00043487548828125, + "num_input_tokens_seen": 75864256, + "step": 1097 + }, + { + "epoch": 68.625, + "grad_norm": 2.96760282304176, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 75923392, + "step": 1098 + }, + { + "epoch": 68.625, + "loss": 0.0010627032024785876, + "loss_ce": 0.0003875018155667931, + "loss_xval": 0.000675201416015625, + "num_input_tokens_seen": 75923392, + "step": 1098 + }, + { + "epoch": 68.6875, + "grad_norm": 0.43332427108493793, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 75982464, + "step": 1099 + }, + { + "epoch": 68.6875, + "loss": 0.0010092169977724552, + "loss_ce": 0.00037216261262074113, + "loss_xval": 0.000637054443359375, + "num_input_tokens_seen": 75982464, + "step": 1099 + }, + { + "epoch": 68.75, + "grad_norm": 2.2587393753504292, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 76054080, + "step": 1100 + }, + { + "epoch": 68.75, + "loss": 0.0009440279682166874, + "loss_ce": 0.00037563807563856244, + "loss_xval": 0.000568389892578125, + "num_input_tokens_seen": 76054080, + "step": 1100 + }, + { + "epoch": 68.8125, + "grad_norm": 1.448401941251065, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 76125760, + "step": 1101 + }, + { + "epoch": 68.8125, + "loss": 0.0006880094297230244, + "loss_ce": 0.0003752042830456048, + "loss_xval": 0.00031280517578125, + "num_input_tokens_seen": 76125760, + "step": 1101 + }, + { + "epoch": 68.875, + "grad_norm": 1.0144090906009593, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 76197376, + "step": 1102 + }, + { + "epoch": 68.875, + "loss": 0.0008232088293880224, + "loss_ce": 0.00038261126610450447, + "loss_xval": 0.0004405975341796875, + "num_input_tokens_seen": 76197376, + "step": 1102 + }, + { + "epoch": 68.9375, + "grad_norm": 1.502076433418958, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 76268992, + "step": 1103 + }, + { + "epoch": 68.9375, + "loss": 0.0007593849441036582, + "loss_ce": 0.00036647109664045274, + "loss_xval": 0.000392913818359375, + "num_input_tokens_seen": 76268992, + "step": 1103 + }, + { + "epoch": 69.0, + "grad_norm": 0.2226085749394477, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 76340608, + "step": 1104 + }, + { + "epoch": 69.0, + "loss": 0.0006789171602576971, + "loss_ce": 0.0003718340303748846, + "loss_xval": 0.0003070831298828125, + "num_input_tokens_seen": 76340608, + "step": 1104 + }, + { + "epoch": 69.0625, + "grad_norm": 1.1949850171733325, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 76412288, + "step": 1105 + }, + { + "epoch": 69.0625, + "loss": 0.0007067075930535793, + "loss_ce": 0.0003652921586763114, + "loss_xval": 0.0003414154052734375, + "num_input_tokens_seen": 76412288, + "step": 1105 + }, + { + "epoch": 69.125, + "grad_norm": 0.22225027753658388, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 76483840, + "step": 1106 + }, + { + "epoch": 69.125, + "loss": 0.000622923020273447, + "loss_ce": 0.00034826481714844704, + "loss_xval": 0.000274658203125, + "num_input_tokens_seen": 76483840, + "step": 1106 + }, + { + "epoch": 69.1875, + "grad_norm": 1.6311751923551645, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 76555456, + "step": 1107 + }, + { + "epoch": 69.1875, + "loss": 0.000752802356146276, + "loss_ce": 0.00034844447509385645, + "loss_xval": 0.00040435791015625, + "num_input_tokens_seen": 76555456, + "step": 1107 + }, + { + "epoch": 69.25, + "grad_norm": 0.90413859652661, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 76614656, + "step": 1108 + }, + { + "epoch": 69.25, + "loss": 0.0005864095874130726, + "loss_ce": 0.00032701014424674213, + "loss_xval": 0.0002593994140625, + "num_input_tokens_seen": 76614656, + "step": 1108 + }, + { + "epoch": 69.3125, + "grad_norm": 0.8893801851428039, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 76686208, + "step": 1109 + }, + { + "epoch": 69.3125, + "loss": 0.0005970131605863571, + "loss_ce": 0.0003452431410551071, + "loss_xval": 0.00025177001953125, + "num_input_tokens_seen": 76686208, + "step": 1109 + }, + { + "epoch": 69.375, + "grad_norm": 1.8984967276637597, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 76745280, + "step": 1110 + }, + { + "epoch": 69.375, + "loss": 0.0007707853801548481, + "loss_ce": 0.00030920698191039264, + "loss_xval": 0.000461578369140625, + "num_input_tokens_seen": 76745280, + "step": 1110 + }, + { + "epoch": 69.4375, + "grad_norm": 1.1870451841541478, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 76816832, + "step": 1111 + }, + { + "epoch": 69.4375, + "loss": 0.0005835613701492548, + "loss_ce": 0.00032034722971729934, + "loss_xval": 0.000263214111328125, + "num_input_tokens_seen": 76816832, + "step": 1111 + }, + { + "epoch": 69.5, + "grad_norm": 0.9232265195058813, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 76888384, + "step": 1112 + }, + { + "epoch": 69.5, + "loss": 0.0006464470061473548, + "loss_ce": 0.00030503160087391734, + "loss_xval": 0.0003414154052734375, + "num_input_tokens_seen": 76888384, + "step": 1112 + }, + { + "epoch": 69.5625, + "grad_norm": 2.1241217130665517, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 76960064, + "step": 1113 + }, + { + "epoch": 69.5625, + "loss": 0.0007790784584358335, + "loss_ce": 0.00030987069476395845, + "loss_xval": 0.000469207763671875, + "num_input_tokens_seen": 76960064, + "step": 1113 + }, + { + "epoch": 69.625, + "grad_norm": 0.5368007405442083, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 77031616, + "step": 1114 + }, + { + "epoch": 69.625, + "loss": 0.0010094753233715892, + "loss_ce": 0.0003037563001271337, + "loss_xval": 0.000705718994140625, + "num_input_tokens_seen": 77031616, + "step": 1114 + }, + { + "epoch": 69.6875, + "grad_norm": 2.8898095165154576, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 77090688, + "step": 1115 + }, + { + "epoch": 69.6875, + "loss": 0.0013308109482750297, + "loss_ce": 0.00029321329202502966, + "loss_xval": 0.00103759765625, + "num_input_tokens_seen": 77090688, + "step": 1115 + }, + { + "epoch": 69.75, + "grad_norm": 4.702732081058997, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 77149888, + "step": 1116 + }, + { + "epoch": 69.75, + "loss": 0.0017137245740741491, + "loss_ce": 0.0002946572203654796, + "loss_xval": 0.0014190673828125, + "num_input_tokens_seen": 77149888, + "step": 1116 + }, + { + "epoch": 69.8125, + "grad_norm": 3.5802036236691444, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 77221440, + "step": 1117 + }, + { + "epoch": 69.8125, + "loss": 0.0014380008215084672, + "loss_ce": 0.0002859622472897172, + "loss_xval": 0.00115203857421875, + "num_input_tokens_seen": 77221440, + "step": 1117 + }, + { + "epoch": 69.875, + "grad_norm": 0.6476686940933929, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 77293120, + "step": 1118 + }, + { + "epoch": 69.875, + "loss": 0.0006478414870798588, + "loss_ce": 0.00029116732184775174, + "loss_xval": 0.0003566741943359375, + "num_input_tokens_seen": 77293120, + "step": 1118 + }, + { + "epoch": 69.9375, + "grad_norm": 2.1153515585154823, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 77364736, + "step": 1119 + }, + { + "epoch": 69.9375, + "loss": 0.000696200062520802, + "loss_ce": 0.00028612007736228406, + "loss_xval": 0.0004100799560546875, + "num_input_tokens_seen": 77364736, + "step": 1119 + }, + { + "epoch": 70.0, + "grad_norm": 2.784751995439613, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 77423936, + "step": 1120 + }, + { + "epoch": 70.0, + "loss": 0.000890074297785759, + "loss_ce": 0.00026446394622325897, + "loss_xval": 0.0006256103515625, + "num_input_tokens_seen": 77423936, + "step": 1120 + }, + { + "epoch": 70.0625, + "grad_norm": 1.5326245030717471, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 77495616, + "step": 1121 + }, + { + "epoch": 70.0625, + "loss": 0.0007288586348295212, + "loss_ce": 0.0002653728879522532, + "loss_xval": 0.0004634857177734375, + "num_input_tokens_seen": 77495616, + "step": 1121 + }, + { + "epoch": 70.125, + "grad_norm": 0.09610609543339954, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 77567232, + "step": 1122 + }, + { + "epoch": 70.125, + "loss": 0.0005251829279586673, + "loss_ce": 0.0002562467707321048, + "loss_xval": 0.0002689361572265625, + "num_input_tokens_seen": 77567232, + "step": 1122 + }, + { + "epoch": 70.1875, + "grad_norm": 0.746399639745206, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 77638784, + "step": 1123 + }, + { + "epoch": 70.1875, + "loss": 0.00044554518535733223, + "loss_ce": 0.00025862501934170723, + "loss_xval": 0.000186920166015625, + "num_input_tokens_seen": 77638784, + "step": 1123 + }, + { + "epoch": 70.25, + "grad_norm": 0.5062431402486854, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 77710400, + "step": 1124 + }, + { + "epoch": 70.25, + "loss": 0.0004995564231649041, + "loss_ce": 0.0002477864036336541, + "loss_xval": 0.00025177001953125, + "num_input_tokens_seen": 77710400, + "step": 1124 + }, + { + "epoch": 70.3125, + "grad_norm": 0.7834653348132636, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 77781952, + "step": 1125 + }, + { + "epoch": 70.3125, + "loss": 0.0007250700728036463, + "loss_ce": 0.0002558623091317713, + "loss_xval": 0.000469207763671875, + "num_input_tokens_seen": 77781952, + "step": 1125 + }, + { + "epoch": 70.375, + "grad_norm": 2.6329741353793525, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 77841024, + "step": 1126 + }, + { + "epoch": 70.375, + "loss": 0.0008808316779322922, + "loss_ce": 0.00023996253730729222, + "loss_xval": 0.000640869140625, + "num_input_tokens_seen": 77841024, + "step": 1126 + }, + { + "epoch": 70.4375, + "grad_norm": 3.810101421457146, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 77912704, + "step": 1127 + }, + { + "epoch": 70.4375, + "loss": 0.001295228023082018, + "loss_ce": 0.00025763033772818744, + "loss_xval": 0.00103759765625, + "num_input_tokens_seen": 77912704, + "step": 1127 + }, + { + "epoch": 70.5, + "grad_norm": 3.3135496242953315, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 77984256, + "step": 1128 + }, + { + "epoch": 70.5, + "loss": 0.0012748048175126314, + "loss_ce": 0.0002524659794289619, + "loss_xval": 0.0010223388671875, + "num_input_tokens_seen": 77984256, + "step": 1128 + }, + { + "epoch": 70.5625, + "grad_norm": 1.2567102717320116, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 78043392, + "step": 1129 + }, + { + "epoch": 70.5625, + "loss": 0.0005960181588307023, + "loss_ce": 0.00024697332992218435, + "loss_xval": 0.0003490447998046875, + "num_input_tokens_seen": 78043392, + "step": 1129 + }, + { + "epoch": 70.625, + "grad_norm": 1.5373568630363612, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 78115200, + "step": 1130 + }, + { + "epoch": 70.625, + "loss": 0.0009142133640125394, + "loss_ce": 0.00024282665981445462, + "loss_xval": 0.00067138671875, + "num_input_tokens_seen": 78115200, + "step": 1130 + }, + { + "epoch": 70.6875, + "grad_norm": 3.3277541930648953, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 78161792, + "step": 1131 + }, + { + "epoch": 70.6875, + "loss": 0.001277487725019455, + "loss_ce": 0.0002170019142795354, + "loss_xval": 0.00106048583984375, + "num_input_tokens_seen": 78161792, + "step": 1131 + }, + { + "epoch": 70.75, + "grad_norm": 3.077346914693892, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 78220864, + "step": 1132 + }, + { + "epoch": 70.75, + "loss": 0.0009054588153958321, + "loss_ce": 0.00022644268756266683, + "loss_xval": 0.00067901611328125, + "num_input_tokens_seen": 78220864, + "step": 1132 + }, + { + "epoch": 70.8125, + "grad_norm": 1.3757736730193164, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 78292672, + "step": 1133 + }, + { + "epoch": 70.8125, + "loss": 0.0008023668196983635, + "loss_ce": 0.00023016221530269831, + "loss_xval": 0.00057220458984375, + "num_input_tokens_seen": 78292672, + "step": 1133 + }, + { + "epoch": 70.875, + "grad_norm": 0.6192579612834873, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 78364352, + "step": 1134 + }, + { + "epoch": 70.875, + "loss": 0.0005293790600262582, + "loss_ce": 0.0002222959155915305, + "loss_xval": 0.0003070831298828125, + "num_input_tokens_seen": 78364352, + "step": 1134 + }, + { + "epoch": 70.9375, + "grad_norm": 1.7480415437279953, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 78435904, + "step": 1135 + }, + { + "epoch": 70.9375, + "loss": 0.0006328938179649413, + "loss_ce": 0.00021327711874619126, + "loss_xval": 0.00041961669921875, + "num_input_tokens_seen": 78435904, + "step": 1135 + }, + { + "epoch": 71.0, + "grad_norm": 2.4914839158123705, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 78507520, + "step": 1136 + }, + { + "epoch": 71.0, + "loss": 0.0007470841519534588, + "loss_ce": 0.00022828533838037401, + "loss_xval": 0.000518798828125, + "num_input_tokens_seen": 78507520, + "step": 1136 + }, + { + "epoch": 71.0625, + "grad_norm": 3.127032348632522, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 78579264, + "step": 1137 + }, + { + "epoch": 71.0625, + "loss": 0.0009275174816139042, + "loss_ce": 0.000217983775655739, + "loss_xval": 0.00070953369140625, + "num_input_tokens_seen": 78579264, + "step": 1137 + }, + { + "epoch": 71.125, + "grad_norm": 3.296218627743644, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 78650880, + "step": 1138 + }, + { + "epoch": 71.125, + "loss": 0.0013986143749207258, + "loss_ce": 0.00021605828078463674, + "loss_xval": 0.00118255615234375, + "num_input_tokens_seen": 78650880, + "step": 1138 + }, + { + "epoch": 71.1875, + "grad_norm": 3.2523286688234703, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 78722624, + "step": 1139 + }, + { + "epoch": 71.1875, + "loss": 0.0010947894770652056, + "loss_ce": 0.00020977971144020557, + "loss_xval": 0.000885009765625, + "num_input_tokens_seen": 78722624, + "step": 1139 + }, + { + "epoch": 71.25, + "grad_norm": 2.8340559225912716, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 78794240, + "step": 1140 + }, + { + "epoch": 71.25, + "loss": 0.0009315769420936704, + "loss_ce": 0.0001991550379898399, + "loss_xval": 0.000732421875, + "num_input_tokens_seen": 78794240, + "step": 1140 + }, + { + "epoch": 71.3125, + "grad_norm": 2.1018467475347307, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 78866048, + "step": 1141 + }, + { + "epoch": 71.3125, + "loss": 0.0006598670734092593, + "loss_ce": 0.00020210337243042886, + "loss_xval": 0.000457763671875, + "num_input_tokens_seen": 78866048, + "step": 1141 + }, + { + "epoch": 71.375, + "grad_norm": 1.585719690902742, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 78925056, + "step": 1142 + }, + { + "epoch": 71.375, + "loss": 0.0005244898493401706, + "loss_ce": 0.00018688915588427335, + "loss_xval": 0.0003376007080078125, + "num_input_tokens_seen": 78925056, + "step": 1142 + }, + { + "epoch": 71.4375, + "grad_norm": 1.5981146071845063, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 78996608, + "step": 1143 + }, + { + "epoch": 71.4375, + "loss": 0.0005193843389861286, + "loss_ce": 0.00019704240548890084, + "loss_xval": 0.0003223419189453125, + "num_input_tokens_seen": 78996608, + "step": 1143 + }, + { + "epoch": 71.5, + "grad_norm": 2.743565029475743, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 79068288, + "step": 1144 + }, + { + "epoch": 71.5, + "loss": 0.0009858801495283842, + "loss_ce": 0.00020005246915388852, + "loss_xval": 0.00078582763671875, + "num_input_tokens_seen": 79068288, + "step": 1144 + }, + { + "epoch": 71.5625, + "grad_norm": 4.49463450679341, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 79127488, + "step": 1145 + }, + { + "epoch": 71.5625, + "loss": 0.0016861867625266314, + "loss_ce": 0.00019845488714054227, + "loss_xval": 0.00148773193359375, + "num_input_tokens_seen": 79127488, + "step": 1145 + }, + { + "epoch": 71.625, + "grad_norm": 6.002361391410423, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 79199104, + "step": 1146 + }, + { + "epoch": 71.625, + "loss": 0.0026474776677787304, + "loss_ce": 0.00019081255595665425, + "loss_xval": 0.0024566650390625, + "num_input_tokens_seen": 79199104, + "step": 1146 + }, + { + "epoch": 71.6875, + "grad_norm": 7.37613968419637, + "learning_rate": 5e-05, + "loss": 0.0039, + "num_input_tokens_seen": 79270784, + "step": 1147 + }, + { + "epoch": 71.6875, + "loss": 0.004187567625194788, + "loss_ce": 0.00018976477440446615, + "loss_xval": 0.003997802734375, + "num_input_tokens_seen": 79270784, + "step": 1147 + }, + { + "epoch": 71.75, + "grad_norm": 9.433800009175698, + "learning_rate": 5e-05, + "loss": 0.006, + "num_input_tokens_seen": 79329984, + "step": 1148 + }, + { + "epoch": 71.75, + "loss": 0.0059251803904771805, + "loss_ce": 0.00018787590670399368, + "loss_xval": 0.0057373046875, + "num_input_tokens_seen": 79329984, + "step": 1148 + }, + { + "epoch": 71.8125, + "grad_norm": 13.554320713358411, + "learning_rate": 5e-05, + "loss": 0.0121, + "num_input_tokens_seen": 79401664, + "step": 1149 + }, + { + "epoch": 71.8125, + "loss": 0.012463638558983803, + "loss_ce": 0.00019557222549337894, + "loss_xval": 0.01226806640625, + "num_input_tokens_seen": 79401664, + "step": 1149 + }, + { + "epoch": 71.875, + "grad_norm": 20.450152288934543, + "learning_rate": 5e-05, + "loss": 0.0273, + "num_input_tokens_seen": 79460672, + "step": 1150 + }, + { + "epoch": 71.875, + "loss": 0.02654273435473442, + "loss_ce": 0.00017554643272887915, + "loss_xval": 0.0263671875, + "num_input_tokens_seen": 79460672, + "step": 1150 + }, + { + "epoch": 71.9375, + "grad_norm": 30.27927271328012, + "learning_rate": 5e-05, + "loss": 0.0601, + "num_input_tokens_seen": 79532416, + "step": 1151 + }, + { + "epoch": 71.9375, + "loss": 0.05904359370470047, + "loss_ce": 0.0002057042729575187, + "loss_xval": 0.058837890625, + "num_input_tokens_seen": 79532416, + "step": 1151 + }, + { + "epoch": 72.0, + "grad_norm": 42.395333437272235, + "learning_rate": 5e-05, + "loss": 0.1191, + "num_input_tokens_seen": 79603968, + "step": 1152 + }, + { + "epoch": 72.0, + "loss": 0.11884753406047821, + "loss_ce": 0.00019518662884365767, + "loss_xval": 0.11865234375, + "num_input_tokens_seen": 79603968, + "step": 1152 + }, + { + "epoch": 72.0625, + "grad_norm": 51.66495567019675, + "learning_rate": 5e-05, + "loss": 0.1765, + "num_input_tokens_seen": 79675648, + "step": 1153 + }, + { + "epoch": 72.0625, + "loss": 0.17893804609775543, + "loss_ce": 0.0002271079138154164, + "loss_xval": 0.1787109375, + "num_input_tokens_seen": 79675648, + "step": 1153 + }, + { + "epoch": 72.125, + "grad_norm": 45.35518512350621, + "learning_rate": 5e-05, + "loss": 0.1428, + "num_input_tokens_seen": 79747328, + "step": 1154 + }, + { + "epoch": 72.125, + "loss": 0.1486750841140747, + "loss_ce": 0.00023758277529850602, + "loss_xval": 0.1484375, + "num_input_tokens_seen": 79747328, + "step": 1154 + }, + { + "epoch": 72.1875, + "grad_norm": 19.70199552558819, + "learning_rate": 5e-05, + "loss": 0.0293, + "num_input_tokens_seen": 79806400, + "step": 1155 + }, + { + "epoch": 72.1875, + "loss": 0.028222400695085526, + "loss_ce": 0.0002682995982468128, + "loss_xval": 0.0279541015625, + "num_input_tokens_seen": 79806400, + "step": 1155 + }, + { + "epoch": 72.25, + "grad_norm": 13.444270932502397, + "learning_rate": 5e-05, + "loss": 0.0167, + "num_input_tokens_seen": 79878016, + "step": 1156 + }, + { + "epoch": 72.25, + "loss": 0.015745338052511215, + "loss_ce": 0.0003034429391846061, + "loss_xval": 0.01544189453125, + "num_input_tokens_seen": 79878016, + "step": 1156 + }, + { + "epoch": 72.3125, + "grad_norm": 34.97534035039033, + "learning_rate": 5e-05, + "loss": 0.0904, + "num_input_tokens_seen": 79949760, + "step": 1157 + }, + { + "epoch": 72.3125, + "loss": 0.09801744669675827, + "loss_ce": 0.00036119777359999716, + "loss_xval": 0.09765625, + "num_input_tokens_seen": 79949760, + "step": 1157 + }, + { + "epoch": 72.375, + "grad_norm": 29.571068751807424, + "learning_rate": 5e-05, + "loss": 0.0668, + "num_input_tokens_seen": 80021312, + "step": 1158 + }, + { + "epoch": 72.375, + "loss": 0.06681597232818604, + "loss_ce": 0.0004097215423826128, + "loss_xval": 0.06640625, + "num_input_tokens_seen": 80021312, + "step": 1158 + }, + { + "epoch": 72.4375, + "grad_norm": 3.12352961523283, + "learning_rate": 5e-05, + "loss": 0.009, + "num_input_tokens_seen": 80092992, + "step": 1159 + }, + { + "epoch": 72.4375, + "loss": 0.00865169707685709, + "loss_ce": 0.00047298570279963315, + "loss_xval": 0.0081787109375, + "num_input_tokens_seen": 80092992, + "step": 1159 + }, + { + "epoch": 72.5, + "grad_norm": 22.360989695703942, + "learning_rate": 5e-05, + "loss": 0.0462, + "num_input_tokens_seen": 80152000, + "step": 1160 + }, + { + "epoch": 72.5, + "loss": 0.04107421636581421, + "loss_ce": 0.0005468718591146171, + "loss_xval": 0.04052734375, + "num_input_tokens_seen": 80152000, + "step": 1160 + }, + { + "epoch": 72.5625, + "grad_norm": 27.510295556022815, + "learning_rate": 5e-05, + "loss": 0.0605, + "num_input_tokens_seen": 80211200, + "step": 1161 + }, + { + "epoch": 72.5625, + "loss": 0.062126122415065765, + "loss_ce": 0.0006026857881806791, + "loss_xval": 0.0615234375, + "num_input_tokens_seen": 80211200, + "step": 1161 + }, + { + "epoch": 72.625, + "grad_norm": 10.877303396663827, + "learning_rate": 5e-05, + "loss": 0.013, + "num_input_tokens_seen": 80282816, + "step": 1162 + }, + { + "epoch": 72.625, + "loss": 0.012666767463088036, + "loss_ce": 0.0007649122853763402, + "loss_xval": 0.01190185546875, + "num_input_tokens_seen": 80282816, + "step": 1162 + }, + { + "epoch": 72.6875, + "grad_norm": 11.656950051709288, + "learning_rate": 5e-05, + "loss": 0.0151, + "num_input_tokens_seen": 80354560, + "step": 1163 + }, + { + "epoch": 72.6875, + "loss": 0.01623488776385784, + "loss_ce": 0.000609888113103807, + "loss_xval": 0.015625, + "num_input_tokens_seen": 80354560, + "step": 1163 + }, + { + "epoch": 72.75, + "grad_norm": 20.38091104817712, + "learning_rate": 5e-05, + "loss": 0.0376, + "num_input_tokens_seen": 80426112, + "step": 1164 + }, + { + "epoch": 72.75, + "loss": 0.041903380304574966, + "loss_ce": 0.0006436131079681218, + "loss_xval": 0.041259765625, + "num_input_tokens_seen": 80426112, + "step": 1164 + }, + { + "epoch": 72.8125, + "grad_norm": 11.945638795307774, + "learning_rate": 5e-05, + "loss": 0.0157, + "num_input_tokens_seen": 80485184, + "step": 1165 + }, + { + "epoch": 72.8125, + "loss": 0.01640426367521286, + "loss_ce": 0.0006571935955435038, + "loss_xval": 0.0157470703125, + "num_input_tokens_seen": 80485184, + "step": 1165 + }, + { + "epoch": 72.875, + "grad_norm": 3.9872152859369683, + "learning_rate": 5e-05, + "loss": 0.0044, + "num_input_tokens_seen": 80544256, + "step": 1166 + }, + { + "epoch": 72.875, + "loss": 0.004912215750664473, + "loss_ce": 0.0007007899112068117, + "loss_xval": 0.00421142578125, + "num_input_tokens_seen": 80544256, + "step": 1166 + }, + { + "epoch": 72.9375, + "grad_norm": 13.697934139156507, + "learning_rate": 5e-05, + "loss": 0.0184, + "num_input_tokens_seen": 80615936, + "step": 1167 + }, + { + "epoch": 72.9375, + "loss": 0.0189347043633461, + "loss_ce": 0.0007462279172614217, + "loss_xval": 0.0181884765625, + "num_input_tokens_seen": 80615936, + "step": 1167 + }, + { + "epoch": 73.0, + "grad_norm": 12.216265019337566, + "learning_rate": 5e-05, + "loss": 0.0165, + "num_input_tokens_seen": 80687616, + "step": 1168 + }, + { + "epoch": 73.0, + "loss": 0.01614932343363762, + "loss_ce": 0.0007684631855227053, + "loss_xval": 0.015380859375, + "num_input_tokens_seen": 80687616, + "step": 1168 + }, + { + "epoch": 73.0625, + "grad_norm": 2.0683251647835994, + "learning_rate": 5e-05, + "loss": 0.0037, + "num_input_tokens_seen": 80759296, + "step": 1169 + }, + { + "epoch": 73.0625, + "loss": 0.0035584524739533663, + "loss_ce": 0.0007966115954332054, + "loss_xval": 0.0027618408203125, + "num_input_tokens_seen": 80759296, + "step": 1169 + }, + { + "epoch": 73.125, + "grad_norm": 8.472099059482757, + "learning_rate": 5e-05, + "loss": 0.0093, + "num_input_tokens_seen": 80830912, + "step": 1170 + }, + { + "epoch": 73.125, + "loss": 0.00800013355910778, + "loss_ce": 0.000797984772361815, + "loss_xval": 0.0072021484375, + "num_input_tokens_seen": 80830912, + "step": 1170 + }, + { + "epoch": 73.1875, + "grad_norm": 10.998131460264046, + "learning_rate": 5e-05, + "loss": 0.0119, + "num_input_tokens_seen": 80902656, + "step": 1171 + }, + { + "epoch": 73.1875, + "loss": 0.0125557417050004, + "loss_ce": 0.0008369915885850787, + "loss_xval": 0.01171875, + "num_input_tokens_seen": 80902656, + "step": 1171 + }, + { + "epoch": 73.25, + "grad_norm": 4.3344978035814625, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 80961792, + "step": 1172 + }, + { + "epoch": 73.25, + "loss": 0.003933137748390436, + "loss_ce": 0.0008356033940799534, + "loss_xval": 0.0030975341796875, + "num_input_tokens_seen": 80961792, + "step": 1172 + }, + { + "epoch": 73.3125, + "grad_norm": 6.32022394282333, + "learning_rate": 5e-05, + "loss": 0.007, + "num_input_tokens_seen": 81033344, + "step": 1173 + }, + { + "epoch": 73.3125, + "loss": 0.006512059364467859, + "loss_ce": 0.0008052723133005202, + "loss_xval": 0.005706787109375, + "num_input_tokens_seen": 81033344, + "step": 1173 + }, + { + "epoch": 73.375, + "grad_norm": 13.37202980190419, + "learning_rate": 5e-05, + "loss": 0.0169, + "num_input_tokens_seen": 81105024, + "step": 1174 + }, + { + "epoch": 73.375, + "loss": 0.018025677651166916, + "loss_ce": 0.0008137636468745768, + "loss_xval": 0.0172119140625, + "num_input_tokens_seen": 81105024, + "step": 1174 + }, + { + "epoch": 73.4375, + "grad_norm": 10.779752378641776, + "learning_rate": 5e-05, + "loss": 0.0118, + "num_input_tokens_seen": 81176640, + "step": 1175 + }, + { + "epoch": 73.4375, + "loss": 0.01216103881597519, + "loss_ce": 0.0008085001609288156, + "loss_xval": 0.0113525390625, + "num_input_tokens_seen": 81176640, + "step": 1175 + }, + { + "epoch": 73.5, + "grad_norm": 0.7641412974344073, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 81235904, + "step": 1176 + }, + { + "epoch": 73.5, + "loss": 0.0018806563457474113, + "loss_ce": 0.0007896529277786613, + "loss_xval": 0.00109100341796875, + "num_input_tokens_seen": 81235904, + "step": 1176 + }, + { + "epoch": 73.5625, + "grad_norm": 9.594310910818406, + "learning_rate": 5e-05, + "loss": 0.0093, + "num_input_tokens_seen": 81307520, + "step": 1177 + }, + { + "epoch": 73.5625, + "loss": 0.009962813928723335, + "loss_ce": 0.0008075400837697089, + "loss_xval": 0.0091552734375, + "num_input_tokens_seen": 81307520, + "step": 1177 + }, + { + "epoch": 73.625, + "grad_norm": 13.113512591536526, + "learning_rate": 5e-05, + "loss": 0.0151, + "num_input_tokens_seen": 81379264, + "step": 1178 + }, + { + "epoch": 73.625, + "loss": 0.015093508176505566, + "loss_ce": 0.0008112816140055656, + "loss_xval": 0.0142822265625, + "num_input_tokens_seen": 81379264, + "step": 1178 + }, + { + "epoch": 73.6875, + "grad_norm": 8.21336511687051, + "learning_rate": 5e-05, + "loss": 0.007, + "num_input_tokens_seen": 81438464, + "step": 1179 + }, + { + "epoch": 73.6875, + "loss": 0.006963968276977539, + "loss_ce": 0.0007383823394775391, + "loss_xval": 0.0062255859375, + "num_input_tokens_seen": 81438464, + "step": 1179 + }, + { + "epoch": 73.75, + "grad_norm": 1.3305949124151968, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 81497600, + "step": 1180 + }, + { + "epoch": 73.75, + "loss": 0.0021093811374157667, + "loss_ce": 0.0006826843600720167, + "loss_xval": 0.00142669677734375, + "num_input_tokens_seen": 81497600, + "step": 1180 + }, + { + "epoch": 73.8125, + "grad_norm": 10.757753909600044, + "learning_rate": 5e-05, + "loss": 0.0106, + "num_input_tokens_seen": 81569280, + "step": 1181 + }, + { + "epoch": 73.8125, + "loss": 0.010827873833477497, + "loss_ce": 0.0006960381288081408, + "loss_xval": 0.0101318359375, + "num_input_tokens_seen": 81569280, + "step": 1181 + }, + { + "epoch": 73.875, + "grad_norm": 14.785007365776059, + "learning_rate": 5e-05, + "loss": 0.0185, + "num_input_tokens_seen": 81640832, + "step": 1182 + }, + { + "epoch": 73.875, + "loss": 0.01923806220293045, + "loss_ce": 0.0006833748193457723, + "loss_xval": 0.0185546875, + "num_input_tokens_seen": 81640832, + "step": 1182 + }, + { + "epoch": 73.9375, + "grad_norm": 12.13125384920187, + "learning_rate": 5e-05, + "loss": 0.0128, + "num_input_tokens_seen": 81712640, + "step": 1183 + }, + { + "epoch": 73.9375, + "loss": 0.012981466948986053, + "loss_ce": 0.0006523649790324271, + "loss_xval": 0.0123291015625, + "num_input_tokens_seen": 81712640, + "step": 1183 + }, + { + "epoch": 74.0, + "grad_norm": 5.163790877656547, + "learning_rate": 5e-05, + "loss": 0.0033, + "num_input_tokens_seen": 81784320, + "step": 1184 + }, + { + "epoch": 74.0, + "loss": 0.003778281854465604, + "loss_ce": 0.0006197125767357647, + "loss_xval": 0.0031585693359375, + "num_input_tokens_seen": 81784320, + "step": 1184 + }, + { + "epoch": 74.0625, + "grad_norm": 2.7271002962089423, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 81855872, + "step": 1185 + }, + { + "epoch": 74.0625, + "loss": 0.0015725651755928993, + "loss_ce": 0.0005960026173852384, + "loss_xval": 0.0009765625, + "num_input_tokens_seen": 81855872, + "step": 1185 + }, + { + "epoch": 74.125, + "grad_norm": 7.825771598203474, + "learning_rate": 5e-05, + "loss": 0.0063, + "num_input_tokens_seen": 81927488, + "step": 1186 + }, + { + "epoch": 74.125, + "loss": 0.006267584394663572, + "loss_ce": 0.0005913149216212332, + "loss_xval": 0.00567626953125, + "num_input_tokens_seen": 81927488, + "step": 1186 + }, + { + "epoch": 74.1875, + "grad_norm": 8.782250449852858, + "learning_rate": 5e-05, + "loss": 0.0072, + "num_input_tokens_seen": 81986496, + "step": 1187 + }, + { + "epoch": 74.1875, + "loss": 0.006883226800709963, + "loss_ce": 0.0005355706671252847, + "loss_xval": 0.00634765625, + "num_input_tokens_seen": 81986496, + "step": 1187 + }, + { + "epoch": 74.25, + "grad_norm": 6.2170072955900055, + "learning_rate": 5e-05, + "loss": 0.0041, + "num_input_tokens_seen": 82045760, + "step": 1188 + }, + { + "epoch": 74.25, + "loss": 0.004353879485279322, + "loss_ce": 0.0005239234305918217, + "loss_xval": 0.0038299560546875, + "num_input_tokens_seen": 82045760, + "step": 1188 + }, + { + "epoch": 74.3125, + "grad_norm": 1.7663809319742363, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 82117312, + "step": 1189 + }, + { + "epoch": 74.3125, + "loss": 0.0013176639331504703, + "loss_ce": 0.0005203922046348453, + "loss_xval": 0.000797271728515625, + "num_input_tokens_seen": 82117312, + "step": 1189 + }, + { + "epoch": 74.375, + "grad_norm": 2.6607146417355954, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 82188992, + "step": 1190 + }, + { + "epoch": 74.375, + "loss": 0.0011832985328510404, + "loss_ce": 0.0004852089623454958, + "loss_xval": 0.000698089599609375, + "num_input_tokens_seen": 82188992, + "step": 1190 + }, + { + "epoch": 74.4375, + "grad_norm": 4.697353229448369, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_input_tokens_seen": 82248128, + "step": 1191 + }, + { + "epoch": 74.4375, + "loss": 0.003033492248505354, + "loss_ce": 0.00045475686783902347, + "loss_xval": 0.0025787353515625, + "num_input_tokens_seen": 82248128, + "step": 1191 + }, + { + "epoch": 74.5, + "grad_norm": 3.393077012844604, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 82319680, + "step": 1192 + }, + { + "epoch": 74.5, + "loss": 0.001984121510758996, + "loss_ce": 0.0004429837572388351, + "loss_xval": 0.0015411376953125, + "num_input_tokens_seen": 82319680, + "step": 1192 + }, + { + "epoch": 74.5625, + "grad_norm": 0.958210115408945, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 82378688, + "step": 1193 + }, + { + "epoch": 74.5625, + "loss": 0.001094202627427876, + "loss_ce": 0.00040374239324592054, + "loss_xval": 0.000690460205078125, + "num_input_tokens_seen": 82378688, + "step": 1193 + }, + { + "epoch": 74.625, + "grad_norm": 1.133347023469092, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 82450304, + "step": 1194 + }, + { + "epoch": 74.625, + "loss": 0.0007766564376652241, + "loss_ce": 0.0004180748655926436, + "loss_xval": 0.00035858154296875, + "num_input_tokens_seen": 82450304, + "step": 1194 + }, + { + "epoch": 74.6875, + "grad_norm": 2.3412767010384474, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 82509312, + "step": 1195 + }, + { + "epoch": 74.6875, + "loss": 0.0011598513228818774, + "loss_ce": 0.0003625796234700829, + "loss_xval": 0.000797271728515625, + "num_input_tokens_seen": 82509312, + "step": 1195 + }, + { + "epoch": 74.75, + "grad_norm": 3.5261338234594457, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 82568512, + "step": 1196 + }, + { + "epoch": 74.75, + "loss": 0.0019028933020308614, + "loss_ce": 0.00034649684675969183, + "loss_xval": 0.001556396484375, + "num_input_tokens_seen": 82568512, + "step": 1196 + }, + { + "epoch": 74.8125, + "grad_norm": 5.722321680343235, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_input_tokens_seen": 82627584, + "step": 1197 + }, + { + "epoch": 74.8125, + "loss": 0.0031568289268761873, + "loss_ce": 0.00034921171027235687, + "loss_xval": 0.0028076171875, + "num_input_tokens_seen": 82627584, + "step": 1197 + }, + { + "epoch": 74.875, + "grad_norm": 8.312268126186806, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 82699264, + "step": 1198 + }, + { + "epoch": 74.875, + "loss": 0.00578534509986639, + "loss_ce": 0.0003226986445952207, + "loss_xval": 0.005462646484375, + "num_input_tokens_seen": 82699264, + "step": 1198 + }, + { + "epoch": 74.9375, + "grad_norm": 11.608810372977517, + "learning_rate": 5e-05, + "loss": 0.0112, + "num_input_tokens_seen": 82770944, + "step": 1199 + }, + { + "epoch": 74.9375, + "loss": 0.011615313589572906, + "loss_ce": 0.00032380997436121106, + "loss_xval": 0.01129150390625, + "num_input_tokens_seen": 82770944, + "step": 1199 + }, + { + "epoch": 75.0, + "grad_norm": 15.768164554755671, + "learning_rate": 5e-05, + "loss": 0.0196, + "num_input_tokens_seen": 82817408, + "step": 1200 + }, + { + "epoch": 75.0, + "loss": 0.019836699590086937, + "loss_ce": 0.00030544938636012375, + "loss_xval": 0.01953125, + "num_input_tokens_seen": 82817408, + "step": 1200 + }, + { + "epoch": 75.0625, + "grad_norm": 20.13083247408963, + "learning_rate": 5e-05, + "loss": 0.0319, + "num_input_tokens_seen": 82889024, + "step": 1201 + }, + { + "epoch": 75.0625, + "loss": 0.03227750584483147, + "loss_ce": 0.0002950829511974007, + "loss_xval": 0.031982421875, + "num_input_tokens_seen": 82889024, + "step": 1201 + }, + { + "epoch": 75.125, + "grad_norm": 23.608974934533943, + "learning_rate": 5e-05, + "loss": 0.0441, + "num_input_tokens_seen": 82960640, + "step": 1202 + }, + { + "epoch": 75.125, + "loss": 0.04472793638706207, + "loss_ce": 0.0002943412109743804, + "loss_xval": 0.04443359375, + "num_input_tokens_seen": 82960640, + "step": 1202 + }, + { + "epoch": 75.1875, + "grad_norm": 25.185805677940856, + "learning_rate": 5e-05, + "loss": 0.0503, + "num_input_tokens_seen": 83032320, + "step": 1203 + }, + { + "epoch": 75.1875, + "loss": 0.050356145948171616, + "loss_ce": 0.0003073193074669689, + "loss_xval": 0.050048828125, + "num_input_tokens_seen": 83032320, + "step": 1203 + }, + { + "epoch": 75.25, + "grad_norm": 24.398977418417044, + "learning_rate": 5e-05, + "loss": 0.0472, + "num_input_tokens_seen": 83103936, + "step": 1204 + }, + { + "epoch": 75.25, + "loss": 0.04962233826518059, + "loss_ce": 0.0003059330047108233, + "loss_xval": 0.04931640625, + "num_input_tokens_seen": 83103936, + "step": 1204 + }, + { + "epoch": 75.3125, + "grad_norm": 19.919419001296145, + "learning_rate": 5e-05, + "loss": 0.0321, + "num_input_tokens_seen": 83175552, + "step": 1205 + }, + { + "epoch": 75.3125, + "loss": 0.03275446221232414, + "loss_ce": 0.0002837601350620389, + "loss_xval": 0.032470703125, + "num_input_tokens_seen": 83175552, + "step": 1205 + }, + { + "epoch": 75.375, + "grad_norm": 11.768760718587416, + "learning_rate": 5e-05, + "loss": 0.0115, + "num_input_tokens_seen": 83247104, + "step": 1206 + }, + { + "epoch": 75.375, + "loss": 0.012369153089821339, + "loss_ce": 0.0002841917157638818, + "loss_xval": 0.0120849609375, + "num_input_tokens_seen": 83247104, + "step": 1206 + }, + { + "epoch": 75.4375, + "grad_norm": 1.8368035999056054, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 83318656, + "step": 1207 + }, + { + "epoch": 75.4375, + "loss": 0.001351103070192039, + "loss_ce": 0.00029061720124445856, + "loss_xval": 0.00106048583984375, + "num_input_tokens_seen": 83318656, + "step": 1207 + }, + { + "epoch": 75.5, + "grad_norm": 7.80666118483688, + "learning_rate": 5e-05, + "loss": 0.006, + "num_input_tokens_seen": 83390336, + "step": 1208 + }, + { + "epoch": 75.5, + "loss": 0.005994611419737339, + "loss_ce": 0.0002878241066355258, + "loss_xval": 0.005706787109375, + "num_input_tokens_seen": 83390336, + "step": 1208 + }, + { + "epoch": 75.5625, + "grad_norm": 15.776520230822296, + "learning_rate": 5e-05, + "loss": 0.0218, + "num_input_tokens_seen": 83461952, + "step": 1209 + }, + { + "epoch": 75.5625, + "loss": 0.01836695894598961, + "loss_ce": 0.0003005519974976778, + "loss_xval": 0.01806640625, + "num_input_tokens_seen": 83461952, + "step": 1209 + }, + { + "epoch": 75.625, + "grad_norm": 20.701223909897063, + "learning_rate": 5e-05, + "loss": 0.0364, + "num_input_tokens_seen": 83533504, + "step": 1210 + }, + { + "epoch": 75.625, + "loss": 0.03570494055747986, + "loss_ce": 0.00030455051455646753, + "loss_xval": 0.035400390625, + "num_input_tokens_seen": 83533504, + "step": 1210 + }, + { + "epoch": 75.6875, + "grad_norm": 21.527798508931795, + "learning_rate": 5e-05, + "loss": 0.0386, + "num_input_tokens_seen": 83605248, + "step": 1211 + }, + { + "epoch": 75.6875, + "loss": 0.03861980512738228, + "loss_ce": 0.0002897270314861089, + "loss_xval": 0.038330078125, + "num_input_tokens_seen": 83605248, + "step": 1211 + }, + { + "epoch": 75.75, + "grad_norm": 17.81662722607838, + "learning_rate": 5e-05, + "loss": 0.0267, + "num_input_tokens_seen": 83676800, + "step": 1212 + }, + { + "epoch": 75.75, + "loss": 0.0250775758177042, + "loss_ce": 0.0002973016817122698, + "loss_xval": 0.0247802734375, + "num_input_tokens_seen": 83676800, + "step": 1212 + }, + { + "epoch": 75.8125, + "grad_norm": 10.896037597453502, + "learning_rate": 5e-05, + "loss": 0.0107, + "num_input_tokens_seen": 83748544, + "step": 1213 + }, + { + "epoch": 75.8125, + "loss": 0.011996936053037643, + "loss_ce": 0.0002781857911031693, + "loss_xval": 0.01171875, + "num_input_tokens_seen": 83748544, + "step": 1213 + }, + { + "epoch": 75.875, + "grad_norm": 2.36863658926043, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 83820352, + "step": 1214 + }, + { + "epoch": 75.875, + "loss": 0.0016460572369396687, + "loss_ce": 0.0002803955867420882, + "loss_xval": 0.00136566162109375, + "num_input_tokens_seen": 83820352, + "step": 1214 + }, + { + "epoch": 75.9375, + "grad_norm": 5.758967969538332, + "learning_rate": 5e-05, + "loss": 0.0037, + "num_input_tokens_seen": 83879424, + "step": 1215 + }, + { + "epoch": 75.9375, + "loss": 0.0028607631102204323, + "loss_ce": 0.0002820278750732541, + "loss_xval": 0.0025787353515625, + "num_input_tokens_seen": 83879424, + "step": 1215 + }, + { + "epoch": 76.0, + "grad_norm": 11.882970539678618, + "learning_rate": 5e-05, + "loss": 0.0123, + "num_input_tokens_seen": 83951040, + "step": 1216 + }, + { + "epoch": 76.0, + "loss": 0.012303365394473076, + "loss_ce": 0.000279439176665619, + "loss_xval": 0.01202392578125, + "num_input_tokens_seen": 83951040, + "step": 1216 + }, + { + "epoch": 76.0625, + "grad_norm": 15.547437781822687, + "learning_rate": 5e-05, + "loss": 0.0205, + "num_input_tokens_seen": 84022656, + "step": 1217 + }, + { + "epoch": 76.0625, + "loss": 0.0207960307598114, + "loss_ce": 0.0002882190456148237, + "loss_xval": 0.0205078125, + "num_input_tokens_seen": 84022656, + "step": 1217 + }, + { + "epoch": 76.125, + "grad_norm": 16.543148649304666, + "learning_rate": 5e-05, + "loss": 0.023, + "num_input_tokens_seen": 84094400, + "step": 1218 + }, + { + "epoch": 76.125, + "loss": 0.02309388294816017, + "loss_ce": 0.0002667343069333583, + "loss_xval": 0.0228271484375, + "num_input_tokens_seen": 84094400, + "step": 1218 + }, + { + "epoch": 76.1875, + "grad_norm": 14.45577440531413, + "learning_rate": 5e-05, + "loss": 0.0176, + "num_input_tokens_seen": 84165952, + "step": 1219 + }, + { + "epoch": 76.1875, + "loss": 0.019056590273976326, + "loss_ce": 0.00025776130496524274, + "loss_xval": 0.018798828125, + "num_input_tokens_seen": 84165952, + "step": 1219 + }, + { + "epoch": 76.25, + "grad_norm": 9.135506928731544, + "learning_rate": 5e-05, + "loss": 0.0081, + "num_input_tokens_seen": 84237568, + "step": 1220 + }, + { + "epoch": 76.25, + "loss": 0.008334951475262642, + "loss_ce": 0.00027831082115881145, + "loss_xval": 0.008056640625, + "num_input_tokens_seen": 84237568, + "step": 1220 + }, + { + "epoch": 76.3125, + "grad_norm": 1.4668907368775246, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 84309120, + "step": 1221 + }, + { + "epoch": 76.3125, + "loss": 0.0018034669337794185, + "loss_ce": 0.0002699586621019989, + "loss_xval": 0.00153350830078125, + "num_input_tokens_seen": 84309120, + "step": 1221 + }, + { + "epoch": 76.375, + "grad_norm": 7.317495664094737, + "learning_rate": 5e-05, + "loss": 0.0056, + "num_input_tokens_seen": 84380864, + "step": 1222 + }, + { + "epoch": 76.375, + "loss": 0.003733080578967929, + "loss_ce": 0.0002845943090505898, + "loss_xval": 0.003448486328125, + "num_input_tokens_seen": 84380864, + "step": 1222 + }, + { + "epoch": 76.4375, + "grad_norm": 14.681200730949367, + "learning_rate": 5e-05, + "loss": 0.0187, + "num_input_tokens_seen": 84452416, + "step": 1223 + }, + { + "epoch": 76.4375, + "loss": 0.01689068041741848, + "loss_ce": 0.000289117539068684, + "loss_xval": 0.0166015625, + "num_input_tokens_seen": 84452416, + "step": 1223 + }, + { + "epoch": 76.5, + "grad_norm": 18.602396130334732, + "learning_rate": 5e-05, + "loss": 0.0293, + "num_input_tokens_seen": 84523968, + "step": 1224 + }, + { + "epoch": 76.5, + "loss": 0.02823912911117077, + "loss_ce": 0.0002850280434358865, + "loss_xval": 0.0279541015625, + "num_input_tokens_seen": 84523968, + "step": 1224 + }, + { + "epoch": 76.5625, + "grad_norm": 19.009596458273673, + "learning_rate": 5e-05, + "loss": 0.0308, + "num_input_tokens_seen": 84595648, + "step": 1225 + }, + { + "epoch": 76.5625, + "loss": 0.03032798133790493, + "loss_ce": 0.0002986853360198438, + "loss_xval": 0.030029296875, + "num_input_tokens_seen": 84595648, + "step": 1225 + }, + { + "epoch": 76.625, + "grad_norm": 17.267936941710204, + "learning_rate": 5e-05, + "loss": 0.0257, + "num_input_tokens_seen": 84667392, + "step": 1226 + }, + { + "epoch": 76.625, + "loss": 0.02483111433684826, + "loss_ce": 0.00029498201911337674, + "loss_xval": 0.0245361328125, + "num_input_tokens_seen": 84667392, + "step": 1226 + }, + { + "epoch": 76.6875, + "grad_norm": 14.417852806473686, + "learning_rate": 5e-05, + "loss": 0.0183, + "num_input_tokens_seen": 84739008, + "step": 1227 + }, + { + "epoch": 76.6875, + "loss": 0.016663750633597374, + "loss_ce": 0.0003063289914280176, + "loss_xval": 0.016357421875, + "num_input_tokens_seen": 84739008, + "step": 1227 + }, + { + "epoch": 76.75, + "grad_norm": 10.706297841004377, + "learning_rate": 5e-05, + "loss": 0.0105, + "num_input_tokens_seen": 84810688, + "step": 1228 + }, + { + "epoch": 76.75, + "loss": 0.010646898299455643, + "loss_ce": 0.0003319565439596772, + "loss_xval": 0.01031494140625, + "num_input_tokens_seen": 84810688, + "step": 1228 + }, + { + "epoch": 76.8125, + "grad_norm": 6.725381663583949, + "learning_rate": 5e-05, + "loss": 0.0047, + "num_input_tokens_seen": 84882368, + "step": 1229 + }, + { + "epoch": 76.8125, + "loss": 0.004713365808129311, + "loss_ce": 0.0003188344999216497, + "loss_xval": 0.00439453125, + "num_input_tokens_seen": 84882368, + "step": 1229 + }, + { + "epoch": 76.875, + "grad_norm": 3.2416371982539625, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 84953920, + "step": 1230 + }, + { + "epoch": 76.875, + "loss": 0.0016462351195514202, + "loss_ce": 0.00034160862560383976, + "loss_xval": 0.00130462646484375, + "num_input_tokens_seen": 84953920, + "step": 1230 + }, + { + "epoch": 76.9375, + "grad_norm": 0.8319526045639284, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 85025536, + "step": 1231 + }, + { + "epoch": 76.9375, + "loss": 0.0012074424885213375, + "loss_ce": 0.0003300620592199266, + "loss_xval": 0.00087738037109375, + "num_input_tokens_seen": 85025536, + "step": 1231 + }, + { + "epoch": 77.0, + "grad_norm": 0.4998901065208127, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 85097088, + "step": 1232 + }, + { + "epoch": 77.0, + "loss": 0.0010170785244554281, + "loss_ce": 0.00031898898305371404, + "loss_xval": 0.000698089599609375, + "num_input_tokens_seen": 85097088, + "step": 1232 + }, + { + "epoch": 77.0625, + "grad_norm": 1.5322908679654579, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 85168640, + "step": 1233 + }, + { + "epoch": 77.0625, + "loss": 0.0006979235913604498, + "loss_ce": 0.00031454648706130683, + "loss_xval": 0.0003833770751953125, + "num_input_tokens_seen": 85168640, + "step": 1233 + }, + { + "epoch": 77.125, + "grad_norm": 2.774667522264961, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 85227712, + "step": 1234 + }, + { + "epoch": 77.125, + "loss": 0.0014237510040402412, + "loss_ce": 0.0003174888261128217, + "loss_xval": 0.00110626220703125, + "num_input_tokens_seen": 85227712, + "step": 1234 + }, + { + "epoch": 77.1875, + "grad_norm": 4.7024190141487265, + "learning_rate": 5e-05, + "loss": 0.0026, + "num_input_tokens_seen": 85299264, + "step": 1235 + }, + { + "epoch": 77.1875, + "loss": 0.002505771117284894, + "loss_ce": 0.00030850546318106353, + "loss_xval": 0.002197265625, + "num_input_tokens_seen": 85299264, + "step": 1235 + }, + { + "epoch": 77.25, + "grad_norm": 7.645943356947957, + "learning_rate": 5e-05, + "loss": 0.0056, + "num_input_tokens_seen": 85370944, + "step": 1236 + }, + { + "epoch": 77.25, + "loss": 0.005701807327568531, + "loss_ce": 0.0003001959703397006, + "loss_xval": 0.005401611328125, + "num_input_tokens_seen": 85370944, + "step": 1236 + }, + { + "epoch": 77.3125, + "grad_norm": 11.666851267281478, + "learning_rate": 5e-05, + "loss": 0.0124, + "num_input_tokens_seen": 85442560, + "step": 1237 + }, + { + "epoch": 77.3125, + "loss": 0.012451760470867157, + "loss_ce": 0.0003057646390516311, + "loss_xval": 0.01214599609375, + "num_input_tokens_seen": 85442560, + "step": 1237 + }, + { + "epoch": 77.375, + "grad_norm": 17.347565722362265, + "learning_rate": 5e-05, + "loss": 0.0265, + "num_input_tokens_seen": 85501696, + "step": 1238 + }, + { + "epoch": 77.375, + "loss": 0.02691800892353058, + "loss_ce": 0.00030668015824630857, + "loss_xval": 0.026611328125, + "num_input_tokens_seen": 85501696, + "step": 1238 + }, + { + "epoch": 77.4375, + "grad_norm": 25.38116280556335, + "learning_rate": 5e-05, + "loss": 0.0555, + "num_input_tokens_seen": 85573376, + "step": 1239 + }, + { + "epoch": 77.4375, + "loss": 0.0549708791077137, + "loss_ce": 0.0002833788748830557, + "loss_xval": 0.0546875, + "num_input_tokens_seen": 85573376, + "step": 1239 + }, + { + "epoch": 77.5, + "grad_norm": 35.527800123958905, + "learning_rate": 5e-05, + "loss": 0.1089, + "num_input_tokens_seen": 85644928, + "step": 1240 + }, + { + "epoch": 77.5, + "loss": 0.10820645838975906, + "loss_ce": 0.0002962992584798485, + "loss_xval": 0.10791015625, + "num_input_tokens_seen": 85644928, + "step": 1240 + }, + { + "epoch": 77.5625, + "grad_norm": 43.15199387931728, + "learning_rate": 5e-05, + "loss": 0.1634, + "num_input_tokens_seen": 85716544, + "step": 1241 + }, + { + "epoch": 77.5625, + "loss": 0.16338858008384705, + "loss_ce": 0.00030264799715951085, + "loss_xval": 0.1630859375, + "num_input_tokens_seen": 85716544, + "step": 1241 + }, + { + "epoch": 77.625, + "grad_norm": 40.2416273483037, + "learning_rate": 5e-05, + "loss": 0.1444, + "num_input_tokens_seen": 85788160, + "step": 1242 + }, + { + "epoch": 77.625, + "loss": 0.14094926416873932, + "loss_ce": 0.0003242639359086752, + "loss_xval": 0.140625, + "num_input_tokens_seen": 85788160, + "step": 1242 + }, + { + "epoch": 77.6875, + "grad_norm": 20.986646115749792, + "learning_rate": 5e-05, + "loss": 0.042, + "num_input_tokens_seen": 85847232, + "step": 1243 + }, + { + "epoch": 77.6875, + "loss": 0.03987127169966698, + "loss_ce": 0.0003204921376891434, + "loss_xval": 0.03955078125, + "num_input_tokens_seen": 85847232, + "step": 1243 + }, + { + "epoch": 77.75, + "grad_norm": 7.662557391421135, + "learning_rate": 5e-05, + "loss": 0.0077, + "num_input_tokens_seen": 85918848, + "step": 1244 + }, + { + "epoch": 77.75, + "loss": 0.008253362961113453, + "loss_ce": 0.00037982812500558794, + "loss_xval": 0.00787353515625, + "num_input_tokens_seen": 85918848, + "step": 1244 + }, + { + "epoch": 77.8125, + "grad_norm": 28.73130524451353, + "learning_rate": 5e-05, + "loss": 0.0761, + "num_input_tokens_seen": 85990528, + "step": 1245 + }, + { + "epoch": 77.8125, + "loss": 0.07705868035554886, + "loss_ce": 0.0003985270159319043, + "loss_xval": 0.07666015625, + "num_input_tokens_seen": 85990528, + "step": 1245 + }, + { + "epoch": 77.875, + "grad_norm": 30.854670471636584, + "learning_rate": 5e-05, + "loss": 0.0883, + "num_input_tokens_seen": 86062272, + "step": 1246 + }, + { + "epoch": 77.875, + "loss": 0.08785340934991837, + "loss_ce": 0.0004510624276008457, + "loss_xval": 0.08740234375, + "num_input_tokens_seen": 86062272, + "step": 1246 + }, + { + "epoch": 77.9375, + "grad_norm": 14.028656578933234, + "learning_rate": 5e-05, + "loss": 0.0191, + "num_input_tokens_seen": 86121344, + "step": 1247 + }, + { + "epoch": 77.9375, + "loss": 0.019599292427301407, + "loss_ce": 0.0004342528118286282, + "loss_xval": 0.0191650390625, + "num_input_tokens_seen": 86121344, + "step": 1247 + }, + { + "epoch": 78.0, + "grad_norm": 10.269793907601676, + "learning_rate": 5e-05, + "loss": 0.0109, + "num_input_tokens_seen": 86192960, + "step": 1248 + }, + { + "epoch": 78.0, + "loss": 0.01089974120259285, + "loss_ce": 0.0004627294256351888, + "loss_xval": 0.01043701171875, + "num_input_tokens_seen": 86192960, + "step": 1248 + }, + { + "epoch": 78.0625, + "grad_norm": 26.15659721116164, + "learning_rate": 5e-05, + "loss": 0.0637, + "num_input_tokens_seen": 86264512, + "step": 1249 + }, + { + "epoch": 78.0625, + "loss": 0.06197069212794304, + "loss_ce": 0.00044725643238052726, + "loss_xval": 0.0615234375, + "num_input_tokens_seen": 86264512, + "step": 1249 + }, + { + "epoch": 78.125, + "grad_norm": 23.947367489720282, + "learning_rate": 5e-05, + "loss": 0.0541, + "num_input_tokens_seen": 86323712, + "step": 1250 + }, + { + "epoch": 78.125, + "eval_synth_IoU": 0.15380492992699146, + "eval_synth_MAE_x": 0.037353515625, + "eval_synth_MAE_y": 0.0395660400390625, + "eval_synth_NUM_probability": 0.993680864572525, + "eval_synth_inside_bbox": 0.3125, + "eval_synth_loss": 0.003601398319005966, + "eval_synth_loss_ce": 0.000526752628502436, + "eval_synth_loss_xval": 0.00307464599609375, + "eval_synth_runtime": 58.4846, + "eval_synth_samples_per_second": 2.189, + "eval_synth_steps_per_second": 0.068, + "num_input_tokens_seen": 86323712, + "step": 1250 + }, + { + "epoch": 78.125, + "loss": 0.0037252199836075306, + "loss_ce": 0.0005208743968978524, + "loss_xval": 0.003204345703125, + "num_input_tokens_seen": 86323712, + "step": 1250 + }, + { + "epoch": 78.1875, + "grad_norm": 5.597444260463171, + "learning_rate": 5e-05, + "loss": 0.0041, + "num_input_tokens_seen": 86395328, + "step": 1251 + }, + { + "epoch": 78.1875, + "loss": 0.004415246658027172, + "loss_ce": 0.0005700316396541893, + "loss_xval": 0.00384521484375, + "num_input_tokens_seen": 86395328, + "step": 1251 + }, + { + "epoch": 78.25, + "grad_norm": 14.920542235110513, + "learning_rate": 5e-05, + "loss": 0.0223, + "num_input_tokens_seen": 86467072, + "step": 1252 + }, + { + "epoch": 78.25, + "loss": 0.022936955094337463, + "loss_ce": 0.0005980880814604461, + "loss_xval": 0.0223388671875, + "num_input_tokens_seen": 86467072, + "step": 1252 + }, + { + "epoch": 78.3125, + "grad_norm": 21.22270271731356, + "learning_rate": 5e-05, + "loss": 0.0454, + "num_input_tokens_seen": 86538688, + "step": 1253 + }, + { + "epoch": 78.3125, + "loss": 0.04360269755125046, + "loss_ce": 0.0006339486571960151, + "loss_xval": 0.04296875, + "num_input_tokens_seen": 86538688, + "step": 1253 + }, + { + "epoch": 78.375, + "grad_norm": 9.998335500552542, + "learning_rate": 5e-05, + "loss": 0.0112, + "num_input_tokens_seen": 86610432, + "step": 1254 + }, + { + "epoch": 78.375, + "loss": 0.010832501575350761, + "loss_ce": 0.0007006655796431005, + "loss_xval": 0.0101318359375, + "num_input_tokens_seen": 86610432, + "step": 1254 + }, + { + "epoch": 78.4375, + "grad_norm": 7.9508214231797805, + "learning_rate": 5e-05, + "loss": 0.0073, + "num_input_tokens_seen": 86669504, + "step": 1255 + }, + { + "epoch": 78.4375, + "loss": 0.00764887360855937, + "loss_ce": 0.0007213835488073528, + "loss_xval": 0.006927490234375, + "num_input_tokens_seen": 86669504, + "step": 1255 + }, + { + "epoch": 78.5, + "grad_norm": 17.42214602129036, + "learning_rate": 5e-05, + "loss": 0.0309, + "num_input_tokens_seen": 86741184, + "step": 1256 + }, + { + "epoch": 78.5, + "loss": 0.03059588000178337, + "loss_ce": 0.0006886525661684573, + "loss_xval": 0.0299072265625, + "num_input_tokens_seen": 86741184, + "step": 1256 + }, + { + "epoch": 78.5625, + "grad_norm": 12.183816197766062, + "learning_rate": 5e-05, + "loss": 0.0161, + "num_input_tokens_seen": 86812864, + "step": 1257 + }, + { + "epoch": 78.5625, + "loss": 0.015264628455042839, + "loss_ce": 0.0007382616749964654, + "loss_xval": 0.0145263671875, + "num_input_tokens_seen": 86812864, + "step": 1257 + }, + { + "epoch": 78.625, + "grad_norm": 1.3140939069078006, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 86884480, + "step": 1258 + }, + { + "epoch": 78.625, + "loss": 0.0012654373422265053, + "loss_ce": 0.0006741593242622912, + "loss_xval": 0.000591278076171875, + "num_input_tokens_seen": 86884480, + "step": 1258 + }, + { + "epoch": 78.6875, + "grad_norm": 12.321355731714885, + "learning_rate": 5e-05, + "loss": 0.0159, + "num_input_tokens_seen": 86956032, + "step": 1259 + }, + { + "epoch": 78.6875, + "loss": 0.017447177320718765, + "loss_ce": 0.000601475010626018, + "loss_xval": 0.016845703125, + "num_input_tokens_seen": 86956032, + "step": 1259 + }, + { + "epoch": 78.75, + "grad_norm": 12.148726058787897, + "learning_rate": 5e-05, + "loss": 0.0152, + "num_input_tokens_seen": 87027776, + "step": 1260 + }, + { + "epoch": 78.75, + "loss": 0.015706263482570648, + "loss_ce": 0.0005695446161553264, + "loss_xval": 0.01513671875, + "num_input_tokens_seen": 87027776, + "step": 1260 + }, + { + "epoch": 78.8125, + "grad_norm": 1.4145298228038716, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 87099520, + "step": 1261 + }, + { + "epoch": 78.8125, + "loss": 0.001163907814770937, + "loss_ce": 0.000515409279614687, + "loss_xval": 0.00064849853515625, + "num_input_tokens_seen": 87099520, + "step": 1261 + }, + { + "epoch": 78.875, + "grad_norm": 10.189589879843748, + "learning_rate": 5e-05, + "loss": 0.0109, + "num_input_tokens_seen": 87171264, + "step": 1262 + }, + { + "epoch": 78.875, + "loss": 0.01154272723942995, + "loss_ce": 0.0004953640745952725, + "loss_xval": 0.01104736328125, + "num_input_tokens_seen": 87171264, + "step": 1262 + }, + { + "epoch": 78.9375, + "grad_norm": 13.848987821502545, + "learning_rate": 5e-05, + "loss": 0.0193, + "num_input_tokens_seen": 87242944, + "step": 1263 + }, + { + "epoch": 78.9375, + "loss": 0.019630515947937965, + "loss_ce": 0.00046547639067284763, + "loss_xval": 0.0191650390625, + "num_input_tokens_seen": 87242944, + "step": 1263 + }, + { + "epoch": 79.0, + "grad_norm": 7.633121068960464, + "learning_rate": 5e-05, + "loss": 0.0067, + "num_input_tokens_seen": 87314560, + "step": 1264 + }, + { + "epoch": 79.0, + "loss": 0.006737522780895233, + "loss_ce": 0.0004509016580414027, + "loss_xval": 0.00628662109375, + "num_input_tokens_seen": 87314560, + "step": 1264 + }, + { + "epoch": 79.0625, + "grad_norm": 2.5095848968654546, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 87386240, + "step": 1265 + }, + { + "epoch": 79.0625, + "loss": 0.0015422420110553503, + "loss_ce": 0.0004664973239414394, + "loss_xval": 0.00107574462890625, + "num_input_tokens_seen": 87386240, + "step": 1265 + }, + { + "epoch": 79.125, + "grad_norm": 8.394106185074616, + "learning_rate": 5e-05, + "loss": 0.0081, + "num_input_tokens_seen": 87457856, + "step": 1266 + }, + { + "epoch": 79.125, + "loss": 0.008274378255009651, + "loss_ce": 0.000461878371424973, + "loss_xval": 0.0078125, + "num_input_tokens_seen": 87457856, + "step": 1266 + }, + { + "epoch": 79.1875, + "grad_norm": 6.9888938015426545, + "learning_rate": 5e-05, + "loss": 0.0056, + "num_input_tokens_seen": 87529472, + "step": 1267 + }, + { + "epoch": 79.1875, + "loss": 0.005406632088124752, + "loss_ce": 0.0004322669410612434, + "loss_xval": 0.004974365234375, + "num_input_tokens_seen": 87529472, + "step": 1267 + }, + { + "epoch": 79.25, + "grad_norm": 0.6307274121771996, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 87601152, + "step": 1268 + }, + { + "epoch": 79.25, + "loss": 0.0011359049240127206, + "loss_ce": 0.00043400059803389013, + "loss_xval": 0.000701904296875, + "num_input_tokens_seen": 87601152, + "step": 1268 + }, + { + "epoch": 79.3125, + "grad_norm": 6.587295085954337, + "learning_rate": 5e-05, + "loss": 0.0052, + "num_input_tokens_seen": 87672896, + "step": 1269 + }, + { + "epoch": 79.3125, + "loss": 0.005824776366353035, + "loss_ce": 0.000392647460103035, + "loss_xval": 0.00543212890625, + "num_input_tokens_seen": 87672896, + "step": 1269 + }, + { + "epoch": 79.375, + "grad_norm": 9.86644422051795, + "learning_rate": 5e-05, + "loss": 0.0105, + "num_input_tokens_seen": 87744576, + "step": 1270 + }, + { + "epoch": 79.375, + "loss": 0.010824608616530895, + "loss_ce": 0.0003875968395732343, + "loss_xval": 0.01043701171875, + "num_input_tokens_seen": 87744576, + "step": 1270 + }, + { + "epoch": 79.4375, + "grad_norm": 7.150141739852848, + "learning_rate": 5e-05, + "loss": 0.0058, + "num_input_tokens_seen": 87816256, + "step": 1271 + }, + { + "epoch": 79.4375, + "loss": 0.005546413827687502, + "loss_ce": 0.0003584257501643151, + "loss_xval": 0.00518798828125, + "num_input_tokens_seen": 87816256, + "step": 1271 + }, + { + "epoch": 79.5, + "grad_norm": 0.6032163165017892, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 87875392, + "step": 1272 + }, + { + "epoch": 79.5, + "loss": 0.0005711402627639472, + "loss_ce": 0.000364192936103791, + "loss_xval": 0.00020694732666015625, + "num_input_tokens_seen": 87875392, + "step": 1272 + }, + { + "epoch": 79.5625, + "grad_norm": 5.324726389319908, + "learning_rate": 5e-05, + "loss": 0.0036, + "num_input_tokens_seen": 87947200, + "step": 1273 + }, + { + "epoch": 79.5625, + "loss": 0.003648017067462206, + "loss_ce": 0.0003521185426507145, + "loss_xval": 0.0032958984375, + "num_input_tokens_seen": 87947200, + "step": 1273 + }, + { + "epoch": 79.625, + "grad_norm": 7.363803798232587, + "learning_rate": 5e-05, + "loss": 0.0062, + "num_input_tokens_seen": 88018944, + "step": 1274 + }, + { + "epoch": 79.625, + "loss": 0.005804733373224735, + "loss_ce": 0.0003420870052650571, + "loss_xval": 0.005462646484375, + "num_input_tokens_seen": 88018944, + "step": 1274 + }, + { + "epoch": 79.6875, + "grad_norm": 5.834627935141895, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 88090560, + "step": 1275 + }, + { + "epoch": 79.6875, + "loss": 0.0035827215760946274, + "loss_ce": 0.00033259938936680555, + "loss_xval": 0.0032501220703125, + "num_input_tokens_seen": 88090560, + "step": 1275 + }, + { + "epoch": 79.75, + "grad_norm": 2.5866018005038907, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 88162240, + "step": 1276 + }, + { + "epoch": 79.75, + "loss": 0.001492985524237156, + "loss_ce": 0.00033331758459098637, + "loss_xval": 0.00115966796875, + "num_input_tokens_seen": 88162240, + "step": 1276 + }, + { + "epoch": 79.8125, + "grad_norm": 1.1924939995655466, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 88233984, + "step": 1277 + }, + { + "epoch": 79.8125, + "loss": 0.0008020081440918148, + "loss_ce": 0.00032135628862306476, + "loss_xval": 0.00048065185546875, + "num_input_tokens_seen": 88233984, + "step": 1277 + }, + { + "epoch": 79.875, + "grad_norm": 4.411322436249555, + "learning_rate": 5e-05, + "loss": 0.0025, + "num_input_tokens_seen": 88305536, + "step": 1278 + }, + { + "epoch": 79.875, + "loss": 0.00267414515838027, + "loss_ce": 0.0002937741228379309, + "loss_xval": 0.00238037109375, + "num_input_tokens_seen": 88305536, + "step": 1278 + }, + { + "epoch": 79.9375, + "grad_norm": 5.861055974831472, + "learning_rate": 5e-05, + "loss": 0.0038, + "num_input_tokens_seen": 88377088, + "step": 1279 + }, + { + "epoch": 79.9375, + "loss": 0.00425551924854517, + "loss_ce": 0.0002882343251258135, + "loss_xval": 0.00396728515625, + "num_input_tokens_seen": 88377088, + "step": 1279 + }, + { + "epoch": 80.0, + "grad_norm": 4.045180702609077, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 88448704, + "step": 1280 + }, + { + "epoch": 80.0, + "loss": 0.002019032370299101, + "loss_ce": 0.00027190105174668133, + "loss_xval": 0.00174713134765625, + "num_input_tokens_seen": 88448704, + "step": 1280 + }, + { + "epoch": 80.0625, + "grad_norm": 0.6975818335879924, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 88520448, + "step": 1281 + }, + { + "epoch": 80.0625, + "loss": 0.0006273279432207346, + "loss_ce": 0.00026111697661690414, + "loss_xval": 0.0003662109375, + "num_input_tokens_seen": 88520448, + "step": 1281 + }, + { + "epoch": 80.125, + "grad_norm": 6.415093584282375, + "learning_rate": 5e-05, + "loss": 0.0046, + "num_input_tokens_seen": 88592064, + "step": 1282 + }, + { + "epoch": 80.125, + "loss": 0.004382018465548754, + "loss_ce": 0.0002621453022584319, + "loss_xval": 0.004119873046875, + "num_input_tokens_seen": 88592064, + "step": 1282 + }, + { + "epoch": 80.1875, + "grad_norm": 10.984504435288581, + "learning_rate": 5e-05, + "loss": 0.0123, + "num_input_tokens_seen": 88651200, + "step": 1283 + }, + { + "epoch": 80.1875, + "loss": 0.011899075470864773, + "loss_ce": 0.00024136043793987483, + "loss_xval": 0.01165771484375, + "num_input_tokens_seen": 88651200, + "step": 1283 + }, + { + "epoch": 80.25, + "grad_norm": 12.976863346590147, + "learning_rate": 5e-05, + "loss": 0.0171, + "num_input_tokens_seen": 88722816, + "step": 1284 + }, + { + "epoch": 80.25, + "loss": 0.017806636169552803, + "loss_ce": 0.00022851029643788934, + "loss_xval": 0.017578125, + "num_input_tokens_seen": 88722816, + "step": 1284 + }, + { + "epoch": 80.3125, + "grad_norm": 12.000667507600303, + "learning_rate": 5e-05, + "loss": 0.0145, + "num_input_tokens_seen": 88794560, + "step": 1285 + }, + { + "epoch": 80.3125, + "loss": 0.01462092436850071, + "loss_ce": 0.0002166273188777268, + "loss_xval": 0.014404296875, + "num_input_tokens_seen": 88794560, + "step": 1285 + }, + { + "epoch": 80.375, + "grad_norm": 8.338492384447054, + "learning_rate": 5e-05, + "loss": 0.0074, + "num_input_tokens_seen": 88866240, + "step": 1286 + }, + { + "epoch": 80.375, + "loss": 0.007325833663344383, + "loss_ce": 0.0002152378874598071, + "loss_xval": 0.007110595703125, + "num_input_tokens_seen": 88866240, + "step": 1286 + }, + { + "epoch": 80.4375, + "grad_norm": 3.1245423806255186, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 88937984, + "step": 1287 + }, + { + "epoch": 80.4375, + "loss": 0.0016321254661306739, + "loss_ce": 0.00020542873244266957, + "loss_xval": 0.00142669677734375, + "num_input_tokens_seen": 88937984, + "step": 1287 + }, + { + "epoch": 80.5, + "grad_norm": 1.778896332245402, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 89009536, + "step": 1288 + }, + { + "epoch": 80.5, + "loss": 0.0011494061909615993, + "loss_ce": 0.00019573181634768844, + "loss_xval": 0.00095367431640625, + "num_input_tokens_seen": 89009536, + "step": 1288 + }, + { + "epoch": 80.5625, + "grad_norm": 5.718502417530253, + "learning_rate": 5e-05, + "loss": 0.0038, + "num_input_tokens_seen": 89081152, + "step": 1289 + }, + { + "epoch": 80.5625, + "loss": 0.0042812880128622055, + "loss_ce": 0.00019193251500837505, + "loss_xval": 0.00408935546875, + "num_input_tokens_seen": 89081152, + "step": 1289 + }, + { + "epoch": 80.625, + "grad_norm": 8.627519762670474, + "learning_rate": 5e-05, + "loss": 0.0077, + "num_input_tokens_seen": 89152768, + "step": 1290 + }, + { + "epoch": 80.625, + "loss": 0.0074982293881475925, + "loss_ce": 0.00020452811440918595, + "loss_xval": 0.007293701171875, + "num_input_tokens_seen": 89152768, + "step": 1290 + }, + { + "epoch": 80.6875, + "grad_norm": 10.07743867452318, + "learning_rate": 5e-05, + "loss": 0.0104, + "num_input_tokens_seen": 89211968, + "step": 1291 + }, + { + "epoch": 80.6875, + "loss": 0.009577931836247444, + "loss_ce": 0.00017851768643595278, + "loss_xval": 0.0093994140625, + "num_input_tokens_seen": 89211968, + "step": 1291 + }, + { + "epoch": 80.75, + "grad_norm": 10.289217040543756, + "learning_rate": 5e-05, + "loss": 0.0109, + "num_input_tokens_seen": 89283712, + "step": 1292 + }, + { + "epoch": 80.75, + "loss": 0.010684765875339508, + "loss_ce": 0.0001867193350335583, + "loss_xval": 0.010498046875, + "num_input_tokens_seen": 89283712, + "step": 1292 + }, + { + "epoch": 80.8125, + "grad_norm": 9.518871705471712, + "learning_rate": 5e-05, + "loss": 0.0094, + "num_input_tokens_seen": 89355328, + "step": 1293 + }, + { + "epoch": 80.8125, + "loss": 0.009457839652895927, + "loss_ce": 0.00018049543723464012, + "loss_xval": 0.00927734375, + "num_input_tokens_seen": 89355328, + "step": 1293 + }, + { + "epoch": 80.875, + "grad_norm": 8.006495123595153, + "learning_rate": 5e-05, + "loss": 0.0067, + "num_input_tokens_seen": 89426880, + "step": 1294 + }, + { + "epoch": 80.875, + "loss": 0.006589856464415789, + "loss_ce": 0.00018116526189260185, + "loss_xval": 0.00640869140625, + "num_input_tokens_seen": 89426880, + "step": 1294 + }, + { + "epoch": 80.9375, + "grad_norm": 5.763032519327169, + "learning_rate": 5e-05, + "loss": 0.0037, + "num_input_tokens_seen": 89498624, + "step": 1295 + }, + { + "epoch": 80.9375, + "loss": 0.003941795788705349, + "loss_ce": 0.00018813367933034897, + "loss_xval": 0.003753662109375, + "num_input_tokens_seen": 89498624, + "step": 1295 + }, + { + "epoch": 81.0, + "grad_norm": 2.828151506659076, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 89570176, + "step": 1296 + }, + { + "epoch": 81.0, + "loss": 0.001258036820217967, + "loss_ce": 0.00017466276767663658, + "loss_xval": 0.0010833740234375, + "num_input_tokens_seen": 89570176, + "step": 1296 + }, + { + "epoch": 81.0625, + "grad_norm": 0.09979255913719957, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 89641856, + "step": 1297 + }, + { + "epoch": 81.0625, + "loss": 0.0003645333054009825, + "loss_ce": 0.000183335185283795, + "loss_xval": 0.0001811981201171875, + "num_input_tokens_seen": 89641856, + "step": 1297 + }, + { + "epoch": 81.125, + "grad_norm": 2.4045827761146454, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 89713472, + "step": 1298 + }, + { + "epoch": 81.125, + "loss": 0.0010376586578786373, + "loss_ce": 0.00017172243678942323, + "loss_xval": 0.000865936279296875, + "num_input_tokens_seen": 89713472, + "step": 1298 + }, + { + "epoch": 81.1875, + "grad_norm": 4.5778954748282175, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_input_tokens_seen": 89785152, + "step": 1299 + }, + { + "epoch": 81.1875, + "loss": 0.0022919767070561647, + "loss_ce": 0.00018626371456775814, + "loss_xval": 0.002105712890625, + "num_input_tokens_seen": 89785152, + "step": 1299 + }, + { + "epoch": 81.25, + "grad_norm": 6.917729798556076, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 89856832, + "step": 1300 + }, + { + "epoch": 81.25, + "loss": 0.004717848729342222, + "loss_ce": 0.00017072970513254404, + "loss_xval": 0.004547119140625, + "num_input_tokens_seen": 89856832, + "step": 1300 + }, + { + "epoch": 81.3125, + "grad_norm": 9.803192834949183, + "learning_rate": 5e-05, + "loss": 0.0103, + "num_input_tokens_seen": 89928448, + "step": 1301 + }, + { + "epoch": 81.3125, + "loss": 0.009035426191985607, + "loss_ce": 0.00018532808462623507, + "loss_xval": 0.00885009765625, + "num_input_tokens_seen": 89928448, + "step": 1301 + }, + { + "epoch": 81.375, + "grad_norm": 13.892361525694142, + "learning_rate": 5e-05, + "loss": 0.02, + "num_input_tokens_seen": 89987584, + "step": 1302 + }, + { + "epoch": 81.375, + "loss": 0.019351232796907425, + "loss_ce": 0.00018619366164784878, + "loss_xval": 0.0191650390625, + "num_input_tokens_seen": 89987584, + "step": 1302 + }, + { + "epoch": 81.4375, + "grad_norm": 19.455423704004556, + "learning_rate": 5e-05, + "loss": 0.0386, + "num_input_tokens_seen": 90059264, + "step": 1303 + }, + { + "epoch": 81.4375, + "loss": 0.038518596440553665, + "loss_ce": 0.00018851927598007023, + "loss_xval": 0.038330078125, + "num_input_tokens_seen": 90059264, + "step": 1303 + }, + { + "epoch": 81.5, + "grad_norm": 26.25043747926682, + "learning_rate": 5e-05, + "loss": 0.0708, + "num_input_tokens_seen": 90130816, + "step": 1304 + }, + { + "epoch": 81.5, + "loss": 0.07098682224750519, + "loss_ce": 0.00018604137585498393, + "loss_xval": 0.07080078125, + "num_input_tokens_seen": 90130816, + "step": 1304 + }, + { + "epoch": 81.5625, + "grad_norm": 32.9316261955588, + "learning_rate": 5e-05, + "loss": 0.111, + "num_input_tokens_seen": 90202368, + "step": 1305 + }, + { + "epoch": 81.5625, + "loss": 0.11202113330364227, + "loss_ce": 0.0002047246671281755, + "loss_xval": 0.11181640625, + "num_input_tokens_seen": 90202368, + "step": 1305 + }, + { + "epoch": 81.625, + "grad_norm": 34.395804303815375, + "learning_rate": 5e-05, + "loss": 0.1213, + "num_input_tokens_seen": 90274048, + "step": 1306 + }, + { + "epoch": 81.625, + "loss": 0.12080544233322144, + "loss_ce": 0.00019997703202534467, + "loss_xval": 0.12060546875, + "num_input_tokens_seen": 90274048, + "step": 1306 + }, + { + "epoch": 81.6875, + "grad_norm": 23.92537655157102, + "learning_rate": 5e-05, + "loss": 0.0609, + "num_input_tokens_seen": 90345728, + "step": 1307 + }, + { + "epoch": 81.6875, + "loss": 0.06155260279774666, + "loss_ce": 0.0002733046712819487, + "loss_xval": 0.061279296875, + "num_input_tokens_seen": 90345728, + "step": 1307 + }, + { + "epoch": 81.75, + "grad_norm": 5.603796248217643, + "learning_rate": 5e-05, + "loss": 0.0052, + "num_input_tokens_seen": 90417280, + "step": 1308 + }, + { + "epoch": 81.75, + "loss": 0.005875443108379841, + "loss_ce": 0.0004127967986278236, + "loss_xval": 0.005462646484375, + "num_input_tokens_seen": 90417280, + "step": 1308 + }, + { + "epoch": 81.8125, + "grad_norm": 13.120661509399742, + "learning_rate": 5e-05, + "loss": 0.0215, + "num_input_tokens_seen": 90489024, + "step": 1309 + }, + { + "epoch": 81.8125, + "loss": 0.020262904465198517, + "loss_ce": 0.00048751308349892497, + "loss_xval": 0.019775390625, + "num_input_tokens_seen": 90489024, + "step": 1309 + }, + { + "epoch": 81.875, + "grad_norm": 27.54321692725346, + "learning_rate": 5e-05, + "loss": 0.0844, + "num_input_tokens_seen": 90560704, + "step": 1310 + }, + { + "epoch": 81.875, + "loss": 0.08634451031684875, + "loss_ce": 0.0004070091526955366, + "loss_xval": 0.0859375, + "num_input_tokens_seen": 90560704, + "step": 1310 + }, + { + "epoch": 81.9375, + "grad_norm": 27.0470695659577, + "learning_rate": 5e-05, + "loss": 0.081, + "num_input_tokens_seen": 90632512, + "step": 1311 + }, + { + "epoch": 81.9375, + "loss": 0.07950714975595474, + "loss_ce": 0.00040558731416240335, + "loss_xval": 0.0791015625, + "num_input_tokens_seen": 90632512, + "step": 1311 + }, + { + "epoch": 82.0, + "grad_norm": 8.319309467414499, + "learning_rate": 5e-05, + "loss": 0.0086, + "num_input_tokens_seen": 90691520, + "step": 1312 + }, + { + "epoch": 82.0, + "loss": 0.007567130029201508, + "loss_ce": 0.0003954994026571512, + "loss_xval": 0.007171630859375, + "num_input_tokens_seen": 90691520, + "step": 1312 + }, + { + "epoch": 82.0625, + "grad_norm": 13.83520003951101, + "learning_rate": 5e-05, + "loss": 0.0214, + "num_input_tokens_seen": 90763264, + "step": 1313 + }, + { + "epoch": 82.0625, + "loss": 0.020780092105269432, + "loss_ce": 0.00039435079088434577, + "loss_xval": 0.0203857421875, + "num_input_tokens_seen": 90763264, + "step": 1313 + }, + { + "epoch": 82.125, + "grad_norm": 26.233744470895388, + "learning_rate": 5e-05, + "loss": 0.0736, + "num_input_tokens_seen": 90835072, + "step": 1314 + }, + { + "epoch": 82.125, + "loss": 0.07211720943450928, + "loss_ce": 0.000339865597197786, + "loss_xval": 0.07177734375, + "num_input_tokens_seen": 90835072, + "step": 1314 + }, + { + "epoch": 82.1875, + "grad_norm": 21.966334899550386, + "learning_rate": 5e-05, + "loss": 0.0553, + "num_input_tokens_seen": 90906816, + "step": 1315 + }, + { + "epoch": 82.1875, + "loss": 0.05788750201463699, + "loss_ce": 0.0005144541501067579, + "loss_xval": 0.057373046875, + "num_input_tokens_seen": 90906816, + "step": 1315 + }, + { + "epoch": 82.25, + "grad_norm": 7.992168817180659, + "learning_rate": 5e-05, + "loss": 0.0092, + "num_input_tokens_seen": 90978368, + "step": 1316 + }, + { + "epoch": 82.25, + "loss": 0.0099477618932724, + "loss_ce": 0.0009755940409377217, + "loss_xval": 0.00897216796875, + "num_input_tokens_seen": 90978368, + "step": 1316 + }, + { + "epoch": 82.3125, + "grad_norm": 9.853948345579273, + "learning_rate": 5e-05, + "loss": 0.0131, + "num_input_tokens_seen": 91050048, + "step": 1317 + }, + { + "epoch": 82.3125, + "loss": 0.01295376755297184, + "loss_ce": 0.0011129477061331272, + "loss_xval": 0.0118408203125, + "num_input_tokens_seen": 91050048, + "step": 1317 + }, + { + "epoch": 82.375, + "grad_norm": 24.121509566174467, + "learning_rate": 5e-05, + "loss": 0.0712, + "num_input_tokens_seen": 91109312, + "step": 1318 + }, + { + "epoch": 82.375, + "loss": 0.07309217005968094, + "loss_ce": 0.0008265475044026971, + "loss_xval": 0.072265625, + "num_input_tokens_seen": 91109312, + "step": 1318 + }, + { + "epoch": 82.4375, + "grad_norm": 22.443026799692635, + "learning_rate": 5e-05, + "loss": 0.0631, + "num_input_tokens_seen": 91180992, + "step": 1319 + }, + { + "epoch": 82.4375, + "loss": 0.06288768351078033, + "loss_ce": 0.001120108994655311, + "loss_xval": 0.061767578125, + "num_input_tokens_seen": 91180992, + "step": 1319 + }, + { + "epoch": 82.5, + "grad_norm": 9.204746494814867, + "learning_rate": 5e-05, + "loss": 0.0156, + "num_input_tokens_seen": 91252736, + "step": 1320 + }, + { + "epoch": 82.5, + "loss": 0.013615299016237259, + "loss_ce": 0.0006758461822755635, + "loss_xval": 0.012939453125, + "num_input_tokens_seen": 91252736, + "step": 1320 + }, + { + "epoch": 82.5625, + "grad_norm": 7.041680380778854, + "learning_rate": 5e-05, + "loss": 0.0107, + "num_input_tokens_seen": 91324352, + "step": 1321 + }, + { + "epoch": 82.5625, + "loss": 0.010888535529375076, + "loss_ce": 0.0004515242180787027, + "loss_xval": 0.01043701171875, + "num_input_tokens_seen": 91324352, + "step": 1321 + }, + { + "epoch": 82.625, + "grad_norm": 20.68685335144807, + "learning_rate": 5e-05, + "loss": 0.0477, + "num_input_tokens_seen": 91395968, + "step": 1322 + }, + { + "epoch": 82.625, + "loss": 0.04815371334552765, + "loss_ce": 0.0003021496522706002, + "loss_xval": 0.0478515625, + "num_input_tokens_seen": 91395968, + "step": 1322 + }, + { + "epoch": 82.6875, + "grad_norm": 19.53341497193577, + "learning_rate": 5e-05, + "loss": 0.0443, + "num_input_tokens_seen": 91455040, + "step": 1323 + }, + { + "epoch": 82.6875, + "loss": 0.0440186932682991, + "loss_ce": 0.00031752054928801954, + "loss_xval": 0.043701171875, + "num_input_tokens_seen": 91455040, + "step": 1323 + }, + { + "epoch": 82.75, + "grad_norm": 5.313685312356574, + "learning_rate": 5e-05, + "loss": 0.0094, + "num_input_tokens_seen": 91526784, + "step": 1324 + }, + { + "epoch": 82.75, + "loss": 0.007937032729387283, + "loss_ce": 0.00027712108567357063, + "loss_xval": 0.007659912109375, + "num_input_tokens_seen": 91526784, + "step": 1324 + }, + { + "epoch": 82.8125, + "grad_norm": 10.570006105733498, + "learning_rate": 5e-05, + "loss": 0.0166, + "num_input_tokens_seen": 91598528, + "step": 1325 + }, + { + "epoch": 82.8125, + "loss": 0.015571070834994316, + "loss_ce": 0.00025124690728262067, + "loss_xval": 0.01531982421875, + "num_input_tokens_seen": 91598528, + "step": 1325 + }, + { + "epoch": 82.875, + "grad_norm": 15.549359921026063, + "learning_rate": 5e-05, + "loss": 0.0284, + "num_input_tokens_seen": 91670144, + "step": 1326 + }, + { + "epoch": 82.875, + "loss": 0.02837221696972847, + "loss_ce": 0.0002960452693514526, + "loss_xval": 0.028076171875, + "num_input_tokens_seen": 91670144, + "step": 1326 + }, + { + "epoch": 82.9375, + "grad_norm": 6.965176644189292, + "learning_rate": 5e-05, + "loss": 0.0062, + "num_input_tokens_seen": 91741824, + "step": 1327 + }, + { + "epoch": 82.9375, + "loss": 0.006542420946061611, + "loss_ce": 0.00025579993962310255, + "loss_xval": 0.00628662109375, + "num_input_tokens_seen": 91741824, + "step": 1327 + }, + { + "epoch": 83.0, + "grad_norm": 6.505690399306705, + "learning_rate": 5e-05, + "loss": 0.0068, + "num_input_tokens_seen": 91801024, + "step": 1328 + }, + { + "epoch": 83.0, + "loss": 0.007964624091982841, + "loss_ce": 0.00027419428806751966, + "loss_xval": 0.0076904296875, + "num_input_tokens_seen": 91801024, + "step": 1328 + }, + { + "epoch": 83.0625, + "grad_norm": 13.084367095717914, + "learning_rate": 5e-05, + "loss": 0.0218, + "num_input_tokens_seen": 91872640, + "step": 1329 + }, + { + "epoch": 83.0625, + "loss": 0.0210590660572052, + "loss_ce": 0.0003071120008826256, + "loss_xval": 0.020751953125, + "num_input_tokens_seen": 91872640, + "step": 1329 + }, + { + "epoch": 83.125, + "grad_norm": 6.6777848034082945, + "learning_rate": 5e-05, + "loss": 0.0067, + "num_input_tokens_seen": 91944384, + "step": 1330 + }, + { + "epoch": 83.125, + "loss": 0.006794905290007591, + "loss_ce": 0.0003251784946769476, + "loss_xval": 0.0064697265625, + "num_input_tokens_seen": 91944384, + "step": 1330 + }, + { + "epoch": 83.1875, + "grad_norm": 6.060488788965237, + "learning_rate": 5e-05, + "loss": 0.0054, + "num_input_tokens_seen": 92016064, + "step": 1331 + }, + { + "epoch": 83.1875, + "loss": 0.004320111591368914, + "loss_ce": 0.0003223086823709309, + "loss_xval": 0.003997802734375, + "num_input_tokens_seen": 92016064, + "step": 1331 + }, + { + "epoch": 83.25, + "grad_norm": 11.275035340765044, + "learning_rate": 5e-05, + "loss": 0.0158, + "num_input_tokens_seen": 92087680, + "step": 1332 + }, + { + "epoch": 83.25, + "loss": 0.015535068698227406, + "loss_ce": 0.0003983501228503883, + "loss_xval": 0.01513671875, + "num_input_tokens_seen": 92087680, + "step": 1332 + }, + { + "epoch": 83.3125, + "grad_norm": 6.306319413443722, + "learning_rate": 5e-05, + "loss": 0.0065, + "num_input_tokens_seen": 92159232, + "step": 1333 + }, + { + "epoch": 83.3125, + "loss": 0.006737944670021534, + "loss_ce": 0.0004208058526273817, + "loss_xval": 0.006317138671875, + "num_input_tokens_seen": 92159232, + "step": 1333 + }, + { + "epoch": 83.375, + "grad_norm": 3.0254192286907, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 92230784, + "step": 1334 + }, + { + "epoch": 83.375, + "loss": 0.0021276480983942747, + "loss_ce": 0.00041866363608278334, + "loss_xval": 0.001708984375, + "num_input_tokens_seen": 92230784, + "step": 1334 + }, + { + "epoch": 83.4375, + "grad_norm": 8.935617589851892, + "learning_rate": 5e-05, + "loss": 0.0103, + "num_input_tokens_seen": 92302336, + "step": 1335 + }, + { + "epoch": 83.4375, + "loss": 0.010258061811327934, + "loss_ce": 0.00043140165507793427, + "loss_xval": 0.00982666015625, + "num_input_tokens_seen": 92302336, + "step": 1335 + }, + { + "epoch": 83.5, + "grad_norm": 6.017498085566332, + "learning_rate": 5e-05, + "loss": 0.0054, + "num_input_tokens_seen": 92373888, + "step": 1336 + }, + { + "epoch": 83.5, + "loss": 0.005582468118518591, + "loss_ce": 0.00045551499351859093, + "loss_xval": 0.005126953125, + "num_input_tokens_seen": 92373888, + "step": 1336 + }, + { + "epoch": 83.5625, + "grad_norm": 1.867077741626802, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 92445632, + "step": 1337 + }, + { + "epoch": 83.5625, + "loss": 0.0021143611520528793, + "loss_ce": 0.00045115326065570116, + "loss_xval": 0.0016632080078125, + "num_input_tokens_seen": 92445632, + "step": 1337 + }, + { + "epoch": 83.625, + "grad_norm": 6.6407270625012105, + "learning_rate": 5e-05, + "loss": 0.0067, + "num_input_tokens_seen": 92504768, + "step": 1338 + }, + { + "epoch": 83.625, + "loss": 0.006132456939667463, + "loss_ce": 0.0003951520484406501, + "loss_xval": 0.0057373046875, + "num_input_tokens_seen": 92504768, + "step": 1338 + }, + { + "epoch": 83.6875, + "grad_norm": 5.393994327897671, + "learning_rate": 5e-05, + "loss": 0.0044, + "num_input_tokens_seen": 92563840, + "step": 1339 + }, + { + "epoch": 83.6875, + "loss": 0.0039986069314181805, + "loss_ce": 0.0003670152509585023, + "loss_xval": 0.003631591796875, + "num_input_tokens_seen": 92563840, + "step": 1339 + }, + { + "epoch": 83.75, + "grad_norm": 0.187030626807252, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 92635456, + "step": 1340 + }, + { + "epoch": 83.75, + "loss": 0.0009429383790120482, + "loss_ce": 0.0003821779100690037, + "loss_xval": 0.000560760498046875, + "num_input_tokens_seen": 92635456, + "step": 1340 + }, + { + "epoch": 83.8125, + "grad_norm": 5.5623352632301355, + "learning_rate": 5e-05, + "loss": 0.0049, + "num_input_tokens_seen": 92707072, + "step": 1341 + }, + { + "epoch": 83.8125, + "loss": 0.004972436930984259, + "loss_ce": 0.0003642825176939368, + "loss_xval": 0.004608154296875, + "num_input_tokens_seen": 92707072, + "step": 1341 + }, + { + "epoch": 83.875, + "grad_norm": 7.908228587710838, + "learning_rate": 5e-05, + "loss": 0.0089, + "num_input_tokens_seen": 92778752, + "step": 1342 + }, + { + "epoch": 83.875, + "loss": 0.0085558220744133, + "loss_ce": 0.00037711087497882545, + "loss_xval": 0.0081787109375, + "num_input_tokens_seen": 92778752, + "step": 1342 + }, + { + "epoch": 83.9375, + "grad_norm": 6.82307396320819, + "learning_rate": 5e-05, + "loss": 0.0063, + "num_input_tokens_seen": 92850304, + "step": 1343 + }, + { + "epoch": 83.9375, + "loss": 0.00607724254950881, + "loss_ce": 0.0003399380366317928, + "loss_xval": 0.0057373046875, + "num_input_tokens_seen": 92850304, + "step": 1343 + }, + { + "epoch": 84.0, + "grad_norm": 2.2725296771540027, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 92921856, + "step": 1344 + }, + { + "epoch": 84.0, + "loss": 0.0013987963320687413, + "loss_ce": 0.0003001635486725718, + "loss_xval": 0.0010986328125, + "num_input_tokens_seen": 92921856, + "step": 1344 + }, + { + "epoch": 84.0625, + "grad_norm": 2.9586250079478553, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 92993536, + "step": 1345 + }, + { + "epoch": 84.0625, + "loss": 0.002096587559208274, + "loss_ce": 0.00030367981526069343, + "loss_xval": 0.00179290771484375, + "num_input_tokens_seen": 92993536, + "step": 1345 + }, + { + "epoch": 84.125, + "grad_norm": 5.5977867942047705, + "learning_rate": 5e-05, + "loss": 0.0047, + "num_input_tokens_seen": 93065152, + "step": 1346 + }, + { + "epoch": 84.125, + "loss": 0.004932393319904804, + "loss_ce": 0.00029372161952778697, + "loss_xval": 0.004638671875, + "num_input_tokens_seen": 93065152, + "step": 1346 + }, + { + "epoch": 84.1875, + "grad_norm": 4.7846782687239475, + "learning_rate": 5e-05, + "loss": 0.0034, + "num_input_tokens_seen": 93136768, + "step": 1347 + }, + { + "epoch": 84.1875, + "loss": 0.0036556622944772243, + "loss_ce": 0.00026821118080988526, + "loss_xval": 0.003387451171875, + "num_input_tokens_seen": 93136768, + "step": 1347 + }, + { + "epoch": 84.25, + "grad_norm": 1.0287294637288882, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 93208512, + "step": 1348 + }, + { + "epoch": 84.25, + "loss": 0.000610416114795953, + "loss_ce": 0.00025946396635845304, + "loss_xval": 0.0003509521484375, + "num_input_tokens_seen": 93208512, + "step": 1348 + }, + { + "epoch": 84.3125, + "grad_norm": 3.4560656948515134, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 93280128, + "step": 1349 + }, + { + "epoch": 84.3125, + "loss": 0.002060085302218795, + "loss_ce": 0.00025191876920871437, + "loss_xval": 0.00180816650390625, + "num_input_tokens_seen": 93280128, + "step": 1349 + }, + { + "epoch": 84.375, + "grad_norm": 5.959650224848103, + "learning_rate": 5e-05, + "loss": 0.0049, + "num_input_tokens_seen": 93351744, + "step": 1350 + }, + { + "epoch": 84.375, + "loss": 0.004852754529565573, + "loss_ce": 0.0002446004073135555, + "loss_xval": 0.004608154296875, + "num_input_tokens_seen": 93351744, + "step": 1350 + }, + { + "epoch": 84.4375, + "grad_norm": 5.097639920887682, + "learning_rate": 5e-05, + "loss": 0.0037, + "num_input_tokens_seen": 93423360, + "step": 1351 + }, + { + "epoch": 84.4375, + "loss": 0.0037487023510038853, + "loss_ce": 0.00023918086662888527, + "loss_xval": 0.003509521484375, + "num_input_tokens_seen": 93423360, + "step": 1351 + }, + { + "epoch": 84.5, + "grad_norm": 0.9345982088961234, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 93494912, + "step": 1352 + }, + { + "epoch": 84.5, + "loss": 0.0008009417215362191, + "loss_ce": 0.0001905901444843039, + "loss_xval": 0.0006103515625, + "num_input_tokens_seen": 93494912, + "step": 1352 + }, + { + "epoch": 84.5625, + "grad_norm": 4.545730632277391, + "learning_rate": 5e-05, + "loss": 0.003, + "num_input_tokens_seen": 93566592, + "step": 1353 + }, + { + "epoch": 84.5625, + "loss": 0.0027575090061873198, + "loss_ce": 0.00019403238547965884, + "loss_xval": 0.0025634765625, + "num_input_tokens_seen": 93566592, + "step": 1353 + }, + { + "epoch": 84.625, + "grad_norm": 8.556585400390404, + "learning_rate": 5e-05, + "loss": 0.0091, + "num_input_tokens_seen": 93638272, + "step": 1354 + }, + { + "epoch": 84.625, + "loss": 0.009279740042984486, + "loss_ce": 0.00018550171807873994, + "loss_xval": 0.00909423828125, + "num_input_tokens_seen": 93638272, + "step": 1354 + }, + { + "epoch": 84.6875, + "grad_norm": 8.62215731743346, + "learning_rate": 5e-05, + "loss": 0.0094, + "num_input_tokens_seen": 93697408, + "step": 1355 + }, + { + "epoch": 84.6875, + "loss": 0.009744501672685146, + "loss_ce": 0.00016198224329855293, + "loss_xval": 0.00958251953125, + "num_input_tokens_seen": 93697408, + "step": 1355 + }, + { + "epoch": 84.75, + "grad_norm": 4.452801107041619, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_input_tokens_seen": 93769024, + "step": 1356 + }, + { + "epoch": 84.75, + "loss": 0.0040631224401295185, + "loss_ce": 0.00015687257109675556, + "loss_xval": 0.00390625, + "num_input_tokens_seen": 93769024, + "step": 1356 + }, + { + "epoch": 84.8125, + "grad_norm": 1.6587461632003737, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 93840576, + "step": 1357 + }, + { + "epoch": 84.8125, + "loss": 0.000456514535471797, + "loss_ce": 0.00015896816330496222, + "loss_xval": 0.00029754638671875, + "num_input_tokens_seen": 93840576, + "step": 1357 + }, + { + "epoch": 84.875, + "grad_norm": 6.452086587448419, + "learning_rate": 5e-05, + "loss": 0.0056, + "num_input_tokens_seen": 93912192, + "step": 1358 + }, + { + "epoch": 84.875, + "loss": 0.005854357499629259, + "loss_ce": 0.00014757020107936114, + "loss_xval": 0.005706787109375, + "num_input_tokens_seen": 93912192, + "step": 1358 + }, + { + "epoch": 84.9375, + "grad_norm": 7.768455428908589, + "learning_rate": 5e-05, + "loss": 0.0075, + "num_input_tokens_seen": 93971392, + "step": 1359 + }, + { + "epoch": 84.9375, + "loss": 0.007658465765416622, + "loss_ce": 0.00015114153211470693, + "loss_xval": 0.00750732421875, + "num_input_tokens_seen": 93971392, + "step": 1359 + }, + { + "epoch": 85.0, + "grad_norm": 6.261992830665613, + "learning_rate": 5e-05, + "loss": 0.0054, + "num_input_tokens_seen": 94030528, + "step": 1360 + }, + { + "epoch": 85.0, + "loss": 0.005399479065090418, + "loss_ce": 0.00015045571490190923, + "loss_xval": 0.0052490234375, + "num_input_tokens_seen": 94030528, + "step": 1360 + }, + { + "epoch": 85.0625, + "grad_norm": 3.5746365441520247, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 94102144, + "step": 1361 + }, + { + "epoch": 85.0625, + "loss": 0.002178808907046914, + "loss_ce": 0.0001493899617344141, + "loss_xval": 0.0020294189453125, + "num_input_tokens_seen": 94102144, + "step": 1361 + }, + { + "epoch": 85.125, + "grad_norm": 0.4716400038577123, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 94161216, + "step": 1362 + }, + { + "epoch": 85.125, + "loss": 0.0006105655338615179, + "loss_ce": 0.00014898713561706245, + "loss_xval": 0.000461578369140625, + "num_input_tokens_seen": 94161216, + "step": 1362 + }, + { + "epoch": 85.1875, + "grad_norm": 2.4725061164222364, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 94232768, + "step": 1363 + }, + { + "epoch": 85.1875, + "loss": 0.0010596337961032987, + "loss_ce": 0.0001479211205150932, + "loss_xval": 0.000911712646484375, + "num_input_tokens_seen": 94232768, + "step": 1363 + }, + { + "epoch": 85.25, + "grad_norm": 4.09294530167, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_input_tokens_seen": 94304512, + "step": 1364 + }, + { + "epoch": 85.25, + "loss": 0.0021935063414275646, + "loss_ce": 0.00014882863615639508, + "loss_xval": 0.002044677734375, + "num_input_tokens_seen": 94304512, + "step": 1364 + }, + { + "epoch": 85.3125, + "grad_norm": 4.489692593923753, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 94376128, + "step": 1365 + }, + { + "epoch": 85.3125, + "loss": 0.003198940772563219, + "loss_ce": 0.00014718300371896476, + "loss_xval": 0.0030517578125, + "num_input_tokens_seen": 94376128, + "step": 1365 + }, + { + "epoch": 85.375, + "grad_norm": 4.618347858672993, + "learning_rate": 5e-05, + "loss": 0.003, + "num_input_tokens_seen": 94447744, + "step": 1366 + }, + { + "epoch": 85.375, + "loss": 0.0029601159039884806, + "loss_ce": 0.00013723997108172625, + "loss_xval": 0.0028228759765625, + "num_input_tokens_seen": 94447744, + "step": 1366 + }, + { + "epoch": 85.4375, + "grad_norm": 4.74594135568652, + "learning_rate": 5e-05, + "loss": 0.003, + "num_input_tokens_seen": 94519296, + "step": 1367 + }, + { + "epoch": 85.4375, + "loss": 0.002849689219146967, + "loss_ce": 0.0001336246496066451, + "loss_xval": 0.002716064453125, + "num_input_tokens_seen": 94519296, + "step": 1367 + }, + { + "epoch": 85.5, + "grad_norm": 4.213946298872178, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_input_tokens_seen": 94591040, + "step": 1368 + }, + { + "epoch": 85.5, + "loss": 0.002137300791218877, + "loss_ce": 0.00013839948223903775, + "loss_xval": 0.0019989013671875, + "num_input_tokens_seen": 94591040, + "step": 1368 + }, + { + "epoch": 85.5625, + "grad_norm": 3.0831016186464044, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 94662656, + "step": 1369 + }, + { + "epoch": 85.5625, + "loss": 0.0013986109988763928, + "loss_ce": 0.00012450206850189716, + "loss_xval": 0.00127410888671875, + "num_input_tokens_seen": 94662656, + "step": 1369 + }, + { + "epoch": 85.625, + "grad_norm": 1.6788028144692124, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 94721664, + "step": 1370 + }, + { + "epoch": 85.625, + "loss": 0.0006253900937736034, + "loss_ce": 0.0001371088292216882, + "loss_xval": 0.00048828125, + "num_input_tokens_seen": 94721664, + "step": 1370 + }, + { + "epoch": 85.6875, + "grad_norm": 0.27781799054399664, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 94793216, + "step": 1371 + }, + { + "epoch": 85.6875, + "loss": 0.00037374423118308187, + "loss_ce": 0.0001400940091116354, + "loss_xval": 0.00023365020751953125, + "num_input_tokens_seen": 94793216, + "step": 1371 + }, + { + "epoch": 85.75, + "grad_norm": 2.7779533251465924, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 94864768, + "step": 1372 + }, + { + "epoch": 85.75, + "loss": 0.0012504992773756385, + "loss_ce": 0.00012897825217805803, + "loss_xval": 0.00112152099609375, + "num_input_tokens_seen": 94864768, + "step": 1372 + }, + { + "epoch": 85.8125, + "grad_norm": 5.650012776195728, + "learning_rate": 5e-05, + "loss": 0.0042, + "num_input_tokens_seen": 94936512, + "step": 1373 + }, + { + "epoch": 85.8125, + "loss": 0.004096949473023415, + "loss_ce": 0.00012966437498107553, + "loss_xval": 0.00396728515625, + "num_input_tokens_seen": 94936512, + "step": 1373 + }, + { + "epoch": 85.875, + "grad_norm": 8.657061946127184, + "learning_rate": 5e-05, + "loss": 0.0093, + "num_input_tokens_seen": 95008128, + "step": 1374 + }, + { + "epoch": 85.875, + "loss": 0.010076146572828293, + "loss_ce": 0.00012741591490339488, + "loss_xval": 0.00994873046875, + "num_input_tokens_seen": 95008128, + "step": 1374 + }, + { + "epoch": 85.9375, + "grad_norm": 11.424991293675157, + "learning_rate": 5e-05, + "loss": 0.016, + "num_input_tokens_seen": 95067200, + "step": 1375 + }, + { + "epoch": 85.9375, + "loss": 0.015012609772384167, + "loss_ce": 0.00012003149458905682, + "loss_xval": 0.014892578125, + "num_input_tokens_seen": 95067200, + "step": 1375 + }, + { + "epoch": 86.0, + "grad_norm": 14.5645994997339, + "learning_rate": 5e-05, + "loss": 0.0256, + "num_input_tokens_seen": 95138944, + "step": 1376 + }, + { + "epoch": 86.0, + "loss": 0.02514888532459736, + "loss_ce": 0.00012447111657820642, + "loss_xval": 0.0250244140625, + "num_input_tokens_seen": 95138944, + "step": 1376 + }, + { + "epoch": 86.0625, + "grad_norm": 18.481510231126762, + "learning_rate": 5e-05, + "loss": 0.0419, + "num_input_tokens_seen": 95210624, + "step": 1377 + }, + { + "epoch": 86.0625, + "loss": 0.04261020943522453, + "loss_ce": 0.00012974253331776708, + "loss_xval": 0.04248046875, + "num_input_tokens_seen": 95210624, + "step": 1377 + }, + { + "epoch": 86.125, + "grad_norm": 21.992243575699817, + "learning_rate": 5e-05, + "loss": 0.06, + "num_input_tokens_seen": 95282176, + "step": 1378 + }, + { + "epoch": 86.125, + "loss": 0.06021275743842125, + "loss_ce": 0.00015416437236126512, + "loss_xval": 0.06005859375, + "num_input_tokens_seen": 95282176, + "step": 1378 + }, + { + "epoch": 86.1875, + "grad_norm": 22.56849610163038, + "learning_rate": 5e-05, + "loss": 0.0639, + "num_input_tokens_seen": 95353728, + "step": 1379 + }, + { + "epoch": 86.1875, + "loss": 0.06363598257303238, + "loss_ce": 0.00015941797755658627, + "loss_xval": 0.0634765625, + "num_input_tokens_seen": 95353728, + "step": 1379 + }, + { + "epoch": 86.25, + "grad_norm": 16.94868086603522, + "learning_rate": 5e-05, + "loss": 0.0378, + "num_input_tokens_seen": 95412864, + "step": 1380 + }, + { + "epoch": 86.25, + "loss": 0.03410815820097923, + "loss_ce": 0.00017261072935070843, + "loss_xval": 0.033935546875, + "num_input_tokens_seen": 95412864, + "step": 1380 + }, + { + "epoch": 86.3125, + "grad_norm": 4.895642035480853, + "learning_rate": 5e-05, + "loss": 0.0045, + "num_input_tokens_seen": 95484544, + "step": 1381 + }, + { + "epoch": 86.3125, + "loss": 0.004222327843308449, + "loss_ce": 0.00022452489065472037, + "loss_xval": 0.003997802734375, + "num_input_tokens_seen": 95484544, + "step": 1381 + }, + { + "epoch": 86.375, + "grad_norm": 8.953947922684396, + "learning_rate": 5e-05, + "loss": 0.0114, + "num_input_tokens_seen": 95556224, + "step": 1382 + }, + { + "epoch": 86.375, + "loss": 0.011928737163543701, + "loss_ce": 0.0002710219123400748, + "loss_xval": 0.01165771484375, + "num_input_tokens_seen": 95556224, + "step": 1382 + }, + { + "epoch": 86.4375, + "grad_norm": 18.1527503843166, + "learning_rate": 5e-05, + "loss": 0.0429, + "num_input_tokens_seen": 95627968, + "step": 1383 + }, + { + "epoch": 86.4375, + "loss": 0.04327051341533661, + "loss_ce": 0.0003017636190634221, + "loss_xval": 0.04296875, + "num_input_tokens_seen": 95627968, + "step": 1383 + }, + { + "epoch": 86.5, + "grad_norm": 18.33660375652762, + "learning_rate": 5e-05, + "loss": 0.0448, + "num_input_tokens_seen": 95687104, + "step": 1384 + }, + { + "epoch": 86.5, + "loss": 0.044277459383010864, + "loss_ce": 0.000332146737491712, + "loss_xval": 0.0439453125, + "num_input_tokens_seen": 95687104, + "step": 1384 + }, + { + "epoch": 86.5625, + "grad_norm": 9.742972210972088, + "learning_rate": 5e-05, + "loss": 0.0147, + "num_input_tokens_seen": 95758848, + "step": 1385 + }, + { + "epoch": 86.5625, + "loss": 0.014891230501234531, + "loss_ce": 0.0003648633719421923, + "loss_xval": 0.0145263671875, + "num_input_tokens_seen": 95758848, + "step": 1385 + }, + { + "epoch": 86.625, + "grad_norm": 2.0753446635970603, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 95830464, + "step": 1386 + }, + { + "epoch": 86.625, + "loss": 0.002193968975916505, + "loss_ce": 0.00040106126107275486, + "loss_xval": 0.00179290771484375, + "num_input_tokens_seen": 95830464, + "step": 1386 + }, + { + "epoch": 86.6875, + "grad_norm": 10.612091415688838, + "learning_rate": 5e-05, + "loss": 0.0159, + "num_input_tokens_seen": 95902208, + "step": 1387 + }, + { + "epoch": 86.6875, + "loss": 0.01625114679336548, + "loss_ce": 0.00038200628478080034, + "loss_xval": 0.015869140625, + "num_input_tokens_seen": 95902208, + "step": 1387 + }, + { + "epoch": 86.75, + "grad_norm": 12.023490316206656, + "learning_rate": 5e-05, + "loss": 0.0197, + "num_input_tokens_seen": 95961216, + "step": 1388 + }, + { + "epoch": 86.75, + "loss": 0.020631328225135803, + "loss_ce": 0.0003676554188132286, + "loss_xval": 0.020263671875, + "num_input_tokens_seen": 95961216, + "step": 1388 + }, + { + "epoch": 86.8125, + "grad_norm": 6.192574013263385, + "learning_rate": 5e-05, + "loss": 0.0063, + "num_input_tokens_seen": 96032896, + "step": 1389 + }, + { + "epoch": 86.8125, + "loss": 0.0059670875780284405, + "loss_ce": 0.00041288844658993185, + "loss_xval": 0.00555419921875, + "num_input_tokens_seen": 96032896, + "step": 1389 + }, + { + "epoch": 86.875, + "grad_norm": 2.6492488243139736, + "learning_rate": 5e-05, + "loss": 0.003, + "num_input_tokens_seen": 96104640, + "step": 1390 + }, + { + "epoch": 86.875, + "loss": 0.0033788911532610655, + "loss_ce": 0.00041868616244755685, + "loss_xval": 0.002960205078125, + "num_input_tokens_seen": 96104640, + "step": 1390 + }, + { + "epoch": 86.9375, + "grad_norm": 8.729217015659232, + "learning_rate": 5e-05, + "loss": 0.0121, + "num_input_tokens_seen": 96176192, + "step": 1391 + }, + { + "epoch": 86.9375, + "loss": 0.01327319536358118, + "loss_ce": 0.0003947777731809765, + "loss_xval": 0.01287841796875, + "num_input_tokens_seen": 96176192, + "step": 1391 + }, + { + "epoch": 87.0, + "grad_norm": 8.728910170528094, + "learning_rate": 5e-05, + "loss": 0.0119, + "num_input_tokens_seen": 96247808, + "step": 1392 + }, + { + "epoch": 87.0, + "loss": 0.011554991826415062, + "loss_ce": 0.00038555837818421423, + "loss_xval": 0.01116943359375, + "num_input_tokens_seen": 96247808, + "step": 1392 + }, + { + "epoch": 87.0625, + "grad_norm": 3.355145966560444, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 96306816, + "step": 1393 + }, + { + "epoch": 87.0625, + "loss": 0.002884861547499895, + "loss_ce": 0.0003519024758134037, + "loss_xval": 0.002532958984375, + "num_input_tokens_seen": 96306816, + "step": 1393 + }, + { + "epoch": 87.125, + "grad_norm": 4.138859429284309, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_input_tokens_seen": 96365952, + "step": 1394 + }, + { + "epoch": 87.125, + "loss": 0.0029549452010542154, + "loss_ce": 0.0003151747805532068, + "loss_xval": 0.0026397705078125, + "num_input_tokens_seen": 96365952, + "step": 1394 + }, + { + "epoch": 87.1875, + "grad_norm": 8.873096729500526, + "learning_rate": 5e-05, + "loss": 0.0114, + "num_input_tokens_seen": 96437568, + "step": 1395 + }, + { + "epoch": 87.1875, + "loss": 0.011341361328959465, + "loss_ce": 0.0002939981932286173, + "loss_xval": 0.01104736328125, + "num_input_tokens_seen": 96437568, + "step": 1395 + }, + { + "epoch": 87.25, + "grad_norm": 7.362971805252709, + "learning_rate": 5e-05, + "loss": 0.0079, + "num_input_tokens_seen": 96509184, + "step": 1396 + }, + { + "epoch": 87.25, + "loss": 0.007855205796658993, + "loss_ce": 0.0002563287562225014, + "loss_xval": 0.007598876953125, + "num_input_tokens_seen": 96509184, + "step": 1396 + }, + { + "epoch": 87.3125, + "grad_norm": 0.7905654077163219, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 96580800, + "step": 1397 + }, + { + "epoch": 87.3125, + "loss": 0.0012294021435081959, + "loss_ce": 0.0002566543407738209, + "loss_xval": 0.000972747802734375, + "num_input_tokens_seen": 96580800, + "step": 1397 + }, + { + "epoch": 87.375, + "grad_norm": 6.997986291690594, + "learning_rate": 5e-05, + "loss": 0.0069, + "num_input_tokens_seen": 96652416, + "step": 1398 + }, + { + "epoch": 87.375, + "loss": 0.007005076855421066, + "loss_ce": 0.0002301743079442531, + "loss_xval": 0.00677490234375, + "num_input_tokens_seen": 96652416, + "step": 1398 + }, + { + "epoch": 87.4375, + "grad_norm": 12.697177281449116, + "learning_rate": 5e-05, + "loss": 0.0217, + "num_input_tokens_seen": 96724032, + "step": 1399 + }, + { + "epoch": 87.4375, + "loss": 0.021074065938591957, + "loss_ce": 0.0002000429667532444, + "loss_xval": 0.0208740234375, + "num_input_tokens_seen": 96724032, + "step": 1399 + }, + { + "epoch": 87.5, + "grad_norm": 13.521911622196056, + "learning_rate": 5e-05, + "loss": 0.0247, + "num_input_tokens_seen": 96795584, + "step": 1400 + }, + { + "epoch": 87.5, + "loss": 0.026104973629117012, + "loss_ce": 0.00022606826678384095, + "loss_xval": 0.02587890625, + "num_input_tokens_seen": 96795584, + "step": 1400 + }, + { + "epoch": 87.5625, + "grad_norm": 7.620328150939222, + "learning_rate": 5e-05, + "loss": 0.0099, + "num_input_tokens_seen": 96867136, + "step": 1401 + }, + { + "epoch": 87.5625, + "loss": 0.009842384606599808, + "loss_ce": 0.0002598651044536382, + "loss_xval": 0.00958251953125, + "num_input_tokens_seen": 96867136, + "step": 1401 + }, + { + "epoch": 87.625, + "grad_norm": 2.257605245863537, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 96938688, + "step": 1402 + }, + { + "epoch": 87.625, + "loss": 0.0028382607270032167, + "loss_ce": 0.00027478416450321674, + "loss_xval": 0.0025634765625, + "num_input_tokens_seen": 96938688, + "step": 1402 + }, + { + "epoch": 87.6875, + "grad_norm": 9.331684847900517, + "learning_rate": 5e-05, + "loss": 0.013, + "num_input_tokens_seen": 96997824, + "step": 1403 + }, + { + "epoch": 87.6875, + "loss": 0.013780038803815842, + "loss_ce": 0.00029126947629265487, + "loss_xval": 0.01348876953125, + "num_input_tokens_seen": 96997824, + "step": 1403 + }, + { + "epoch": 87.75, + "grad_norm": 11.815568839124595, + "learning_rate": 5e-05, + "loss": 0.0203, + "num_input_tokens_seen": 97069376, + "step": 1404 + }, + { + "epoch": 87.75, + "loss": 0.02077043429017067, + "loss_ce": 0.0003846912004519254, + "loss_xval": 0.0203857421875, + "num_input_tokens_seen": 97069376, + "step": 1404 + }, + { + "epoch": 87.8125, + "grad_norm": 10.458438509888028, + "learning_rate": 5e-05, + "loss": 0.0157, + "num_input_tokens_seen": 97141120, + "step": 1405 + }, + { + "epoch": 87.8125, + "loss": 0.01642109826207161, + "loss_ce": 0.00030781730310991406, + "loss_xval": 0.01611328125, + "num_input_tokens_seen": 97141120, + "step": 1405 + }, + { + "epoch": 87.875, + "grad_norm": 4.899426763532995, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 97212800, + "step": 1406 + }, + { + "epoch": 87.875, + "loss": 0.004273131489753723, + "loss_ce": 0.0002753287262748927, + "loss_xval": 0.003997802734375, + "num_input_tokens_seen": 97212800, + "step": 1406 + }, + { + "epoch": 87.9375, + "grad_norm": 2.572351585425516, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 97284352, + "step": 1407 + }, + { + "epoch": 87.9375, + "loss": 0.0018727717688307166, + "loss_ce": 0.0002553401282057166, + "loss_xval": 0.001617431640625, + "num_input_tokens_seen": 97284352, + "step": 1407 + }, + { + "epoch": 88.0, + "grad_norm": 7.9539379483395205, + "learning_rate": 5e-05, + "loss": 0.0095, + "num_input_tokens_seen": 97356032, + "step": 1408 + }, + { + "epoch": 88.0, + "loss": 0.009988140314817429, + "loss_ce": 0.00028355044196359813, + "loss_xval": 0.00970458984375, + "num_input_tokens_seen": 97356032, + "step": 1408 + }, + { + "epoch": 88.0625, + "grad_norm": 8.125767744555626, + "learning_rate": 5e-05, + "loss": 0.0105, + "num_input_tokens_seen": 97427584, + "step": 1409 + }, + { + "epoch": 88.0625, + "loss": 0.010983756743371487, + "loss_ce": 0.00024156902509275824, + "loss_xval": 0.0107421875, + "num_input_tokens_seen": 97427584, + "step": 1409 + }, + { + "epoch": 88.125, + "grad_norm": 2.498880075441956, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 97499392, + "step": 1410 + }, + { + "epoch": 88.125, + "loss": 0.0017704254714772105, + "loss_ce": 0.0002369171561440453, + "loss_xval": 0.00153350830078125, + "num_input_tokens_seen": 97499392, + "step": 1410 + }, + { + "epoch": 88.1875, + "grad_norm": 4.088092554901874, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_input_tokens_seen": 97558464, + "step": 1411 + }, + { + "epoch": 88.1875, + "loss": 0.0027891267091035843, + "loss_ce": 0.00021039124112576246, + "loss_xval": 0.0025787353515625, + "num_input_tokens_seen": 97558464, + "step": 1411 + }, + { + "epoch": 88.25, + "grad_norm": 7.781016703274088, + "learning_rate": 5e-05, + "loss": 0.009, + "num_input_tokens_seen": 97617472, + "step": 1412 + }, + { + "epoch": 88.25, + "loss": 0.009119339287281036, + "loss_ce": 0.00020820641657337546, + "loss_xval": 0.0089111328125, + "num_input_tokens_seen": 97617472, + "step": 1412 + }, + { + "epoch": 88.3125, + "grad_norm": 8.188874335876337, + "learning_rate": 5e-05, + "loss": 0.01, + "num_input_tokens_seen": 97689024, + "step": 1413 + }, + { + "epoch": 88.3125, + "loss": 0.010654650628566742, + "loss_ce": 0.00021763896802440286, + "loss_xval": 0.01043701171875, + "num_input_tokens_seen": 97689024, + "step": 1413 + }, + { + "epoch": 88.375, + "grad_norm": 5.9478870558346175, + "learning_rate": 5e-05, + "loss": 0.0055, + "num_input_tokens_seen": 97760768, + "step": 1414 + }, + { + "epoch": 88.375, + "loss": 0.005853202193975449, + "loss_ce": 0.00020745031361002475, + "loss_xval": 0.005645751953125, + "num_input_tokens_seen": 97760768, + "step": 1414 + }, + { + "epoch": 88.4375, + "grad_norm": 1.475954179206284, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 97820032, + "step": 1415 + }, + { + "epoch": 88.4375, + "loss": 0.00100472301710397, + "loss_ce": 0.0001960072258953005, + "loss_xval": 0.0008087158203125, + "num_input_tokens_seen": 97820032, + "step": 1415 + }, + { + "epoch": 88.5, + "grad_norm": 4.114678583328582, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 97891584, + "step": 1416 + }, + { + "epoch": 88.5, + "loss": 0.0029814133886247873, + "loss_ce": 0.00018905493197962642, + "loss_xval": 0.0027923583984375, + "num_input_tokens_seen": 97891584, + "step": 1416 + }, + { + "epoch": 88.5625, + "grad_norm": 8.631794997393147, + "learning_rate": 5e-05, + "loss": 0.0109, + "num_input_tokens_seen": 97963392, + "step": 1417 + }, + { + "epoch": 88.5625, + "loss": 0.010813172906637192, + "loss_ce": 0.00019305601017549634, + "loss_xval": 0.0106201171875, + "num_input_tokens_seen": 97963392, + "step": 1417 + }, + { + "epoch": 88.625, + "grad_norm": 10.541015255541664, + "learning_rate": 5e-05, + "loss": 0.0158, + "num_input_tokens_seen": 98035008, + "step": 1418 + }, + { + "epoch": 88.625, + "loss": 0.015680886805057526, + "loss_ce": 0.00017795769963413477, + "loss_xval": 0.0155029296875, + "num_input_tokens_seen": 98035008, + "step": 1418 + }, + { + "epoch": 88.6875, + "grad_norm": 9.71623274897056, + "learning_rate": 5e-05, + "loss": 0.0138, + "num_input_tokens_seen": 98106752, + "step": 1419 + }, + { + "epoch": 88.6875, + "loss": 0.014093155972659588, + "loss_ce": 0.0001771400129655376, + "loss_xval": 0.013916015625, + "num_input_tokens_seen": 98106752, + "step": 1419 + }, + { + "epoch": 88.75, + "grad_norm": 6.3053819258526875, + "learning_rate": 5e-05, + "loss": 0.0063, + "num_input_tokens_seen": 98178432, + "step": 1420 + }, + { + "epoch": 88.75, + "loss": 0.006050786003470421, + "loss_ce": 0.00016089326527435333, + "loss_xval": 0.005889892578125, + "num_input_tokens_seen": 98178432, + "step": 1420 + }, + { + "epoch": 88.8125, + "grad_norm": 1.3613681611204922, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 98249984, + "step": 1421 + }, + { + "epoch": 88.8125, + "loss": 0.0017392839072272182, + "loss_ce": 0.00014474045019596815, + "loss_xval": 0.00159454345703125, + "num_input_tokens_seen": 98249984, + "step": 1421 + }, + { + "epoch": 88.875, + "grad_norm": 2.8573323171257665, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 98321536, + "step": 1422 + }, + { + "epoch": 88.875, + "loss": 0.0017971351044252515, + "loss_ce": 0.0001415565056959167, + "loss_xval": 0.00165557861328125, + "num_input_tokens_seen": 98321536, + "step": 1422 + }, + { + "epoch": 88.9375, + "grad_norm": 4.949223939521938, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 98393088, + "step": 1423 + }, + { + "epoch": 88.9375, + "loss": 0.0042897784151136875, + "loss_ce": 0.0001393879938405007, + "loss_xval": 0.004150390625, + "num_input_tokens_seen": 98393088, + "step": 1423 + }, + { + "epoch": 89.0, + "grad_norm": 4.876705980586143, + "learning_rate": 5e-05, + "loss": 0.0048, + "num_input_tokens_seen": 98464640, + "step": 1424 + }, + { + "epoch": 89.0, + "loss": 0.005188289098441601, + "loss_ce": 0.00012237107148393989, + "loss_xval": 0.00506591796875, + "num_input_tokens_seen": 98464640, + "step": 1424 + }, + { + "epoch": 89.0625, + "grad_norm": 2.4669840278626824, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 98536256, + "step": 1425 + }, + { + "epoch": 89.0625, + "loss": 0.0016833170084282756, + "loss_ce": 0.0001269204803975299, + "loss_xval": 0.001556396484375, + "num_input_tokens_seen": 98536256, + "step": 1425 + }, + { + "epoch": 89.125, + "grad_norm": 1.6542139949837114, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 98607872, + "step": 1426 + }, + { + "epoch": 89.125, + "loss": 0.0010792359244078398, + "loss_ce": 0.00012174686708021909, + "loss_xval": 0.000957489013671875, + "num_input_tokens_seen": 98607872, + "step": 1426 + }, + { + "epoch": 89.1875, + "grad_norm": 5.219650660467849, + "learning_rate": 5e-05, + "loss": 0.0049, + "num_input_tokens_seen": 98666944, + "step": 1427 + }, + { + "epoch": 89.1875, + "loss": 0.005092137027531862, + "loss_ce": 0.00011777185864048079, + "loss_xval": 0.004974365234375, + "num_input_tokens_seen": 98666944, + "step": 1427 + }, + { + "epoch": 89.25, + "grad_norm": 7.222756845472301, + "learning_rate": 5e-05, + "loss": 0.0078, + "num_input_tokens_seen": 98738624, + "step": 1428 + }, + { + "epoch": 89.25, + "loss": 0.008125527761876583, + "loss_ce": 0.00012992256961297244, + "loss_xval": 0.00799560546875, + "num_input_tokens_seen": 98738624, + "step": 1428 + }, + { + "epoch": 89.3125, + "grad_norm": 8.234607449616483, + "learning_rate": 5e-05, + "loss": 0.0097, + "num_input_tokens_seen": 98810240, + "step": 1429 + }, + { + "epoch": 89.3125, + "loss": 0.010082121938467026, + "loss_ce": 0.0001333911350229755, + "loss_xval": 0.00994873046875, + "num_input_tokens_seen": 98810240, + "step": 1429 + }, + { + "epoch": 89.375, + "grad_norm": 8.465055746180632, + "learning_rate": 5e-05, + "loss": 0.0105, + "num_input_tokens_seen": 98881856, + "step": 1430 + }, + { + "epoch": 89.375, + "loss": 0.01027833390980959, + "loss_ce": 0.00014649789955001324, + "loss_xval": 0.0101318359375, + "num_input_tokens_seen": 98881856, + "step": 1430 + }, + { + "epoch": 89.4375, + "grad_norm": 7.673367194062242, + "learning_rate": 5e-05, + "loss": 0.0088, + "num_input_tokens_seen": 98953536, + "step": 1431 + }, + { + "epoch": 89.4375, + "loss": 0.009366064332425594, + "loss_ce": 0.00014975547674112022, + "loss_xval": 0.00921630859375, + "num_input_tokens_seen": 98953536, + "step": 1431 + }, + { + "epoch": 89.5, + "grad_norm": 5.9578760054636515, + "learning_rate": 5e-05, + "loss": 0.0053, + "num_input_tokens_seen": 99025088, + "step": 1432 + }, + { + "epoch": 89.5, + "loss": 0.005480596330016851, + "loss_ce": 0.00017053764895536005, + "loss_xval": 0.00531005859375, + "num_input_tokens_seen": 99025088, + "step": 1432 + }, + { + "epoch": 89.5625, + "grad_norm": 3.8362523636362607, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_input_tokens_seen": 99084352, + "step": 1433 + }, + { + "epoch": 89.5625, + "loss": 0.002257879823446274, + "loss_ce": 0.00018268443818669766, + "loss_xval": 0.0020751953125, + "num_input_tokens_seen": 99084352, + "step": 1433 + }, + { + "epoch": 89.625, + "grad_norm": 1.6603316778373378, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 99143360, + "step": 1434 + }, + { + "epoch": 89.625, + "loss": 0.0008298221509903669, + "loss_ce": 0.00017750888946466148, + "loss_xval": 0.000652313232421875, + "num_input_tokens_seen": 99143360, + "step": 1434 + }, + { + "epoch": 89.6875, + "grad_norm": 0.25305967288664716, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 99202432, + "step": 1435 + }, + { + "epoch": 89.6875, + "loss": 0.0005389668513089418, + "loss_ce": 0.0001765706401783973, + "loss_xval": 0.000362396240234375, + "num_input_tokens_seen": 99202432, + "step": 1435 + }, + { + "epoch": 89.75, + "grad_norm": 1.7938476568634658, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 99274176, + "step": 1436 + }, + { + "epoch": 89.75, + "loss": 0.001010698964819312, + "loss_ce": 0.00018672439910005778, + "loss_xval": 0.000823974609375, + "num_input_tokens_seen": 99274176, + "step": 1436 + }, + { + "epoch": 89.8125, + "grad_norm": 3.1259805404663594, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 99345856, + "step": 1437 + }, + { + "epoch": 89.8125, + "loss": 0.001759349019266665, + "loss_ce": 0.00018769371672533453, + "loss_xval": 0.0015716552734375, + "num_input_tokens_seen": 99345856, + "step": 1437 + }, + { + "epoch": 89.875, + "grad_norm": 4.692768165309171, + "learning_rate": 5e-05, + "loss": 0.0036, + "num_input_tokens_seen": 99417408, + "step": 1438 + }, + { + "epoch": 89.875, + "loss": 0.0036474072840064764, + "loss_ce": 0.00016840336320456117, + "loss_xval": 0.00347900390625, + "num_input_tokens_seen": 99417408, + "step": 1438 + }, + { + "epoch": 89.9375, + "grad_norm": 6.687045552794276, + "learning_rate": 5e-05, + "loss": 0.0067, + "num_input_tokens_seen": 99489152, + "step": 1439 + }, + { + "epoch": 89.9375, + "loss": 0.006873737554997206, + "loss_ce": 0.00015987036749720573, + "loss_xval": 0.0067138671875, + "num_input_tokens_seen": 99489152, + "step": 1439 + }, + { + "epoch": 90.0, + "grad_norm": 9.163106584929274, + "learning_rate": 5e-05, + "loss": 0.0123, + "num_input_tokens_seen": 99560832, + "step": 1440 + }, + { + "epoch": 90.0, + "loss": 0.011944948695600033, + "loss_ce": 0.00016516332107130438, + "loss_xval": 0.01177978515625, + "num_input_tokens_seen": 99560832, + "step": 1440 + }, + { + "epoch": 90.0625, + "grad_norm": 12.669250965673333, + "learning_rate": 5e-05, + "loss": 0.0229, + "num_input_tokens_seen": 99619968, + "step": 1441 + }, + { + "epoch": 90.0625, + "loss": 0.02285129763185978, + "loss_ce": 0.00014621867740061134, + "loss_xval": 0.022705078125, + "num_input_tokens_seen": 99619968, + "step": 1441 + }, + { + "epoch": 90.125, + "grad_norm": 16.549156532771427, + "learning_rate": 5e-05, + "loss": 0.0393, + "num_input_tokens_seen": 99691520, + "step": 1442 + }, + { + "epoch": 90.125, + "loss": 0.03847173973917961, + "loss_ce": 0.00014166282198857516, + "loss_xval": 0.038330078125, + "num_input_tokens_seen": 99691520, + "step": 1442 + }, + { + "epoch": 90.1875, + "grad_norm": 18.803814282510302, + "learning_rate": 5e-05, + "loss": 0.0506, + "num_input_tokens_seen": 99763264, + "step": 1443 + }, + { + "epoch": 90.1875, + "loss": 0.0509246364235878, + "loss_ce": 0.00014338496839627624, + "loss_xval": 0.05078125, + "num_input_tokens_seen": 99763264, + "step": 1443 + }, + { + "epoch": 90.25, + "grad_norm": 16.951817413186063, + "learning_rate": 5e-05, + "loss": 0.0417, + "num_input_tokens_seen": 99822400, + "step": 1444 + }, + { + "epoch": 90.25, + "loss": 0.039222631603479385, + "loss_ce": 0.00016012991545721889, + "loss_xval": 0.0390625, + "num_input_tokens_seen": 99822400, + "step": 1444 + }, + { + "epoch": 90.3125, + "grad_norm": 10.18017997837843, + "learning_rate": 5e-05, + "loss": 0.0156, + "num_input_tokens_seen": 99893952, + "step": 1445 + }, + { + "epoch": 90.3125, + "loss": 0.016544576734304428, + "loss_ce": 0.00018715558690018952, + "loss_xval": 0.016357421875, + "num_input_tokens_seen": 99893952, + "step": 1445 + }, + { + "epoch": 90.375, + "grad_norm": 0.6092748133457243, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 99965632, + "step": 1446 + }, + { + "epoch": 90.375, + "loss": 0.0005741502973251045, + "loss_ce": 0.0002155687689082697, + "loss_xval": 0.00035858154296875, + "num_input_tokens_seen": 99965632, + "step": 1446 + }, + { + "epoch": 90.4375, + "grad_norm": 8.441956699548394, + "learning_rate": 5e-05, + "loss": 0.0112, + "num_input_tokens_seen": 100037248, + "step": 1447 + }, + { + "epoch": 90.4375, + "loss": 0.012076275423169136, + "loss_ce": 0.0002354555472265929, + "loss_xval": 0.0118408203125, + "num_input_tokens_seen": 100037248, + "step": 1447 + }, + { + "epoch": 90.5, + "grad_norm": 13.906676576538521, + "learning_rate": 5e-05, + "loss": 0.0294, + "num_input_tokens_seen": 100108928, + "step": 1448 + }, + { + "epoch": 90.5, + "loss": 0.02933783084154129, + "loss_ce": 0.00028509661206044257, + "loss_xval": 0.029052734375, + "num_input_tokens_seen": 100108928, + "step": 1448 + }, + { + "epoch": 90.5625, + "grad_norm": 13.606772895897334, + "learning_rate": 5e-05, + "loss": 0.0282, + "num_input_tokens_seen": 100180608, + "step": 1449 + }, + { + "epoch": 90.5625, + "loss": 0.027996975928544998, + "loss_ce": 0.0002870155731216073, + "loss_xval": 0.0277099609375, + "num_input_tokens_seen": 100180608, + "step": 1449 + }, + { + "epoch": 90.625, + "grad_norm": 7.634030291353978, + "learning_rate": 5e-05, + "loss": 0.0096, + "num_input_tokens_seen": 100252224, + "step": 1450 + }, + { + "epoch": 90.625, + "loss": 0.009694787673652172, + "loss_ce": 0.00029537358204834163, + "loss_xval": 0.0093994140625, + "num_input_tokens_seen": 100252224, + "step": 1450 + }, + { + "epoch": 90.6875, + "grad_norm": 1.3682169589201283, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 100311232, + "step": 1451 + }, + { + "epoch": 90.6875, + "loss": 0.0008320095366798341, + "loss_ce": 0.00029795191949233413, + "loss_xval": 0.0005340576171875, + "num_input_tokens_seen": 100311232, + "step": 1451 + }, + { + "epoch": 90.75, + "grad_norm": 9.032418326904276, + "learning_rate": 5e-05, + "loss": 0.0128, + "num_input_tokens_seen": 100370432, + "step": 1452 + }, + { + "epoch": 90.75, + "loss": 0.013181054033339024, + "loss_ce": 0.0003026359772775322, + "loss_xval": 0.01287841796875, + "num_input_tokens_seen": 100370432, + "step": 1452 + }, + { + "epoch": 90.8125, + "grad_norm": 11.59426157318603, + "learning_rate": 5e-05, + "loss": 0.0208, + "num_input_tokens_seen": 100442112, + "step": 1453 + }, + { + "epoch": 90.8125, + "loss": 0.022128943353891373, + "loss_ce": 0.0002783581439871341, + "loss_xval": 0.0218505859375, + "num_input_tokens_seen": 100442112, + "step": 1453 + }, + { + "epoch": 90.875, + "grad_norm": 8.386193910116718, + "learning_rate": 5e-05, + "loss": 0.0112, + "num_input_tokens_seen": 100501248, + "step": 1454 + }, + { + "epoch": 90.875, + "loss": 0.010896342806518078, + "loss_ce": 0.0002762254443950951, + "loss_xval": 0.0106201171875, + "num_input_tokens_seen": 100501248, + "step": 1454 + }, + { + "epoch": 90.9375, + "grad_norm": 2.1259181598992742, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 100572800, + "step": 1455 + }, + { + "epoch": 90.9375, + "loss": 0.0011982765281572938, + "loss_ce": 0.00027130506350658834, + "loss_xval": 0.000926971435546875, + "num_input_tokens_seen": 100572800, + "step": 1455 + }, + { + "epoch": 91.0, + "grad_norm": 3.138463209234726, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 100644480, + "step": 1456 + }, + { + "epoch": 91.0, + "loss": 0.0022985711693763733, + "loss_ce": 0.0002538934932090342, + "loss_xval": 0.002044677734375, + "num_input_tokens_seen": 100644480, + "step": 1456 + }, + { + "epoch": 91.0625, + "grad_norm": 4.981874046721102, + "learning_rate": 5e-05, + "loss": 0.0042, + "num_input_tokens_seen": 100716096, + "step": 1457 + }, + { + "epoch": 91.0625, + "loss": 0.004420939367264509, + "loss_ce": 0.00024003109137993306, + "loss_xval": 0.004180908203125, + "num_input_tokens_seen": 100716096, + "step": 1457 + }, + { + "epoch": 91.125, + "grad_norm": 3.6465351426806363, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_input_tokens_seen": 100787712, + "step": 1458 + }, + { + "epoch": 91.125, + "loss": 0.002575915539637208, + "loss_ce": 0.00022606206766795367, + "loss_xval": 0.002349853515625, + "num_input_tokens_seen": 100787712, + "step": 1458 + }, + { + "epoch": 91.1875, + "grad_norm": 0.5411738473532007, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 100859264, + "step": 1459 + }, + { + "epoch": 91.1875, + "loss": 0.0013069550041109324, + "loss_ce": 0.0001930634316522628, + "loss_xval": 0.0011138916015625, + "num_input_tokens_seen": 100859264, + "step": 1459 + }, + { + "epoch": 91.25, + "grad_norm": 3.3943456146166944, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 100930816, + "step": 1460 + }, + { + "epoch": 91.25, + "loss": 0.0021058768033981323, + "loss_ce": 0.00020615763787645847, + "loss_xval": 0.00189971923828125, + "num_input_tokens_seen": 100930816, + "step": 1460 + }, + { + "epoch": 91.3125, + "grad_norm": 6.824281318521774, + "learning_rate": 5e-05, + "loss": 0.0071, + "num_input_tokens_seen": 101002432, + "step": 1461 + }, + { + "epoch": 91.3125, + "loss": 0.006831796374171972, + "loss_ce": 0.0001789645612007007, + "loss_xval": 0.00665283203125, + "num_input_tokens_seen": 101002432, + "step": 1461 + }, + { + "epoch": 91.375, + "grad_norm": 8.703061439614308, + "learning_rate": 5e-05, + "loss": 0.0119, + "num_input_tokens_seen": 101061632, + "step": 1462 + }, + { + "epoch": 91.375, + "loss": 0.011036021634936333, + "loss_ce": 0.00017176421533804387, + "loss_xval": 0.0108642578125, + "num_input_tokens_seen": 101061632, + "step": 1462 + }, + { + "epoch": 91.4375, + "grad_norm": 8.61245755703454, + "learning_rate": 5e-05, + "loss": 0.0116, + "num_input_tokens_seen": 101120704, + "step": 1463 + }, + { + "epoch": 91.4375, + "loss": 0.011173286475241184, + "loss_ce": 0.0001869582774816081, + "loss_xval": 0.010986328125, + "num_input_tokens_seen": 101120704, + "step": 1463 + }, + { + "epoch": 91.5, + "grad_norm": 6.233009274737832, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 101192256, + "step": 1464 + }, + { + "epoch": 91.5, + "loss": 0.005891819950193167, + "loss_ce": 0.000185032666195184, + "loss_xval": 0.005706787109375, + "num_input_tokens_seen": 101192256, + "step": 1464 + }, + { + "epoch": 91.5625, + "grad_norm": 2.021738887516537, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 101263872, + "step": 1465 + }, + { + "epoch": 91.5625, + "loss": 0.0013423706404864788, + "loss_ce": 0.00020559091353788972, + "loss_xval": 0.00113677978515625, + "num_input_tokens_seen": 101263872, + "step": 1465 + }, + { + "epoch": 91.625, + "grad_norm": 2.568438390324364, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 101335424, + "step": 1466 + }, + { + "epoch": 91.625, + "loss": 0.0018030558712780476, + "loss_ce": 0.00021614175057038665, + "loss_xval": 0.0015869140625, + "num_input_tokens_seen": 101335424, + "step": 1466 + }, + { + "epoch": 91.6875, + "grad_norm": 6.616184071416912, + "learning_rate": 5e-05, + "loss": 0.0073, + "num_input_tokens_seen": 101407040, + "step": 1467 + }, + { + "epoch": 91.6875, + "loss": 0.007182391360402107, + "loss_ce": 0.00022438356245402247, + "loss_xval": 0.0069580078125, + "num_input_tokens_seen": 101407040, + "step": 1467 + }, + { + "epoch": 91.75, + "grad_norm": 9.052048701564082, + "learning_rate": 5e-05, + "loss": 0.0133, + "num_input_tokens_seen": 101478784, + "step": 1468 + }, + { + "epoch": 91.75, + "loss": 0.01281468290835619, + "loss_ce": 0.00024144031340256333, + "loss_xval": 0.0125732421875, + "num_input_tokens_seen": 101478784, + "step": 1468 + }, + { + "epoch": 91.8125, + "grad_norm": 8.216207441201835, + "learning_rate": 5e-05, + "loss": 0.0112, + "num_input_tokens_seen": 101550336, + "step": 1469 + }, + { + "epoch": 91.8125, + "loss": 0.012428391724824905, + "loss_ce": 0.00022136066399980336, + "loss_xval": 0.01220703125, + "num_input_tokens_seen": 101550336, + "step": 1469 + }, + { + "epoch": 91.875, + "grad_norm": 4.215063838216148, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_input_tokens_seen": 101622080, + "step": 1470 + }, + { + "epoch": 91.875, + "loss": 0.0033252511639147997, + "loss_ce": 0.00022771699877921492, + "loss_xval": 0.0030975341796875, + "num_input_tokens_seen": 101622080, + "step": 1470 + }, + { + "epoch": 91.9375, + "grad_norm": 0.9309762576951055, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 101693760, + "step": 1471 + }, + { + "epoch": 91.9375, + "loss": 0.00047848786925897, + "loss_ce": 0.0002133663947461173, + "loss_xval": 0.0002651214599609375, + "num_input_tokens_seen": 101693760, + "step": 1471 + }, + { + "epoch": 92.0, + "grad_norm": 5.291556637101302, + "learning_rate": 5e-05, + "loss": 0.0048, + "num_input_tokens_seen": 101765312, + "step": 1472 + }, + { + "epoch": 92.0, + "loss": 0.005367064382880926, + "loss_ce": 0.0002095934614771977, + "loss_xval": 0.005157470703125, + "num_input_tokens_seen": 101765312, + "step": 1472 + }, + { + "epoch": 92.0625, + "grad_norm": 8.002296405575832, + "learning_rate": 5e-05, + "loss": 0.0104, + "num_input_tokens_seen": 101837056, + "step": 1473 + }, + { + "epoch": 92.0625, + "loss": 0.010928795672953129, + "loss_ce": 0.0001866081147454679, + "loss_xval": 0.0107421875, + "num_input_tokens_seen": 101837056, + "step": 1473 + }, + { + "epoch": 92.125, + "grad_norm": 8.903838830767665, + "learning_rate": 5e-05, + "loss": 0.0126, + "num_input_tokens_seen": 101908672, + "step": 1474 + }, + { + "epoch": 92.125, + "loss": 0.01206380594521761, + "loss_ce": 0.0001619504182599485, + "loss_xval": 0.01190185546875, + "num_input_tokens_seen": 101908672, + "step": 1474 + }, + { + "epoch": 92.1875, + "grad_norm": 7.604459502826674, + "learning_rate": 5e-05, + "loss": 0.0093, + "num_input_tokens_seen": 101980416, + "step": 1475 + }, + { + "epoch": 92.1875, + "loss": 0.009634369052946568, + "loss_ce": 0.00017392008157912642, + "loss_xval": 0.00946044921875, + "num_input_tokens_seen": 101980416, + "step": 1475 + }, + { + "epoch": 92.25, + "grad_norm": 4.488691635855987, + "learning_rate": 5e-05, + "loss": 0.0034, + "num_input_tokens_seen": 102052032, + "step": 1476 + }, + { + "epoch": 92.25, + "loss": 0.0033483817242085934, + "loss_ce": 0.0001592948829056695, + "loss_xval": 0.0031890869140625, + "num_input_tokens_seen": 102052032, + "step": 1476 + }, + { + "epoch": 92.3125, + "grad_norm": 1.0491006631465516, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 102123712, + "step": 1477 + }, + { + "epoch": 92.3125, + "loss": 0.00053489237325266, + "loss_ce": 0.0001591446780366823, + "loss_xval": 0.0003757476806640625, + "num_input_tokens_seen": 102123712, + "step": 1477 + }, + { + "epoch": 92.375, + "grad_norm": 1.5456909708775723, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 102170240, + "step": 1478 + }, + { + "epoch": 92.375, + "loss": 0.0006626353715546429, + "loss_ce": 0.00015528064977843314, + "loss_xval": 0.000507354736328125, + "num_input_tokens_seen": 102170240, + "step": 1478 + }, + { + "epoch": 92.4375, + "grad_norm": 3.0534220131793335, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 102229376, + "step": 1479 + }, + { + "epoch": 92.4375, + "loss": 0.0015860440907999873, + "loss_ce": 0.00015171787526924163, + "loss_xval": 0.001434326171875, + "num_input_tokens_seen": 102229376, + "step": 1479 + }, + { + "epoch": 92.5, + "grad_norm": 3.6563123643106765, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_input_tokens_seen": 102288512, + "step": 1480 + }, + { + "epoch": 92.5, + "loss": 0.0022240960970520973, + "loss_ce": 0.00013364202459342778, + "loss_xval": 0.0020904541015625, + "num_input_tokens_seen": 102288512, + "step": 1480 + }, + { + "epoch": 92.5625, + "grad_norm": 3.5907319476119204, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 102360064, + "step": 1481 + }, + { + "epoch": 92.5625, + "loss": 0.002281162654981017, + "loss_ce": 0.00014493215712718666, + "loss_xval": 0.00213623046875, + "num_input_tokens_seen": 102360064, + "step": 1481 + }, + { + "epoch": 92.625, + "grad_norm": 3.197964126736755, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 102419072, + "step": 1482 + }, + { + "epoch": 92.625, + "loss": 0.0018821260891854763, + "loss_ce": 0.00012736536154989153, + "loss_xval": 0.0017547607421875, + "num_input_tokens_seen": 102419072, + "step": 1482 + }, + { + "epoch": 92.6875, + "grad_norm": 2.5057744516558538, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 102490624, + "step": 1483 + }, + { + "epoch": 92.6875, + "loss": 0.0014125545276328921, + "loss_ce": 0.00013081621727906168, + "loss_xval": 0.00128173828125, + "num_input_tokens_seen": 102490624, + "step": 1483 + }, + { + "epoch": 92.75, + "grad_norm": 2.006927847620975, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 102562368, + "step": 1484 + }, + { + "epoch": 92.75, + "loss": 0.0013389154337346554, + "loss_ce": 0.00012584170326590538, + "loss_xval": 0.00121307373046875, + "num_input_tokens_seen": 102562368, + "step": 1484 + }, + { + "epoch": 92.8125, + "grad_norm": 2.350273933391494, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 102608960, + "step": 1485 + }, + { + "epoch": 92.8125, + "loss": 0.0011312231654301286, + "loss_ce": 0.00010888425458688289, + "loss_xval": 0.0010223388671875, + "num_input_tokens_seen": 102608960, + "step": 1485 + }, + { + "epoch": 92.875, + "grad_norm": 3.297160408486596, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 102680576, + "step": 1486 + }, + { + "epoch": 92.875, + "loss": 0.0019285711459815502, + "loss_ce": 0.00011277526937192306, + "loss_xval": 0.0018157958984375, + "num_input_tokens_seen": 102680576, + "step": 1486 + }, + { + "epoch": 92.9375, + "grad_norm": 4.720119891390996, + "learning_rate": 5e-05, + "loss": 0.0037, + "num_input_tokens_seen": 102752192, + "step": 1487 + }, + { + "epoch": 92.9375, + "loss": 0.0036649664398282766, + "loss_ce": 0.00010966858826577663, + "loss_xval": 0.0035552978515625, + "num_input_tokens_seen": 102752192, + "step": 1487 + }, + { + "epoch": 93.0, + "grad_norm": 6.9058946446826495, + "learning_rate": 5e-05, + "loss": 0.0075, + "num_input_tokens_seen": 102823872, + "step": 1488 + }, + { + "epoch": 93.0, + "loss": 0.007736025843769312, + "loss_ce": 0.00010663115972420201, + "loss_xval": 0.00762939453125, + "num_input_tokens_seen": 102823872, + "step": 1488 + }, + { + "epoch": 93.0625, + "grad_norm": 9.544356321091449, + "learning_rate": 5e-05, + "loss": 0.0142, + "num_input_tokens_seen": 102895616, + "step": 1489 + }, + { + "epoch": 93.0625, + "loss": 0.014089690521359444, + "loss_ce": 0.0001126401184592396, + "loss_xval": 0.01397705078125, + "num_input_tokens_seen": 102895616, + "step": 1489 + }, + { + "epoch": 93.125, + "grad_norm": 12.144914124818996, + "learning_rate": 5e-05, + "loss": 0.0237, + "num_input_tokens_seen": 102967296, + "step": 1490 + }, + { + "epoch": 93.125, + "loss": 0.02307068556547165, + "loss_ce": 0.00012146684457547963, + "loss_xval": 0.02294921875, + "num_input_tokens_seen": 102967296, + "step": 1490 + }, + { + "epoch": 93.1875, + "grad_norm": 13.277750354610223, + "learning_rate": 5e-05, + "loss": 0.0293, + "num_input_tokens_seen": 103038912, + "step": 1491 + }, + { + "epoch": 93.1875, + "loss": 0.02955476939678192, + "loss_ce": 0.000135824506287463, + "loss_xval": 0.0294189453125, + "num_input_tokens_seen": 103038912, + "step": 1491 + }, + { + "epoch": 93.25, + "grad_norm": 11.655739983208555, + "learning_rate": 5e-05, + "loss": 0.0228, + "num_input_tokens_seen": 103110464, + "step": 1492 + }, + { + "epoch": 93.25, + "loss": 0.02409106306731701, + "loss_ce": 0.000165281118825078, + "loss_xval": 0.02392578125, + "num_input_tokens_seen": 103110464, + "step": 1492 + }, + { + "epoch": 93.3125, + "grad_norm": 8.403740244338909, + "learning_rate": 5e-05, + "loss": 0.012, + "num_input_tokens_seen": 103182080, + "step": 1493 + }, + { + "epoch": 93.3125, + "loss": 0.012799791991710663, + "loss_ce": 0.00022654981876257807, + "loss_xval": 0.0125732421875, + "num_input_tokens_seen": 103182080, + "step": 1493 + }, + { + "epoch": 93.375, + "grad_norm": 4.172359561436788, + "learning_rate": 5e-05, + "loss": 0.0038, + "num_input_tokens_seen": 103253760, + "step": 1494 + }, + { + "epoch": 93.375, + "loss": 0.004045870620757341, + "loss_ce": 0.00021591474069282413, + "loss_xval": 0.0038299560546875, + "num_input_tokens_seen": 103253760, + "step": 1494 + }, + { + "epoch": 93.4375, + "grad_norm": 0.447412754121555, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 103325440, + "step": 1495 + }, + { + "epoch": 93.4375, + "loss": 0.0008496865048073232, + "loss_ce": 0.00022407615324482322, + "loss_xval": 0.0006256103515625, + "num_input_tokens_seen": 103325440, + "step": 1495 + }, + { + "epoch": 93.5, + "grad_norm": 3.8982735632213625, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 103397120, + "step": 1496 + }, + { + "epoch": 93.5, + "loss": 0.002992264460772276, + "loss_ce": 0.00021516485139727592, + "loss_xval": 0.002777099609375, + "num_input_tokens_seen": 103397120, + "step": 1496 + }, + { + "epoch": 93.5625, + "grad_norm": 6.281101491606525, + "learning_rate": 5e-05, + "loss": 0.007, + "num_input_tokens_seen": 103456320, + "step": 1497 + }, + { + "epoch": 93.5625, + "loss": 0.0074848029762506485, + "loss_ce": 0.00019110173161607236, + "loss_xval": 0.007293701171875, + "num_input_tokens_seen": 103456320, + "step": 1497 + }, + { + "epoch": 93.625, + "grad_norm": 7.413049338313138, + "learning_rate": 5e-05, + "loss": 0.0096, + "num_input_tokens_seen": 103527936, + "step": 1498 + }, + { + "epoch": 93.625, + "loss": 0.00933548528701067, + "loss_ce": 0.00018021151481661946, + "loss_xval": 0.0091552734375, + "num_input_tokens_seen": 103527936, + "step": 1498 + }, + { + "epoch": 93.6875, + "grad_norm": 7.626232550227615, + "learning_rate": 5e-05, + "loss": 0.01, + "num_input_tokens_seen": 103599680, + "step": 1499 + }, + { + "epoch": 93.6875, + "loss": 0.010206174105405807, + "loss_ce": 0.00019640830578282475, + "loss_xval": 0.010009765625, + "num_input_tokens_seen": 103599680, + "step": 1499 + }, + { + "epoch": 93.75, + "grad_norm": 7.359830796258682, + "learning_rate": 5e-05, + "loss": 0.0094, + "num_input_tokens_seen": 103671424, + "step": 1500 + }, + { + "epoch": 93.75, + "eval_synth_IoU": 0.014503192156553268, + "eval_synth_MAE_x": 0.064971923828125, + "eval_synth_MAE_y": 0.065216064453125, + "eval_synth_NUM_probability": 0.9975821077823639, + "eval_synth_inside_bbox": 0.0625, + "eval_synth_loss": 0.006595642305910587, + "eval_synth_loss_ce": 0.00020220893929945305, + "eval_synth_loss_xval": 0.0063934326171875, + "eval_synth_runtime": 53.8671, + "eval_synth_samples_per_second": 2.376, + "eval_synth_steps_per_second": 0.074, + "num_input_tokens_seen": 103671424, + "step": 1500 + }, + { + "epoch": 93.75, + "loss": 0.0063739619217813015, + "loss_ce": 0.0002094111987389624, + "loss_xval": 0.00616455078125, + "num_input_tokens_seen": 103671424, + "step": 1500 + }, + { + "epoch": 93.8125, + "grad_norm": 5.815031205342289, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 103743104, + "step": 1501 + }, + { + "epoch": 93.8125, + "loss": 0.005885062273591757, + "loss_ce": 0.00020879291696473956, + "loss_xval": 0.00567626953125, + "num_input_tokens_seen": 103743104, + "step": 1501 + }, + { + "epoch": 93.875, + "grad_norm": 2.802506755096174, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 103814720, + "step": 1502 + }, + { + "epoch": 93.875, + "loss": 0.0018814080394804478, + "loss_ce": 0.00021819998801220208, + "loss_xval": 0.0016632080078125, + "num_input_tokens_seen": 103814720, + "step": 1502 + }, + { + "epoch": 93.9375, + "grad_norm": 0.7991744215551432, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 103886400, + "step": 1503 + }, + { + "epoch": 93.9375, + "loss": 0.0005315585294738412, + "loss_ce": 0.0002130313077941537, + "loss_xval": 0.0003185272216796875, + "num_input_tokens_seen": 103886400, + "step": 1503 + }, + { + "epoch": 94.0, + "grad_norm": 4.320325374935247, + "learning_rate": 5e-05, + "loss": 0.0036, + "num_input_tokens_seen": 103957952, + "step": 1504 + }, + { + "epoch": 94.0, + "loss": 0.0035868578124791384, + "loss_ce": 0.00021466535690706223, + "loss_xval": 0.0033721923828125, + "num_input_tokens_seen": 103957952, + "step": 1504 + }, + { + "epoch": 94.0625, + "grad_norm": 7.803107358622946, + "learning_rate": 5e-05, + "loss": 0.0105, + "num_input_tokens_seen": 104017152, + "step": 1505 + }, + { + "epoch": 94.0625, + "loss": 0.010636304505169392, + "loss_ce": 0.00019929290283471346, + "loss_xval": 0.01043701171875, + "num_input_tokens_seen": 104017152, + "step": 1505 + }, + { + "epoch": 94.125, + "grad_norm": 11.06673981346744, + "learning_rate": 5e-05, + "loss": 0.0201, + "num_input_tokens_seen": 104088832, + "step": 1506 + }, + { + "epoch": 94.125, + "loss": 0.0195931363850832, + "loss_ce": 0.00018395722145214677, + "loss_xval": 0.0194091796875, + "num_input_tokens_seen": 104088832, + "step": 1506 + }, + { + "epoch": 94.1875, + "grad_norm": 12.747970197272299, + "learning_rate": 5e-05, + "loss": 0.0263, + "num_input_tokens_seen": 104160576, + "step": 1507 + }, + { + "epoch": 94.1875, + "loss": 0.02580292895436287, + "loss_ce": 0.00016816369316074997, + "loss_xval": 0.025634765625, + "num_input_tokens_seen": 104160576, + "step": 1507 + }, + { + "epoch": 94.25, + "grad_norm": 11.777468484766578, + "learning_rate": 5e-05, + "loss": 0.0234, + "num_input_tokens_seen": 104232192, + "step": 1508 + }, + { + "epoch": 94.25, + "loss": 0.02337353304028511, + "loss_ce": 0.0001801743928808719, + "loss_xval": 0.023193359375, + "num_input_tokens_seen": 104232192, + "step": 1508 + }, + { + "epoch": 94.3125, + "grad_norm": 8.40786241418314, + "learning_rate": 5e-05, + "loss": 0.0126, + "num_input_tokens_seen": 104291264, + "step": 1509 + }, + { + "epoch": 94.3125, + "loss": 0.014077544212341309, + "loss_ce": 0.00016152839816641062, + "loss_xval": 0.013916015625, + "num_input_tokens_seen": 104291264, + "step": 1509 + }, + { + "epoch": 94.375, + "grad_norm": 3.4373041650499183, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 104350336, + "step": 1510 + }, + { + "epoch": 94.375, + "loss": 0.0023855778854340315, + "loss_ce": 0.0002035711077041924, + "loss_xval": 0.0021820068359375, + "num_input_tokens_seen": 104350336, + "step": 1510 + }, + { + "epoch": 94.4375, + "grad_norm": 1.8541259301263842, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 104421888, + "step": 1511 + }, + { + "epoch": 94.4375, + "loss": 0.0016115899197757244, + "loss_ce": 0.0002382989041507244, + "loss_xval": 0.001373291015625, + "num_input_tokens_seen": 104421888, + "step": 1511 + }, + { + "epoch": 94.5, + "grad_norm": 6.309518960354388, + "learning_rate": 5e-05, + "loss": 0.0083, + "num_input_tokens_seen": 104493440, + "step": 1512 + }, + { + "epoch": 94.5, + "loss": 0.00801610667258501, + "loss_ce": 0.0002646417706273496, + "loss_xval": 0.00775146484375, + "num_input_tokens_seen": 104493440, + "step": 1512 + }, + { + "epoch": 94.5625, + "grad_norm": 9.278758980180998, + "learning_rate": 5e-05, + "loss": 0.0152, + "num_input_tokens_seen": 104552576, + "step": 1513 + }, + { + "epoch": 94.5625, + "loss": 0.014277543872594833, + "loss_ce": 0.0002394582115812227, + "loss_xval": 0.0140380859375, + "num_input_tokens_seen": 104552576, + "step": 1513 + }, + { + "epoch": 94.625, + "grad_norm": 10.669631347212485, + "learning_rate": 5e-05, + "loss": 0.0201, + "num_input_tokens_seen": 104624256, + "step": 1514 + }, + { + "epoch": 94.625, + "loss": 0.01994253136217594, + "loss_ce": 0.00028921186458319426, + "loss_xval": 0.0196533203125, + "num_input_tokens_seen": 104624256, + "step": 1514 + }, + { + "epoch": 94.6875, + "grad_norm": 9.71471363982105, + "learning_rate": 5e-05, + "loss": 0.0167, + "num_input_tokens_seen": 104695808, + "step": 1515 + }, + { + "epoch": 94.6875, + "loss": 0.016933368518948555, + "loss_ce": 0.0002097350952681154, + "loss_xval": 0.0167236328125, + "num_input_tokens_seen": 104695808, + "step": 1515 + }, + { + "epoch": 94.75, + "grad_norm": 5.316889455778196, + "learning_rate": 5e-05, + "loss": 0.0056, + "num_input_tokens_seen": 104767488, + "step": 1516 + }, + { + "epoch": 94.75, + "loss": 0.00562123442068696, + "loss_ce": 0.00018910533981397748, + "loss_xval": 0.00543212890625, + "num_input_tokens_seen": 104767488, + "step": 1516 + }, + { + "epoch": 94.8125, + "grad_norm": 0.8871147780921804, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 104839168, + "step": 1517 + }, + { + "epoch": 94.8125, + "loss": 0.000770560756791383, + "loss_ce": 0.00017928268061950803, + "loss_xval": 0.000591278076171875, + "num_input_tokens_seen": 104839168, + "step": 1517 + }, + { + "epoch": 94.875, + "grad_norm": 6.706770298722494, + "learning_rate": 5e-05, + "loss": 0.0087, + "num_input_tokens_seen": 104910720, + "step": 1518 + }, + { + "epoch": 94.875, + "loss": 0.007364541292190552, + "loss_ce": 0.0001623926218599081, + "loss_xval": 0.0072021484375, + "num_input_tokens_seen": 104910720, + "step": 1518 + }, + { + "epoch": 94.9375, + "grad_norm": 11.540691922864541, + "learning_rate": 5e-05, + "loss": 0.0225, + "num_input_tokens_seen": 104982336, + "step": 1519 + }, + { + "epoch": 94.9375, + "loss": 0.022339239716529846, + "loss_ce": 0.0001224419247591868, + "loss_xval": 0.022216796875, + "num_input_tokens_seen": 104982336, + "step": 1519 + }, + { + "epoch": 95.0, + "grad_norm": 13.257087366622384, + "learning_rate": 5e-05, + "loss": 0.0303, + "num_input_tokens_seen": 105053888, + "step": 1520 + }, + { + "epoch": 95.0, + "loss": 0.030590539798140526, + "loss_ce": 0.00019503226212691516, + "loss_xval": 0.0303955078125, + "num_input_tokens_seen": 105053888, + "step": 1520 + }, + { + "epoch": 95.0625, + "grad_norm": 11.574854226916827, + "learning_rate": 5e-05, + "loss": 0.0233, + "num_input_tokens_seen": 105125504, + "step": 1521 + }, + { + "epoch": 95.0625, + "loss": 0.022616026923060417, + "loss_ce": 0.00027716063777916133, + "loss_xval": 0.0223388671875, + "num_input_tokens_seen": 105125504, + "step": 1521 + }, + { + "epoch": 95.125, + "grad_norm": 6.765160516068692, + "learning_rate": 5e-05, + "loss": 0.0081, + "num_input_tokens_seen": 105197056, + "step": 1522 + }, + { + "epoch": 95.125, + "loss": 0.008212330751121044, + "loss_ce": 0.00021672547154594213, + "loss_xval": 0.00799560546875, + "num_input_tokens_seen": 105197056, + "step": 1522 + }, + { + "epoch": 95.1875, + "grad_norm": 0.9655238738340344, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 105268672, + "step": 1523 + }, + { + "epoch": 95.1875, + "loss": 0.0006662615924142301, + "loss_ce": 0.00021231263235677034, + "loss_xval": 0.000453948974609375, + "num_input_tokens_seen": 105268672, + "step": 1523 + }, + { + "epoch": 95.25, + "grad_norm": 7.545374574995911, + "learning_rate": 5e-05, + "loss": 0.01, + "num_input_tokens_seen": 105340288, + "step": 1524 + }, + { + "epoch": 95.25, + "loss": 0.010226485319435596, + "loss_ce": 0.00021672008733730763, + "loss_xval": 0.010009765625, + "num_input_tokens_seen": 105340288, + "step": 1524 + }, + { + "epoch": 95.3125, + "grad_norm": 9.974132967357392, + "learning_rate": 5e-05, + "loss": 0.0166, + "num_input_tokens_seen": 105411904, + "step": 1525 + }, + { + "epoch": 95.3125, + "loss": 0.016069557517766953, + "loss_ce": 0.00020041617972310632, + "loss_xval": 0.015869140625, + "num_input_tokens_seen": 105411904, + "step": 1525 + }, + { + "epoch": 95.375, + "grad_norm": 8.400691776915371, + "learning_rate": 5e-05, + "loss": 0.0124, + "num_input_tokens_seen": 105483520, + "step": 1526 + }, + { + "epoch": 95.375, + "loss": 0.012161885388195515, + "loss_ce": 0.00019899505423381925, + "loss_xval": 0.011962890625, + "num_input_tokens_seen": 105483520, + "step": 1526 + }, + { + "epoch": 95.4375, + "grad_norm": 3.1287308542841585, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_input_tokens_seen": 105555136, + "step": 1527 + }, + { + "epoch": 95.4375, + "loss": 0.002447618404403329, + "loss_ce": 0.0002198353031417355, + "loss_xval": 0.002227783203125, + "num_input_tokens_seen": 105555136, + "step": 1527 + }, + { + "epoch": 95.5, + "grad_norm": 3.333597100843683, + "learning_rate": 5e-05, + "loss": 0.0026, + "num_input_tokens_seen": 105626752, + "step": 1528 + }, + { + "epoch": 95.5, + "loss": 0.0024574645794928074, + "loss_ce": 0.0002449401072226465, + "loss_xval": 0.0022125244140625, + "num_input_tokens_seen": 105626752, + "step": 1528 + }, + { + "epoch": 95.5625, + "grad_norm": 7.238995915388532, + "learning_rate": 5e-05, + "loss": 0.0095, + "num_input_tokens_seen": 105698304, + "step": 1529 + }, + { + "epoch": 95.5625, + "loss": 0.008781913667917252, + "loss_ce": 0.0002369913418078795, + "loss_xval": 0.008544921875, + "num_input_tokens_seen": 105698304, + "step": 1529 + }, + { + "epoch": 95.625, + "grad_norm": 7.5002230114016655, + "learning_rate": 5e-05, + "loss": 0.0105, + "num_input_tokens_seen": 105769856, + "step": 1530 + }, + { + "epoch": 95.625, + "loss": 0.01023135520517826, + "loss_ce": 0.00028262505657039583, + "loss_xval": 0.00994873046875, + "num_input_tokens_seen": 105769856, + "step": 1530 + }, + { + "epoch": 95.6875, + "grad_norm": 4.072986944781151, + "learning_rate": 5e-05, + "loss": 0.0035, + "num_input_tokens_seen": 105841472, + "step": 1531 + }, + { + "epoch": 95.6875, + "loss": 0.0039795455522835255, + "loss_ce": 0.0002869187155738473, + "loss_xval": 0.003692626953125, + "num_input_tokens_seen": 105841472, + "step": 1531 + }, + { + "epoch": 95.75, + "grad_norm": 1.6817132275661484, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 105900544, + "step": 1532 + }, + { + "epoch": 95.75, + "loss": 0.0010310329962521791, + "loss_ce": 0.00031386996852234006, + "loss_xval": 0.0007171630859375, + "num_input_tokens_seen": 105900544, + "step": 1532 + }, + { + "epoch": 95.8125, + "grad_norm": 6.534209892896805, + "learning_rate": 5e-05, + "loss": 0.0082, + "num_input_tokens_seen": 105959552, + "step": 1533 + }, + { + "epoch": 95.8125, + "loss": 0.008419894613325596, + "loss_ce": 0.0003022188611794263, + "loss_xval": 0.00811767578125, + "num_input_tokens_seen": 105959552, + "step": 1533 + }, + { + "epoch": 95.875, + "grad_norm": 7.322113681309952, + "learning_rate": 5e-05, + "loss": 0.0103, + "num_input_tokens_seen": 106018752, + "step": 1534 + }, + { + "epoch": 95.875, + "loss": 0.010474001057446003, + "loss_ce": 0.00028112975996918976, + "loss_xval": 0.01019287109375, + "num_input_tokens_seen": 106018752, + "step": 1534 + }, + { + "epoch": 95.9375, + "grad_norm": 3.3700594012413823, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 106090432, + "step": 1535 + }, + { + "epoch": 95.9375, + "loss": 0.002817351371049881, + "loss_ce": 0.000269133597612381, + "loss_xval": 0.0025482177734375, + "num_input_tokens_seen": 106090432, + "step": 1535 + }, + { + "epoch": 96.0, + "grad_norm": 2.0784202889921293, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 106149632, + "step": 1536 + }, + { + "epoch": 96.0, + "loss": 0.0012789510656148195, + "loss_ce": 0.00027950035291723907, + "loss_xval": 0.00099945068359375, + "num_input_tokens_seen": 106149632, + "step": 1536 + }, + { + "epoch": 96.0625, + "grad_norm": 5.73115808037762, + "learning_rate": 5e-05, + "loss": 0.0067, + "num_input_tokens_seen": 106221184, + "step": 1537 + }, + { + "epoch": 96.0625, + "loss": 0.007430717349052429, + "loss_ce": 0.00022856892610434443, + "loss_xval": 0.0072021484375, + "num_input_tokens_seen": 106221184, + "step": 1537 + }, + { + "epoch": 96.125, + "grad_norm": 5.778802439492285, + "learning_rate": 5e-05, + "loss": 0.0068, + "num_input_tokens_seen": 106292800, + "step": 1538 + }, + { + "epoch": 96.125, + "loss": 0.00656101806089282, + "loss_ce": 0.00018284417456015944, + "loss_xval": 0.006378173828125, + "num_input_tokens_seen": 106292800, + "step": 1538 + }, + { + "epoch": 96.1875, + "grad_norm": 2.748174730459637, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 106351936, + "step": 1539 + }, + { + "epoch": 96.1875, + "loss": 0.0019633835181593895, + "loss_ce": 0.00016284632147289813, + "loss_xval": 0.001800537109375, + "num_input_tokens_seen": 106351936, + "step": 1539 + }, + { + "epoch": 96.25, + "grad_norm": 1.075487575940013, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 106423552, + "step": 1540 + }, + { + "epoch": 96.25, + "loss": 0.0006161235505715013, + "loss_ce": 0.0001526378619018942, + "loss_xval": 0.0004634857177734375, + "num_input_tokens_seen": 106423552, + "step": 1540 + }, + { + "epoch": 96.3125, + "grad_norm": 4.298651762979521, + "learning_rate": 5e-05, + "loss": 0.0036, + "num_input_tokens_seen": 106495232, + "step": 1541 + }, + { + "epoch": 96.3125, + "loss": 0.0032136535737663507, + "loss_ce": 0.00013137809582985938, + "loss_xval": 0.003082275390625, + "num_input_tokens_seen": 106495232, + "step": 1541 + }, + { + "epoch": 96.375, + "grad_norm": 5.9612708922485815, + "learning_rate": 5e-05, + "loss": 0.0064, + "num_input_tokens_seen": 106566848, + "step": 1542 + }, + { + "epoch": 96.375, + "loss": 0.00659773126244545, + "loss_ce": 0.0001280044816667214, + "loss_xval": 0.0064697265625, + "num_input_tokens_seen": 106566848, + "step": 1542 + }, + { + "epoch": 96.4375, + "grad_norm": 5.412804621160427, + "learning_rate": 5e-05, + "loss": 0.0053, + "num_input_tokens_seen": 106625984, + "step": 1543 + }, + { + "epoch": 96.4375, + "loss": 0.005133685190230608, + "loss_ce": 9.828498150454834e-05, + "loss_xval": 0.005035400390625, + "num_input_tokens_seen": 106625984, + "step": 1543 + }, + { + "epoch": 96.5, + "grad_norm": 3.1861755295026826, + "learning_rate": 5e-05, + "loss": 0.0026, + "num_input_tokens_seen": 106685184, + "step": 1544 + }, + { + "epoch": 96.5, + "loss": 0.002085526008158922, + "loss_ce": 8.662457548780367e-05, + "loss_xval": 0.0019989013671875, + "num_input_tokens_seen": 106685184, + "step": 1544 + }, + { + "epoch": 96.5625, + "grad_norm": 0.28285281962099296, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 106756800, + "step": 1545 + }, + { + "epoch": 96.5625, + "loss": 0.0006361733539961278, + "loss_ce": 9.067163045983762e-05, + "loss_xval": 0.000545501708984375, + "num_input_tokens_seen": 106756800, + "step": 1545 + }, + { + "epoch": 96.625, + "grad_norm": 2.8705565420322605, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 106828416, + "step": 1546 + }, + { + "epoch": 96.625, + "loss": 0.001644703559577465, + "loss_ce": 8.830706792650744e-05, + "loss_xval": 0.001556396484375, + "num_input_tokens_seen": 106828416, + "step": 1546 + }, + { + "epoch": 96.6875, + "grad_norm": 4.6414381546442725, + "learning_rate": 5e-05, + "loss": 0.0042, + "num_input_tokens_seen": 106900032, + "step": 1547 + }, + { + "epoch": 96.6875, + "loss": 0.004356001503765583, + "loss_ce": 8.354033343493938e-05, + "loss_xval": 0.0042724609375, + "num_input_tokens_seen": 106900032, + "step": 1547 + }, + { + "epoch": 96.75, + "grad_norm": 4.447232076049476, + "learning_rate": 5e-05, + "loss": 0.0038, + "num_input_tokens_seen": 106959168, + "step": 1548 + }, + { + "epoch": 96.75, + "loss": 0.003473155666142702, + "loss_ce": 8.57045961311087e-05, + "loss_xval": 0.003387451171875, + "num_input_tokens_seen": 106959168, + "step": 1548 + }, + { + "epoch": 96.8125, + "grad_norm": 2.9373942860318802, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 107030784, + "step": 1549 + }, + { + "epoch": 96.8125, + "loss": 0.001669206889346242, + "loss_ce": 8.229284139815718e-05, + "loss_xval": 0.0015869140625, + "num_input_tokens_seen": 107030784, + "step": 1549 + }, + { + "epoch": 96.875, + "grad_norm": 0.9303865075128541, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 107102464, + "step": 1550 + }, + { + "epoch": 96.875, + "loss": 0.00041434323065914214, + "loss_ce": 7.864987856009975e-05, + "loss_xval": 0.000335693359375, + "num_input_tokens_seen": 107102464, + "step": 1550 + }, + { + "epoch": 96.9375, + "grad_norm": 1.138452302146148, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 107174080, + "step": 1551 + }, + { + "epoch": 96.9375, + "loss": 0.0004917146870866418, + "loss_ce": 8.544942102162167e-05, + "loss_xval": 0.0004062652587890625, + "num_input_tokens_seen": 107174080, + "step": 1551 + }, + { + "epoch": 97.0, + "grad_norm": 2.6210738731498995, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 107245632, + "step": 1552 + }, + { + "epoch": 97.0, + "loss": 0.001443982939235866, + "loss_ce": 8.595068356953561e-05, + "loss_xval": 0.0013580322265625, + "num_input_tokens_seen": 107245632, + "step": 1552 + }, + { + "epoch": 97.0625, + "grad_norm": 3.311687916584373, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 107317248, + "step": 1553 + }, + { + "epoch": 97.0625, + "loss": 0.0018252766458317637, + "loss_ce": 8.577468543080613e-05, + "loss_xval": 0.001739501953125, + "num_input_tokens_seen": 107317248, + "step": 1553 + }, + { + "epoch": 97.125, + "grad_norm": 3.4443077065766827, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 107388800, + "step": 1554 + }, + { + "epoch": 97.125, + "loss": 0.002176129724830389, + "loss_ce": 8.567561599193141e-05, + "loss_xval": 0.0020904541015625, + "num_input_tokens_seen": 107388800, + "step": 1554 + }, + { + "epoch": 97.1875, + "grad_norm": 3.232124177935162, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 107460480, + "step": 1555 + }, + { + "epoch": 97.1875, + "loss": 0.002326987450942397, + "loss_ce": 8.394538599532098e-05, + "loss_xval": 0.0022430419921875, + "num_input_tokens_seen": 107460480, + "step": 1555 + }, + { + "epoch": 97.25, + "grad_norm": 2.5180593657209966, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 107532160, + "step": 1556 + }, + { + "epoch": 97.25, + "loss": 0.0013235878432169557, + "loss_ce": 8.762597281020135e-05, + "loss_xval": 0.0012359619140625, + "num_input_tokens_seen": 107532160, + "step": 1556 + }, + { + "epoch": 97.3125, + "grad_norm": 0.9263637828252639, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 107603776, + "step": 1557 + }, + { + "epoch": 97.3125, + "loss": 0.0005748713156208396, + "loss_ce": 8.659008744871244e-05, + "loss_xval": 0.00048828125, + "num_input_tokens_seen": 107603776, + "step": 1557 + }, + { + "epoch": 97.375, + "grad_norm": 0.9979947084252878, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 107662784, + "step": 1558 + }, + { + "epoch": 97.375, + "loss": 0.0005600899457931519, + "loss_ce": 8.706748485565186e-05, + "loss_xval": 0.0004730224609375, + "num_input_tokens_seen": 107662784, + "step": 1558 + }, + { + "epoch": 97.4375, + "grad_norm": 2.174623829666188, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 107734528, + "step": 1559 + }, + { + "epoch": 97.4375, + "loss": 0.0013285287423059344, + "loss_ce": 8.493742643622681e-05, + "loss_xval": 0.00124359130859375, + "num_input_tokens_seen": 107734528, + "step": 1559 + }, + { + "epoch": 97.5, + "grad_norm": 2.704914360205697, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 107806144, + "step": 1560 + }, + { + "epoch": 97.5, + "loss": 0.0014250859385356307, + "loss_ce": 8.231253013946116e-05, + "loss_xval": 0.0013427734375, + "num_input_tokens_seen": 107806144, + "step": 1560 + }, + { + "epoch": 97.5625, + "grad_norm": 3.0964133170468706, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 107877824, + "step": 1561 + }, + { + "epoch": 97.5625, + "loss": 0.0015938394935801625, + "loss_ce": 8.321935456478968e-05, + "loss_xval": 0.0015106201171875, + "num_input_tokens_seen": 107877824, + "step": 1561 + }, + { + "epoch": 97.625, + "grad_norm": 3.339458327215024, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 107949376, + "step": 1562 + }, + { + "epoch": 97.625, + "loss": 0.002547947457060218, + "loss_ce": 7.602353434776887e-05, + "loss_xval": 0.002471923828125, + "num_input_tokens_seen": 107949376, + "step": 1562 + }, + { + "epoch": 97.6875, + "grad_norm": 3.1582407893570537, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 108020992, + "step": 1563 + }, + { + "epoch": 97.6875, + "loss": 0.002089640824124217, + "loss_ce": 7.548071152996272e-05, + "loss_xval": 0.00201416015625, + "num_input_tokens_seen": 108020992, + "step": 1563 + }, + { + "epoch": 97.75, + "grad_norm": 2.575145553500417, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 108092608, + "step": 1564 + }, + { + "epoch": 97.75, + "loss": 0.0014476650394499302, + "loss_ce": 7.437399472109973e-05, + "loss_xval": 0.001373291015625, + "num_input_tokens_seen": 108092608, + "step": 1564 + }, + { + "epoch": 97.8125, + "grad_norm": 1.9222053814262445, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 108164288, + "step": 1565 + }, + { + "epoch": 97.8125, + "loss": 0.0007965641561895609, + "loss_ce": 7.558635115856305e-05, + "loss_xval": 0.000720977783203125, + "num_input_tokens_seen": 108164288, + "step": 1565 + }, + { + "epoch": 97.875, + "grad_norm": 1.6299140847499538, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 108235840, + "step": 1566 + }, + { + "epoch": 97.875, + "loss": 0.0008366935653612018, + "loss_ce": 7.375411223620176e-05, + "loss_xval": 0.000762939453125, + "num_input_tokens_seen": 108235840, + "step": 1566 + }, + { + "epoch": 97.9375, + "grad_norm": 2.2908526062343384, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 108307392, + "step": 1567 + }, + { + "epoch": 97.9375, + "loss": 0.0012096071150153875, + "loss_ce": 7.282734441105276e-05, + "loss_xval": 0.00113677978515625, + "num_input_tokens_seen": 108307392, + "step": 1567 + }, + { + "epoch": 98.0, + "grad_norm": 4.153602107714487, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_input_tokens_seen": 108379008, + "step": 1568 + }, + { + "epoch": 98.0, + "loss": 0.0032553914934396744, + "loss_ce": 8.156333933584392e-05, + "loss_xval": 0.003173828125, + "num_input_tokens_seen": 108379008, + "step": 1568 + }, + { + "epoch": 98.0625, + "grad_norm": 7.1710467414549335, + "learning_rate": 5e-05, + "loss": 0.0092, + "num_input_tokens_seen": 108450688, + "step": 1569 + }, + { + "epoch": 98.0625, + "loss": 0.009295488707721233, + "loss_ce": 7.918039773358032e-05, + "loss_xval": 0.00921630859375, + "num_input_tokens_seen": 108450688, + "step": 1569 + }, + { + "epoch": 98.125, + "grad_norm": 11.662438031029577, + "learning_rate": 5e-05, + "loss": 0.0241, + "num_input_tokens_seen": 108522304, + "step": 1570 + }, + { + "epoch": 98.125, + "loss": 0.024007223546504974, + "loss_ce": 8.144209277816117e-05, + "loss_xval": 0.02392578125, + "num_input_tokens_seen": 108522304, + "step": 1570 + }, + { + "epoch": 98.1875, + "grad_norm": 17.407733059595365, + "learning_rate": 5e-05, + "loss": 0.0543, + "num_input_tokens_seen": 108593856, + "step": 1571 + }, + { + "epoch": 98.1875, + "loss": 0.055038176476955414, + "loss_ce": 0.00010653500794433057, + "loss_xval": 0.054931640625, + "num_input_tokens_seen": 108593856, + "step": 1571 + }, + { + "epoch": 98.25, + "grad_norm": 20.927165200757347, + "learning_rate": 5e-05, + "loss": 0.0795, + "num_input_tokens_seen": 108665536, + "step": 1572 + }, + { + "epoch": 98.25, + "loss": 0.08215904235839844, + "loss_ce": 0.00012779254757333547, + "loss_xval": 0.08203125, + "num_input_tokens_seen": 108665536, + "step": 1572 + }, + { + "epoch": 98.3125, + "grad_norm": 16.725522171740963, + "learning_rate": 5e-05, + "loss": 0.0514, + "num_input_tokens_seen": 108737152, + "step": 1573 + }, + { + "epoch": 98.3125, + "loss": 0.048741865903139114, + "loss_ce": 0.0001578811788931489, + "loss_xval": 0.048583984375, + "num_input_tokens_seen": 108737152, + "step": 1573 + }, + { + "epoch": 98.375, + "grad_norm": 5.127003707099649, + "learning_rate": 5e-05, + "loss": 0.0054, + "num_input_tokens_seen": 108808768, + "step": 1574 + }, + { + "epoch": 98.375, + "loss": 0.00516909547150135, + "loss_ce": 0.00019473010615911335, + "loss_xval": 0.004974365234375, + "num_input_tokens_seen": 108808768, + "step": 1574 + }, + { + "epoch": 98.4375, + "grad_norm": 7.22636403212046, + "learning_rate": 5e-05, + "loss": 0.0106, + "num_input_tokens_seen": 108880576, + "step": 1575 + }, + { + "epoch": 98.4375, + "loss": 0.010036150924861431, + "loss_ce": 0.000270525662926957, + "loss_xval": 0.009765625, + "num_input_tokens_seen": 108880576, + "step": 1575 + }, + { + "epoch": 98.5, + "grad_norm": 15.10241100748922, + "learning_rate": 5e-05, + "loss": 0.0419, + "num_input_tokens_seen": 108952192, + "step": 1576 + }, + { + "epoch": 98.5, + "loss": 0.04190199449658394, + "loss_ce": 0.00039808667497709394, + "loss_xval": 0.04150390625, + "num_input_tokens_seen": 108952192, + "step": 1576 + }, + { + "epoch": 98.5625, + "grad_norm": 14.14376880473154, + "learning_rate": 5e-05, + "loss": 0.0382, + "num_input_tokens_seen": 109023872, + "step": 1577 + }, + { + "epoch": 98.5625, + "loss": 0.03786517307162285, + "loss_ce": 0.0005116576212458313, + "loss_xval": 0.037353515625, + "num_input_tokens_seen": 109023872, + "step": 1577 + }, + { + "epoch": 98.625, + "grad_norm": 5.188315109714636, + "learning_rate": 5e-05, + "loss": 0.0064, + "num_input_tokens_seen": 109083072, + "step": 1578 + }, + { + "epoch": 98.625, + "loss": 0.005667213816195726, + "loss_ce": 0.0007233662181533873, + "loss_xval": 0.00494384765625, + "num_input_tokens_seen": 109083072, + "step": 1578 + }, + { + "epoch": 98.6875, + "grad_norm": 6.888484289652325, + "learning_rate": 5e-05, + "loss": 0.0114, + "num_input_tokens_seen": 109154816, + "step": 1579 + }, + { + "epoch": 98.6875, + "loss": 0.011372015811502934, + "loss_ce": 0.0008129340712912381, + "loss_xval": 0.01055908203125, + "num_input_tokens_seen": 109154816, + "step": 1579 + }, + { + "epoch": 98.75, + "grad_norm": 15.72317232842836, + "learning_rate": 5e-05, + "loss": 0.0508, + "num_input_tokens_seen": 109226368, + "step": 1580 + }, + { + "epoch": 98.75, + "loss": 0.049659743905067444, + "loss_ce": 0.0008316197781823575, + "loss_xval": 0.048828125, + "num_input_tokens_seen": 109226368, + "step": 1580 + }, + { + "epoch": 98.8125, + "grad_norm": 13.773559500694164, + "learning_rate": 5e-05, + "loss": 0.0367, + "num_input_tokens_seen": 109298048, + "step": 1581 + }, + { + "epoch": 98.8125, + "loss": 0.036477576941251755, + "loss_ce": 0.00034476383007131517, + "loss_xval": 0.0361328125, + "num_input_tokens_seen": 109298048, + "step": 1581 + }, + { + "epoch": 98.875, + "grad_norm": 3.4372123434645796, + "learning_rate": 5e-05, + "loss": 0.0043, + "num_input_tokens_seen": 109369728, + "step": 1582 + }, + { + "epoch": 98.875, + "loss": 0.003365415846928954, + "loss_ce": 0.00025262293638661504, + "loss_xval": 0.00311279296875, + "num_input_tokens_seen": 109369728, + "step": 1582 + }, + { + "epoch": 98.9375, + "grad_norm": 7.8152476972547005, + "learning_rate": 5e-05, + "loss": 0.0114, + "num_input_tokens_seen": 109428800, + "step": 1583 + }, + { + "epoch": 98.9375, + "loss": 0.01139904372394085, + "loss_ce": 0.00022961030481383204, + "loss_xval": 0.01116943359375, + "num_input_tokens_seen": 109428800, + "step": 1583 + }, + { + "epoch": 99.0, + "grad_norm": 18.65440543806313, + "learning_rate": 5e-05, + "loss": 0.0614, + "num_input_tokens_seen": 109475392, + "step": 1584 + }, + { + "epoch": 99.0, + "loss": 0.059953078627586365, + "loss_ce": 0.0003827674372587353, + "loss_xval": 0.0595703125, + "num_input_tokens_seen": 109475392, + "step": 1584 + }, + { + "epoch": 99.0625, + "grad_norm": 23.83843810097326, + "learning_rate": 5e-05, + "loss": 0.0944, + "num_input_tokens_seen": 109546944, + "step": 1585 + }, + { + "epoch": 99.0625, + "loss": 0.09535447508096695, + "loss_ce": 0.0001396313455188647, + "loss_xval": 0.09521484375, + "num_input_tokens_seen": 109546944, + "step": 1585 + }, + { + "epoch": 99.125, + "grad_norm": 17.427719670772035, + "learning_rate": 5e-05, + "loss": 0.0516, + "num_input_tokens_seen": 109618688, + "step": 1586 + }, + { + "epoch": 99.125, + "loss": 0.051441460847854614, + "loss_ce": 0.00017192953964695334, + "loss_xval": 0.05126953125, + "num_input_tokens_seen": 109618688, + "step": 1586 + }, + { + "epoch": 99.1875, + "grad_norm": 6.166135907550393, + "learning_rate": 5e-05, + "loss": 0.0097, + "num_input_tokens_seen": 109690496, + "step": 1587 + }, + { + "epoch": 99.1875, + "loss": 0.010387794114649296, + "loss_ce": 0.0007442396599799395, + "loss_xval": 0.0096435546875, + "num_input_tokens_seen": 109690496, + "step": 1587 + }, + { + "epoch": 99.25, + "grad_norm": 7.035162997340315, + "learning_rate": 5e-05, + "loss": 0.0145, + "num_input_tokens_seen": 109762176, + "step": 1588 + }, + { + "epoch": 99.25, + "loss": 0.015704365447163582, + "loss_ce": 0.0018493849784135818, + "loss_xval": 0.01385498046875, + "num_input_tokens_seen": 109762176, + "step": 1588 + }, + { + "epoch": 99.3125, + "grad_norm": 22.907493652120387, + "learning_rate": 5e-05, + "loss": 0.1104, + "num_input_tokens_seen": 109821248, + "step": 1589 + }, + { + "epoch": 99.3125, + "loss": 0.1109476163983345, + "loss_ce": 0.0025491821579635143, + "loss_xval": 0.1083984375, + "num_input_tokens_seen": 109821248, + "step": 1589 + }, + { + "epoch": 99.375, + "grad_norm": 27.005672270421268, + "learning_rate": 5e-05, + "loss": 0.148, + "num_input_tokens_seen": 109893056, + "step": 1590 + }, + { + "epoch": 99.375, + "loss": 0.14455649256706238, + "loss_ce": 0.0010018125176429749, + "loss_xval": 0.1435546875, + "num_input_tokens_seen": 109893056, + "step": 1590 + }, + { + "epoch": 99.4375, + "grad_norm": 18.030635499664317, + "learning_rate": 5e-05, + "loss": 0.059, + "num_input_tokens_seen": 109952064, + "step": 1591 + }, + { + "epoch": 99.4375, + "loss": 0.05704944580793381, + "loss_ce": 0.00016467941168230027, + "loss_xval": 0.056884765625, + "num_input_tokens_seen": 109952064, + "step": 1591 + }, + { + "epoch": 99.5, + "grad_norm": 6.114988855419912, + "learning_rate": 5e-05, + "loss": 0.0084, + "num_input_tokens_seen": 110023680, + "step": 1592 + }, + { + "epoch": 99.5, + "loss": 0.009071971289813519, + "loss_ce": 0.00016083818627521396, + "loss_xval": 0.0089111328125, + "num_input_tokens_seen": 110023680, + "step": 1592 + }, + { + "epoch": 99.5625, + "grad_norm": 25.734581206903535, + "learning_rate": 5e-05, + "loss": 0.1091, + "num_input_tokens_seen": 110095424, + "step": 1593 + }, + { + "epoch": 99.5625, + "loss": 0.11003206670284271, + "loss_ce": 0.0001687873445916921, + "loss_xval": 0.10986328125, + "num_input_tokens_seen": 110095424, + "step": 1593 + }, + { + "epoch": 99.625, + "grad_norm": 16.73317359001743, + "learning_rate": 5e-05, + "loss": 0.0489, + "num_input_tokens_seen": 110167168, + "step": 1594 + }, + { + "epoch": 99.625, + "loss": 0.04463024437427521, + "loss_ce": 0.00019665084255393595, + "loss_xval": 0.04443359375, + "num_input_tokens_seen": 110167168, + "step": 1594 + }, + { + "epoch": 99.6875, + "grad_norm": 5.752505101398109, + "learning_rate": 5e-05, + "loss": 0.0102, + "num_input_tokens_seen": 110238848, + "step": 1595 + }, + { + "epoch": 99.6875, + "loss": 0.009299618192017078, + "loss_ce": 0.00026641503791324794, + "loss_xval": 0.009033203125, + "num_input_tokens_seen": 110238848, + "step": 1595 + }, + { + "epoch": 99.75, + "grad_norm": 16.170946032727255, + "learning_rate": 5e-05, + "loss": 0.0483, + "num_input_tokens_seen": 110310464, + "step": 1596 + }, + { + "epoch": 99.75, + "loss": 0.04650229960680008, + "loss_ce": 0.00035972270416095853, + "loss_xval": 0.046142578125, + "num_input_tokens_seen": 110310464, + "step": 1596 + }, + { + "epoch": 99.8125, + "grad_norm": 5.235616533671838, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 110382016, + "step": 1597 + }, + { + "epoch": 99.8125, + "loss": 0.006151093170046806, + "loss_ce": 0.0006579289911314845, + "loss_xval": 0.0054931640625, + "num_input_tokens_seen": 110382016, + "step": 1597 + }, + { + "epoch": 99.875, + "grad_norm": 12.563475707020881, + "learning_rate": 5e-05, + "loss": 0.0318, + "num_input_tokens_seen": 110441216, + "step": 1598 + }, + { + "epoch": 99.875, + "loss": 0.03234691545367241, + "loss_ce": 0.0015851972857490182, + "loss_xval": 0.03076171875, + "num_input_tokens_seen": 110441216, + "step": 1598 + }, + { + "epoch": 99.9375, + "grad_norm": 14.430502102016037, + "learning_rate": 5e-05, + "loss": 0.0399, + "num_input_tokens_seen": 110512832, + "step": 1599 + }, + { + "epoch": 99.9375, + "loss": 0.04030544310808182, + "loss_ce": 0.0007546603446826339, + "loss_xval": 0.03955078125, + "num_input_tokens_seen": 110512832, + "step": 1599 + }, + { + "epoch": 100.0, + "grad_norm": 2.025038232206116, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 110571904, + "step": 1600 + }, + { + "epoch": 100.0, + "loss": 0.0020840922370553017, + "loss_ce": 0.0009473123354837298, + "loss_xval": 0.00113677978515625, + "num_input_tokens_seen": 110571904, + "step": 1600 + }, + { + "epoch": 100.0625, + "grad_norm": 12.756881989338398, + "learning_rate": 5e-05, + "loss": 0.0334, + "num_input_tokens_seen": 110643584, + "step": 1601 + }, + { + "epoch": 100.0625, + "loss": 0.0334443598985672, + "loss_ce": 0.0009736571228131652, + "loss_xval": 0.032470703125, + "num_input_tokens_seen": 110643584, + "step": 1601 + }, + { + "epoch": 100.125, + "grad_norm": 1.3560502459791763, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 110702720, + "step": 1602 + }, + { + "epoch": 100.125, + "loss": 0.0013581677339971066, + "loss_ce": 0.0006143018254078925, + "loss_xval": 0.000743865966796875, + "num_input_tokens_seen": 110702720, + "step": 1602 + }, + { + "epoch": 100.1875, + "grad_norm": 11.49146267669002, + "learning_rate": 5e-05, + "loss": 0.0282, + "num_input_tokens_seen": 110774336, + "step": 1603 + }, + { + "epoch": 100.1875, + "loss": 0.027602650225162506, + "loss_ce": 0.0005030405009165406, + "loss_xval": 0.027099609375, + "num_input_tokens_seen": 110774336, + "step": 1603 + }, + { + "epoch": 100.25, + "grad_norm": 2.8865605520789948, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_input_tokens_seen": 110845888, + "step": 1604 + }, + { + "epoch": 100.25, + "loss": 0.0031700225081294775, + "loss_ce": 0.0004844755749218166, + "loss_xval": 0.002685546875, + "num_input_tokens_seen": 110845888, + "step": 1604 + }, + { + "epoch": 100.3125, + "grad_norm": 9.938249020872993, + "learning_rate": 5e-05, + "loss": 0.0205, + "num_input_tokens_seen": 110917632, + "step": 1605 + }, + { + "epoch": 100.3125, + "loss": 0.020480969920754433, + "loss_ce": 0.0004614384670276195, + "loss_xval": 0.02001953125, + "num_input_tokens_seen": 110917632, + "step": 1605 + }, + { + "epoch": 100.375, + "grad_norm": 535.267256561788, + "learning_rate": 5e-05, + "loss": 0.5603, + "num_input_tokens_seen": 110989312, + "step": 1606 + }, + { + "epoch": 100.375, + "loss": 0.0017519124085083604, + "loss_ce": 0.0004015095764771104, + "loss_xval": 0.00135040283203125, + "num_input_tokens_seen": 110989312, + "step": 1606 + }, + { + "epoch": 100.4375, + "grad_norm": 26.803573221116935, + "learning_rate": 5e-05, + "loss": 0.1485, + "num_input_tokens_seen": 111060928, + "step": 1607 + }, + { + "epoch": 100.4375, + "loss": 0.13052555918693542, + "loss_ce": 0.12610051035881042, + "loss_xval": 0.004425048828125, + "num_input_tokens_seen": 111060928, + "step": 1607 + }, + { + "epoch": 100.5, + "grad_norm": 11.96771523717426, + "learning_rate": 5e-05, + "loss": 0.7261, + "num_input_tokens_seen": 111132608, + "step": 1608 + }, + { + "epoch": 100.5, + "loss": 0.728778064250946, + "loss_ce": 0.7280418276786804, + "loss_xval": 0.000736236572265625, + "num_input_tokens_seen": 111132608, + "step": 1608 + }, + { + "epoch": 100.5625, + "grad_norm": 11.58786077220693, + "learning_rate": 5e-05, + "loss": 0.7126, + "num_input_tokens_seen": 111204288, + "step": 1609 + }, + { + "epoch": 100.5625, + "loss": 0.7263501286506653, + "loss_ce": 0.7207348942756653, + "loss_xval": 0.005615234375, + "num_input_tokens_seen": 111204288, + "step": 1609 + }, + { + "epoch": 100.625, + "grad_norm": 7.627950979392913, + "learning_rate": 5e-05, + "loss": 0.5433, + "num_input_tokens_seen": 111275968, + "step": 1610 + }, + { + "epoch": 100.625, + "loss": 0.5421339273452759, + "loss_ce": 0.5413748025894165, + "loss_xval": 0.000759124755859375, + "num_input_tokens_seen": 111275968, + "step": 1610 + }, + { + "epoch": 100.6875, + "grad_norm": 8.926215867422837, + "learning_rate": 5e-05, + "loss": 0.5094, + "num_input_tokens_seen": 111335168, + "step": 1611 + }, + { + "epoch": 100.6875, + "loss": 0.508723258972168, + "loss_ce": 0.5062055587768555, + "loss_xval": 0.0025177001953125, + "num_input_tokens_seen": 111335168, + "step": 1611 + }, + { + "epoch": 100.75, + "grad_norm": 13.96864154608611, + "learning_rate": 5e-05, + "loss": 0.4793, + "num_input_tokens_seen": 111406848, + "step": 1612 + }, + { + "epoch": 100.75, + "loss": 0.4821075201034546, + "loss_ce": 0.4637969732284546, + "loss_xval": 0.018310546875, + "num_input_tokens_seen": 111406848, + "step": 1612 + }, + { + "epoch": 100.8125, + "grad_norm": 32.770614154641045, + "learning_rate": 5e-05, + "loss": 0.4225, + "num_input_tokens_seen": 111478528, + "step": 1613 + }, + { + "epoch": 100.8125, + "loss": 0.42495107650756836, + "loss_ce": 0.29604482650756836, + "loss_xval": 0.12890625, + "num_input_tokens_seen": 111478528, + "step": 1613 + }, + { + "epoch": 100.875, + "grad_norm": 96.20222280752, + "learning_rate": 5e-05, + "loss": 1.2346, + "num_input_tokens_seen": 111537600, + "step": 1614 + }, + { + "epoch": 100.875, + "loss": 1.218656063079834, + "loss_ce": 0.1795935183763504, + "loss_xval": 1.0390625, + "num_input_tokens_seen": 111537600, + "step": 1614 + }, + { + "epoch": 100.9375, + "grad_norm": 98.88955953502278, + "learning_rate": 5e-05, + "loss": 1.2119, + "num_input_tokens_seen": 111609280, + "step": 1615 + }, + { + "epoch": 100.9375, + "loss": 1.1993708610534668, + "loss_ce": 0.1681208610534668, + "loss_xval": 1.03125, + "num_input_tokens_seen": 111609280, + "step": 1615 + }, + { + "epoch": 101.0, + "grad_norm": 22.017518689198113, + "learning_rate": 5e-05, + "loss": 0.2187, + "num_input_tokens_seen": 111680896, + "step": 1616 + }, + { + "epoch": 101.0, + "loss": 0.2185109406709671, + "loss_ce": 0.1611378937959671, + "loss_xval": 0.057373046875, + "num_input_tokens_seen": 111680896, + "step": 1616 + }, + { + "epoch": 101.0625, + "grad_norm": 70.8242896529797, + "learning_rate": 5e-05, + "loss": 0.7584, + "num_input_tokens_seen": 111752576, + "step": 1617 + }, + { + "epoch": 101.0625, + "loss": 0.74258953332901, + "loss_ce": 0.1293082982301712, + "loss_xval": 0.61328125, + "num_input_tokens_seen": 111752576, + "step": 1617 + }, + { + "epoch": 101.125, + "grad_norm": 27.584765102526394, + "learning_rate": 5e-05, + "loss": 0.1862, + "num_input_tokens_seen": 111824128, + "step": 1618 + }, + { + "epoch": 101.125, + "loss": 0.18330301344394684, + "loss_ce": 0.07636941969394684, + "loss_xval": 0.10693359375, + "num_input_tokens_seen": 111824128, + "step": 1618 + }, + { + "epoch": 101.1875, + "grad_norm": 45.35551615786325, + "learning_rate": 5e-05, + "loss": 0.3777, + "num_input_tokens_seen": 111895872, + "step": 1619 + }, + { + "epoch": 101.1875, + "loss": 0.37971189618110657, + "loss_ce": 0.06135252118110657, + "loss_xval": 0.318359375, + "num_input_tokens_seen": 111895872, + "step": 1619 + }, + { + "epoch": 101.25, + "grad_norm": 43.72179545283442, + "learning_rate": 5e-05, + "loss": 0.3603, + "num_input_tokens_seen": 111967616, + "step": 1620 + }, + { + "epoch": 101.25, + "loss": 0.36618638038635254, + "loss_ce": 0.06345201283693314, + "loss_xval": 0.302734375, + "num_input_tokens_seen": 111967616, + "step": 1620 + }, + { + "epoch": 101.3125, + "grad_norm": 10.257659607571505, + "learning_rate": 5e-05, + "loss": 0.066, + "num_input_tokens_seen": 112039296, + "step": 1621 + }, + { + "epoch": 101.3125, + "loss": 0.06621536612510681, + "loss_ce": 0.05034622177481651, + "loss_xval": 0.015869140625, + "num_input_tokens_seen": 112039296, + "step": 1621 + }, + { + "epoch": 101.375, + "grad_norm": 47.05730933565881, + "learning_rate": 5e-05, + "loss": 0.4014, + "num_input_tokens_seen": 112098368, + "step": 1622 + }, + { + "epoch": 101.375, + "loss": 0.3984343707561493, + "loss_ce": 0.046871863305568695, + "loss_xval": 0.3515625, + "num_input_tokens_seen": 112098368, + "step": 1622 + }, + { + "epoch": 101.4375, + "grad_norm": 21.400823346508936, + "learning_rate": 5e-05, + "loss": 0.1173, + "num_input_tokens_seen": 112157568, + "step": 1623 + }, + { + "epoch": 101.4375, + "loss": 0.11608298122882843, + "loss_ce": 0.04088767245411873, + "loss_xval": 0.0751953125, + "num_input_tokens_seen": 112157568, + "step": 1623 + }, + { + "epoch": 101.5, + "grad_norm": 23.821125025940553, + "learning_rate": 5e-05, + "loss": 0.1299, + "num_input_tokens_seen": 112229248, + "step": 1624 + }, + { + "epoch": 101.5, + "loss": 0.13002723455429077, + "loss_ce": 0.03383582830429077, + "loss_xval": 0.09619140625, + "num_input_tokens_seen": 112229248, + "step": 1624 + }, + { + "epoch": 101.5625, + "grad_norm": 36.06573205221317, + "learning_rate": 5e-05, + "loss": 0.2493, + "num_input_tokens_seen": 112300864, + "step": 1625 + }, + { + "epoch": 101.5625, + "loss": 0.24369105696678162, + "loss_ce": 0.022011367604136467, + "loss_xval": 0.2216796875, + "num_input_tokens_seen": 112300864, + "step": 1625 + }, + { + "epoch": 101.625, + "grad_norm": 8.543920957277138, + "learning_rate": 5e-05, + "loss": 0.0293, + "num_input_tokens_seen": 112372544, + "step": 1626 + }, + { + "epoch": 101.625, + "loss": 0.027967257425189018, + "loss_ce": 0.014356417581439018, + "loss_xval": 0.01361083984375, + "num_input_tokens_seen": 112372544, + "step": 1626 + }, + { + "epoch": 101.6875, + "grad_norm": 24.866755936364093, + "learning_rate": 5e-05, + "loss": 0.1224, + "num_input_tokens_seen": 112444224, + "step": 1627 + }, + { + "epoch": 101.6875, + "loss": 0.12345461547374725, + "loss_ce": 0.013591330498456955, + "loss_xval": 0.10986328125, + "num_input_tokens_seen": 112444224, + "step": 1627 + }, + { + "epoch": 101.75, + "grad_norm": 29.743958865983373, + "learning_rate": 5e-05, + "loss": 0.163, + "num_input_tokens_seen": 112515840, + "step": 1628 + }, + { + "epoch": 101.75, + "loss": 0.1601444035768509, + "loss_ce": 0.006824086420238018, + "loss_xval": 0.1533203125, + "num_input_tokens_seen": 112515840, + "step": 1628 + }, + { + "epoch": 101.8125, + "grad_norm": 3.2598818341929277, + "learning_rate": 5e-05, + "loss": 0.0088, + "num_input_tokens_seen": 112587392, + "step": 1629 + }, + { + "epoch": 101.8125, + "loss": 0.009183799847960472, + "loss_ce": 0.005124961957335472, + "loss_xval": 0.004058837890625, + "num_input_tokens_seen": 112587392, + "step": 1629 + }, + { + "epoch": 101.875, + "grad_norm": 23.646893975459562, + "learning_rate": 5e-05, + "loss": 0.1019, + "num_input_tokens_seen": 112659136, + "step": 1630 + }, + { + "epoch": 101.875, + "loss": 0.10114867240190506, + "loss_ce": 0.003980706911534071, + "loss_xval": 0.09716796875, + "num_input_tokens_seen": 112659136, + "step": 1630 + }, + { + "epoch": 101.9375, + "grad_norm": 22.947677597928312, + "learning_rate": 5e-05, + "loss": 0.0949, + "num_input_tokens_seen": 112730752, + "step": 1631 + }, + { + "epoch": 101.9375, + "loss": 0.0934811681509018, + "loss_ce": 0.0026608556509017944, + "loss_xval": 0.0908203125, + "num_input_tokens_seen": 112730752, + "step": 1631 + }, + { + "epoch": 102.0, + "grad_norm": 0.9599640145062192, + "learning_rate": 5e-05, + "loss": 0.0033, + "num_input_tokens_seen": 112802368, + "step": 1632 + }, + { + "epoch": 102.0, + "loss": 0.0026651208754628897, + "loss_ce": 0.0018411462660878897, + "loss_xval": 0.000823974609375, + "num_input_tokens_seen": 112802368, + "step": 1632 + }, + { + "epoch": 102.0625, + "grad_norm": 23.756242845065035, + "learning_rate": 5e-05, + "loss": 0.0953, + "num_input_tokens_seen": 112874112, + "step": 1633 + }, + { + "epoch": 102.0625, + "loss": 0.0883241519331932, + "loss_ce": 0.0018983713816851377, + "loss_xval": 0.08642578125, + "num_input_tokens_seen": 112874112, + "step": 1633 + }, + { + "epoch": 102.125, + "grad_norm": 16.368851868634717, + "learning_rate": 5e-05, + "loss": 0.0489, + "num_input_tokens_seen": 112945664, + "step": 1634 + }, + { + "epoch": 102.125, + "loss": 0.04897579923272133, + "loss_ce": 0.001368378521874547, + "loss_xval": 0.047607421875, + "num_input_tokens_seen": 112945664, + "step": 1634 + }, + { + "epoch": 102.1875, + "grad_norm": 4.815410162153499, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 113017280, + "step": 1635 + }, + { + "epoch": 102.1875, + "loss": 0.005683029070496559, + "loss_ce": 0.0011053922353312373, + "loss_xval": 0.00457763671875, + "num_input_tokens_seen": 113017280, + "step": 1635 + }, + { + "epoch": 102.25, + "grad_norm": 18.835360111676962, + "learning_rate": 5e-05, + "loss": 0.0649, + "num_input_tokens_seen": 113088960, + "step": 1636 + }, + { + "epoch": 102.25, + "loss": 0.06543883681297302, + "loss_ce": 0.0009857096010819077, + "loss_xval": 0.064453125, + "num_input_tokens_seen": 113088960, + "step": 1636 + }, + { + "epoch": 102.3125, + "grad_norm": 11.221442126844604, + "learning_rate": 5e-05, + "loss": 0.025, + "num_input_tokens_seen": 113160640, + "step": 1637 + }, + { + "epoch": 102.3125, + "loss": 0.026167068630456924, + "loss_ce": 0.0010205849539488554, + "loss_xval": 0.025146484375, + "num_input_tokens_seen": 113160640, + "step": 1637 + }, + { + "epoch": 102.375, + "grad_norm": 6.333657334181233, + "learning_rate": 5e-05, + "loss": 0.0099, + "num_input_tokens_seen": 113219712, + "step": 1638 + }, + { + "epoch": 102.375, + "loss": 0.01122201420366764, + "loss_ce": 0.0008460377575829625, + "loss_xval": 0.0103759765625, + "num_input_tokens_seen": 113219712, + "step": 1638 + }, + { + "epoch": 102.4375, + "grad_norm": 16.67463922925151, + "learning_rate": 5e-05, + "loss": 0.053, + "num_input_tokens_seen": 113291328, + "step": 1639 + }, + { + "epoch": 102.4375, + "loss": 0.05274589732289314, + "loss_ce": 0.0007439448963850737, + "loss_xval": 0.052001953125, + "num_input_tokens_seen": 113291328, + "step": 1639 + }, + { + "epoch": 102.5, + "grad_norm": 7.620879504670172, + "learning_rate": 5e-05, + "loss": 0.0124, + "num_input_tokens_seen": 113362944, + "step": 1640 + }, + { + "epoch": 102.5, + "loss": 0.012734380550682545, + "loss_ce": 0.0007714895764365792, + "loss_xval": 0.011962890625, + "num_input_tokens_seen": 113362944, + "step": 1640 + }, + { + "epoch": 102.5625, + "grad_norm": 8.532472679636887, + "learning_rate": 5e-05, + "loss": 0.0149, + "num_input_tokens_seen": 113434624, + "step": 1641 + }, + { + "epoch": 102.5625, + "loss": 0.016723984852433205, + "loss_ce": 0.0007327737403102219, + "loss_xval": 0.0159912109375, + "num_input_tokens_seen": 113434624, + "step": 1641 + }, + { + "epoch": 102.625, + "grad_norm": 13.787586527116975, + "learning_rate": 5e-05, + "loss": 0.0371, + "num_input_tokens_seen": 113506240, + "step": 1642 + }, + { + "epoch": 102.625, + "loss": 0.03597182780504227, + "loss_ce": 0.0008155779214575887, + "loss_xval": 0.03515625, + "num_input_tokens_seen": 113506240, + "step": 1642 + }, + { + "epoch": 102.6875, + "grad_norm": 4.768758139645163, + "learning_rate": 5e-05, + "loss": 0.0059, + "num_input_tokens_seen": 113577792, + "step": 1643 + }, + { + "epoch": 102.6875, + "loss": 0.006672807969152927, + "loss_ce": 0.0007829151581972837, + "loss_xval": 0.005889892578125, + "num_input_tokens_seen": 113577792, + "step": 1643 + }, + { + "epoch": 102.75, + "grad_norm": 8.60501362437254, + "learning_rate": 5e-05, + "loss": 0.0153, + "num_input_tokens_seen": 113649472, + "step": 1644 + }, + { + "epoch": 102.75, + "loss": 0.01445347536355257, + "loss_ce": 0.0007816006545908749, + "loss_xval": 0.013671875, + "num_input_tokens_seen": 113649472, + "step": 1644 + }, + { + "epoch": 102.8125, + "grad_norm": 11.367159098499652, + "learning_rate": 5e-05, + "loss": 0.0266, + "num_input_tokens_seen": 113721152, + "step": 1645 + }, + { + "epoch": 102.8125, + "loss": 0.026272470131516457, + "loss_ce": 0.0007597756339237094, + "loss_xval": 0.0255126953125, + "num_input_tokens_seen": 113721152, + "step": 1645 + }, + { + "epoch": 102.875, + "grad_norm": 3.3770240355971866, + "learning_rate": 5e-05, + "loss": 0.0035, + "num_input_tokens_seen": 113780160, + "step": 1646 + }, + { + "epoch": 102.875, + "loss": 0.0036508911289274693, + "loss_ce": 0.0007974976324476302, + "loss_xval": 0.0028533935546875, + "num_input_tokens_seen": 113780160, + "step": 1646 + }, + { + "epoch": 102.9375, + "grad_norm": 7.827881100394053, + "learning_rate": 5e-05, + "loss": 0.0133, + "num_input_tokens_seen": 113851776, + "step": 1647 + }, + { + "epoch": 102.9375, + "loss": 0.012910965830087662, + "loss_ce": 0.0007039343472570181, + "loss_xval": 0.01220703125, + "num_input_tokens_seen": 113851776, + "step": 1647 + }, + { + "epoch": 103.0, + "grad_norm": 10.520670672072665, + "learning_rate": 5e-05, + "loss": 0.0232, + "num_input_tokens_seen": 113910784, + "step": 1648 + }, + { + "epoch": 103.0, + "loss": 0.01958302967250347, + "loss_ce": 0.0007842010818421841, + "loss_xval": 0.018798828125, + "num_input_tokens_seen": 113910784, + "step": 1648 + }, + { + "epoch": 103.0625, + "grad_norm": 1.5553846635183586, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 113969920, + "step": 1649 + }, + { + "epoch": 103.0625, + "loss": 0.0020108516328036785, + "loss_ce": 0.0007672603824175894, + "loss_xval": 0.00124359130859375, + "num_input_tokens_seen": 113969920, + "step": 1649 + }, + { + "epoch": 103.125, + "grad_norm": 7.754582670099876, + "learning_rate": 5e-05, + "loss": 0.013, + "num_input_tokens_seen": 114041600, + "step": 1650 + }, + { + "epoch": 103.125, + "loss": 0.013059111312031746, + "loss_ce": 0.0007300098077394068, + "loss_xval": 0.0123291015625, + "num_input_tokens_seen": 114041600, + "step": 1650 + }, + { + "epoch": 103.1875, + "grad_norm": 8.891149925560153, + "learning_rate": 5e-05, + "loss": 0.0167, + "num_input_tokens_seen": 114113216, + "step": 1651 + }, + { + "epoch": 103.1875, + "loss": 0.016599709168076515, + "loss_ce": 0.0008526380988769233, + "loss_xval": 0.0157470703125, + "num_input_tokens_seen": 114113216, + "step": 1651 + }, + { + "epoch": 103.25, + "grad_norm": 0.12653154608843153, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 114184960, + "step": 1652 + }, + { + "epoch": 103.25, + "loss": 0.0012427711626514792, + "loss_ce": 0.0008498573442921042, + "loss_xval": 0.000392913818359375, + "num_input_tokens_seen": 114184960, + "step": 1652 + }, + { + "epoch": 103.3125, + "grad_norm": 7.470563693408022, + "learning_rate": 5e-05, + "loss": 0.0121, + "num_input_tokens_seen": 114256640, + "step": 1653 + }, + { + "epoch": 103.3125, + "loss": 0.012393683195114136, + "loss_ce": 0.0007970034494064748, + "loss_xval": 0.0115966796875, + "num_input_tokens_seen": 114256640, + "step": 1653 + }, + { + "epoch": 103.375, + "grad_norm": 7.080284493962458, + "learning_rate": 5e-05, + "loss": 0.0112, + "num_input_tokens_seen": 114328192, + "step": 1654 + }, + { + "epoch": 103.375, + "loss": 0.011119197122752666, + "loss_ce": 0.0008042553090490401, + "loss_xval": 0.01031494140625, + "num_input_tokens_seen": 114328192, + "step": 1654 + }, + { + "epoch": 103.4375, + "grad_norm": 1.1158407425238983, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 114399808, + "step": 1655 + }, + { + "epoch": 103.4375, + "loss": 0.001563415164127946, + "loss_ce": 0.0007966610719449818, + "loss_xval": 0.000766754150390625, + "num_input_tokens_seen": 114399808, + "step": 1655 + }, + { + "epoch": 103.5, + "grad_norm": 6.957097591177087, + "learning_rate": 5e-05, + "loss": 0.0112, + "num_input_tokens_seen": 114471424, + "step": 1656 + }, + { + "epoch": 103.5, + "loss": 0.012271175161004066, + "loss_ce": 0.0007965656695887446, + "loss_xval": 0.011474609375, + "num_input_tokens_seen": 114471424, + "step": 1656 + }, + { + "epoch": 103.5625, + "grad_norm": 5.014957813994443, + "learning_rate": 5e-05, + "loss": 0.0063, + "num_input_tokens_seen": 114530496, + "step": 1657 + }, + { + "epoch": 103.5625, + "loss": 0.006433679256588221, + "loss_ce": 0.0007574096089228988, + "loss_xval": 0.00567626953125, + "num_input_tokens_seen": 114530496, + "step": 1657 + }, + { + "epoch": 103.625, + "grad_norm": 1.9327560449349166, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 114589504, + "step": 1658 + }, + { + "epoch": 103.625, + "loss": 0.0017968413885682821, + "loss_ce": 0.0007363555487245321, + "loss_xval": 0.00106048583984375, + "num_input_tokens_seen": 114589504, + "step": 1658 + }, + { + "epoch": 103.6875, + "grad_norm": 6.253396113562765, + "learning_rate": 5e-05, + "loss": 0.009, + "num_input_tokens_seen": 114648640, + "step": 1659 + }, + { + "epoch": 103.6875, + "loss": 0.008760355412960052, + "loss_ce": 0.0007037151954136789, + "loss_xval": 0.008056640625, + "num_input_tokens_seen": 114648640, + "step": 1659 + }, + { + "epoch": 103.75, + "grad_norm": 3.665027561445157, + "learning_rate": 5e-05, + "loss": 0.0037, + "num_input_tokens_seen": 114707648, + "step": 1660 + }, + { + "epoch": 103.75, + "loss": 0.0037118252366781235, + "loss_ce": 0.0007211025804281235, + "loss_xval": 0.00299072265625, + "num_input_tokens_seen": 114707648, + "step": 1660 + }, + { + "epoch": 103.8125, + "grad_norm": 2.8591045359372713, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_input_tokens_seen": 114779264, + "step": 1661 + }, + { + "epoch": 103.8125, + "loss": 0.0027316624764353037, + "loss_ce": 0.0006717260112054646, + "loss_xval": 0.0020599365234375, + "num_input_tokens_seen": 114779264, + "step": 1661 + }, + { + "epoch": 103.875, + "grad_norm": 5.895781742475335, + "learning_rate": 5e-05, + "loss": 0.0081, + "num_input_tokens_seen": 114851072, + "step": 1662 + }, + { + "epoch": 103.875, + "loss": 0.007647594437003136, + "loss_ce": 0.0006285517010837793, + "loss_xval": 0.00701904296875, + "num_input_tokens_seen": 114851072, + "step": 1662 + }, + { + "epoch": 103.9375, + "grad_norm": 1.5133022469671882, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 114910080, + "step": 1663 + }, + { + "epoch": 103.9375, + "loss": 0.0016773771494626999, + "loss_ce": 0.0007656645029783249, + "loss_xval": 0.000911712646484375, + "num_input_tokens_seen": 114910080, + "step": 1663 + }, + { + "epoch": 104.0, + "grad_norm": 3.610526889212173, + "learning_rate": 5e-05, + "loss": 0.0038, + "num_input_tokens_seen": 114981632, + "step": 1664 + }, + { + "epoch": 104.0, + "loss": 0.003930710256099701, + "loss_ce": 0.0007263646693900228, + "loss_xval": 0.003204345703125, + "num_input_tokens_seen": 114981632, + "step": 1664 + }, + { + "epoch": 104.0625, + "grad_norm": 4.220355319451222, + "learning_rate": 5e-05, + "loss": 0.0047, + "num_input_tokens_seen": 115053184, + "step": 1665 + }, + { + "epoch": 104.0625, + "loss": 0.005380589049309492, + "loss_ce": 0.0006198466871865094, + "loss_xval": 0.0047607421875, + "num_input_tokens_seen": 115053184, + "step": 1665 + }, + { + "epoch": 104.125, + "grad_norm": 0.07905389863386036, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 115124800, + "step": 1666 + }, + { + "epoch": 104.125, + "loss": 0.0010264108423143625, + "loss_ce": 0.0006354043725878, + "loss_xval": 0.0003910064697265625, + "num_input_tokens_seen": 115124800, + "step": 1666 + }, + { + "epoch": 104.1875, + "grad_norm": 3.7039766082303043, + "learning_rate": 5e-05, + "loss": 0.0038, + "num_input_tokens_seen": 115196416, + "step": 1667 + }, + { + "epoch": 104.1875, + "loss": 0.0037187503185123205, + "loss_ce": 0.0007127689314074814, + "loss_xval": 0.0030059814453125, + "num_input_tokens_seen": 115196416, + "step": 1667 + }, + { + "epoch": 104.25, + "grad_norm": 3.05507735450659, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_input_tokens_seen": 115268032, + "step": 1668 + }, + { + "epoch": 104.25, + "loss": 0.00257797259837389, + "loss_ce": 0.0005943299038335681, + "loss_xval": 0.001983642578125, + "num_input_tokens_seen": 115268032, + "step": 1668 + }, + { + "epoch": 104.3125, + "grad_norm": 1.0814883240904307, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 115339840, + "step": 1669 + }, + { + "epoch": 104.3125, + "loss": 0.0011010868474841118, + "loss_ce": 0.0005326969549059868, + "loss_xval": 0.000568389892578125, + "num_input_tokens_seen": 115339840, + "step": 1669 + }, + { + "epoch": 104.375, + "grad_norm": 3.690749536698239, + "learning_rate": 5e-05, + "loss": 0.0035, + "num_input_tokens_seen": 115411392, + "step": 1670 + }, + { + "epoch": 104.375, + "loss": 0.0035830074921250343, + "loss_ce": 0.0005770259886048734, + "loss_xval": 0.0030059814453125, + "num_input_tokens_seen": 115411392, + "step": 1670 + }, + { + "epoch": 104.4375, + "grad_norm": 1.8211360411562902, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 115470464, + "step": 1671 + }, + { + "epoch": 104.4375, + "loss": 0.001722783432342112, + "loss_ce": 0.0006241506198421121, + "loss_xval": 0.0010986328125, + "num_input_tokens_seen": 115470464, + "step": 1671 + }, + { + "epoch": 104.5, + "grad_norm": 2.054217315006852, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 115542016, + "step": 1672 + }, + { + "epoch": 104.5, + "loss": 0.0015874525997787714, + "loss_ce": 0.00048119042185135186, + "loss_xval": 0.00110626220703125, + "num_input_tokens_seen": 115542016, + "step": 1672 + }, + { + "epoch": 104.5625, + "grad_norm": 3.3353779554239678, + "learning_rate": 5e-05, + "loss": 0.003, + "num_input_tokens_seen": 115613760, + "step": 1673 + }, + { + "epoch": 104.5625, + "loss": 0.0029215288814157248, + "loss_ce": 0.0005258990568108857, + "loss_xval": 0.0023956298828125, + "num_input_tokens_seen": 115613760, + "step": 1673 + }, + { + "epoch": 104.625, + "grad_norm": 0.39359180484763523, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 115685376, + "step": 1674 + }, + { + "epoch": 104.625, + "loss": 0.0007368091028183699, + "loss_ce": 0.0005298617761582136, + "loss_xval": 0.00020694732666015625, + "num_input_tokens_seen": 115685376, + "step": 1674 + }, + { + "epoch": 104.6875, + "grad_norm": 3.1102239317177895, + "learning_rate": 5e-05, + "loss": 0.0026, + "num_input_tokens_seen": 115744512, + "step": 1675 + }, + { + "epoch": 104.6875, + "loss": 0.0027107985224574804, + "loss_ce": 0.00048301531933248043, + "loss_xval": 0.002227783203125, + "num_input_tokens_seen": 115744512, + "step": 1675 + }, + { + "epoch": 104.75, + "grad_norm": 2.048859685459329, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 115816064, + "step": 1676 + }, + { + "epoch": 104.75, + "loss": 0.0014386232942342758, + "loss_ce": 0.00043154318700544536, + "loss_xval": 0.001007080078125, + "num_input_tokens_seen": 115816064, + "step": 1676 + }, + { + "epoch": 104.8125, + "grad_norm": 1.3415920474072838, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 115875136, + "step": 1677 + }, + { + "epoch": 104.8125, + "loss": 0.0009290423477068543, + "loss_ce": 0.0004293169768061489, + "loss_xval": 0.000499725341796875, + "num_input_tokens_seen": 115875136, + "step": 1677 + }, + { + "epoch": 104.875, + "grad_norm": 2.761071000900736, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 115946816, + "step": 1678 + }, + { + "epoch": 104.875, + "loss": 0.002234385348856449, + "loss_ce": 0.00043384829768911004, + "loss_xval": 0.001800537109375, + "num_input_tokens_seen": 115946816, + "step": 1678 + }, + { + "epoch": 104.9375, + "grad_norm": 0.35031847383260256, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 116018368, + "step": 1679 + }, + { + "epoch": 104.9375, + "loss": 0.0006341906264424324, + "loss_ce": 0.0004215212247800082, + "loss_xval": 0.00021266937255859375, + "num_input_tokens_seen": 116018368, + "step": 1679 + }, + { + "epoch": 105.0, + "grad_norm": 2.156864692866578, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 116089984, + "step": 1680 + }, + { + "epoch": 105.0, + "loss": 0.0013213963247835636, + "loss_ce": 0.0003448338247835636, + "loss_xval": 0.0009765625, + "num_input_tokens_seen": 116089984, + "step": 1680 + }, + { + "epoch": 105.0625, + "grad_norm": 1.561713135829587, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 116161600, + "step": 1681 + }, + { + "epoch": 105.0625, + "loss": 0.0013053379952907562, + "loss_ce": 0.00038981062243692577, + "loss_xval": 0.00091552734375, + "num_input_tokens_seen": 116161600, + "step": 1681 + }, + { + "epoch": 105.125, + "grad_norm": 1.2670996351838228, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 116233280, + "step": 1682 + }, + { + "epoch": 105.125, + "loss": 0.0008212352404370904, + "loss_ce": 0.0003024364123120904, + "loss_xval": 0.000518798828125, + "num_input_tokens_seen": 116233280, + "step": 1682 + }, + { + "epoch": 105.1875, + "grad_norm": 1.765763884638601, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 116292352, + "step": 1683 + }, + { + "epoch": 105.1875, + "loss": 0.0009669559658505023, + "loss_ce": 0.00026886636624112725, + "loss_xval": 0.000698089599609375, + "num_input_tokens_seen": 116292352, + "step": 1683 + }, + { + "epoch": 105.25, + "grad_norm": 0.057805268793867895, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 116364032, + "step": 1684 + }, + { + "epoch": 105.25, + "loss": 0.0003976496518589556, + "loss_ce": 0.0003156336606480181, + "loss_xval": 8.20159912109375e-05, + "num_input_tokens_seen": 116364032, + "step": 1684 + }, + { + "epoch": 105.3125, + "grad_norm": 1.6781220119027798, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 116435648, + "step": 1685 + }, + { + "epoch": 105.3125, + "loss": 0.0009655409958213568, + "loss_ce": 0.0003055983397644013, + "loss_xval": 0.000659942626953125, + "num_input_tokens_seen": 116435648, + "step": 1685 + }, + { + "epoch": 105.375, + "grad_norm": 0.6101874326629925, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 116507328, + "step": 1686 + }, + { + "epoch": 105.375, + "loss": 0.00042850777390412986, + "loss_ce": 0.0002482633281033486, + "loss_xval": 0.00018024444580078125, + "num_input_tokens_seen": 116507328, + "step": 1686 + }, + { + "epoch": 105.4375, + "grad_norm": 0.9843981208119894, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 116578944, + "step": 1687 + }, + { + "epoch": 105.4375, + "loss": 0.0005340241477824748, + "loss_ce": 0.00027081003645434976, + "loss_xval": 0.000263214111328125, + "num_input_tokens_seen": 116578944, + "step": 1687 + }, + { + "epoch": 105.5, + "grad_norm": 1.1536326062306255, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 116638144, + "step": 1688 + }, + { + "epoch": 105.5, + "loss": 0.0008680825121700764, + "loss_ce": 0.0002806191623676568, + "loss_xval": 0.00058746337890625, + "num_input_tokens_seen": 116638144, + "step": 1688 + }, + { + "epoch": 105.5625, + "grad_norm": 0.5324269020281875, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 116709824, + "step": 1689 + }, + { + "epoch": 105.5625, + "loss": 0.0004137876385357231, + "loss_ce": 0.00029076365171931684, + "loss_xval": 0.00012302398681640625, + "num_input_tokens_seen": 116709824, + "step": 1689 + }, + { + "epoch": 105.625, + "grad_norm": 1.2980282215142769, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 116781504, + "step": 1690 + }, + { + "epoch": 105.625, + "loss": 0.0009148502722382545, + "loss_ce": 0.00024727825075387955, + "loss_xval": 0.000667572021484375, + "num_input_tokens_seen": 116781504, + "step": 1690 + }, + { + "epoch": 105.6875, + "grad_norm": 0.29562102696361137, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 116840640, + "step": 1691 + }, + { + "epoch": 105.6875, + "loss": 0.0003725567366927862, + "loss_ce": 0.0002333202719455585, + "loss_xval": 0.0001392364501953125, + "num_input_tokens_seen": 116840640, + "step": 1691 + }, + { + "epoch": 105.75, + "grad_norm": 1.0232409981555282, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 116912256, + "step": 1692 + }, + { + "epoch": 105.75, + "loss": 0.0005665783537551761, + "loss_ce": 0.00021371884213294834, + "loss_xval": 0.0003528594970703125, + "num_input_tokens_seen": 116912256, + "step": 1692 + }, + { + "epoch": 105.8125, + "grad_norm": 0.34358201221822143, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 116983936, + "step": 1693 + }, + { + "epoch": 105.8125, + "loss": 0.0003722688998095691, + "loss_ce": 0.00023589345801156014, + "loss_xval": 0.00013637542724609375, + "num_input_tokens_seen": 116983936, + "step": 1693 + }, + { + "epoch": 105.875, + "grad_norm": 1.2389221421776981, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 117055616, + "step": 1694 + }, + { + "epoch": 105.875, + "loss": 0.0006702013779431581, + "loss_ce": 0.00023914058692753315, + "loss_xval": 0.000431060791015625, + "num_input_tokens_seen": 117055616, + "step": 1694 + }, + { + "epoch": 105.9375, + "grad_norm": 0.62070963216232, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 117114688, + "step": 1695 + }, + { + "epoch": 105.9375, + "loss": 0.0003860698197968304, + "loss_ce": 0.0001896128960652277, + "loss_xval": 0.0001964569091796875, + "num_input_tokens_seen": 117114688, + "step": 1695 + }, + { + "epoch": 106.0, + "grad_norm": 1.151688414326741, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 117186368, + "step": 1696 + }, + { + "epoch": 106.0, + "loss": 0.0006602061912417412, + "loss_ce": 0.00020625718752853572, + "loss_xval": 0.000453948974609375, + "num_input_tokens_seen": 117186368, + "step": 1696 + }, + { + "epoch": 106.0625, + "grad_norm": 0.9367465334806727, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 117258176, + "step": 1697 + }, + { + "epoch": 106.0625, + "loss": 0.00044638433610089123, + "loss_ce": 0.00018317022477276623, + "loss_xval": 0.000263214111328125, + "num_input_tokens_seen": 117258176, + "step": 1697 + }, + { + "epoch": 106.125, + "grad_norm": 1.0215678861138757, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 117329728, + "step": 1698 + }, + { + "epoch": 106.125, + "loss": 0.000491351296659559, + "loss_ce": 0.00017663878679741174, + "loss_xval": 0.0003147125244140625, + "num_input_tokens_seen": 117329728, + "step": 1698 + }, + { + "epoch": 106.1875, + "grad_norm": 1.0125715492817908, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 117401408, + "step": 1699 + }, + { + "epoch": 106.1875, + "loss": 0.0005752349970862269, + "loss_ce": 0.0001746918132994324, + "loss_xval": 0.000400543212890625, + "num_input_tokens_seen": 117401408, + "step": 1699 + }, + { + "epoch": 106.25, + "grad_norm": 1.066618829875019, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 117473216, + "step": 1700 + }, + { + "epoch": 106.25, + "loss": 0.00048783019883558154, + "loss_ce": 0.00015785889991093427, + "loss_xval": 0.0003299713134765625, + "num_input_tokens_seen": 117473216, + "step": 1700 + }, + { + "epoch": 106.3125, + "grad_norm": 0.6928800245749516, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 117544768, + "step": 1701 + }, + { + "epoch": 106.3125, + "loss": 0.00034172070445492864, + "loss_ce": 0.00015289320435840636, + "loss_xval": 0.0001888275146484375, + "num_input_tokens_seen": 117544768, + "step": 1701 + }, + { + "epoch": 106.375, + "grad_norm": 0.6562982159952676, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 117616512, + "step": 1702 + }, + { + "epoch": 106.375, + "loss": 0.0002900107065215707, + "loss_ce": 0.00017795395979192108, + "loss_xval": 0.00011205673217773438, + "num_input_tokens_seen": 117616512, + "step": 1702 + }, + { + "epoch": 106.4375, + "grad_norm": 0.8476027009117076, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 117675712, + "step": 1703 + }, + { + "epoch": 106.4375, + "loss": 0.00037799362326040864, + "loss_ce": 0.00015960219025146216, + "loss_xval": 0.00021839141845703125, + "num_input_tokens_seen": 117675712, + "step": 1703 + }, + { + "epoch": 106.5, + "grad_norm": 0.30678107087599277, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 117734784, + "step": 1704 + }, + { + "epoch": 106.5, + "loss": 0.00021619546168949455, + "loss_ce": 0.00012798058742191643, + "loss_xval": 8.821487426757812e-05, + "num_input_tokens_seen": 117734784, + "step": 1704 + }, + { + "epoch": 106.5625, + "grad_norm": 0.9752061317620482, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 117806592, + "step": 1705 + }, + { + "epoch": 106.5625, + "loss": 0.000510626588948071, + "loss_ce": 0.00015204506053123623, + "loss_xval": 0.00035858154296875, + "num_input_tokens_seen": 117806592, + "step": 1705 + }, + { + "epoch": 106.625, + "grad_norm": 0.11033428610619243, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 117878336, + "step": 1706 + }, + { + "epoch": 106.625, + "loss": 0.00024724070681259036, + "loss_ce": 0.00014472071779891849, + "loss_xval": 0.00010251998901367188, + "num_input_tokens_seen": 117878336, + "step": 1706 + }, + { + "epoch": 106.6875, + "grad_norm": 0.8970902998903458, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 117949952, + "step": 1707 + }, + { + "epoch": 106.6875, + "loss": 0.000420129275880754, + "loss_ce": 0.00013593431503977627, + "loss_xval": 0.0002841949462890625, + "num_input_tokens_seen": 117949952, + "step": 1707 + }, + { + "epoch": 106.75, + "grad_norm": 0.0864938089783715, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 118021504, + "step": 1708 + }, + { + "epoch": 106.75, + "loss": 0.00023897687788121402, + "loss_ce": 0.00012930433149449527, + "loss_xval": 0.00010967254638671875, + "num_input_tokens_seen": 118021504, + "step": 1708 + }, + { + "epoch": 106.8125, + "grad_norm": 0.8691233585505428, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 118093120, + "step": 1709 + }, + { + "epoch": 106.8125, + "loss": 0.00035278405994176865, + "loss_ce": 0.00015060511941555887, + "loss_xval": 0.000202178955078125, + "num_input_tokens_seen": 118093120, + "step": 1709 + }, + { + "epoch": 106.875, + "grad_norm": 0.2920927641095561, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 118164864, + "step": 1710 + }, + { + "epoch": 106.875, + "loss": 0.0003268433501943946, + "loss_ce": 0.00013038642646279186, + "loss_xval": 0.0001964569091796875, + "num_input_tokens_seen": 118164864, + "step": 1710 + }, + { + "epoch": 106.9375, + "grad_norm": 0.6542078281721475, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 118236480, + "step": 1711 + }, + { + "epoch": 106.9375, + "loss": 0.00030111701926216483, + "loss_ce": 0.00014566810568794608, + "loss_xval": 0.00015544891357421875, + "num_input_tokens_seen": 118236480, + "step": 1711 + }, + { + "epoch": 107.0, + "grad_norm": 0.45484764465508853, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 118308032, + "step": 1712 + }, + { + "epoch": 107.0, + "loss": 0.00029993202770128846, + "loss_ce": 0.00013590004527941346, + "loss_xval": 0.000164031982421875, + "num_input_tokens_seen": 118308032, + "step": 1712 + }, + { + "epoch": 107.0625, + "grad_norm": 0.1475064477319675, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 118379712, + "step": 1713 + }, + { + "epoch": 107.0625, + "loss": 0.00025952374562621117, + "loss_ce": 0.00013268507609609514, + "loss_xval": 0.00012683868408203125, + "num_input_tokens_seen": 118379712, + "step": 1713 + }, + { + "epoch": 107.125, + "grad_norm": 0.25357679220631113, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 118438720, + "step": 1714 + }, + { + "epoch": 107.125, + "loss": 0.00023810943821445107, + "loss_ce": 0.00012557586887851357, + "loss_xval": 0.0001125335693359375, + "num_input_tokens_seen": 118438720, + "step": 1714 + }, + { + "epoch": 107.1875, + "grad_norm": 0.0193646792988071, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 118510336, + "step": 1715 + }, + { + "epoch": 107.1875, + "loss": 0.00020649436919484288, + "loss_ce": 0.00012018684356007725, + "loss_xval": 8.630752563476562e-05, + "num_input_tokens_seen": 118510336, + "step": 1715 + }, + { + "epoch": 107.25, + "grad_norm": 0.24234549892618124, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 118581888, + "step": 1716 + }, + { + "epoch": 107.25, + "loss": 0.00017738385940901935, + "loss_ce": 0.00011253401316935197, + "loss_xval": 6.4849853515625e-05, + "num_input_tokens_seen": 118581888, + "step": 1716 + }, + { + "epoch": 107.3125, + "grad_norm": 0.16171066044051577, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 118653504, + "step": 1717 + }, + { + "epoch": 107.3125, + "loss": 0.0002221949107479304, + "loss_ce": 0.00011490655015222728, + "loss_xval": 0.00010728836059570312, + "num_input_tokens_seen": 118653504, + "step": 1717 + }, + { + "epoch": 107.375, + "grad_norm": 0.23147773169592833, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 118725184, + "step": 1718 + }, + { + "epoch": 107.375, + "loss": 0.00021182381897233427, + "loss_ce": 9.976707951864228e-05, + "loss_xval": 0.00011205673217773438, + "num_input_tokens_seen": 118725184, + "step": 1718 + }, + { + "epoch": 107.4375, + "grad_norm": 0.44545970064903945, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 118796800, + "step": 1719 + }, + { + "epoch": 107.4375, + "loss": 0.0002488536119926721, + "loss_ce": 9.817306272452697e-05, + "loss_xval": 0.0001506805419921875, + "num_input_tokens_seen": 118796800, + "step": 1719 + }, + { + "epoch": 107.5, + "grad_norm": 0.04274000590970272, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 118868416, + "step": 1720 + }, + { + "epoch": 107.5, + "loss": 0.0001888109982246533, + "loss_ce": 0.0001010729611152783, + "loss_xval": 8.7738037109375e-05, + "num_input_tokens_seen": 118868416, + "step": 1720 + }, + { + "epoch": 107.5625, + "grad_norm": 0.348588123620522, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 118940032, + "step": 1721 + }, + { + "epoch": 107.5625, + "loss": 0.00023122888524085283, + "loss_ce": 0.00010915856546489522, + "loss_xval": 0.0001220703125, + "num_input_tokens_seen": 118940032, + "step": 1721 + }, + { + "epoch": 107.625, + "grad_norm": 0.13433877341461867, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 119011712, + "step": 1722 + }, + { + "epoch": 107.625, + "loss": 0.00021391075279098004, + "loss_ce": 0.00010662239219527692, + "loss_xval": 0.00010728836059570312, + "num_input_tokens_seen": 119011712, + "step": 1722 + }, + { + "epoch": 107.6875, + "grad_norm": 0.6809539631450718, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 119083264, + "step": 1723 + }, + { + "epoch": 107.6875, + "loss": 0.00023257522843778133, + "loss_ce": 9.715346823213622e-05, + "loss_xval": 0.0001354217529296875, + "num_input_tokens_seen": 119083264, + "step": 1723 + }, + { + "epoch": 107.75, + "grad_norm": 0.1889751093111315, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 119154880, + "step": 1724 + }, + { + "epoch": 107.75, + "loss": 0.00014927498705219477, + "loss_ce": 9.658448107074946e-05, + "loss_xval": 5.269050598144531e-05, + "num_input_tokens_seen": 119154880, + "step": 1724 + }, + { + "epoch": 107.8125, + "grad_norm": 0.49261580074161004, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 119226496, + "step": 1725 + }, + { + "epoch": 107.8125, + "loss": 0.00027312879683449864, + "loss_ce": 0.00010337477579014376, + "loss_xval": 0.0001697540283203125, + "num_input_tokens_seen": 119226496, + "step": 1725 + }, + { + "epoch": 107.875, + "grad_norm": 0.1872703937412782, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 119285440, + "step": 1726 + }, + { + "epoch": 107.875, + "loss": 0.00020403148664627224, + "loss_ce": 9.197475446853787e-05, + "loss_xval": 0.00011205673217773438, + "num_input_tokens_seen": 119285440, + "step": 1726 + }, + { + "epoch": 107.9375, + "grad_norm": 0.44040056668876976, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 119357120, + "step": 1727 + }, + { + "epoch": 107.9375, + "loss": 0.0002309640112798661, + "loss_ce": 9.554225107422099e-05, + "loss_xval": 0.0001354217529296875, + "num_input_tokens_seen": 119357120, + "step": 1727 + }, + { + "epoch": 108.0, + "grad_norm": 0.4350724614073691, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 119428800, + "step": 1728 + }, + { + "epoch": 108.0, + "loss": 0.000171815074281767, + "loss_ce": 8.503071148879826e-05, + "loss_xval": 8.678436279296875e-05, + "num_input_tokens_seen": 119428800, + "step": 1728 + }, + { + "epoch": 108.0625, + "grad_norm": 0.12261902416252166, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 119500480, + "step": 1729 + }, + { + "epoch": 108.0625, + "loss": 0.00015668789274059236, + "loss_ce": 0.00010542790550971404, + "loss_xval": 5.125999450683594e-05, + "num_input_tokens_seen": 119500480, + "step": 1729 + }, + { + "epoch": 108.125, + "grad_norm": 0.04068237379475871, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 119559680, + "step": 1730 + }, + { + "epoch": 108.125, + "loss": 0.00014538603136315942, + "loss_ce": 8.292036363855004e-05, + "loss_xval": 6.246566772460938e-05, + "num_input_tokens_seen": 119559680, + "step": 1730 + }, + { + "epoch": 108.1875, + "grad_norm": 0.21295300564952035, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 119631424, + "step": 1731 + }, + { + "epoch": 108.1875, + "loss": 0.00018561960314400494, + "loss_ce": 7.117867789929733e-05, + "loss_xval": 0.00011444091796875, + "num_input_tokens_seen": 119631424, + "step": 1731 + }, + { + "epoch": 108.25, + "grad_norm": 0.6202458907209969, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 119702976, + "step": 1732 + }, + { + "epoch": 108.25, + "loss": 0.00030045691528357565, + "loss_ce": 8.015814091777429e-05, + "loss_xval": 0.00022029876708984375, + "num_input_tokens_seen": 119702976, + "step": 1732 + }, + { + "epoch": 108.3125, + "grad_norm": 0.3733458594831618, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 119774656, + "step": 1733 + }, + { + "epoch": 108.3125, + "loss": 0.00019437010632827878, + "loss_ce": 8.517439709976315e-05, + "loss_xval": 0.00010919570922851562, + "num_input_tokens_seen": 119774656, + "step": 1733 + }, + { + "epoch": 108.375, + "grad_norm": 0.788826552316628, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 119846336, + "step": 1734 + }, + { + "epoch": 108.375, + "loss": 0.0002434041816741228, + "loss_ce": 7.93721919762902e-05, + "loss_xval": 0.000164031982421875, + "num_input_tokens_seen": 119846336, + "step": 1734 + }, + { + "epoch": 108.4375, + "grad_norm": 1.4110098626578793, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 119918080, + "step": 1735 + }, + { + "epoch": 108.4375, + "loss": 0.0004943107487633824, + "loss_ce": 9.376756497658789e-05, + "loss_xval": 0.000400543212890625, + "num_input_tokens_seen": 119918080, + "step": 1735 + }, + { + "epoch": 108.5, + "grad_norm": 0.2677482367315443, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 119989632, + "step": 1736 + }, + { + "epoch": 108.5, + "loss": 0.0001648281468078494, + "loss_ce": 7.709010242251679e-05, + "loss_xval": 8.7738037109375e-05, + "num_input_tokens_seen": 119989632, + "step": 1736 + }, + { + "epoch": 108.5625, + "grad_norm": 1.4965836171714064, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 120061376, + "step": 1737 + }, + { + "epoch": 108.5625, + "loss": 0.0004614101198967546, + "loss_ce": 6.849630881333724e-05, + "loss_xval": 0.000392913818359375, + "num_input_tokens_seen": 120061376, + "step": 1737 + }, + { + "epoch": 108.625, + "grad_norm": 1.7362134972145584, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 120132928, + "step": 1738 + }, + { + "epoch": 108.625, + "loss": 0.000654931936878711, + "loss_ce": 8.65420515765436e-05, + "loss_xval": 0.000568389892578125, + "num_input_tokens_seen": 120132928, + "step": 1738 + }, + { + "epoch": 108.6875, + "grad_norm": 0.10497585603783582, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 120192192, + "step": 1739 + }, + { + "epoch": 108.6875, + "loss": 0.00011801804066635668, + "loss_ce": 7.1526417741552e-05, + "loss_xval": 4.649162292480469e-05, + "num_input_tokens_seen": 120192192, + "step": 1739 + }, + { + "epoch": 108.75, + "grad_norm": 1.9549083462496235, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 120264000, + "step": 1740 + }, + { + "epoch": 108.75, + "loss": 0.0008286124211736023, + "loss_ce": 7.330238440772519e-05, + "loss_xval": 0.00075531005859375, + "num_input_tokens_seen": 120264000, + "step": 1740 + }, + { + "epoch": 108.8125, + "grad_norm": 1.64600119762836, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 120323136, + "step": 1741 + }, + { + "epoch": 108.8125, + "loss": 0.0006632714066654444, + "loss_ce": 7.199330138973892e-05, + "loss_xval": 0.000591278076171875, + "num_input_tokens_seen": 120323136, + "step": 1741 + }, + { + "epoch": 108.875, + "grad_norm": 0.31792835465951985, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 120394688, + "step": 1742 + }, + { + "epoch": 108.875, + "loss": 0.00013815359852742404, + "loss_ce": 7.091955922078341e-05, + "loss_xval": 6.723403930664062e-05, + "num_input_tokens_seen": 120394688, + "step": 1742 + }, + { + "epoch": 108.9375, + "grad_norm": 1.8780950319980536, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 120466432, + "step": 1743 + }, + { + "epoch": 108.9375, + "loss": 0.0008036716608330607, + "loss_ce": 7.12497640051879e-05, + "loss_xval": 0.000732421875, + "num_input_tokens_seen": 120466432, + "step": 1743 + }, + { + "epoch": 109.0, + "grad_norm": 1.738570960504345, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 120525568, + "step": 1744 + }, + { + "epoch": 109.0, + "loss": 0.0007079385104589164, + "loss_ce": 7.088408892741427e-05, + "loss_xval": 0.000637054443359375, + "num_input_tokens_seen": 120525568, + "step": 1744 + }, + { + "epoch": 109.0625, + "grad_norm": 0.06762341183011107, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 120584704, + "step": 1745 + }, + { + "epoch": 109.0625, + "loss": 0.0001587302831467241, + "loss_ce": 6.527020741486922e-05, + "loss_xval": 9.34600830078125e-05, + "num_input_tokens_seen": 120584704, + "step": 1745 + }, + { + "epoch": 109.125, + "grad_norm": 1.6084002598386766, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 120656384, + "step": 1746 + }, + { + "epoch": 109.125, + "loss": 0.0006961078615859151, + "loss_ce": 7.049749547149986e-05, + "loss_xval": 0.0006256103515625, + "num_input_tokens_seen": 120656384, + "step": 1746 + }, + { + "epoch": 109.1875, + "grad_norm": 1.8576904874062472, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 120728000, + "step": 1747 + }, + { + "epoch": 109.1875, + "loss": 0.0008069484028965235, + "loss_ce": 8.597062696935609e-05, + "loss_xval": 0.000720977783203125, + "num_input_tokens_seen": 120728000, + "step": 1747 + }, + { + "epoch": 109.25, + "grad_norm": 0.4006157882739807, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 120799552, + "step": 1748 + }, + { + "epoch": 109.25, + "loss": 0.0001714739773888141, + "loss_ce": 6.561612826772034e-05, + "loss_xval": 0.00010585784912109375, + "num_input_tokens_seen": 120799552, + "step": 1748 + }, + { + "epoch": 109.3125, + "grad_norm": 1.7118247767867223, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 120871168, + "step": 1749 + }, + { + "epoch": 109.3125, + "loss": 0.0007678983965888619, + "loss_ce": 6.21793806203641e-05, + "loss_xval": 0.000705718994140625, + "num_input_tokens_seen": 120871168, + "step": 1749 + }, + { + "epoch": 109.375, + "grad_norm": 2.6824813308961724, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 120930240, + "step": 1750 + }, + { + "epoch": 109.375, + "eval_synth_IoU": 0.06953348591923714, + "eval_synth_MAE_x": 0.02272796630859375, + "eval_synth_MAE_y": 0.023560963571071625, + "eval_synth_NUM_probability": 0.9992043226957321, + "eval_synth_inside_bbox": 0.1875, + "eval_synth_loss": 0.0006334885256364942, + "eval_synth_loss_ce": 7.082072261255234e-05, + "eval_synth_loss_xval": 0.0005626678466796875, + "eval_synth_runtime": 58.1486, + "eval_synth_samples_per_second": 2.201, + "eval_synth_steps_per_second": 0.069, + "num_input_tokens_seen": 120930240, + "step": 1750 + }, + { + "epoch": 109.375, + "loss": 0.0005780013161711395, + "loss_ce": 6.301718531176448e-05, + "loss_xval": 0.000514984130859375, + "num_input_tokens_seen": 120930240, + "step": 1750 + }, + { + "epoch": 109.4375, + "grad_norm": 1.579022788002766, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 121001920, + "step": 1751 + }, + { + "epoch": 109.4375, + "loss": 0.0007101305527612567, + "loss_ce": 7.307608029805124e-05, + "loss_xval": 0.000637054443359375, + "num_input_tokens_seen": 121001920, + "step": 1751 + }, + { + "epoch": 109.5, + "grad_norm": 0.7241471043715434, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 121073472, + "step": 1752 + }, + { + "epoch": 109.5, + "loss": 0.00023262601462192833, + "loss_ce": 6.76403651596047e-05, + "loss_xval": 0.00016498565673828125, + "num_input_tokens_seen": 121073472, + "step": 1752 + }, + { + "epoch": 109.5625, + "grad_norm": 2.1732530424914747, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 121145088, + "step": 1753 + }, + { + "epoch": 109.5625, + "loss": 0.0010409480892121792, + "loss_ce": 6.820028647780418e-05, + "loss_xval": 0.000972747802734375, + "num_input_tokens_seen": 121145088, + "step": 1753 + }, + { + "epoch": 109.625, + "grad_norm": 1.7185759244489052, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 121216640, + "step": 1754 + }, + { + "epoch": 109.625, + "loss": 0.0007494474994018674, + "loss_ce": 6.2801998865325e-05, + "loss_xval": 0.0006866455078125, + "num_input_tokens_seen": 121216640, + "step": 1754 + }, + { + "epoch": 109.6875, + "grad_norm": 0.25256629473919784, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 121288256, + "step": 1755 + }, + { + "epoch": 109.6875, + "loss": 0.00019676412921398878, + "loss_ce": 6.038870196789503e-05, + "loss_xval": 0.00013637542724609375, + "num_input_tokens_seen": 121288256, + "step": 1755 + }, + { + "epoch": 109.75, + "grad_norm": 2.321287818343787, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 121347520, + "step": 1756 + }, + { + "epoch": 109.75, + "loss": 0.0010661354754120111, + "loss_ce": 6.668480637017637e-05, + "loss_xval": 0.00099945068359375, + "num_input_tokens_seen": 121347520, + "step": 1756 + }, + { + "epoch": 109.8125, + "grad_norm": 3.325342099713657, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 121419072, + "step": 1757 + }, + { + "epoch": 109.8125, + "loss": 0.0022488064132630825, + "loss_ce": 6.679958460154012e-05, + "loss_xval": 0.0021820068359375, + "num_input_tokens_seen": 121419072, + "step": 1757 + }, + { + "epoch": 109.875, + "grad_norm": 3.001496617326765, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 121490688, + "step": 1758 + }, + { + "epoch": 109.875, + "loss": 0.001875939778983593, + "loss_ce": 6.014389146002941e-05, + "loss_xval": 0.0018157958984375, + "num_input_tokens_seen": 121490688, + "step": 1758 + }, + { + "epoch": 109.9375, + "grad_norm": 1.6426331612052185, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 121562304, + "step": 1759 + }, + { + "epoch": 109.9375, + "loss": 0.0006351894699037075, + "loss_ce": 7.061428914312273e-05, + "loss_xval": 0.0005645751953125, + "num_input_tokens_seen": 121562304, + "step": 1759 + }, + { + "epoch": 110.0, + "grad_norm": 0.14074563650367997, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 121633856, + "step": 1760 + }, + { + "epoch": 110.0, + "loss": 0.00021442785509862006, + "loss_ce": 6.851567741250619e-05, + "loss_xval": 0.00014591217041015625, + "num_input_tokens_seen": 121633856, + "step": 1760 + }, + { + "epoch": 110.0625, + "grad_norm": 1.5574652891852028, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 121705536, + "step": 1761 + }, + { + "epoch": 110.0625, + "loss": 0.0006171943969093263, + "loss_ce": 6.406330066965893e-05, + "loss_xval": 0.000553131103515625, + "num_input_tokens_seen": 121705536, + "step": 1761 + }, + { + "epoch": 110.125, + "grad_norm": 2.288040547426898, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 121777280, + "step": 1762 + }, + { + "epoch": 110.125, + "loss": 0.0011686455691233277, + "loss_ce": 7.00127420714125e-05, + "loss_xval": 0.0010986328125, + "num_input_tokens_seen": 121777280, + "step": 1762 + }, + { + "epoch": 110.1875, + "grad_norm": 2.129444147921264, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 121836480, + "step": 1763 + }, + { + "epoch": 110.1875, + "loss": 0.0010745597537606955, + "loss_ce": 5.985027382848784e-05, + "loss_xval": 0.00101470947265625, + "num_input_tokens_seen": 121836480, + "step": 1763 + }, + { + "epoch": 110.25, + "grad_norm": 1.3842064988364935, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 121908032, + "step": 1764 + }, + { + "epoch": 110.25, + "loss": 0.000571783515624702, + "loss_ce": 6.824348383815959e-05, + "loss_xval": 0.0005035400390625, + "num_input_tokens_seen": 121908032, + "step": 1764 + }, + { + "epoch": 110.3125, + "grad_norm": 0.35625028169896966, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 121967168, + "step": 1765 + }, + { + "epoch": 110.3125, + "loss": 0.00019284302834421396, + "loss_ce": 6.505066266981885e-05, + "loss_xval": 0.0001277923583984375, + "num_input_tokens_seen": 121967168, + "step": 1765 + }, + { + "epoch": 110.375, + "grad_norm": 0.598471501459658, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 122038784, + "step": 1766 + }, + { + "epoch": 110.375, + "loss": 0.00024847392342053354, + "loss_ce": 7.490520511055365e-05, + "loss_xval": 0.0001735687255859375, + "num_input_tokens_seen": 122038784, + "step": 1766 + }, + { + "epoch": 110.4375, + "grad_norm": 1.177837228473711, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 122110528, + "step": 1767 + }, + { + "epoch": 110.4375, + "loss": 0.00036571957753039896, + "loss_ce": 6.435850082198158e-05, + "loss_xval": 0.000301361083984375, + "num_input_tokens_seen": 122110528, + "step": 1767 + }, + { + "epoch": 110.5, + "grad_norm": 1.4204740711254538, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 122182208, + "step": 1768 + }, + { + "epoch": 110.5, + "loss": 0.0005603337194770575, + "loss_ce": 6.823776493547484e-05, + "loss_xval": 0.000492095947265625, + "num_input_tokens_seen": 122182208, + "step": 1768 + }, + { + "epoch": 110.5625, + "grad_norm": 1.6478446466696302, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 122253952, + "step": 1769 + }, + { + "epoch": 110.5625, + "loss": 0.0006315922364592552, + "loss_ce": 7.08317311364226e-05, + "loss_xval": 0.000560760498046875, + "num_input_tokens_seen": 122253952, + "step": 1769 + }, + { + "epoch": 110.625, + "grad_norm": 1.9859732568458073, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 122325504, + "step": 1770 + }, + { + "epoch": 110.625, + "loss": 0.0007901331409811974, + "loss_ce": 6.534063868457451e-05, + "loss_xval": 0.00072479248046875, + "num_input_tokens_seen": 122325504, + "step": 1770 + }, + { + "epoch": 110.6875, + "grad_norm": 2.3332948078117006, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 122384704, + "step": 1771 + }, + { + "epoch": 110.6875, + "loss": 0.001085848081856966, + "loss_ce": 7.113863102858886e-05, + "loss_xval": 0.00101470947265625, + "num_input_tokens_seen": 122384704, + "step": 1771 + }, + { + "epoch": 110.75, + "grad_norm": 2.6076680683930005, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 122456448, + "step": 1772 + }, + { + "epoch": 110.75, + "loss": 0.0012727116700261831, + "loss_ce": 6.72673195367679e-05, + "loss_xval": 0.0012054443359375, + "num_input_tokens_seen": 122456448, + "step": 1772 + }, + { + "epoch": 110.8125, + "grad_norm": 2.9984594669057536, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 122528000, + "step": 1773 + }, + { + "epoch": 110.8125, + "loss": 0.0018300075316801667, + "loss_ce": 5.998796768835746e-05, + "loss_xval": 0.00177001953125, + "num_input_tokens_seen": 122528000, + "step": 1773 + }, + { + "epoch": 110.875, + "grad_norm": 3.598605406045106, + "learning_rate": 5e-05, + "loss": 0.0025, + "num_input_tokens_seen": 122599616, + "step": 1774 + }, + { + "epoch": 110.875, + "loss": 0.0025792140513658524, + "loss_ce": 6.151380512164906e-05, + "loss_xval": 0.0025177001953125, + "num_input_tokens_seen": 122599616, + "step": 1774 + }, + { + "epoch": 110.9375, + "grad_norm": 4.1017815818888055, + "learning_rate": 5e-05, + "loss": 0.0033, + "num_input_tokens_seen": 122671232, + "step": 1775 + }, + { + "epoch": 110.9375, + "loss": 0.003606092417612672, + "loss_ce": 6.605328235309571e-05, + "loss_xval": 0.0035400390625, + "num_input_tokens_seen": 122671232, + "step": 1775 + }, + { + "epoch": 111.0, + "grad_norm": 4.37458727859117, + "learning_rate": 5e-05, + "loss": 0.0037, + "num_input_tokens_seen": 122742848, + "step": 1776 + }, + { + "epoch": 111.0, + "loss": 0.0038295018021017313, + "loss_ce": 7.583970727864653e-05, + "loss_xval": 0.003753662109375, + "num_input_tokens_seen": 122742848, + "step": 1776 + }, + { + "epoch": 111.0625, + "grad_norm": 4.258724618254019, + "learning_rate": 5e-05, + "loss": 0.0036, + "num_input_tokens_seen": 122814592, + "step": 1777 + }, + { + "epoch": 111.0625, + "loss": 0.0033998247236013412, + "loss_ce": 7.3408788011875e-05, + "loss_xval": 0.003326416015625, + "num_input_tokens_seen": 122814592, + "step": 1777 + }, + { + "epoch": 111.125, + "grad_norm": 3.6412286226566386, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 122886208, + "step": 1778 + }, + { + "epoch": 111.125, + "loss": 0.0030623050406575203, + "loss_ce": 7.158235530368984e-05, + "loss_xval": 0.00299072265625, + "num_input_tokens_seen": 122886208, + "step": 1778 + }, + { + "epoch": 111.1875, + "grad_norm": 2.301903912547345, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 122957760, + "step": 1779 + }, + { + "epoch": 111.1875, + "loss": 0.001317744143307209, + "loss_ce": 7.415279105771333e-05, + "loss_xval": 0.00124359130859375, + "num_input_tokens_seen": 122957760, + "step": 1779 + }, + { + "epoch": 111.25, + "grad_norm": 0.1169441178714884, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 123029504, + "step": 1780 + }, + { + "epoch": 111.25, + "loss": 0.0003047786885872483, + "loss_ce": 7.875786104705185e-05, + "loss_xval": 0.00022602081298828125, + "num_input_tokens_seen": 123029504, + "step": 1780 + }, + { + "epoch": 111.3125, + "grad_norm": 2.676408616858053, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 123101184, + "step": 1781 + }, + { + "epoch": 111.3125, + "loss": 0.001780191552825272, + "loss_ce": 7.883652142481878e-05, + "loss_xval": 0.00170135498046875, + "num_input_tokens_seen": 123101184, + "step": 1781 + }, + { + "epoch": 111.375, + "grad_norm": 5.026821121010569, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 123172992, + "step": 1782 + }, + { + "epoch": 111.375, + "loss": 0.005271148402243853, + "loss_ce": 8.315989543916658e-05, + "loss_xval": 0.00518798828125, + "num_input_tokens_seen": 123172992, + "step": 1782 + }, + { + "epoch": 111.4375, + "grad_norm": 7.014898222716356, + "learning_rate": 5e-05, + "loss": 0.0097, + "num_input_tokens_seen": 123244608, + "step": 1783 + }, + { + "epoch": 111.4375, + "loss": 0.009538796730339527, + "loss_ce": 7.834726420696825e-05, + "loss_xval": 0.00946044921875, + "num_input_tokens_seen": 123244608, + "step": 1783 + }, + { + "epoch": 111.5, + "grad_norm": 8.57632780067167, + "learning_rate": 5e-05, + "loss": 0.0143, + "num_input_tokens_seen": 123316224, + "step": 1784 + }, + { + "epoch": 111.5, + "loss": 0.014065294526517391, + "loss_ce": 8.824347605695948e-05, + "loss_xval": 0.01397705078125, + "num_input_tokens_seen": 123316224, + "step": 1784 + }, + { + "epoch": 111.5625, + "grad_norm": 8.879834626558313, + "learning_rate": 5e-05, + "loss": 0.0154, + "num_input_tokens_seen": 123375360, + "step": 1785 + }, + { + "epoch": 111.5625, + "loss": 0.015540524385869503, + "loss_ce": 9.86294326139614e-05, + "loss_xval": 0.01544189453125, + "num_input_tokens_seen": 123375360, + "step": 1785 + }, + { + "epoch": 111.625, + "grad_norm": 7.156892270448019, + "learning_rate": 5e-05, + "loss": 0.0104, + "num_input_tokens_seen": 123446976, + "step": 1786 + }, + { + "epoch": 111.625, + "loss": 0.010456508956849575, + "loss_ce": 0.00014156715769786388, + "loss_xval": 0.01031494140625, + "num_input_tokens_seen": 123446976, + "step": 1786 + }, + { + "epoch": 111.6875, + "grad_norm": 3.4572267030304777, + "learning_rate": 5e-05, + "loss": 0.0026, + "num_input_tokens_seen": 123518528, + "step": 1787 + }, + { + "epoch": 111.6875, + "loss": 0.0025650986935943365, + "loss_ce": 0.00012369242904242128, + "loss_xval": 0.00244140625, + "num_input_tokens_seen": 123518528, + "step": 1787 + }, + { + "epoch": 111.75, + "grad_norm": 0.8988618961510938, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 123590208, + "step": 1788 + }, + { + "epoch": 111.75, + "loss": 0.00045180803863331676, + "loss_ce": 0.00015044694009702653, + "loss_xval": 0.000301361083984375, + "num_input_tokens_seen": 123590208, + "step": 1788 + }, + { + "epoch": 111.8125, + "grad_norm": 4.2547116814228225, + "learning_rate": 5e-05, + "loss": 0.0039, + "num_input_tokens_seen": 123649344, + "step": 1789 + }, + { + "epoch": 111.8125, + "loss": 0.0041702440939843655, + "loss_ce": 0.00017244124319404364, + "loss_xval": 0.003997802734375, + "num_input_tokens_seen": 123649344, + "step": 1789 + }, + { + "epoch": 111.875, + "grad_norm": 5.244946357466308, + "learning_rate": 5e-05, + "loss": 0.0059, + "num_input_tokens_seen": 123721024, + "step": 1790 + }, + { + "epoch": 111.875, + "loss": 0.006065647583454847, + "loss_ce": 0.00020627248159144074, + "loss_xval": 0.005859375, + "num_input_tokens_seen": 123721024, + "step": 1790 + }, + { + "epoch": 111.9375, + "grad_norm": 3.45428635969614, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 123792576, + "step": 1791 + }, + { + "epoch": 111.9375, + "loss": 0.0029510704334825277, + "loss_ce": 0.00023500603856518865, + "loss_xval": 0.002716064453125, + "num_input_tokens_seen": 123792576, + "step": 1791 + }, + { + "epoch": 112.0, + "grad_norm": 0.4139652085726806, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 123851712, + "step": 1792 + }, + { + "epoch": 112.0, + "loss": 0.0004514800093602389, + "loss_ce": 0.00020829305867664516, + "loss_xval": 0.00024318695068359375, + "num_input_tokens_seen": 123851712, + "step": 1792 + }, + { + "epoch": 112.0625, + "grad_norm": 4.381395109553456, + "learning_rate": 5e-05, + "loss": 0.0043, + "num_input_tokens_seen": 123923328, + "step": 1793 + }, + { + "epoch": 112.0625, + "loss": 0.004271045792847872, + "loss_ce": 0.00024272565497085452, + "loss_xval": 0.0040283203125, + "num_input_tokens_seen": 123923328, + "step": 1793 + }, + { + "epoch": 112.125, + "grad_norm": 5.976886761872509, + "learning_rate": 5e-05, + "loss": 0.0079, + "num_input_tokens_seen": 123995136, + "step": 1794 + }, + { + "epoch": 112.125, + "loss": 0.007744569797068834, + "loss_ce": 0.00026776306913234293, + "loss_xval": 0.007476806640625, + "num_input_tokens_seen": 123995136, + "step": 1794 + }, + { + "epoch": 112.1875, + "grad_norm": 4.295502004175747, + "learning_rate": 5e-05, + "loss": 0.0042, + "num_input_tokens_seen": 124066688, + "step": 1795 + }, + { + "epoch": 112.1875, + "loss": 0.0045149922370910645, + "loss_ce": 0.0002730487904045731, + "loss_xval": 0.004241943359375, + "num_input_tokens_seen": 124066688, + "step": 1795 + }, + { + "epoch": 112.25, + "grad_norm": 0.5322849004613961, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 124138368, + "step": 1796 + }, + { + "epoch": 112.25, + "loss": 0.0004541517118923366, + "loss_ce": 0.00029107340378686786, + "loss_xval": 0.00016307830810546875, + "num_input_tokens_seen": 124138368, + "step": 1796 + }, + { + "epoch": 112.3125, + "grad_norm": 3.1494265214095427, + "learning_rate": 5e-05, + "loss": 0.0025, + "num_input_tokens_seen": 124210176, + "step": 1797 + }, + { + "epoch": 112.3125, + "loss": 0.002794926520437002, + "loss_ce": 0.00030774399056099355, + "loss_xval": 0.0024871826171875, + "num_input_tokens_seen": 124210176, + "step": 1797 + }, + { + "epoch": 112.375, + "grad_norm": 5.114076833623887, + "learning_rate": 5e-05, + "loss": 0.006, + "num_input_tokens_seen": 124281856, + "step": 1798 + }, + { + "epoch": 112.375, + "loss": 0.006075653247535229, + "loss_ce": 0.000338348385412246, + "loss_xval": 0.0057373046875, + "num_input_tokens_seen": 124281856, + "step": 1798 + }, + { + "epoch": 112.4375, + "grad_norm": 4.469994657184072, + "learning_rate": 5e-05, + "loss": 0.0048, + "num_input_tokens_seen": 124353408, + "step": 1799 + }, + { + "epoch": 112.4375, + "loss": 0.004576649982482195, + "loss_ce": 0.0003957415756303817, + "loss_xval": 0.004180908203125, + "num_input_tokens_seen": 124353408, + "step": 1799 + }, + { + "epoch": 112.5, + "grad_norm": 1.534273645092906, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 124425088, + "step": 1800 + }, + { + "epoch": 112.5, + "loss": 0.0012304669944569468, + "loss_ce": 0.0003836042305920273, + "loss_xval": 0.00084686279296875, + "num_input_tokens_seen": 124425088, + "step": 1800 + }, + { + "epoch": 112.5625, + "grad_norm": 1.6878461682804873, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 124496768, + "step": 1801 + }, + { + "epoch": 112.5625, + "loss": 0.0012798135867342353, + "loss_ce": 0.00033758333302102983, + "loss_xval": 0.000942230224609375, + "num_input_tokens_seen": 124496768, + "step": 1801 + }, + { + "epoch": 112.625, + "grad_norm": 3.394315017740658, + "learning_rate": 5e-05, + "loss": 0.003, + "num_input_tokens_seen": 124555840, + "step": 1802 + }, + { + "epoch": 112.625, + "loss": 0.0028806827031075954, + "loss_ce": 0.00034772377694025636, + "loss_xval": 0.002532958984375, + "num_input_tokens_seen": 124555840, + "step": 1802 + }, + { + "epoch": 112.6875, + "grad_norm": 3.0832818029007254, + "learning_rate": 5e-05, + "loss": 0.0025, + "num_input_tokens_seen": 124627456, + "step": 1803 + }, + { + "epoch": 112.6875, + "loss": 0.002533203223720193, + "loss_ce": 0.0003817139659076929, + "loss_xval": 0.0021514892578125, + "num_input_tokens_seen": 124627456, + "step": 1803 + }, + { + "epoch": 112.75, + "grad_norm": 1.014619500099443, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 124699072, + "step": 1804 + }, + { + "epoch": 112.75, + "loss": 0.0006874875398352742, + "loss_ce": 0.0003308133454993367, + "loss_xval": 0.0003566741943359375, + "num_input_tokens_seen": 124699072, + "step": 1804 + }, + { + "epoch": 112.8125, + "grad_norm": 1.7040306007143686, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 124770688, + "step": 1805 + }, + { + "epoch": 112.8125, + "loss": 0.0010238613467663527, + "loss_ce": 0.00030288362177088857, + "loss_xval": 0.000720977783203125, + "num_input_tokens_seen": 124770688, + "step": 1805 + }, + { + "epoch": 112.875, + "grad_norm": 3.357459877271798, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 124842304, + "step": 1806 + }, + { + "epoch": 112.875, + "loss": 0.002878846600651741, + "loss_ce": 0.0003306289145257324, + "loss_xval": 0.0025482177734375, + "num_input_tokens_seen": 124842304, + "step": 1806 + }, + { + "epoch": 112.9375, + "grad_norm": 3.4384850104981792, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 124913856, + "step": 1807 + }, + { + "epoch": 112.9375, + "loss": 0.00293713784776628, + "loss_ce": 0.0003126260999124497, + "loss_xval": 0.00262451171875, + "num_input_tokens_seen": 124913856, + "step": 1807 + }, + { + "epoch": 113.0, + "grad_norm": 2.036426365495195, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 124985408, + "step": 1808 + }, + { + "epoch": 113.0, + "loss": 0.001254330389201641, + "loss_ce": 0.0002548796765040606, + "loss_xval": 0.00099945068359375, + "num_input_tokens_seen": 124985408, + "step": 1808 + }, + { + "epoch": 113.0625, + "grad_norm": 0.357243663340432, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 125044544, + "step": 1809 + }, + { + "epoch": 113.0625, + "loss": 0.0005287157255224884, + "loss_ce": 0.0002931581693701446, + "loss_xval": 0.00023555755615234375, + "num_input_tokens_seen": 125044544, + "step": 1809 + }, + { + "epoch": 113.125, + "grad_norm": 2.53961593001829, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 125103616, + "step": 1810 + }, + { + "epoch": 113.125, + "loss": 0.0017481737304478884, + "loss_ce": 0.00026044173864647746, + "loss_xval": 0.00148773193359375, + "num_input_tokens_seen": 125103616, + "step": 1810 + }, + { + "epoch": 113.1875, + "grad_norm": 3.941602045394896, + "learning_rate": 5e-05, + "loss": 0.0036, + "num_input_tokens_seen": 125175296, + "step": 1811 + }, + { + "epoch": 113.1875, + "loss": 0.0037777761463075876, + "loss_ce": 0.0002377371274633333, + "loss_xval": 0.0035400390625, + "num_input_tokens_seen": 125175296, + "step": 1811 + }, + { + "epoch": 113.25, + "grad_norm": 4.1087198992751075, + "learning_rate": 5e-05, + "loss": 0.0039, + "num_input_tokens_seen": 125246976, + "step": 1812 + }, + { + "epoch": 113.25, + "loss": 0.004106709733605385, + "loss_ce": 0.00023097720986697823, + "loss_xval": 0.003875732421875, + "num_input_tokens_seen": 125246976, + "step": 1812 + }, + { + "epoch": 113.3125, + "grad_norm": 3.2760181391357883, + "learning_rate": 5e-05, + "loss": 0.0026, + "num_input_tokens_seen": 125318656, + "step": 1813 + }, + { + "epoch": 113.3125, + "loss": 0.002692433074116707, + "loss_ce": 0.00022050918778404593, + "loss_xval": 0.002471923828125, + "num_input_tokens_seen": 125318656, + "step": 1813 + }, + { + "epoch": 113.375, + "grad_norm": 1.9032552197242543, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 125390336, + "step": 1814 + }, + { + "epoch": 113.375, + "loss": 0.001287812483496964, + "loss_ce": 0.00021969727822579443, + "loss_xval": 0.001068115234375, + "num_input_tokens_seen": 125390336, + "step": 1814 + }, + { + "epoch": 113.4375, + "grad_norm": 0.43698459078532, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 125462016, + "step": 1815 + }, + { + "epoch": 113.4375, + "loss": 0.00039364141412079334, + "loss_ce": 0.00020672124810516834, + "loss_xval": 0.000186920166015625, + "num_input_tokens_seen": 125462016, + "step": 1815 + }, + { + "epoch": 113.5, + "grad_norm": 0.8694389485657945, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 125533632, + "step": 1816 + }, + { + "epoch": 113.5, + "loss": 0.0004984450060874224, + "loss_ce": 0.00017801046487875283, + "loss_xval": 0.0003204345703125, + "num_input_tokens_seen": 125533632, + "step": 1816 + }, + { + "epoch": 113.5625, + "grad_norm": 1.8447178924289218, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 125605184, + "step": 1817 + }, + { + "epoch": 113.5625, + "loss": 0.0012481531593948603, + "loss_ce": 0.0001800378959160298, + "loss_xval": 0.001068115234375, + "num_input_tokens_seen": 125605184, + "step": 1817 + }, + { + "epoch": 113.625, + "grad_norm": 2.4990876372368533, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 125676800, + "step": 1818 + }, + { + "epoch": 113.625, + "loss": 0.0014198764692991972, + "loss_ce": 0.00018391459889244288, + "loss_xval": 0.0012359619140625, + "num_input_tokens_seen": 125676800, + "step": 1818 + }, + { + "epoch": 113.6875, + "grad_norm": 3.34970916140925, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_input_tokens_seen": 125748416, + "step": 1819 + }, + { + "epoch": 113.6875, + "loss": 0.0026685905177146196, + "loss_ce": 0.00015089042426552624, + "loss_xval": 0.0025177001953125, + "num_input_tokens_seen": 125748416, + "step": 1819 + }, + { + "epoch": 113.75, + "grad_norm": 4.698109000689552, + "learning_rate": 5e-05, + "loss": 0.0048, + "num_input_tokens_seen": 125807424, + "step": 1820 + }, + { + "epoch": 113.75, + "loss": 0.00491245137527585, + "loss_ce": 0.000151709042256698, + "loss_xval": 0.0047607421875, + "num_input_tokens_seen": 125807424, + "step": 1820 + }, + { + "epoch": 113.8125, + "grad_norm": 6.367293742130005, + "learning_rate": 5e-05, + "loss": 0.0086, + "num_input_tokens_seen": 125879104, + "step": 1821 + }, + { + "epoch": 113.8125, + "loss": 0.008711196482181549, + "loss_ce": 0.0001662747235968709, + "loss_xval": 0.008544921875, + "num_input_tokens_seen": 125879104, + "step": 1821 + }, + { + "epoch": 113.875, + "grad_norm": 7.886708899895091, + "learning_rate": 5e-05, + "loss": 0.0131, + "num_input_tokens_seen": 125950784, + "step": 1822 + }, + { + "epoch": 113.875, + "loss": 0.01309604849666357, + "loss_ce": 0.00015659494965802878, + "loss_xval": 0.012939453125, + "num_input_tokens_seen": 125950784, + "step": 1822 + }, + { + "epoch": 113.9375, + "grad_norm": 8.846094764017112, + "learning_rate": 5e-05, + "loss": 0.0164, + "num_input_tokens_seen": 126022464, + "step": 1823 + }, + { + "epoch": 113.9375, + "loss": 0.016144748777151108, + "loss_ce": 0.00015353858179878443, + "loss_xval": 0.0159912109375, + "num_input_tokens_seen": 126022464, + "step": 1823 + }, + { + "epoch": 114.0, + "grad_norm": 8.424385504717284, + "learning_rate": 5e-05, + "loss": 0.015, + "num_input_tokens_seen": 126094016, + "step": 1824 + }, + { + "epoch": 114.0, + "loss": 0.015658153221011162, + "loss_ce": 0.00015522287867497653, + "loss_xval": 0.0155029296875, + "num_input_tokens_seen": 126094016, + "step": 1824 + }, + { + "epoch": 114.0625, + "grad_norm": 6.13634496945625, + "learning_rate": 5e-05, + "loss": 0.0082, + "num_input_tokens_seen": 126165696, + "step": 1825 + }, + { + "epoch": 114.0625, + "loss": 0.008386150002479553, + "loss_ce": 0.00020743937056977302, + "loss_xval": 0.0081787109375, + "num_input_tokens_seen": 126165696, + "step": 1825 + }, + { + "epoch": 114.125, + "grad_norm": 2.491914014905266, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 126237248, + "step": 1826 + }, + { + "epoch": 114.125, + "loss": 0.0016902722418308258, + "loss_ce": 0.00020254029368516058, + "loss_xval": 0.00148773193359375, + "num_input_tokens_seen": 126237248, + "step": 1826 + }, + { + "epoch": 114.1875, + "grad_norm": 1.2848642236060528, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 126308928, + "step": 1827 + }, + { + "epoch": 114.1875, + "loss": 0.0008566059404984117, + "loss_ce": 0.0002424397098366171, + "loss_xval": 0.000614166259765625, + "num_input_tokens_seen": 126308928, + "step": 1827 + }, + { + "epoch": 114.25, + "grad_norm": 3.851711734139488, + "learning_rate": 5e-05, + "loss": 0.0036, + "num_input_tokens_seen": 126380480, + "step": 1828 + }, + { + "epoch": 114.25, + "loss": 0.003448725678026676, + "loss_ce": 0.0002901563420891762, + "loss_xval": 0.0031585693359375, + "num_input_tokens_seen": 126380480, + "step": 1828 + }, + { + "epoch": 114.3125, + "grad_norm": 4.443643290686806, + "learning_rate": 5e-05, + "loss": 0.0047, + "num_input_tokens_seen": 126452160, + "step": 1829 + }, + { + "epoch": 114.3125, + "loss": 0.004588674288243055, + "loss_ce": 0.00025517813628539443, + "loss_xval": 0.00433349609375, + "num_input_tokens_seen": 126452160, + "step": 1829 + }, + { + "epoch": 114.375, + "grad_norm": 3.386106327231258, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 126523840, + "step": 1830 + }, + { + "epoch": 114.375, + "loss": 0.0029100049287080765, + "loss_ce": 0.00031601072987541556, + "loss_xval": 0.002593994140625, + "num_input_tokens_seen": 126523840, + "step": 1830 + }, + { + "epoch": 114.4375, + "grad_norm": 1.9251156894869557, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 126595392, + "step": 1831 + }, + { + "epoch": 114.4375, + "loss": 0.001533187460154295, + "loss_ce": 0.0003582606732379645, + "loss_xval": 0.0011749267578125, + "num_input_tokens_seen": 126595392, + "step": 1831 + }, + { + "epoch": 114.5, + "grad_norm": 0.5046940819061044, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 126667008, + "step": 1832 + }, + { + "epoch": 114.5, + "loss": 0.0006944824708625674, + "loss_ce": 0.0003339935792610049, + "loss_xval": 0.0003604888916015625, + "num_input_tokens_seen": 126667008, + "step": 1832 + }, + { + "epoch": 114.5625, + "grad_norm": 0.9328194283619768, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 126738624, + "step": 1833 + }, + { + "epoch": 114.5625, + "loss": 0.0007143397815525532, + "loss_ce": 0.00035003622178919613, + "loss_xval": 0.0003643035888671875, + "num_input_tokens_seen": 126738624, + "step": 1833 + }, + { + "epoch": 114.625, + "grad_norm": 1.5869105539345048, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 126810304, + "step": 1834 + }, + { + "epoch": 114.625, + "loss": 0.0008443781407549977, + "loss_ce": 0.0003255793417338282, + "loss_xval": 0.000518798828125, + "num_input_tokens_seen": 126810304, + "step": 1834 + }, + { + "epoch": 114.6875, + "grad_norm": 1.1512002350734496, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 126881856, + "step": 1835 + }, + { + "epoch": 114.6875, + "loss": 0.0007516160840168595, + "loss_ce": 0.0003300920652691275, + "loss_xval": 0.0004215240478515625, + "num_input_tokens_seen": 126881856, + "step": 1835 + }, + { + "epoch": 114.75, + "grad_norm": 0.15802612896501098, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 126953536, + "step": 1836 + }, + { + "epoch": 114.75, + "loss": 0.0006052050739526749, + "loss_ce": 0.0003229174471925944, + "loss_xval": 0.00028228759765625, + "num_input_tokens_seen": 126953536, + "step": 1836 + }, + { + "epoch": 114.8125, + "grad_norm": 1.384495541854308, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 127025344, + "step": 1837 + }, + { + "epoch": 114.8125, + "loss": 0.0007899554912000895, + "loss_ce": 0.0003283771511632949, + "loss_xval": 0.000461578369140625, + "num_input_tokens_seen": 127025344, + "step": 1837 + }, + { + "epoch": 114.875, + "grad_norm": 2.5352190525658584, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 127097152, + "step": 1838 + }, + { + "epoch": 114.875, + "loss": 0.0019897250458598137, + "loss_ce": 0.0002883701235987246, + "loss_xval": 0.00170135498046875, + "num_input_tokens_seen": 127097152, + "step": 1838 + }, + { + "epoch": 114.9375, + "grad_norm": 3.571308713408931, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_input_tokens_seen": 127168768, + "step": 1839 + }, + { + "epoch": 114.9375, + "loss": 0.003045588731765747, + "loss_ce": 0.0002532302460167557, + "loss_xval": 0.0027923583984375, + "num_input_tokens_seen": 127168768, + "step": 1839 + }, + { + "epoch": 115.0, + "grad_norm": 4.6709850727627344, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 127227840, + "step": 1840 + }, + { + "epoch": 115.0, + "loss": 0.004910253453999758, + "loss_ce": 0.0002715814916882664, + "loss_xval": 0.004638671875, + "num_input_tokens_seen": 127227840, + "step": 1840 + }, + { + "epoch": 115.0625, + "grad_norm": 5.525343372867988, + "learning_rate": 5e-05, + "loss": 0.0069, + "num_input_tokens_seen": 127299520, + "step": 1841 + }, + { + "epoch": 115.0625, + "loss": 0.0067585185170173645, + "loss_ce": 0.00022775662364438176, + "loss_xval": 0.00653076171875, + "num_input_tokens_seen": 127299520, + "step": 1841 + }, + { + "epoch": 115.125, + "grad_norm": 5.74424661005289, + "learning_rate": 5e-05, + "loss": 0.0074, + "num_input_tokens_seen": 127371136, + "step": 1842 + }, + { + "epoch": 115.125, + "loss": 0.00769760413095355, + "loss_ce": 0.00022079766495153308, + "loss_xval": 0.007476806640625, + "num_input_tokens_seen": 127371136, + "step": 1842 + }, + { + "epoch": 115.1875, + "grad_norm": 5.468632833948617, + "learning_rate": 5e-05, + "loss": 0.0068, + "num_input_tokens_seen": 127442880, + "step": 1843 + }, + { + "epoch": 115.1875, + "loss": 0.006827354431152344, + "loss_ce": 0.0002050399052677676, + "loss_xval": 0.006622314453125, + "num_input_tokens_seen": 127442880, + "step": 1843 + }, + { + "epoch": 115.25, + "grad_norm": 4.465560051973801, + "learning_rate": 5e-05, + "loss": 0.0047, + "num_input_tokens_seen": 127514560, + "step": 1844 + }, + { + "epoch": 115.25, + "loss": 0.004257781431078911, + "loss_ce": 0.0002294613077538088, + "loss_xval": 0.0040283203125, + "num_input_tokens_seen": 127514560, + "step": 1844 + }, + { + "epoch": 115.3125, + "grad_norm": 2.294471964577755, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 127586240, + "step": 1845 + }, + { + "epoch": 115.3125, + "loss": 0.0017640250734984875, + "loss_ce": 0.00023051683092489839, + "loss_xval": 0.00153350830078125, + "num_input_tokens_seen": 127586240, + "step": 1845 + }, + { + "epoch": 115.375, + "grad_norm": 0.5333315440861227, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 127657856, + "step": 1846 + }, + { + "epoch": 115.375, + "loss": 0.0005950859049335122, + "loss_ce": 0.00021552354155573994, + "loss_xval": 0.0003795623779296875, + "num_input_tokens_seen": 127657856, + "step": 1846 + }, + { + "epoch": 115.4375, + "grad_norm": 2.9353209276907544, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 127729472, + "step": 1847 + }, + { + "epoch": 115.4375, + "loss": 0.002198727335780859, + "loss_ce": 0.00024560230667702854, + "loss_xval": 0.001953125, + "num_input_tokens_seen": 127729472, + "step": 1847 + }, + { + "epoch": 115.5, + "grad_norm": 4.950894094113668, + "learning_rate": 5e-05, + "loss": 0.0057, + "num_input_tokens_seen": 127801152, + "step": 1848 + }, + { + "epoch": 115.5, + "loss": 0.005807612556964159, + "loss_ce": 0.00022289558546617627, + "loss_xval": 0.005584716796875, + "num_input_tokens_seen": 127801152, + "step": 1848 + }, + { + "epoch": 115.5625, + "grad_norm": 6.535244946174206, + "learning_rate": 5e-05, + "loss": 0.0094, + "num_input_tokens_seen": 127872768, + "step": 1849 + }, + { + "epoch": 115.5625, + "loss": 0.009224435314536095, + "loss_ce": 0.00019123233505524695, + "loss_xval": 0.009033203125, + "num_input_tokens_seen": 127872768, + "step": 1849 + }, + { + "epoch": 115.625, + "grad_norm": 7.165273740342595, + "learning_rate": 5e-05, + "loss": 0.0113, + "num_input_tokens_seen": 127931968, + "step": 1850 + }, + { + "epoch": 115.625, + "loss": 0.01137017086148262, + "loss_ce": 0.00020073754421900958, + "loss_xval": 0.01116943359375, + "num_input_tokens_seen": 127931968, + "step": 1850 + }, + { + "epoch": 115.6875, + "grad_norm": 6.6107148447934145, + "learning_rate": 5e-05, + "loss": 0.0098, + "num_input_tokens_seen": 128003584, + "step": 1851 + }, + { + "epoch": 115.6875, + "loss": 0.01010982133448124, + "loss_ce": 0.00022212599287740886, + "loss_xval": 0.0098876953125, + "num_input_tokens_seen": 128003584, + "step": 1851 + }, + { + "epoch": 115.75, + "grad_norm": 5.118953223208179, + "learning_rate": 5e-05, + "loss": 0.0059, + "num_input_tokens_seen": 128075200, + "step": 1852 + }, + { + "epoch": 115.75, + "loss": 0.0058305393904447556, + "loss_ce": 0.0002153048844775185, + "loss_xval": 0.005615234375, + "num_input_tokens_seen": 128075200, + "step": 1852 + }, + { + "epoch": 115.8125, + "grad_norm": 2.865843839872992, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 128146944, + "step": 1853 + }, + { + "epoch": 115.8125, + "loss": 0.0022164033725857735, + "loss_ce": 0.00020224327454343438, + "loss_xval": 0.00201416015625, + "num_input_tokens_seen": 128146944, + "step": 1853 + }, + { + "epoch": 115.875, + "grad_norm": 0.1575066950867638, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 128218624, + "step": 1854 + }, + { + "epoch": 115.875, + "loss": 0.0004789860104210675, + "loss_ce": 0.00022149394499137998, + "loss_xval": 0.0002574920654296875, + "num_input_tokens_seen": 128218624, + "step": 1854 + }, + { + "epoch": 115.9375, + "grad_norm": 2.6798771718241228, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 128290176, + "step": 1855 + }, + { + "epoch": 115.9375, + "loss": 0.002353307791054249, + "loss_ce": 0.0002475949004292488, + "loss_xval": 0.002105712890625, + "num_input_tokens_seen": 128290176, + "step": 1855 + }, + { + "epoch": 116.0, + "grad_norm": 4.719576085613883, + "learning_rate": 5e-05, + "loss": 0.0054, + "num_input_tokens_seen": 128361728, + "step": 1856 + }, + { + "epoch": 116.0, + "loss": 0.005584153346717358, + "loss_ce": 0.00027409486938267946, + "loss_xval": 0.00531005859375, + "num_input_tokens_seen": 128361728, + "step": 1856 + }, + { + "epoch": 116.0625, + "grad_norm": 5.490063572209329, + "learning_rate": 5e-05, + "loss": 0.0072, + "num_input_tokens_seen": 128433280, + "step": 1857 + }, + { + "epoch": 116.0625, + "loss": 0.0072921644896268845, + "loss_ce": 0.0002731215790845454, + "loss_xval": 0.00701904296875, + "num_input_tokens_seen": 128433280, + "step": 1857 + }, + { + "epoch": 116.125, + "grad_norm": 4.931355390541039, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 128504832, + "step": 1858 + }, + { + "epoch": 116.125, + "loss": 0.005588597152382135, + "loss_ce": 0.00024802106781862676, + "loss_xval": 0.005340576171875, + "num_input_tokens_seen": 128504832, + "step": 1858 + }, + { + "epoch": 116.1875, + "grad_norm": 3.4001732010288155, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_input_tokens_seen": 128576384, + "step": 1859 + }, + { + "epoch": 116.1875, + "loss": 0.003236029762774706, + "loss_ce": 0.0003216010518372059, + "loss_xval": 0.0029144287109375, + "num_input_tokens_seen": 128576384, + "step": 1859 + }, + { + "epoch": 116.25, + "grad_norm": 1.2787257238501162, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 128648064, + "step": 1860 + }, + { + "epoch": 116.25, + "loss": 0.0010641159024089575, + "loss_ce": 0.0002554001403041184, + "loss_xval": 0.0008087158203125, + "num_input_tokens_seen": 128648064, + "step": 1860 + }, + { + "epoch": 116.3125, + "grad_norm": 1.2800305157776937, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 128719808, + "step": 1861 + }, + { + "epoch": 116.3125, + "loss": 0.0008978757541626692, + "loss_ce": 0.0002646360080689192, + "loss_xval": 0.00063323974609375, + "num_input_tokens_seen": 128719808, + "step": 1861 + }, + { + "epoch": 116.375, + "grad_norm": 3.485110576462092, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_input_tokens_seen": 128791424, + "step": 1862 + }, + { + "epoch": 116.375, + "loss": 0.0031012576073408127, + "loss_ce": 0.0002478641690686345, + "loss_xval": 0.0028533935546875, + "num_input_tokens_seen": 128791424, + "step": 1862 + }, + { + "epoch": 116.4375, + "grad_norm": 4.646144569167161, + "learning_rate": 5e-05, + "loss": 0.0052, + "num_input_tokens_seen": 128863104, + "step": 1863 + }, + { + "epoch": 116.4375, + "loss": 0.0047195046208798885, + "loss_ce": 0.00029445585096254945, + "loss_xval": 0.004425048828125, + "num_input_tokens_seen": 128863104, + "step": 1863 + }, + { + "epoch": 116.5, + "grad_norm": 4.549980043580312, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 128934720, + "step": 1864 + }, + { + "epoch": 116.5, + "loss": 0.0052688755095005035, + "loss_ce": 0.000263992726104334, + "loss_xval": 0.0050048828125, + "num_input_tokens_seen": 128934720, + "step": 1864 + }, + { + "epoch": 116.5625, + "grad_norm": 3.0547045447153636, + "learning_rate": 5e-05, + "loss": 0.0026, + "num_input_tokens_seen": 129006336, + "step": 1865 + }, + { + "epoch": 116.5625, + "loss": 0.0026751989498734474, + "loss_ce": 0.00024905154714360833, + "loss_xval": 0.0024261474609375, + "num_input_tokens_seen": 129006336, + "step": 1865 + }, + { + "epoch": 116.625, + "grad_norm": 0.5058449032568803, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 129077888, + "step": 1866 + }, + { + "epoch": 116.625, + "loss": 0.0005971361533738673, + "loss_ce": 0.0002442766563035548, + "loss_xval": 0.0003528594970703125, + "num_input_tokens_seen": 129077888, + "step": 1866 + }, + { + "epoch": 116.6875, + "grad_norm": 2.4142117198655995, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 129149568, + "step": 1867 + }, + { + "epoch": 116.6875, + "loss": 0.0016070627607405186, + "loss_ce": 0.000271918746875599, + "loss_xval": 0.00133514404296875, + "num_input_tokens_seen": 129149568, + "step": 1867 + }, + { + "epoch": 116.75, + "grad_norm": 5.0569314143632935, + "learning_rate": 5e-05, + "loss": 0.006, + "num_input_tokens_seen": 129221184, + "step": 1868 + }, + { + "epoch": 116.75, + "loss": 0.006282792426645756, + "loss_ce": 0.0002097943506669253, + "loss_xval": 0.006072998046875, + "num_input_tokens_seen": 129221184, + "step": 1868 + }, + { + "epoch": 116.8125, + "grad_norm": 7.532738506332238, + "learning_rate": 5e-05, + "loss": 0.0129, + "num_input_tokens_seen": 129292800, + "step": 1869 + }, + { + "epoch": 116.8125, + "loss": 0.013102911412715912, + "loss_ce": 0.0002244935603812337, + "loss_xval": 0.01287841796875, + "num_input_tokens_seen": 129292800, + "step": 1869 + }, + { + "epoch": 116.875, + "grad_norm": 9.355417713150146, + "learning_rate": 5e-05, + "loss": 0.0197, + "num_input_tokens_seen": 129351872, + "step": 1870 + }, + { + "epoch": 116.875, + "loss": 0.019691236317157745, + "loss_ce": 0.00028205677517689764, + "loss_xval": 0.0194091796875, + "num_input_tokens_seen": 129351872, + "step": 1870 + }, + { + "epoch": 116.9375, + "grad_norm": 9.051156641686775, + "learning_rate": 5e-05, + "loss": 0.0189, + "num_input_tokens_seen": 129423488, + "step": 1871 + }, + { + "epoch": 116.9375, + "loss": 0.01793898269534111, + "loss_ce": 0.00023878809588495642, + "loss_xval": 0.0177001953125, + "num_input_tokens_seen": 129423488, + "step": 1871 + }, + { + "epoch": 117.0, + "grad_norm": 5.676332544955607, + "learning_rate": 5e-05, + "loss": 0.0082, + "num_input_tokens_seen": 129495168, + "step": 1872 + }, + { + "epoch": 117.0, + "loss": 0.006899159401655197, + "loss_ce": 0.0003378799301572144, + "loss_xval": 0.006561279296875, + "num_input_tokens_seen": 129495168, + "step": 1872 + }, + { + "epoch": 117.0625, + "grad_norm": 0.24712015306062302, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 129566848, + "step": 1873 + }, + { + "epoch": 117.0625, + "loss": 0.000838251318782568, + "loss_ce": 0.000342340674251318, + "loss_xval": 0.00049591064453125, + "num_input_tokens_seen": 129566848, + "step": 1873 + }, + { + "epoch": 117.125, + "grad_norm": 5.538933794151056, + "learning_rate": 5e-05, + "loss": 0.0074, + "num_input_tokens_seen": 129638464, + "step": 1874 + }, + { + "epoch": 117.125, + "loss": 0.007606185507029295, + "loss_ce": 0.0004040371277369559, + "loss_xval": 0.0072021484375, + "num_input_tokens_seen": 129638464, + "step": 1874 + }, + { + "epoch": 117.1875, + "grad_norm": 7.734305494423269, + "learning_rate": 5e-05, + "loss": 0.0141, + "num_input_tokens_seen": 129710080, + "step": 1875 + }, + { + "epoch": 117.1875, + "loss": 0.013922973535954952, + "loss_ce": 0.00037316849920898676, + "loss_xval": 0.0135498046875, + "num_input_tokens_seen": 129710080, + "step": 1875 + }, + { + "epoch": 117.25, + "grad_norm": 6.0224839866175826, + "learning_rate": 5e-05, + "loss": 0.0091, + "num_input_tokens_seen": 129781760, + "step": 1876 + }, + { + "epoch": 117.25, + "loss": 0.010079562664031982, + "loss_ce": 0.00037497258745133877, + "loss_xval": 0.00970458984375, + "num_input_tokens_seen": 129781760, + "step": 1876 + }, + { + "epoch": 117.3125, + "grad_norm": 1.8034154244820024, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 129840832, + "step": 1877 + }, + { + "epoch": 117.3125, + "loss": 0.0012554076965898275, + "loss_ce": 0.000412359629990533, + "loss_xval": 0.000843048095703125, + "num_input_tokens_seen": 129840832, + "step": 1877 + }, + { + "epoch": 117.375, + "grad_norm": 2.803885585326666, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_input_tokens_seen": 129912448, + "step": 1878 + }, + { + "epoch": 117.375, + "loss": 0.0023236768320202827, + "loss_ce": 0.0003934399283025414, + "loss_xval": 0.00193023681640625, + "num_input_tokens_seen": 129912448, + "step": 1878 + }, + { + "epoch": 117.4375, + "grad_norm": 5.350260126337846, + "learning_rate": 5e-05, + "loss": 0.0073, + "num_input_tokens_seen": 129984064, + "step": 1879 + }, + { + "epoch": 117.4375, + "loss": 0.007126620039343834, + "loss_ce": 0.0004737878334708512, + "loss_xval": 0.00665283203125, + "num_input_tokens_seen": 129984064, + "step": 1879 + }, + { + "epoch": 117.5, + "grad_norm": 4.234827947508276, + "learning_rate": 5e-05, + "loss": 0.0047, + "num_input_tokens_seen": 130055616, + "step": 1880 + }, + { + "epoch": 117.5, + "loss": 0.004482659976929426, + "loss_ce": 0.00045433969353325665, + "loss_xval": 0.0040283203125, + "num_input_tokens_seen": 130055616, + "step": 1880 + }, + { + "epoch": 117.5625, + "grad_norm": 0.40401382532919683, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 130127232, + "step": 1881 + }, + { + "epoch": 117.5625, + "loss": 0.0006351525080390275, + "loss_ce": 0.00046349113108590245, + "loss_xval": 0.000171661376953125, + "num_input_tokens_seen": 130127232, + "step": 1881 + }, + { + "epoch": 117.625, + "grad_norm": 3.515571860108049, + "learning_rate": 5e-05, + "loss": 0.0035, + "num_input_tokens_seen": 130198848, + "step": 1882 + }, + { + "epoch": 117.625, + "loss": 0.0034939174074679613, + "loss_ce": 0.00041164198773913085, + "loss_xval": 0.003082275390625, + "num_input_tokens_seen": 130198848, + "step": 1882 + }, + { + "epoch": 117.6875, + "grad_norm": 5.162499769376859, + "learning_rate": 5e-05, + "loss": 0.0067, + "num_input_tokens_seen": 130270528, + "step": 1883 + }, + { + "epoch": 117.6875, + "loss": 0.006677284371107817, + "loss_ce": 0.0003906633937731385, + "loss_xval": 0.00628662109375, + "num_input_tokens_seen": 130270528, + "step": 1883 + }, + { + "epoch": 117.75, + "grad_norm": 3.7342067107303967, + "learning_rate": 5e-05, + "loss": 0.0038, + "num_input_tokens_seen": 130329536, + "step": 1884 + }, + { + "epoch": 117.75, + "loss": 0.0037594609893858433, + "loss_ce": 0.00032623333390802145, + "loss_xval": 0.0034332275390625, + "num_input_tokens_seen": 130329536, + "step": 1884 + }, + { + "epoch": 117.8125, + "grad_norm": 0.4550271944054085, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 130388672, + "step": 1885 + }, + { + "epoch": 117.8125, + "loss": 0.0004483010561671108, + "loss_ce": 0.00033290646388195455, + "loss_xval": 0.00011539459228515625, + "num_input_tokens_seen": 130388672, + "step": 1885 + }, + { + "epoch": 117.875, + "grad_norm": 2.5099531673200572, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 130460480, + "step": 1886 + }, + { + "epoch": 117.875, + "loss": 0.0020212167873978615, + "loss_ce": 0.0002969737397506833, + "loss_xval": 0.0017242431640625, + "num_input_tokens_seen": 130460480, + "step": 1886 + }, + { + "epoch": 117.9375, + "grad_norm": 3.7727950222198268, + "learning_rate": 5e-05, + "loss": 0.0037, + "num_input_tokens_seen": 130532032, + "step": 1887 + }, + { + "epoch": 117.9375, + "loss": 0.0038818458560854197, + "loss_ce": 0.0002502540301065892, + "loss_xval": 0.003631591796875, + "num_input_tokens_seen": 130532032, + "step": 1887 + }, + { + "epoch": 118.0, + "grad_norm": 3.3771414897134897, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 130591104, + "step": 1888 + }, + { + "epoch": 118.0, + "loss": 0.0025949934497475624, + "loss_ce": 0.00024513984681107104, + "loss_xval": 0.002349853515625, + "num_input_tokens_seen": 130591104, + "step": 1888 + }, + { + "epoch": 118.0625, + "grad_norm": 1.8822174090023722, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 130662656, + "step": 1889 + }, + { + "epoch": 118.0625, + "loss": 0.0014085653237998486, + "loss_ce": 0.00023363855143543333, + "loss_xval": 0.0011749267578125, + "num_input_tokens_seen": 130662656, + "step": 1889 + }, + { + "epoch": 118.125, + "grad_norm": 0.22173542948007965, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 130734400, + "step": 1890 + }, + { + "epoch": 118.125, + "loss": 0.00043304392602294683, + "loss_ce": 0.0001889032864710316, + "loss_xval": 0.000244140625, + "num_input_tokens_seen": 130734400, + "step": 1890 + }, + { + "epoch": 118.1875, + "grad_norm": 2.3935184610564866, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 130793408, + "step": 1891 + }, + { + "epoch": 118.1875, + "loss": 0.0013886751839891076, + "loss_ce": 0.0001756014535203576, + "loss_xval": 0.00121307373046875, + "num_input_tokens_seen": 130793408, + "step": 1891 + }, + { + "epoch": 118.25, + "grad_norm": 4.620740343148861, + "learning_rate": 5e-05, + "loss": 0.005, + "num_input_tokens_seen": 130865088, + "step": 1892 + }, + { + "epoch": 118.25, + "loss": 0.005361003335565329, + "loss_ce": 0.00017301514162681997, + "loss_xval": 0.00518798828125, + "num_input_tokens_seen": 130865088, + "step": 1892 + }, + { + "epoch": 118.3125, + "grad_norm": 6.107267740441691, + "learning_rate": 5e-05, + "loss": 0.0084, + "num_input_tokens_seen": 130936704, + "step": 1893 + }, + { + "epoch": 118.3125, + "loss": 0.00820300355553627, + "loss_ce": 0.00014636323612648994, + "loss_xval": 0.008056640625, + "num_input_tokens_seen": 130936704, + "step": 1893 + }, + { + "epoch": 118.375, + "grad_norm": 6.598447170792395, + "learning_rate": 5e-05, + "loss": 0.0098, + "num_input_tokens_seen": 131008256, + "step": 1894 + }, + { + "epoch": 118.375, + "loss": 0.00978381372988224, + "loss_ce": 0.00014025870768819004, + "loss_xval": 0.0096435546875, + "num_input_tokens_seen": 131008256, + "step": 1894 + }, + { + "epoch": 118.4375, + "grad_norm": 5.97610984602622, + "learning_rate": 5e-05, + "loss": 0.0082, + "num_input_tokens_seen": 131067264, + "step": 1895 + }, + { + "epoch": 118.4375, + "loss": 0.0079050837084651, + "loss_ce": 0.00015361930127255619, + "loss_xval": 0.00775146484375, + "num_input_tokens_seen": 131067264, + "step": 1895 + }, + { + "epoch": 118.5, + "grad_norm": 3.8296539045437776, + "learning_rate": 5e-05, + "loss": 0.0036, + "num_input_tokens_seen": 131138944, + "step": 1896 + }, + { + "epoch": 118.5, + "loss": 0.003549471264705062, + "loss_ce": 0.0002077965618809685, + "loss_xval": 0.0033416748046875, + "num_input_tokens_seen": 131138944, + "step": 1896 + }, + { + "epoch": 118.5625, + "grad_norm": 0.5031231600520943, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 131210496, + "step": 1897 + }, + { + "epoch": 118.5625, + "loss": 0.00042222841875627637, + "loss_ce": 0.00021242006914690137, + "loss_xval": 0.000209808349609375, + "num_input_tokens_seen": 131210496, + "step": 1897 + }, + { + "epoch": 118.625, + "grad_norm": 2.7708119254564774, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 131282176, + "step": 1898 + }, + { + "epoch": 118.625, + "loss": 0.0018496726406738162, + "loss_ce": 0.00027801733813248575, + "loss_xval": 0.0015716552734375, + "num_input_tokens_seen": 131282176, + "step": 1898 + }, + { + "epoch": 118.6875, + "grad_norm": 4.503105217437891, + "learning_rate": 5e-05, + "loss": 0.005, + "num_input_tokens_seen": 131353792, + "step": 1899 + }, + { + "epoch": 118.6875, + "loss": 0.005182057619094849, + "loss_ce": 0.00023820977366995066, + "loss_xval": 0.00494384765625, + "num_input_tokens_seen": 131353792, + "step": 1899 + }, + { + "epoch": 118.75, + "grad_norm": 4.32176320487952, + "learning_rate": 5e-05, + "loss": 0.0047, + "num_input_tokens_seen": 131425408, + "step": 1900 + }, + { + "epoch": 118.75, + "loss": 0.004484056495130062, + "loss_ce": 0.0002726308593992144, + "loss_xval": 0.00421142578125, + "num_input_tokens_seen": 131425408, + "step": 1900 + }, + { + "epoch": 118.8125, + "grad_norm": 2.625321999415006, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 131484544, + "step": 1901 + }, + { + "epoch": 118.8125, + "loss": 0.0019633802585303783, + "loss_ce": 0.00030017219251021743, + "loss_xval": 0.0016632080078125, + "num_input_tokens_seen": 131484544, + "step": 1901 + }, + { + "epoch": 118.875, + "grad_norm": 0.08851930350664257, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 131556160, + "step": 1902 + }, + { + "epoch": 118.875, + "loss": 0.0004228780453559011, + "loss_ce": 0.0002989003842230886, + "loss_xval": 0.0001239776611328125, + "num_input_tokens_seen": 131556160, + "step": 1902 + }, + { + "epoch": 118.9375, + "grad_norm": 2.4963959699155094, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 131602752, + "step": 1903 + }, + { + "epoch": 118.9375, + "loss": 0.0017550225602462888, + "loss_ce": 0.00027492005028761923, + "loss_xval": 0.0014801025390625, + "num_input_tokens_seen": 131602752, + "step": 1903 + }, + { + "epoch": 119.0, + "grad_norm": 4.442726334947545, + "learning_rate": 5e-05, + "loss": 0.005, + "num_input_tokens_seen": 131674304, + "step": 1904 + }, + { + "epoch": 119.0, + "loss": 0.005132955964654684, + "loss_ce": 0.00025014366838149726, + "loss_xval": 0.0048828125, + "num_input_tokens_seen": 131674304, + "step": 1904 + }, + { + "epoch": 119.0625, + "grad_norm": 5.2266029914078125, + "learning_rate": 5e-05, + "loss": 0.0068, + "num_input_tokens_seen": 131745856, + "step": 1905 + }, + { + "epoch": 119.0625, + "loss": 0.006491591222584248, + "loss_ce": 0.0002660052268765867, + "loss_xval": 0.0062255859375, + "num_input_tokens_seen": 131745856, + "step": 1905 + }, + { + "epoch": 119.125, + "grad_norm": 4.301569943752984, + "learning_rate": 5e-05, + "loss": 0.0047, + "num_input_tokens_seen": 131817408, + "step": 1906 + }, + { + "epoch": 119.125, + "loss": 0.0044795856811106205, + "loss_ce": 0.0002681601035874337, + "loss_xval": 0.00421142578125, + "num_input_tokens_seen": 131817408, + "step": 1906 + }, + { + "epoch": 119.1875, + "grad_norm": 1.8553873164934465, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 131889024, + "step": 1907 + }, + { + "epoch": 119.1875, + "loss": 0.0012664373498409986, + "loss_ce": 0.0002517279062885791, + "loss_xval": 0.00101470947265625, + "num_input_tokens_seen": 131889024, + "step": 1907 + }, + { + "epoch": 119.25, + "grad_norm": 0.8385600978339574, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 131960704, + "step": 1908 + }, + { + "epoch": 119.25, + "loss": 0.0005585540202446282, + "loss_ce": 0.0002533782389946282, + "loss_xval": 0.00030517578125, + "num_input_tokens_seen": 131960704, + "step": 1908 + }, + { + "epoch": 119.3125, + "grad_norm": 2.8320554472079498, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 132032384, + "step": 1909 + }, + { + "epoch": 119.3125, + "loss": 0.002158309333026409, + "loss_ce": 0.00023570179473608732, + "loss_xval": 0.001922607421875, + "num_input_tokens_seen": 132032384, + "step": 1909 + }, + { + "epoch": 119.375, + "grad_norm": 4.023752035863257, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 132104064, + "step": 1910 + }, + { + "epoch": 119.375, + "loss": 0.0039420705288648605, + "loss_ce": 0.00023418469936586916, + "loss_xval": 0.0037078857421875, + "num_input_tokens_seen": 132104064, + "step": 1910 + }, + { + "epoch": 119.4375, + "grad_norm": 4.339042522343855, + "learning_rate": 5e-05, + "loss": 0.0046, + "num_input_tokens_seen": 132175616, + "step": 1911 + }, + { + "epoch": 119.4375, + "loss": 0.004613110329955816, + "loss_ce": 0.00021857912361156195, + "loss_xval": 0.00439453125, + "num_input_tokens_seen": 132175616, + "step": 1911 + }, + { + "epoch": 119.5, + "grad_norm": 3.577390118173682, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_input_tokens_seen": 132247296, + "step": 1912 + }, + { + "epoch": 119.5, + "loss": 0.0032142093405127525, + "loss_ce": 0.0001777103025233373, + "loss_xval": 0.0030364990234375, + "num_input_tokens_seen": 132247296, + "step": 1912 + }, + { + "epoch": 119.5625, + "grad_norm": 1.843513969482811, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 132318912, + "step": 1913 + }, + { + "epoch": 119.5625, + "loss": 0.0010311749065294862, + "loss_ce": 0.0001919415226439014, + "loss_xval": 0.0008392333984375, + "num_input_tokens_seen": 132318912, + "step": 1913 + }, + { + "epoch": 119.625, + "grad_norm": 0.3208426926764874, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 132390592, + "step": 1914 + }, + { + "epoch": 119.625, + "loss": 0.0003046026104129851, + "loss_ce": 0.00020398998458404094, + "loss_xval": 0.00010061264038085938, + "num_input_tokens_seen": 132390592, + "step": 1914 + }, + { + "epoch": 119.6875, + "grad_norm": 2.265117670760533, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 132462336, + "step": 1915 + }, + { + "epoch": 119.6875, + "loss": 0.0014705532230436802, + "loss_ce": 0.00015829737822059542, + "loss_xval": 0.001312255859375, + "num_input_tokens_seen": 132462336, + "step": 1915 + }, + { + "epoch": 119.75, + "grad_norm": 3.5286223420609417, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_input_tokens_seen": 132534016, + "step": 1916 + }, + { + "epoch": 119.75, + "loss": 0.003026542253792286, + "loss_ce": 0.00017314877186436206, + "loss_xval": 0.0028533935546875, + "num_input_tokens_seen": 132534016, + "step": 1916 + }, + { + "epoch": 119.8125, + "grad_norm": 3.8044836959593145, + "learning_rate": 5e-05, + "loss": 0.0035, + "num_input_tokens_seen": 132593216, + "step": 1917 + }, + { + "epoch": 119.8125, + "loss": 0.0033919373527169228, + "loss_ce": 0.00014181526785250753, + "loss_xval": 0.0032501220703125, + "num_input_tokens_seen": 132593216, + "step": 1917 + }, + { + "epoch": 119.875, + "grad_norm": 3.434972428474604, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 132664832, + "step": 1918 + }, + { + "epoch": 119.875, + "loss": 0.0028979736380279064, + "loss_ce": 0.00013613273040391505, + "loss_xval": 0.0027618408203125, + "num_input_tokens_seen": 132664832, + "step": 1918 + }, + { + "epoch": 119.9375, + "grad_norm": 2.858343072749603, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 132736512, + "step": 1919 + }, + { + "epoch": 119.9375, + "loss": 0.0021448051556944847, + "loss_ce": 0.00013064501399639994, + "loss_xval": 0.00201416015625, + "num_input_tokens_seen": 132736512, + "step": 1919 + }, + { + "epoch": 120.0, + "grad_norm": 2.0550669726698705, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 132808256, + "step": 1920 + }, + { + "epoch": 120.0, + "loss": 0.0010152951581403613, + "loss_ce": 0.00014554419612977654, + "loss_xval": 0.0008697509765625, + "num_input_tokens_seen": 132808256, + "step": 1920 + }, + { + "epoch": 120.0625, + "grad_norm": 0.9455822939077928, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 132880000, + "step": 1921 + }, + { + "epoch": 120.0625, + "loss": 0.0003893776738550514, + "loss_ce": 0.0001242562138941139, + "loss_xval": 0.0002651214599609375, + "num_input_tokens_seen": 132880000, + "step": 1921 + }, + { + "epoch": 120.125, + "grad_norm": 0.038142623378173694, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 132951616, + "step": 1922 + }, + { + "epoch": 120.125, + "loss": 0.00025403310428373516, + "loss_ce": 0.00013625432620756328, + "loss_xval": 0.00011777877807617188, + "num_input_tokens_seen": 132951616, + "step": 1922 + }, + { + "epoch": 120.1875, + "grad_norm": 0.517983666843393, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 133023296, + "step": 1923 + }, + { + "epoch": 120.1875, + "loss": 0.0002651893300935626, + "loss_ce": 0.00013072125148028135, + "loss_xval": 0.00013446807861328125, + "num_input_tokens_seen": 133023296, + "step": 1923 + }, + { + "epoch": 120.25, + "grad_norm": 0.5278042857056948, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 133094912, + "step": 1924 + }, + { + "epoch": 120.25, + "loss": 0.00028297933749854565, + "loss_ce": 0.00011322532373014838, + "loss_xval": 0.0001697540283203125, + "num_input_tokens_seen": 133094912, + "step": 1924 + }, + { + "epoch": 120.3125, + "grad_norm": 0.19533164384864551, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 133166592, + "step": 1925 + }, + { + "epoch": 120.3125, + "loss": 0.0001814560528146103, + "loss_ce": 0.00012256666377652436, + "loss_xval": 5.888938903808594e-05, + "num_input_tokens_seen": 133166592, + "step": 1925 + }, + { + "epoch": 120.375, + "grad_norm": 0.26742184823554127, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 133238208, + "step": 1926 + }, + { + "epoch": 120.375, + "loss": 0.00021792534971609712, + "loss_ce": 0.000109206470369827, + "loss_xval": 0.0001087188720703125, + "num_input_tokens_seen": 133238208, + "step": 1926 + }, + { + "epoch": 120.4375, + "grad_norm": 0.6238052263308073, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 133297344, + "step": 1927 + }, + { + "epoch": 120.4375, + "loss": 0.0002904341381508857, + "loss_ce": 9.874559327727184e-05, + "loss_xval": 0.00019168853759765625, + "num_input_tokens_seen": 133297344, + "step": 1927 + }, + { + "epoch": 120.5, + "grad_norm": 0.8868703067433092, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 133369024, + "step": 1928 + }, + { + "epoch": 120.5, + "loss": 0.00046274211490526795, + "loss_ce": 8.508710016030818e-05, + "loss_xval": 0.000377655029296875, + "num_input_tokens_seen": 133369024, + "step": 1928 + }, + { + "epoch": 120.5625, + "grad_norm": 1.3927712553743166, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 133440704, + "step": 1929 + }, + { + "epoch": 120.5625, + "loss": 0.0006976852891966701, + "loss_ce": 9.114840213442221e-05, + "loss_xval": 0.000606536865234375, + "num_input_tokens_seen": 133440704, + "step": 1929 + }, + { + "epoch": 120.625, + "grad_norm": 2.6743346780382424, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 133512256, + "step": 1930 + }, + { + "epoch": 120.625, + "loss": 0.00175421591848135, + "loss_ce": 8.337857434526086e-05, + "loss_xval": 0.00167083740234375, + "num_input_tokens_seen": 133512256, + "step": 1930 + }, + { + "epoch": 120.6875, + "grad_norm": 5.5814413905267255, + "learning_rate": 5e-05, + "loss": 0.0071, + "num_input_tokens_seen": 133571392, + "step": 1931 + }, + { + "epoch": 120.6875, + "loss": 0.007034452632069588, + "loss_ce": 7.644500874448568e-05, + "loss_xval": 0.0069580078125, + "num_input_tokens_seen": 133571392, + "step": 1931 + }, + { + "epoch": 120.75, + "grad_norm": 11.156357970593493, + "learning_rate": 5e-05, + "loss": 0.0282, + "num_input_tokens_seen": 133643008, + "step": 1932 + }, + { + "epoch": 120.75, + "loss": 0.028513383120298386, + "loss_ce": 7.099966751411557e-05, + "loss_xval": 0.0284423828125, + "num_input_tokens_seen": 133643008, + "step": 1932 + }, + { + "epoch": 120.8125, + "grad_norm": 19.248052553147836, + "learning_rate": 5e-05, + "loss": 0.086, + "num_input_tokens_seen": 133714624, + "step": 1933 + }, + { + "epoch": 120.8125, + "loss": 0.085076242685318, + "loss_ce": 0.00011530210758792236, + "loss_xval": 0.0849609375, + "num_input_tokens_seen": 133714624, + "step": 1933 + }, + { + "epoch": 120.875, + "grad_norm": 20.62678420915086, + "learning_rate": 5e-05, + "loss": 0.1021, + "num_input_tokens_seen": 133773696, + "step": 1934 + }, + { + "epoch": 120.875, + "loss": 0.10283531993627548, + "loss_ce": 0.00029625691240653396, + "loss_xval": 0.1025390625, + "num_input_tokens_seen": 133773696, + "step": 1934 + }, + { + "epoch": 120.9375, + "grad_norm": 7.228987067817792, + "learning_rate": 5e-05, + "loss": 0.0146, + "num_input_tokens_seen": 133845440, + "step": 1935 + }, + { + "epoch": 120.9375, + "loss": 0.015175651758909225, + "loss_ce": 0.001259636483155191, + "loss_xval": 0.013916015625, + "num_input_tokens_seen": 133845440, + "step": 1935 + }, + { + "epoch": 121.0, + "grad_norm": 9.84998372390247, + "learning_rate": 5e-05, + "loss": 0.0271, + "num_input_tokens_seen": 133917056, + "step": 1936 + }, + { + "epoch": 121.0, + "loss": 0.02591935358941555, + "loss_ce": 0.0019935716409236193, + "loss_xval": 0.02392578125, + "num_input_tokens_seen": 133917056, + "step": 1936 + }, + { + "epoch": 121.0625, + "grad_norm": 18.91736705034326, + "learning_rate": 5e-05, + "loss": 0.0873, + "num_input_tokens_seen": 133988736, + "step": 1937 + }, + { + "epoch": 121.0625, + "loss": 0.08870452642440796, + "loss_ce": 0.0017904606647789478, + "loss_xval": 0.0869140625, + "num_input_tokens_seen": 133988736, + "step": 1937 + }, + { + "epoch": 121.125, + "grad_norm": 11.215001835063116, + "learning_rate": 5e-05, + "loss": 0.0317, + "num_input_tokens_seen": 134060416, + "step": 1938 + }, + { + "epoch": 121.125, + "loss": 0.030903557315468788, + "loss_ce": 0.0012404713779687881, + "loss_xval": 0.0296630859375, + "num_input_tokens_seen": 134060416, + "step": 1938 + }, + { + "epoch": 121.1875, + "grad_norm": 3.4933702646752485, + "learning_rate": 5e-05, + "loss": 0.0058, + "num_input_tokens_seen": 134132160, + "step": 1939 + }, + { + "epoch": 121.1875, + "loss": 0.006178136914968491, + "loss_ce": 0.0012953245313838124, + "loss_xval": 0.0048828125, + "num_input_tokens_seen": 134132160, + "step": 1939 + }, + { + "epoch": 121.25, + "grad_norm": 25.058268735271895, + "learning_rate": 5e-05, + "loss": 0.1257, + "num_input_tokens_seen": 134203904, + "step": 1940 + }, + { + "epoch": 121.25, + "loss": 0.1271267533302307, + "loss_ce": 0.0011501964181661606, + "loss_xval": 0.1259765625, + "num_input_tokens_seen": 134203904, + "step": 1940 + }, + { + "epoch": 121.3125, + "grad_norm": 30.920576182316488, + "learning_rate": 5e-05, + "loss": 0.2169, + "num_input_tokens_seen": 134275456, + "step": 1941 + }, + { + "epoch": 121.3125, + "loss": 0.21448010206222534, + "loss_ce": 0.0006129105458967388, + "loss_xval": 0.2138671875, + "num_input_tokens_seen": 134275456, + "step": 1941 + }, + { + "epoch": 121.375, + "grad_norm": 4.388403747873867, + "learning_rate": 5e-05, + "loss": 0.0117, + "num_input_tokens_seen": 134347072, + "step": 1942 + }, + { + "epoch": 121.375, + "loss": 0.008787467144429684, + "loss_ce": 0.0010360024170950055, + "loss_xval": 0.00775146484375, + "num_input_tokens_seen": 134347072, + "step": 1942 + }, + { + "epoch": 121.4375, + "grad_norm": 28.057472943062788, + "learning_rate": 5e-05, + "loss": 0.1948, + "num_input_tokens_seen": 134406208, + "step": 1943 + }, + { + "epoch": 121.4375, + "loss": 0.1948278546333313, + "loss_ce": 0.00146848289296031, + "loss_xval": 0.193359375, + "num_input_tokens_seen": 134406208, + "step": 1943 + }, + { + "epoch": 121.5, + "grad_norm": 8.07372460241718, + "learning_rate": 5e-05, + "loss": 0.0311, + "num_input_tokens_seen": 134477824, + "step": 1944 + }, + { + "epoch": 121.5, + "loss": 0.03582604229450226, + "loss_ce": 0.008238151669502258, + "loss_xval": 0.027587890625, + "num_input_tokens_seen": 134477824, + "step": 1944 + }, + { + "epoch": 121.5625, + "grad_norm": 19.59842984931545, + "learning_rate": 5e-05, + "loss": 0.1091, + "num_input_tokens_seen": 134549440, + "step": 1945 + }, + { + "epoch": 121.5625, + "loss": 0.10925983637571335, + "loss_ce": 0.0164863970130682, + "loss_xval": 0.0927734375, + "num_input_tokens_seen": 134549440, + "step": 1945 + }, + { + "epoch": 121.625, + "grad_norm": 25.373277244650303, + "learning_rate": 5e-05, + "loss": 0.1628, + "num_input_tokens_seen": 134621120, + "step": 1946 + }, + { + "epoch": 121.625, + "loss": 0.15820538997650146, + "loss_ce": 0.003908507991582155, + "loss_xval": 0.154296875, + "num_input_tokens_seen": 134621120, + "step": 1946 + }, + { + "epoch": 121.6875, + "grad_norm": 5.837645263257035, + "learning_rate": 5e-05, + "loss": 0.0126, + "num_input_tokens_seen": 134692672, + "step": 1947 + }, + { + "epoch": 121.6875, + "loss": 0.01283172331750393, + "loss_ce": 0.0018453949596732855, + "loss_xval": 0.010986328125, + "num_input_tokens_seen": 134692672, + "step": 1947 + }, + { + "epoch": 121.75, + "grad_norm": 21.944172020399566, + "learning_rate": 5e-05, + "loss": 0.115, + "num_input_tokens_seen": 134764288, + "step": 1948 + }, + { + "epoch": 121.75, + "loss": 0.11614223569631577, + "loss_ce": 0.0009078634320758283, + "loss_xval": 0.115234375, + "num_input_tokens_seen": 134764288, + "step": 1948 + }, + { + "epoch": 121.8125, + "grad_norm": 4.671667038476155, + "learning_rate": 5e-05, + "loss": 0.0094, + "num_input_tokens_seen": 134835968, + "step": 1949 + }, + { + "epoch": 121.8125, + "loss": 0.007974156178534031, + "loss_ce": 0.0005278667085804045, + "loss_xval": 0.0074462890625, + "num_input_tokens_seen": 134835968, + "step": 1949 + }, + { + "epoch": 121.875, + "grad_norm": 17.329389706599958, + "learning_rate": 5e-05, + "loss": 0.075, + "num_input_tokens_seen": 134907520, + "step": 1950 + }, + { + "epoch": 121.875, + "loss": 0.07992716878652573, + "loss_ce": 0.0003373274812474847, + "loss_xval": 0.07958984375, + "num_input_tokens_seen": 134907520, + "step": 1950 + }, + { + "epoch": 121.9375, + "grad_norm": 4.712997049900013, + "learning_rate": 5e-05, + "loss": 0.007, + "num_input_tokens_seen": 134979072, + "step": 1951 + }, + { + "epoch": 121.9375, + "loss": 0.006857059895992279, + "loss_ce": 0.0002652631083037704, + "loss_xval": 0.006591796875, + "num_input_tokens_seen": 134979072, + "step": 1951 + }, + { + "epoch": 122.0, + "grad_norm": 15.58583942859578, + "learning_rate": 5e-05, + "loss": 0.0579, + "num_input_tokens_seen": 135050752, + "step": 1952 + }, + { + "epoch": 122.0, + "loss": 0.05805974081158638, + "loss_ce": 0.00019841421453747898, + "loss_xval": 0.057861328125, + "num_input_tokens_seen": 135050752, + "step": 1952 + }, + { + "epoch": 122.0625, + "grad_norm": 7.025742904997995, + "learning_rate": 5e-05, + "loss": 0.0174, + "num_input_tokens_seen": 135122496, + "step": 1953 + }, + { + "epoch": 122.0625, + "loss": 0.017606699839234352, + "loss_ce": 0.0001506455009803176, + "loss_xval": 0.0174560546875, + "num_input_tokens_seen": 135122496, + "step": 1953 + }, + { + "epoch": 122.125, + "grad_norm": 15.468862719989644, + "learning_rate": 5e-05, + "loss": 0.0571, + "num_input_tokens_seen": 135181632, + "step": 1954 + }, + { + "epoch": 122.125, + "loss": 0.056041352450847626, + "loss_ce": 0.00013315118849277496, + "loss_xval": 0.055908203125, + "num_input_tokens_seen": 135181632, + "step": 1954 + }, + { + "epoch": 122.1875, + "grad_norm": 5.551718809172979, + "learning_rate": 5e-05, + "loss": 0.0092, + "num_input_tokens_seen": 135253248, + "step": 1955 + }, + { + "epoch": 122.1875, + "loss": 0.008621604181826115, + "loss_ce": 0.00013771781232208014, + "loss_xval": 0.00848388671875, + "num_input_tokens_seen": 135253248, + "step": 1955 + }, + { + "epoch": 122.25, + "grad_norm": 12.265454105803553, + "learning_rate": 5e-05, + "loss": 0.0357, + "num_input_tokens_seen": 135324800, + "step": 1956 + }, + { + "epoch": 122.25, + "loss": 0.036517731845378876, + "loss_ce": 0.00014077810919843614, + "loss_xval": 0.036376953125, + "num_input_tokens_seen": 135324800, + "step": 1956 + }, + { + "epoch": 122.3125, + "grad_norm": 6.687800353400105, + "learning_rate": 5e-05, + "loss": 0.0118, + "num_input_tokens_seen": 135396416, + "step": 1957 + }, + { + "epoch": 122.3125, + "loss": 0.01237054355442524, + "loss_ce": 0.00016351198428310454, + "loss_xval": 0.01220703125, + "num_input_tokens_seen": 135396416, + "step": 1957 + }, + { + "epoch": 122.375, + "grad_norm": 10.946746665319191, + "learning_rate": 5e-05, + "loss": 0.0298, + "num_input_tokens_seen": 135467968, + "step": 1958 + }, + { + "epoch": 122.375, + "loss": 0.030561501160264015, + "loss_ce": 0.00016599331866018474, + "loss_xval": 0.0303955078125, + "num_input_tokens_seen": 135467968, + "step": 1958 + }, + { + "epoch": 122.4375, + "grad_norm": 6.228161151916856, + "learning_rate": 5e-05, + "loss": 0.0103, + "num_input_tokens_seen": 135539520, + "step": 1959 + }, + { + "epoch": 122.4375, + "loss": 0.010677915997803211, + "loss_ce": 0.00024090414808597416, + "loss_xval": 0.01043701171875, + "num_input_tokens_seen": 135539520, + "step": 1959 + }, + { + "epoch": 122.5, + "grad_norm": 5.822338375589962, + "learning_rate": 5e-05, + "loss": 0.0092, + "num_input_tokens_seen": 135611264, + "step": 1960 + }, + { + "epoch": 122.5, + "loss": 0.010117021389305592, + "loss_ce": 0.00029036152409389615, + "loss_xval": 0.00982666015625, + "num_input_tokens_seen": 135611264, + "step": 1960 + }, + { + "epoch": 122.5625, + "grad_norm": 8.336276382534857, + "learning_rate": 5e-05, + "loss": 0.0189, + "num_input_tokens_seen": 135682816, + "step": 1961 + }, + { + "epoch": 122.5625, + "loss": 0.02094731293618679, + "loss_ce": 0.00031743032741360366, + "loss_xval": 0.0206298828125, + "num_input_tokens_seen": 135682816, + "step": 1961 + }, + { + "epoch": 122.625, + "grad_norm": 1.6092316524284045, + "learning_rate": 5e-05, + "loss": 0.0048, + "num_input_tokens_seen": 135754432, + "step": 1962 + }, + { + "epoch": 122.625, + "loss": 0.005259785335510969, + "loss_ce": 0.000407490530051291, + "loss_xval": 0.004852294921875, + "num_input_tokens_seen": 135754432, + "step": 1962 + }, + { + "epoch": 122.6875, + "grad_norm": 8.359250061226241, + "learning_rate": 5e-05, + "loss": 0.0202, + "num_input_tokens_seen": 135825984, + "step": 1963 + }, + { + "epoch": 122.6875, + "loss": 0.019757412374019623, + "loss_ce": 0.00047030241694301367, + "loss_xval": 0.019287109375, + "num_input_tokens_seen": 135825984, + "step": 1963 + }, + { + "epoch": 122.75, + "grad_norm": 3.774240853149926, + "learning_rate": 5e-05, + "loss": 0.0062, + "num_input_tokens_seen": 135897728, + "step": 1964 + }, + { + "epoch": 122.75, + "loss": 0.005050990264862776, + "loss_ce": 0.0005649062804877758, + "loss_xval": 0.004486083984375, + "num_input_tokens_seen": 135897728, + "step": 1964 + }, + { + "epoch": 122.8125, + "grad_norm": 6.56217554022863, + "learning_rate": 5e-05, + "loss": 0.0123, + "num_input_tokens_seen": 135969408, + "step": 1965 + }, + { + "epoch": 122.8125, + "loss": 0.013142449781298637, + "loss_ce": 0.000630243041086942, + "loss_xval": 0.01251220703125, + "num_input_tokens_seen": 135969408, + "step": 1965 + }, + { + "epoch": 122.875, + "grad_norm": 6.069851309504797, + "learning_rate": 5e-05, + "loss": 0.0105, + "num_input_tokens_seen": 136041088, + "step": 1966 + }, + { + "epoch": 122.875, + "loss": 0.00996310357004404, + "loss_ce": 0.0007467948598787189, + "loss_xval": 0.00921630859375, + "num_input_tokens_seen": 136041088, + "step": 1966 + }, + { + "epoch": 122.9375, + "grad_norm": 2.64188694224837, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 136112704, + "step": 1967 + }, + { + "epoch": 122.9375, + "loss": 0.003580670803785324, + "loss_ce": 0.000742536096367985, + "loss_xval": 0.002838134765625, + "num_input_tokens_seen": 136112704, + "step": 1967 + }, + { + "epoch": 123.0, + "grad_norm": 6.8665627223274255, + "learning_rate": 5e-05, + "loss": 0.0147, + "num_input_tokens_seen": 136184256, + "step": 1968 + }, + { + "epoch": 123.0, + "loss": 0.012735115364193916, + "loss_ce": 0.0008942951099015772, + "loss_xval": 0.0118408203125, + "num_input_tokens_seen": 136184256, + "step": 1968 + }, + { + "epoch": 123.0625, + "grad_norm": 0.5807229440386033, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 136255872, + "step": 1969 + }, + { + "epoch": 123.0625, + "loss": 0.001586060388945043, + "loss_ce": 0.0008002327522262931, + "loss_xval": 0.00078582763671875, + "num_input_tokens_seen": 136255872, + "step": 1969 + }, + { + "epoch": 123.125, + "grad_norm": 5.465627938504707, + "learning_rate": 5e-05, + "loss": 0.0088, + "num_input_tokens_seen": 136327424, + "step": 1970 + }, + { + "epoch": 123.125, + "loss": 0.008768781088292599, + "loss_ce": 0.0008342110668309033, + "loss_xval": 0.0079345703125, + "num_input_tokens_seen": 136327424, + "step": 1970 + }, + { + "epoch": 123.1875, + "grad_norm": 3.2688936917904052, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 136399168, + "step": 1971 + }, + { + "epoch": 123.1875, + "loss": 0.003709967015311122, + "loss_ce": 0.000887090980540961, + "loss_xval": 0.0028228759765625, + "num_input_tokens_seen": 136399168, + "step": 1971 + }, + { + "epoch": 123.25, + "grad_norm": 2.859825800308269, + "learning_rate": 5e-05, + "loss": 0.0042, + "num_input_tokens_seen": 136470784, + "step": 1972 + }, + { + "epoch": 123.25, + "loss": 0.0040792059153318405, + "loss_ce": 0.0008901189430616796, + "loss_xval": 0.0031890869140625, + "num_input_tokens_seen": 136470784, + "step": 1972 + }, + { + "epoch": 123.3125, + "grad_norm": 4.779754176105408, + "learning_rate": 5e-05, + "loss": 0.008, + "num_input_tokens_seen": 136529984, + "step": 1973 + }, + { + "epoch": 123.3125, + "loss": 0.008246231824159622, + "loss_ce": 0.0008304607472382486, + "loss_xval": 0.007415771484375, + "num_input_tokens_seen": 136529984, + "step": 1973 + }, + { + "epoch": 123.375, + "grad_norm": 0.39980392059776326, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 136601664, + "step": 1974 + }, + { + "epoch": 123.375, + "loss": 0.0014047473669052124, + "loss_ce": 0.0008020252571441233, + "loss_xval": 0.00060272216796875, + "num_input_tokens_seen": 136601664, + "step": 1974 + }, + { + "epoch": 123.4375, + "grad_norm": 4.187773290468926, + "learning_rate": 5e-05, + "loss": 0.0062, + "num_input_tokens_seen": 136673344, + "step": 1975 + }, + { + "epoch": 123.4375, + "loss": 0.006777295842766762, + "loss_ce": 0.0007348151411861181, + "loss_xval": 0.00604248046875, + "num_input_tokens_seen": 136673344, + "step": 1975 + }, + { + "epoch": 123.5, + "grad_norm": 1.7687218212830937, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 136732416, + "step": 1976 + }, + { + "epoch": 123.5, + "loss": 0.002407355234026909, + "loss_ce": 0.0007212591008283198, + "loss_xval": 0.00168609619140625, + "num_input_tokens_seen": 136732416, + "step": 1976 + }, + { + "epoch": 123.5625, + "grad_norm": 2.6514844366807613, + "learning_rate": 5e-05, + "loss": 0.0033, + "num_input_tokens_seen": 136804160, + "step": 1977 + }, + { + "epoch": 123.5625, + "loss": 0.0029455279000103474, + "loss_ce": 0.0006109331152401865, + "loss_xval": 0.0023345947265625, + "num_input_tokens_seen": 136804160, + "step": 1977 + }, + { + "epoch": 123.625, + "grad_norm": 3.0990904073083367, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 136875840, + "step": 1978 + }, + { + "epoch": 123.625, + "loss": 0.003793952288106084, + "loss_ce": 0.0004980538506060839, + "loss_xval": 0.0032958984375, + "num_input_tokens_seen": 136875840, + "step": 1978 + }, + { + "epoch": 123.6875, + "grad_norm": 1.103623406702498, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 136947584, + "step": 1979 + }, + { + "epoch": 123.6875, + "loss": 0.0019718646071851254, + "loss_ce": 0.0004917620099149644, + "loss_xval": 0.0014801025390625, + "num_input_tokens_seen": 136947584, + "step": 1979 + }, + { + "epoch": 123.75, + "grad_norm": 3.151175428198334, + "learning_rate": 5e-05, + "loss": 0.0034, + "num_input_tokens_seen": 137006656, + "step": 1980 + }, + { + "epoch": 123.75, + "loss": 0.0033435134682804346, + "loss_ce": 0.0004443435464054346, + "loss_xval": 0.002899169921875, + "num_input_tokens_seen": 137006656, + "step": 1980 + }, + { + "epoch": 123.8125, + "grad_norm": 0.6259623426254022, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 137065728, + "step": 1981 + }, + { + "epoch": 123.8125, + "loss": 0.0006881886511109769, + "loss_ce": 0.00039445696165785193, + "loss_xval": 0.000293731689453125, + "num_input_tokens_seen": 137065728, + "step": 1981 + }, + { + "epoch": 123.875, + "grad_norm": 3.066975260146319, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 137137408, + "step": 1982 + }, + { + "epoch": 123.875, + "loss": 0.002845182316377759, + "loss_ce": 0.00037325851735658944, + "loss_xval": 0.002471923828125, + "num_input_tokens_seen": 137137408, + "step": 1982 + }, + { + "epoch": 123.9375, + "grad_norm": 1.3733118331002803, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 137209024, + "step": 1983 + }, + { + "epoch": 123.9375, + "loss": 0.0013497625477612019, + "loss_ce": 0.0003426824987400323, + "loss_xval": 0.001007080078125, + "num_input_tokens_seen": 137209024, + "step": 1983 + }, + { + "epoch": 124.0, + "grad_norm": 2.5286701579932305, + "learning_rate": 5e-05, + "loss": 0.0025, + "num_input_tokens_seen": 137280704, + "step": 1984 + }, + { + "epoch": 124.0, + "loss": 0.002422866877168417, + "loss_ce": 0.0003018952556885779, + "loss_xval": 0.0021209716796875, + "num_input_tokens_seen": 137280704, + "step": 1984 + }, + { + "epoch": 124.0625, + "grad_norm": 1.6838102534552042, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 137339648, + "step": 1985 + }, + { + "epoch": 124.0625, + "loss": 0.0013198907254263759, + "loss_ce": 0.00027466367464512587, + "loss_xval": 0.00104522705078125, + "num_input_tokens_seen": 137339648, + "step": 1985 + }, + { + "epoch": 124.125, + "grad_norm": 1.870493706709494, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 137398912, + "step": 1986 + }, + { + "epoch": 124.125, + "loss": 0.0017201672308146954, + "loss_ce": 0.0002476940571796149, + "loss_xval": 0.00147247314453125, + "num_input_tokens_seen": 137398912, + "step": 1986 + }, + { + "epoch": 124.1875, + "grad_norm": 1.8316167682634479, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 137470464, + "step": 1987 + }, + { + "epoch": 124.1875, + "loss": 0.0013492691796272993, + "loss_ce": 0.0002201187307946384, + "loss_xval": 0.001129150390625, + "num_input_tokens_seen": 137470464, + "step": 1987 + }, + { + "epoch": 124.25, + "grad_norm": 1.3896300670799442, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 137542144, + "step": 1988 + }, + { + "epoch": 124.25, + "loss": 0.000932362221647054, + "loss_ce": 0.00021138445299584419, + "loss_xval": 0.000720977783203125, + "num_input_tokens_seen": 137542144, + "step": 1988 + }, + { + "epoch": 124.3125, + "grad_norm": 1.8757599594813787, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 137613760, + "step": 1989 + }, + { + "epoch": 124.3125, + "loss": 0.0013382441829890013, + "loss_ce": 0.0001938350615091622, + "loss_xval": 0.0011444091796875, + "num_input_tokens_seen": 137613760, + "step": 1989 + }, + { + "epoch": 124.375, + "grad_norm": 1.0058499642413008, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 137685504, + "step": 1990 + }, + { + "epoch": 124.375, + "loss": 0.000767315796110779, + "loss_ce": 0.00019129650900140405, + "loss_xval": 0.000576019287109375, + "num_input_tokens_seen": 137685504, + "step": 1990 + }, + { + "epoch": 124.4375, + "grad_norm": 1.7611596455705885, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 137757120, + "step": 1991 + }, + { + "epoch": 124.4375, + "loss": 0.0010359555017203093, + "loss_ce": 0.00015476047701667994, + "loss_xval": 0.000881195068359375, + "num_input_tokens_seen": 137757120, + "step": 1991 + }, + { + "epoch": 124.5, + "grad_norm": 0.7570485915024486, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 137816256, + "step": 1992 + }, + { + "epoch": 124.5, + "loss": 0.0004390325048007071, + "loss_ce": 0.00014911549806129187, + "loss_xval": 0.0002899169921875, + "num_input_tokens_seen": 137816256, + "step": 1992 + }, + { + "epoch": 124.5625, + "grad_norm": 1.321495989789071, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 137887872, + "step": 1993 + }, + { + "epoch": 124.5625, + "loss": 0.000663900631479919, + "loss_ce": 0.00014128712064120919, + "loss_xval": 0.000522613525390625, + "num_input_tokens_seen": 137887872, + "step": 1993 + }, + { + "epoch": 124.625, + "grad_norm": 0.2112346218673502, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 137946880, + "step": 1994 + }, + { + "epoch": 124.625, + "loss": 0.0002966892207041383, + "loss_ce": 0.00012502782919909805, + "loss_xval": 0.000171661376953125, + "num_input_tokens_seen": 137946880, + "step": 1994 + }, + { + "epoch": 124.6875, + "grad_norm": 1.5159169466326932, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 138018496, + "step": 1995 + }, + { + "epoch": 124.6875, + "loss": 0.0007277600234374404, + "loss_ce": 0.00011359379277564585, + "loss_xval": 0.000614166259765625, + "num_input_tokens_seen": 138018496, + "step": 1995 + }, + { + "epoch": 124.75, + "grad_norm": 0.32384576031497636, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 138090048, + "step": 1996 + }, + { + "epoch": 124.75, + "loss": 0.00020585546735674143, + "loss_ce": 0.00011621007433859631, + "loss_xval": 8.96453857421875e-05, + "num_input_tokens_seen": 138090048, + "step": 1996 + }, + { + "epoch": 124.8125, + "grad_norm": 1.2856806695187504, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 138161664, + "step": 1997 + }, + { + "epoch": 124.8125, + "loss": 0.0009543233318254352, + "loss_ce": 0.00011127521429443732, + "loss_xval": 0.000843048095703125, + "num_input_tokens_seen": 138161664, + "step": 1997 + }, + { + "epoch": 124.875, + "grad_norm": 0.273387107407299, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 138233216, + "step": 1998 + }, + { + "epoch": 124.875, + "loss": 0.00030335847986862063, + "loss_ce": 0.00010690158524084836, + "loss_xval": 0.0001964569091796875, + "num_input_tokens_seen": 138233216, + "step": 1998 + }, + { + "epoch": 124.9375, + "grad_norm": 1.2811936909150359, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 138304832, + "step": 1999 + }, + { + "epoch": 124.9375, + "loss": 0.0005950164049863815, + "loss_ce": 9.529103408567607e-05, + "loss_xval": 0.000499725341796875, + "num_input_tokens_seen": 138304832, + "step": 1999 + }, + { + "epoch": 125.0, + "grad_norm": 0.6247624824023181, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 138363904, + "step": 2000 + }, + { + "epoch": 125.0, + "eval_synth_IoU": 0.16027227230370045, + "eval_synth_MAE_x": 0.00927734375, + "eval_synth_MAE_y": 0.016754150390625, + "eval_synth_NUM_probability": 0.9989532977342606, + "eval_synth_inside_bbox": 0.6875, + "eval_synth_loss": 0.00046615718747489154, + "eval_synth_loss_ce": 8.802525371720549e-05, + "eval_synth_loss_xval": 0.0003781318664550781, + "eval_synth_runtime": 63.8838, + "eval_synth_samples_per_second": 2.004, + "eval_synth_steps_per_second": 0.063, + "num_input_tokens_seen": 138363904, + "step": 2000 + }, + { + "epoch": 125.0, + "loss": 0.0005669583915732801, + "loss_ce": 9.20285820029676e-05, + "loss_xval": 0.0004749298095703125, + "num_input_tokens_seen": 138363904, + "step": 2000 + }, + { + "epoch": 125.0625, + "grad_norm": 1.1289912466788816, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 138435456, + "step": 2001 + }, + { + "epoch": 125.0625, + "loss": 0.0005536347161978483, + "loss_ce": 8.824166434351355e-05, + "loss_xval": 0.00046539306640625, + "num_input_tokens_seen": 138435456, + "step": 2001 + }, + { + "epoch": 125.125, + "grad_norm": 1.3321695183277404, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 138507264, + "step": 2002 + }, + { + "epoch": 125.125, + "loss": 0.0006004355382174253, + "loss_ce": 8.163671736838296e-05, + "loss_xval": 0.000518798828125, + "num_input_tokens_seen": 138507264, + "step": 2002 + }, + { + "epoch": 125.1875, + "grad_norm": 0.1435252998460747, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 138578880, + "step": 2003 + }, + { + "epoch": 125.1875, + "loss": 0.0001957011700142175, + "loss_ce": 8.078341488726437e-05, + "loss_xval": 0.00011491775512695312, + "num_input_tokens_seen": 138578880, + "step": 2003 + }, + { + "epoch": 125.25, + "grad_norm": 0.7789113344012044, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 138650496, + "step": 2004 + }, + { + "epoch": 125.25, + "loss": 0.00030843037529848516, + "loss_ce": 7.47801605029963e-05, + "loss_xval": 0.00023365020751953125, + "num_input_tokens_seen": 138650496, + "step": 2004 + }, + { + "epoch": 125.3125, + "grad_norm": 0.13352900111691626, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 138722112, + "step": 2005 + }, + { + "epoch": 125.3125, + "loss": 0.0002294059086125344, + "loss_ce": 7.49106693547219e-05, + "loss_xval": 0.0001544952392578125, + "num_input_tokens_seen": 138722112, + "step": 2005 + }, + { + "epoch": 125.375, + "grad_norm": 0.6646610600448061, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 138793664, + "step": 2006 + }, + { + "epoch": 125.375, + "loss": 0.00029537681257352233, + "loss_ce": 7.221702253445983e-05, + "loss_xval": 0.0002231597900390625, + "num_input_tokens_seen": 138793664, + "step": 2006 + }, + { + "epoch": 125.4375, + "grad_norm": 0.2662452337215074, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 138865408, + "step": 2007 + }, + { + "epoch": 125.4375, + "loss": 0.00023515126667916775, + "loss_ce": 6.444357131840661e-05, + "loss_xval": 0.00017070770263671875, + "num_input_tokens_seen": 138865408, + "step": 2007 + }, + { + "epoch": 125.5, + "grad_norm": 0.2014708435449277, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 138937024, + "step": 2008 + }, + { + "epoch": 125.5, + "loss": 0.00019856503058690578, + "loss_ce": 6.600430060643703e-05, + "loss_xval": 0.00013256072998046875, + "num_input_tokens_seen": 138937024, + "step": 2008 + }, + { + "epoch": 125.5625, + "grad_norm": 0.37715980519558245, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 138996160, + "step": 2009 + }, + { + "epoch": 125.5625, + "loss": 0.00028794267564080656, + "loss_ce": 6.001452129567042e-05, + "loss_xval": 0.00022792816162109375, + "num_input_tokens_seen": 138996160, + "step": 2009 + }, + { + "epoch": 125.625, + "grad_norm": 0.15972412896600788, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 139067776, + "step": 2010 + }, + { + "epoch": 125.625, + "loss": 0.00024122698232531548, + "loss_ce": 6.479722651420161e-05, + "loss_xval": 0.00017642974853515625, + "num_input_tokens_seen": 139067776, + "step": 2010 + }, + { + "epoch": 125.6875, + "grad_norm": 0.21785802377503982, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 139139392, + "step": 2011 + }, + { + "epoch": 125.6875, + "loss": 0.0001753378746798262, + "loss_ce": 5.994327875669114e-05, + "loss_xval": 0.00011539459228515625, + "num_input_tokens_seen": 139139392, + "step": 2011 + }, + { + "epoch": 125.75, + "grad_norm": 0.3215632556672757, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 139211200, + "step": 2012 + }, + { + "epoch": 125.75, + "loss": 0.0001643118157517165, + "loss_ce": 5.893080742680468e-05, + "loss_xval": 0.00010538101196289062, + "num_input_tokens_seen": 139211200, + "step": 2012 + }, + { + "epoch": 125.8125, + "grad_norm": 0.0709285778957573, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 139282944, + "step": 2013 + }, + { + "epoch": 125.8125, + "loss": 9.688051795819774e-05, + "loss_ce": 5.7064615248236805e-05, + "loss_xval": 3.981590270996094e-05, + "num_input_tokens_seen": 139282944, + "step": 2013 + }, + { + "epoch": 125.875, + "grad_norm": 0.2572826224861247, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 139341952, + "step": 2014 + }, + { + "epoch": 125.875, + "loss": 0.00015328649897128344, + "loss_ce": 5.267385859042406e-05, + "loss_xval": 0.00010061264038085938, + "num_input_tokens_seen": 139341952, + "step": 2014 + }, + { + "epoch": 125.9375, + "grad_norm": 0.09740524760517949, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 139413760, + "step": 2015 + }, + { + "epoch": 125.9375, + "loss": 0.00014573728549294174, + "loss_ce": 5.132352816872299e-05, + "loss_xval": 9.441375732421875e-05, + "num_input_tokens_seen": 139413760, + "step": 2015 + }, + { + "epoch": 126.0, + "grad_norm": 0.2981512293786988, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 139485376, + "step": 2016 + }, + { + "epoch": 126.0, + "loss": 0.00012508727377280593, + "loss_ce": 4.7362816985696554e-05, + "loss_xval": 7.772445678710938e-05, + "num_input_tokens_seen": 139485376, + "step": 2016 + }, + { + "epoch": 126.0625, + "grad_norm": 0.23777569105293894, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 139557120, + "step": 2017 + }, + { + "epoch": 126.0625, + "loss": 0.0001335121924057603, + "loss_ce": 4.7204663133015856e-05, + "loss_xval": 8.630752563476562e-05, + "num_input_tokens_seen": 139557120, + "step": 2017 + }, + { + "epoch": 126.125, + "grad_norm": 0.08930608778804024, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 139616256, + "step": 2018 + }, + { + "epoch": 126.125, + "loss": 0.00016115653852466494, + "loss_ce": 4.671562055591494e-05, + "loss_xval": 0.00011444091796875, + "num_input_tokens_seen": 139616256, + "step": 2018 + }, + { + "epoch": 126.1875, + "grad_norm": 0.27264871121824047, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 139687808, + "step": 2019 + }, + { + "epoch": 126.1875, + "loss": 0.00015747947327326983, + "loss_ce": 4.589958189171739e-05, + "loss_xval": 0.00011157989501953125, + "num_input_tokens_seen": 139687808, + "step": 2019 + }, + { + "epoch": 126.25, + "grad_norm": 0.260372602529021, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 139759424, + "step": 2020 + }, + { + "epoch": 126.25, + "loss": 9.420116839464754e-05, + "loss_ce": 4.3179588828934357e-05, + "loss_xval": 5.1021575927734375e-05, + "num_input_tokens_seen": 139759424, + "step": 2020 + }, + { + "epoch": 126.3125, + "grad_norm": 0.024071873095662125, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 139831104, + "step": 2021 + }, + { + "epoch": 126.3125, + "loss": 9.950671665137634e-05, + "loss_ce": 4.1094164771493524e-05, + "loss_xval": 5.841255187988281e-05, + "num_input_tokens_seen": 139831104, + "step": 2021 + }, + { + "epoch": 126.375, + "grad_norm": 0.10003631958517323, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 139902784, + "step": 2022 + }, + { + "epoch": 126.375, + "loss": 8.994160452857614e-05, + "loss_ce": 4.05889586545527e-05, + "loss_xval": 4.935264587402344e-05, + "num_input_tokens_seen": 139902784, + "step": 2022 + }, + { + "epoch": 126.4375, + "grad_norm": 0.5174751342213327, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 139961856, + "step": 2023 + }, + { + "epoch": 126.4375, + "loss": 0.0002707853855099529, + "loss_ce": 4.094986798008904e-05, + "loss_xval": 0.00022983551025390625, + "num_input_tokens_seen": 139961856, + "step": 2023 + }, + { + "epoch": 126.5, + "grad_norm": 1.1338815551773325, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 140033408, + "step": 2024 + }, + { + "epoch": 126.5, + "loss": 0.0003625124227255583, + "loss_ce": 4.20778633269947e-05, + "loss_xval": 0.0003204345703125, + "num_input_tokens_seen": 140033408, + "step": 2024 + }, + { + "epoch": 126.5625, + "grad_norm": 0.6000982973521324, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 140105088, + "step": 2025 + }, + { + "epoch": 126.5625, + "loss": 0.0002064650325337425, + "loss_ce": 3.861834920826368e-05, + "loss_xval": 0.0001678466796875, + "num_input_tokens_seen": 140105088, + "step": 2025 + }, + { + "epoch": 126.625, + "grad_norm": 0.6923015797971452, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 140176640, + "step": 2026 + }, + { + "epoch": 126.625, + "loss": 0.0002409739390714094, + "loss_ce": 3.879498763126321e-05, + "loss_xval": 0.000202178955078125, + "num_input_tokens_seen": 140176640, + "step": 2026 + }, + { + "epoch": 126.6875, + "grad_norm": 1.3144685472037902, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 140248320, + "step": 2027 + }, + { + "epoch": 126.6875, + "loss": 0.0006072742398828268, + "loss_ce": 3.888433275278658e-05, + "loss_xval": 0.000568389892578125, + "num_input_tokens_seen": 140248320, + "step": 2027 + }, + { + "epoch": 126.75, + "grad_norm": 0.6489809024564193, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 140319936, + "step": 2028 + }, + { + "epoch": 126.75, + "loss": 0.00017327992827631533, + "loss_ce": 3.7858171708649024e-05, + "loss_xval": 0.0001354217529296875, + "num_input_tokens_seen": 140319936, + "step": 2028 + }, + { + "epoch": 126.8125, + "grad_norm": 0.6426814506642058, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 140391552, + "step": 2029 + }, + { + "epoch": 126.8125, + "loss": 0.00019704973965417594, + "loss_ce": 3.587877654354088e-05, + "loss_xval": 0.00016117095947265625, + "num_input_tokens_seen": 140391552, + "step": 2029 + }, + { + "epoch": 126.875, + "grad_norm": 1.5260237419116756, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 140463232, + "step": 2030 + }, + { + "epoch": 126.875, + "loss": 0.0006509263766929507, + "loss_ce": 3.676014603115618e-05, + "loss_xval": 0.000614166259765625, + "num_input_tokens_seen": 140463232, + "step": 2030 + }, + { + "epoch": 126.9375, + "grad_norm": 1.409406677975525, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 140534784, + "step": 2031 + }, + { + "epoch": 126.9375, + "loss": 0.0006154632428660989, + "loss_ce": 3.562926212907769e-05, + "loss_xval": 0.000579833984375, + "num_input_tokens_seen": 140534784, + "step": 2031 + }, + { + "epoch": 127.0, + "grad_norm": 0.5766126788840545, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 140606336, + "step": 2032 + }, + { + "epoch": 127.0, + "loss": 0.00019185608834959567, + "loss_ce": 3.831452704616822e-05, + "loss_xval": 0.00015354156494140625, + "num_input_tokens_seen": 140606336, + "step": 2032 + }, + { + "epoch": 127.0625, + "grad_norm": 0.2498634473100581, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 140665408, + "step": 2033 + }, + { + "epoch": 127.0625, + "loss": 9.204051457345486e-05, + "loss_ce": 3.6012152122566476e-05, + "loss_xval": 5.602836608886719e-05, + "num_input_tokens_seen": 140665408, + "step": 2033 + }, + { + "epoch": 127.125, + "grad_norm": 0.6422050609056498, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 140736960, + "step": 2034 + }, + { + "epoch": 127.125, + "loss": 0.00023144778970163316, + "loss_ce": 3.499088415992446e-05, + "loss_xval": 0.0001964569091796875, + "num_input_tokens_seen": 140736960, + "step": 2034 + }, + { + "epoch": 127.1875, + "grad_norm": 0.5595138398678299, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 140808576, + "step": 2035 + }, + { + "epoch": 127.1875, + "loss": 0.00017433673201594502, + "loss_ce": 3.3192936825798824e-05, + "loss_xval": 0.000141143798828125, + "num_input_tokens_seen": 140808576, + "step": 2035 + }, + { + "epoch": 127.25, + "grad_norm": 0.22115814221362357, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 140880256, + "step": 2036 + }, + { + "epoch": 127.25, + "loss": 6.543470954056829e-05, + "loss_ce": 3.575160008040257e-05, + "loss_xval": 2.968311309814453e-05, + "num_input_tokens_seen": 140880256, + "step": 2036 + }, + { + "epoch": 127.3125, + "grad_norm": 0.20004183815073995, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 140939456, + "step": 2037 + }, + { + "epoch": 127.3125, + "loss": 0.0001192227064166218, + "loss_ce": 3.291518078185618e-05, + "loss_xval": 8.630752563476562e-05, + "num_input_tokens_seen": 140939456, + "step": 2037 + }, + { + "epoch": 127.375, + "grad_norm": 0.627095105597688, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 140985984, + "step": 2038 + }, + { + "epoch": 127.375, + "loss": 0.00016552691522520036, + "loss_ce": 3.487353751552291e-05, + "loss_xval": 0.00013065338134765625, + "num_input_tokens_seen": 140985984, + "step": 2038 + }, + { + "epoch": 127.4375, + "grad_norm": 0.8872909986246116, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 141057536, + "step": 2039 + }, + { + "epoch": 127.4375, + "loss": 0.00025953692966140807, + "loss_ce": 3.1608771678293124e-05, + "loss_xval": 0.00022792816162109375, + "num_input_tokens_seen": 141057536, + "step": 2039 + }, + { + "epoch": 127.5, + "grad_norm": 0.7824631232935765, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 141129216, + "step": 2040 + }, + { + "epoch": 127.5, + "loss": 0.00019397796131670475, + "loss_ce": 3.376067979843356e-05, + "loss_xval": 0.00016021728515625, + "num_input_tokens_seen": 141129216, + "step": 2040 + }, + { + "epoch": 127.5625, + "grad_norm": 0.3417990588828484, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 141200768, + "step": 2041 + }, + { + "epoch": 127.5625, + "loss": 9.184810187434778e-05, + "loss_ce": 3.12897827825509e-05, + "loss_xval": 6.0558319091796875e-05, + "num_input_tokens_seen": 141200768, + "step": 2041 + }, + { + "epoch": 127.625, + "grad_norm": 0.06222981770637875, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 141272384, + "step": 2042 + }, + { + "epoch": 127.625, + "loss": 9.499691077508032e-05, + "loss_ce": 3.205440953024663e-05, + "loss_xval": 6.29425048828125e-05, + "num_input_tokens_seen": 141272384, + "step": 2042 + }, + { + "epoch": 127.6875, + "grad_norm": 0.11726662055733453, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 141344192, + "step": 2043 + }, + { + "epoch": 127.6875, + "loss": 8.967838220996782e-05, + "loss_ce": 3.1742667488288134e-05, + "loss_xval": 5.793571472167969e-05, + "num_input_tokens_seen": 141344192, + "step": 2043 + }, + { + "epoch": 127.75, + "grad_norm": 0.15529934399109852, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 141415808, + "step": 2044 + }, + { + "epoch": 127.75, + "loss": 9.700076043372974e-05, + "loss_ce": 2.881304499169346e-05, + "loss_xval": 6.818771362304688e-05, + "num_input_tokens_seen": 141415808, + "step": 2044 + }, + { + "epoch": 127.8125, + "grad_norm": 0.10366291324035277, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 141487360, + "step": 2045 + }, + { + "epoch": 127.8125, + "loss": 7.866997475503013e-05, + "loss_ce": 3.0271003197412938e-05, + "loss_xval": 4.839897155761719e-05, + "num_input_tokens_seen": 141487360, + "step": 2045 + }, + { + "epoch": 127.875, + "grad_norm": 0.18112455961803986, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 141558912, + "step": 2046 + }, + { + "epoch": 127.875, + "loss": 0.00012426248576957732, + "loss_ce": 3.127923991996795e-05, + "loss_xval": 9.298324584960938e-05, + "num_input_tokens_seen": 141558912, + "step": 2046 + }, + { + "epoch": 127.9375, + "grad_norm": 0.45311283789773527, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 141630656, + "step": 2047 + }, + { + "epoch": 127.9375, + "loss": 0.00011793743760790676, + "loss_ce": 3.067623401875608e-05, + "loss_xval": 8.726119995117188e-05, + "num_input_tokens_seen": 141630656, + "step": 2047 + }, + { + "epoch": 128.0, + "grad_norm": 0.7398344509970638, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 141702400, + "step": 2048 + }, + { + "epoch": 128.0, + "loss": 0.00020481945830397308, + "loss_ce": 3.0297051125671715e-05, + "loss_xval": 0.00017452239990234375, + "num_input_tokens_seen": 141702400, + "step": 2048 + }, + { + "epoch": 128.0625, + "grad_norm": 0.9385140841705101, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 141773952, + "step": 2049 + }, + { + "epoch": 128.0625, + "loss": 0.00026439601788297296, + "loss_ce": 2.9792136047035456e-05, + "loss_xval": 0.0002346038818359375, + "num_input_tokens_seen": 141773952, + "step": 2049 + }, + { + "epoch": 128.125, + "grad_norm": 1.1834843798666885, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 141845632, + "step": 2050 + }, + { + "epoch": 128.125, + "loss": 0.0004072985320817679, + "loss_ce": 2.9643508241861127e-05, + "loss_xval": 0.000377655029296875, + "num_input_tokens_seen": 141845632, + "step": 2050 + }, + { + "epoch": 128.1875, + "grad_norm": 1.6238155697563705, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 141917376, + "step": 2051 + }, + { + "epoch": 128.1875, + "loss": 0.0006558331078849733, + "loss_ce": 3.0222730856621638e-05, + "loss_xval": 0.0006256103515625, + "num_input_tokens_seen": 141917376, + "step": 2051 + }, + { + "epoch": 128.25, + "grad_norm": 2.281162313109408, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 141989184, + "step": 2052 + }, + { + "epoch": 128.25, + "loss": 0.001286553917452693, + "loss_ce": 2.7703841624315828e-05, + "loss_xval": 0.00125885009765625, + "num_input_tokens_seen": 141989184, + "step": 2052 + }, + { + "epoch": 128.3125, + "grad_norm": 3.1514891072102236, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_input_tokens_seen": 142060800, + "step": 2053 + }, + { + "epoch": 128.3125, + "loss": 0.002576857805252075, + "loss_ce": 2.864013185899239e-05, + "loss_xval": 0.0025482177734375, + "num_input_tokens_seen": 142060800, + "step": 2053 + }, + { + "epoch": 128.375, + "grad_norm": 4.284793735662272, + "learning_rate": 5e-05, + "loss": 0.0044, + "num_input_tokens_seen": 142132416, + "step": 2054 + }, + { + "epoch": 128.375, + "loss": 0.004516967572271824, + "loss_ce": 3.088352968916297e-05, + "loss_xval": 0.004486083984375, + "num_input_tokens_seen": 142132416, + "step": 2054 + }, + { + "epoch": 128.4375, + "grad_norm": 5.376791980859641, + "learning_rate": 5e-05, + "loss": 0.0069, + "num_input_tokens_seen": 142204224, + "step": 2055 + }, + { + "epoch": 128.4375, + "loss": 0.007232699543237686, + "loss_ce": 3.055127672269009e-05, + "loss_xval": 0.0072021484375, + "num_input_tokens_seen": 142204224, + "step": 2055 + }, + { + "epoch": 128.5, + "grad_norm": 6.1098489395843805, + "learning_rate": 5e-05, + "loss": 0.0091, + "num_input_tokens_seen": 142275904, + "step": 2056 + }, + { + "epoch": 128.5, + "loss": 0.00888771377503872, + "loss_ce": 3.7616420740960166e-05, + "loss_xval": 0.00885009765625, + "num_input_tokens_seen": 142275904, + "step": 2056 + }, + { + "epoch": 128.5625, + "grad_norm": 5.633904203121933, + "learning_rate": 5e-05, + "loss": 0.0078, + "num_input_tokens_seen": 142335040, + "step": 2057 + }, + { + "epoch": 128.5625, + "loss": 0.007855950854718685, + "loss_ce": 4.3451062083477154e-05, + "loss_xval": 0.0078125, + "num_input_tokens_seen": 142335040, + "step": 2057 + }, + { + "epoch": 128.625, + "grad_norm": 3.200725435628059, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_input_tokens_seen": 142406656, + "step": 2058 + }, + { + "epoch": 128.625, + "loss": 0.0025145290419459343, + "loss_ce": 5.7864039263222367e-05, + "loss_xval": 0.0024566650390625, + "num_input_tokens_seen": 142406656, + "step": 2058 + }, + { + "epoch": 128.6875, + "grad_norm": 0.34371424886412993, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 142478208, + "step": 2059 + }, + { + "epoch": 128.6875, + "loss": 0.0002074497169815004, + "loss_ce": 7.393530540866777e-05, + "loss_xval": 0.000133514404296875, + "num_input_tokens_seen": 142478208, + "step": 2059 + }, + { + "epoch": 128.75, + "grad_norm": 3.282261970447706, + "learning_rate": 5e-05, + "loss": 0.003, + "num_input_tokens_seen": 142549760, + "step": 2060 + }, + { + "epoch": 128.75, + "loss": 0.002972968854010105, + "loss_ce": 8.905776485335082e-05, + "loss_xval": 0.0028839111328125, + "num_input_tokens_seen": 142549760, + "step": 2060 + }, + { + "epoch": 128.8125, + "grad_norm": 3.694021790210018, + "learning_rate": 5e-05, + "loss": 0.0037, + "num_input_tokens_seen": 142621376, + "step": 2061 + }, + { + "epoch": 128.8125, + "loss": 0.003910902887582779, + "loss_ce": 0.00011146435281261802, + "loss_xval": 0.0037994384765625, + "num_input_tokens_seen": 142621376, + "step": 2061 + }, + { + "epoch": 128.875, + "grad_norm": 1.7190069916683812, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 142680512, + "step": 2062 + }, + { + "epoch": 128.875, + "loss": 0.00120463315397501, + "loss_ce": 0.00013651789049617946, + "loss_xval": 0.001068115234375, + "num_input_tokens_seen": 142680512, + "step": 2062 + }, + { + "epoch": 128.9375, + "grad_norm": 0.9999859186357049, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 142752192, + "step": 2063 + }, + { + "epoch": 128.9375, + "loss": 0.0004924352397210896, + "loss_ce": 0.00016055656305979937, + "loss_xval": 0.000331878662109375, + "num_input_tokens_seen": 142752192, + "step": 2063 + }, + { + "epoch": 129.0, + "grad_norm": 2.440647575670156, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 142811136, + "step": 2064 + }, + { + "epoch": 129.0, + "loss": 0.0016980627551674843, + "loss_ce": 0.00018744260887615383, + "loss_xval": 0.0015106201171875, + "num_input_tokens_seen": 142811136, + "step": 2064 + }, + { + "epoch": 129.0625, + "grad_norm": 1.5520947196070725, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 142882752, + "step": 2065 + }, + { + "epoch": 129.0625, + "loss": 0.0009497711434960365, + "loss_ce": 0.00021353457123041153, + "loss_xval": 0.000736236572265625, + "num_input_tokens_seen": 142882752, + "step": 2065 + }, + { + "epoch": 129.125, + "grad_norm": 0.6450613769787013, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 142954432, + "step": 2066 + }, + { + "epoch": 129.125, + "loss": 0.00037603528471663594, + "loss_ce": 0.00021677168842870742, + "loss_xval": 0.00015926361083984375, + "num_input_tokens_seen": 142954432, + "step": 2066 + }, + { + "epoch": 129.1875, + "grad_norm": 2.277810195811712, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 143026048, + "step": 2067 + }, + { + "epoch": 129.1875, + "loss": 0.001686818664893508, + "loss_ce": 0.00021434557856991887, + "loss_xval": 0.00147247314453125, + "num_input_tokens_seen": 143026048, + "step": 2067 + }, + { + "epoch": 129.25, + "grad_norm": 2.140687373653537, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 143085056, + "step": 2068 + }, + { + "epoch": 129.25, + "loss": 0.0012608192628249526, + "loss_ce": 0.0002308510447619483, + "loss_xval": 0.00102996826171875, + "num_input_tokens_seen": 143085056, + "step": 2068 + }, + { + "epoch": 129.3125, + "grad_norm": 0.5652085826944148, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 143156800, + "step": 2069 + }, + { + "epoch": 129.3125, + "loss": 0.0005040373071096838, + "loss_ce": 0.00022937910398468375, + "loss_xval": 0.000274658203125, + "num_input_tokens_seen": 143156800, + "step": 2069 + }, + { + "epoch": 129.375, + "grad_norm": 1.2559795587036462, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 143228352, + "step": 2070 + }, + { + "epoch": 129.375, + "loss": 0.000903485284652561, + "loss_ce": 0.00022065445955377072, + "loss_xval": 0.000682830810546875, + "num_input_tokens_seen": 143228352, + "step": 2070 + }, + { + "epoch": 129.4375, + "grad_norm": 2.088837487238198, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 143287360, + "step": 2071 + }, + { + "epoch": 129.4375, + "loss": 0.0016689603216946125, + "loss_ce": 0.00022700471163261682, + "loss_xval": 0.00144195556640625, + "num_input_tokens_seen": 143287360, + "step": 2071 + }, + { + "epoch": 129.5, + "grad_norm": 1.6954721292442427, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 143359168, + "step": 2072 + }, + { + "epoch": 129.5, + "loss": 0.0010234940564259887, + "loss_ce": 0.000226222284254618, + "loss_xval": 0.000797271728515625, + "num_input_tokens_seen": 143359168, + "step": 2072 + }, + { + "epoch": 129.5625, + "grad_norm": 0.39508711281801384, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 143430976, + "step": 2073 + }, + { + "epoch": 129.5625, + "loss": 0.0003998870379291475, + "loss_ce": 0.00019961541693191975, + "loss_xval": 0.0002002716064453125, + "num_input_tokens_seen": 143430976, + "step": 2073 + }, + { + "epoch": 129.625, + "grad_norm": 0.9832453768116473, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 143490112, + "step": 2074 + }, + { + "epoch": 129.625, + "loss": 0.000548589276149869, + "loss_ce": 0.00017284158093389124, + "loss_xval": 0.0003757476806640625, + "num_input_tokens_seen": 143490112, + "step": 2074 + }, + { + "epoch": 129.6875, + "grad_norm": 1.47372069184567, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 143549120, + "step": 2075 + }, + { + "epoch": 129.6875, + "loss": 0.0007509666029363871, + "loss_ce": 0.00017113260400947183, + "loss_xval": 0.000579833984375, + "num_input_tokens_seen": 143549120, + "step": 2075 + }, + { + "epoch": 129.75, + "grad_norm": 0.8547622412274952, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 143595712, + "step": 2076 + }, + { + "epoch": 129.75, + "loss": 0.00036665465449914336, + "loss_ce": 0.00014254120469558984, + "loss_xval": 0.00022411346435546875, + "num_input_tokens_seen": 143595712, + "step": 2076 + }, + { + "epoch": 129.8125, + "grad_norm": 0.44202400008651155, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 143667328, + "step": 2077 + }, + { + "epoch": 129.8125, + "loss": 0.00024111934180837125, + "loss_ce": 0.00013478465552907437, + "loss_xval": 0.00010633468627929688, + "num_input_tokens_seen": 143667328, + "step": 2077 + }, + { + "epoch": 129.875, + "grad_norm": 1.6873056781973697, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 143726464, + "step": 2078 + }, + { + "epoch": 129.875, + "loss": 0.001029672333970666, + "loss_ce": 0.00012177436292404309, + "loss_xval": 0.00090789794921875, + "num_input_tokens_seen": 143726464, + "step": 2078 + }, + { + "epoch": 129.9375, + "grad_norm": 2.158001366610662, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 143798080, + "step": 2079 + }, + { + "epoch": 129.9375, + "loss": 0.0013578996295109391, + "loss_ce": 0.00011430833546910435, + "loss_xval": 0.00124359130859375, + "num_input_tokens_seen": 143798080, + "step": 2079 + }, + { + "epoch": 130.0, + "grad_norm": 1.5856936274924123, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 143869760, + "step": 2080 + }, + { + "epoch": 130.0, + "loss": 0.0008600328583270311, + "loss_ce": 0.00010090813157148659, + "loss_xval": 0.000759124755859375, + "num_input_tokens_seen": 143869760, + "step": 2080 + }, + { + "epoch": 130.0625, + "grad_norm": 0.4768186065708803, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 143941440, + "step": 2081 + }, + { + "epoch": 130.0625, + "loss": 0.00021693602320738137, + "loss_ce": 9.295836207456887e-05, + "loss_xval": 0.0001239776611328125, + "num_input_tokens_seen": 143941440, + "step": 2081 + }, + { + "epoch": 130.125, + "grad_norm": 0.7141073631281202, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 144013184, + "step": 2082 + }, + { + "epoch": 130.125, + "loss": 0.0002918837999459356, + "loss_ce": 8.684382191859186e-05, + "loss_xval": 0.00020503997802734375, + "num_input_tokens_seen": 144013184, + "step": 2082 + }, + { + "epoch": 130.1875, + "grad_norm": 1.6396061250062675, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 144084736, + "step": 2083 + }, + { + "epoch": 130.1875, + "loss": 0.0009414113592356443, + "loss_ce": 8.691917901160195e-05, + "loss_xval": 0.0008544921875, + "num_input_tokens_seen": 144084736, + "step": 2083 + }, + { + "epoch": 130.25, + "grad_norm": 2.4179746537506874, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 144156352, + "step": 2084 + }, + { + "epoch": 130.25, + "loss": 0.0014471329050138593, + "loss_ce": 7.384183845715597e-05, + "loss_xval": 0.001373291015625, + "num_input_tokens_seen": 144156352, + "step": 2084 + }, + { + "epoch": 130.3125, + "grad_norm": 3.316065081632679, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 144227904, + "step": 2085 + }, + { + "epoch": 130.3125, + "loss": 0.002950825961306691, + "loss_ce": 6.691478483844548e-05, + "loss_xval": 0.0028839111328125, + "num_input_tokens_seen": 144227904, + "step": 2085 + }, + { + "epoch": 130.375, + "grad_norm": 4.422820340803834, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 144299584, + "step": 2086 + }, + { + "epoch": 130.375, + "loss": 0.005191377829760313, + "loss_ce": 6.442474841605872e-05, + "loss_xval": 0.005126953125, + "num_input_tokens_seen": 144299584, + "step": 2086 + }, + { + "epoch": 130.4375, + "grad_norm": 5.654507138086931, + "learning_rate": 5e-05, + "loss": 0.0083, + "num_input_tokens_seen": 144371328, + "step": 2087 + }, + { + "epoch": 130.4375, + "loss": 0.008368929848074913, + "loss_ce": 6.814829976065084e-05, + "loss_xval": 0.00830078125, + "num_input_tokens_seen": 144371328, + "step": 2087 + }, + { + "epoch": 130.5, + "grad_norm": 6.5448532185426105, + "learning_rate": 5e-05, + "loss": 0.011, + "num_input_tokens_seen": 144430464, + "step": 2088 + }, + { + "epoch": 130.5, + "loss": 0.010691306553781033, + "loss_ce": 7.118981739040464e-05, + "loss_xval": 0.0106201171875, + "num_input_tokens_seen": 144430464, + "step": 2088 + }, + { + "epoch": 130.5625, + "grad_norm": 6.274424619511378, + "learning_rate": 5e-05, + "loss": 0.0103, + "num_input_tokens_seen": 144502080, + "step": 2089 + }, + { + "epoch": 130.5625, + "loss": 0.010163553059101105, + "loss_ce": 9.275234333472326e-05, + "loss_xval": 0.01007080078125, + "num_input_tokens_seen": 144502080, + "step": 2089 + }, + { + "epoch": 130.625, + "grad_norm": 4.177287100411339, + "learning_rate": 5e-05, + "loss": 0.0047, + "num_input_tokens_seen": 144561152, + "step": 2090 + }, + { + "epoch": 130.625, + "loss": 0.004846962168812752, + "loss_ce": 0.00011673758126562461, + "loss_xval": 0.004730224609375, + "num_input_tokens_seen": 144561152, + "step": 2090 + }, + { + "epoch": 130.6875, + "grad_norm": 0.49295514443893185, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 144620288, + "step": 2091 + }, + { + "epoch": 130.6875, + "loss": 0.00044583145063370466, + "loss_ce": 0.00014256300346460193, + "loss_xval": 0.0003032684326171875, + "num_input_tokens_seen": 144620288, + "step": 2091 + }, + { + "epoch": 130.75, + "grad_norm": 3.0607890432683558, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 144691904, + "step": 2092 + }, + { + "epoch": 130.75, + "loss": 0.0030132883694022894, + "loss_ce": 0.00017515364743303508, + "loss_xval": 0.002838134765625, + "num_input_tokens_seen": 144691904, + "step": 2092 + }, + { + "epoch": 130.8125, + "grad_norm": 5.011675862355414, + "learning_rate": 5e-05, + "loss": 0.007, + "num_input_tokens_seen": 144763456, + "step": 2093 + }, + { + "epoch": 130.8125, + "loss": 0.007125661708414555, + "loss_ce": 0.0002286891103722155, + "loss_xval": 0.00689697265625, + "num_input_tokens_seen": 144763456, + "step": 2093 + }, + { + "epoch": 130.875, + "grad_norm": 4.665674019487356, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 144835072, + "step": 2094 + }, + { + "epoch": 130.875, + "loss": 0.006237682420760393, + "loss_ce": 0.00022571970475837588, + "loss_xval": 0.006011962890625, + "num_input_tokens_seen": 144835072, + "step": 2094 + }, + { + "epoch": 130.9375, + "grad_norm": 2.327435721813015, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_input_tokens_seen": 144906752, + "step": 2095 + }, + { + "epoch": 130.9375, + "loss": 0.001996598206460476, + "loss_ce": 0.00027235501329414546, + "loss_xval": 0.0017242431640625, + "num_input_tokens_seen": 144906752, + "step": 2095 + }, + { + "epoch": 131.0, + "grad_norm": 0.7726657536137062, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 144953280, + "step": 2096 + }, + { + "epoch": 131.0, + "loss": 0.0011610279325395823, + "loss_ce": 0.00027983286418020725, + "loss_xval": 0.000881195068359375, + "num_input_tokens_seen": 144953280, + "step": 2096 + }, + { + "epoch": 131.0625, + "grad_norm": 3.181122525381395, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_input_tokens_seen": 145024832, + "step": 2097 + }, + { + "epoch": 131.0625, + "loss": 0.0032270171213895082, + "loss_ce": 0.00028207077411934733, + "loss_xval": 0.0029449462890625, + "num_input_tokens_seen": 145024832, + "step": 2097 + }, + { + "epoch": 131.125, + "grad_norm": 3.6210500111995345, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 145096448, + "step": 2098 + }, + { + "epoch": 131.125, + "loss": 0.003765873610973358, + "loss_ce": 0.00028686958830803633, + "loss_xval": 0.00347900390625, + "num_input_tokens_seen": 145096448, + "step": 2098 + }, + { + "epoch": 131.1875, + "grad_norm": 2.1818267846684427, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 145168128, + "step": 2099 + }, + { + "epoch": 131.1875, + "loss": 0.0021016704849898815, + "loss_ce": 0.0002782451338134706, + "loss_xval": 0.00182342529296875, + "num_input_tokens_seen": 145168128, + "step": 2099 + }, + { + "epoch": 131.25, + "grad_norm": 0.09386964420034875, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 145239744, + "step": 2100 + }, + { + "epoch": 131.25, + "loss": 0.0003963925701100379, + "loss_ce": 0.00025048039969988167, + "loss_xval": 0.00014591217041015625, + "num_input_tokens_seen": 145239744, + "step": 2100 + }, + { + "epoch": 131.3125, + "grad_norm": 2.1667057386452147, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 145298816, + "step": 2101 + }, + { + "epoch": 131.3125, + "loss": 0.001640781294554472, + "loss_ce": 0.00023697275901213288, + "loss_xval": 0.00140380859375, + "num_input_tokens_seen": 145298816, + "step": 2101 + }, + { + "epoch": 131.375, + "grad_norm": 3.06446925765169, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 145370560, + "step": 2102 + }, + { + "epoch": 131.375, + "loss": 0.002880053361877799, + "loss_ce": 0.00022502410865854472, + "loss_xval": 0.002655029296875, + "num_input_tokens_seen": 145370560, + "step": 2102 + }, + { + "epoch": 131.4375, + "grad_norm": 2.4814137009230643, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 145442240, + "step": 2103 + }, + { + "epoch": 131.4375, + "loss": 0.00198269821703434, + "loss_ce": 0.0001897904003271833, + "loss_xval": 0.00179290771484375, + "num_input_tokens_seen": 145442240, + "step": 2103 + }, + { + "epoch": 131.5, + "grad_norm": 0.8091662677058721, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 145513856, + "step": 2104 + }, + { + "epoch": 131.5, + "loss": 0.000518153072334826, + "loss_ce": 0.00016910828708205372, + "loss_xval": 0.0003490447998046875, + "num_input_tokens_seen": 145513856, + "step": 2104 + }, + { + "epoch": 131.5625, + "grad_norm": 0.9711896831709398, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 145585408, + "step": 2105 + }, + { + "epoch": 131.5625, + "loss": 0.000512247032020241, + "loss_ce": 0.0001593875203980133, + "loss_xval": 0.0003528594970703125, + "num_input_tokens_seen": 145585408, + "step": 2105 + }, + { + "epoch": 131.625, + "grad_norm": 2.0220649876272074, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 145657152, + "step": 2106 + }, + { + "epoch": 131.625, + "loss": 0.0014408582355827093, + "loss_ce": 0.00013623179984278977, + "loss_xval": 0.00130462646484375, + "num_input_tokens_seen": 145657152, + "step": 2106 + }, + { + "epoch": 131.6875, + "grad_norm": 2.1693323682100005, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 145728704, + "step": 2107 + }, + { + "epoch": 131.6875, + "loss": 0.0013570053270086646, + "loss_ce": 0.00013630217290483415, + "loss_xval": 0.001220703125, + "num_input_tokens_seen": 145728704, + "step": 2107 + }, + { + "epoch": 131.75, + "grad_norm": 1.9971593096020999, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 145800384, + "step": 2108 + }, + { + "epoch": 131.75, + "loss": 0.0013030003756284714, + "loss_ce": 0.0001128148433053866, + "loss_xval": 0.001190185546875, + "num_input_tokens_seen": 145800384, + "step": 2108 + }, + { + "epoch": 131.8125, + "grad_norm": 1.9509462926255905, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 145872064, + "step": 2109 + }, + { + "epoch": 131.8125, + "loss": 0.0012032217346131802, + "loss_ce": 0.00011221828026464209, + "loss_xval": 0.00109100341796875, + "num_input_tokens_seen": 145872064, + "step": 2109 + }, + { + "epoch": 131.875, + "grad_norm": 2.164767534194291, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 145943808, + "step": 2110 + }, + { + "epoch": 131.875, + "loss": 0.0012747022556141019, + "loss_ce": 9.977544686989859e-05, + "loss_xval": 0.0011749267578125, + "num_input_tokens_seen": 145943808, + "step": 2110 + }, + { + "epoch": 131.9375, + "grad_norm": 2.577591041548048, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 146015360, + "step": 2111 + }, + { + "epoch": 131.9375, + "loss": 0.0018587567610666156, + "loss_ce": 9.636659524403512e-05, + "loss_xval": 0.00176239013671875, + "num_input_tokens_seen": 146015360, + "step": 2111 + }, + { + "epoch": 132.0, + "grad_norm": 2.897437033309384, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 146086976, + "step": 2112 + }, + { + "epoch": 132.0, + "loss": 0.002194196917116642, + "loss_ce": 8.848396828398108e-05, + "loss_xval": 0.002105712890625, + "num_input_tokens_seen": 146086976, + "step": 2112 + }, + { + "epoch": 132.0625, + "grad_norm": 3.1496740804235337, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 146158592, + "step": 2113 + }, + { + "epoch": 132.0625, + "loss": 0.0026031185407191515, + "loss_ce": 0.00010067722178064287, + "loss_xval": 0.00250244140625, + "num_input_tokens_seen": 146158592, + "step": 2113 + }, + { + "epoch": 132.125, + "grad_norm": 3.213446574083063, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 146230144, + "step": 2114 + }, + { + "epoch": 132.125, + "loss": 0.0028038991149514914, + "loss_ce": 8.783464727457613e-05, + "loss_xval": 0.002716064453125, + "num_input_tokens_seen": 146230144, + "step": 2114 + }, + { + "epoch": 132.1875, + "grad_norm": 3.317823485630597, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_input_tokens_seen": 146301888, + "step": 2115 + }, + { + "epoch": 132.1875, + "loss": 0.002983665093779564, + "loss_ce": 9.97540118987672e-05, + "loss_xval": 0.0028839111328125, + "num_input_tokens_seen": 146301888, + "step": 2115 + }, + { + "epoch": 132.25, + "grad_norm": 3.2938515342402424, + "learning_rate": 5e-05, + "loss": 0.003, + "num_input_tokens_seen": 146373568, + "step": 2116 + }, + { + "epoch": 132.25, + "loss": 0.0033804993145167828, + "loss_ce": 9.985957149183378e-05, + "loss_xval": 0.0032806396484375, + "num_input_tokens_seen": 146373568, + "step": 2116 + }, + { + "epoch": 132.3125, + "grad_norm": 2.94541207651058, + "learning_rate": 5e-05, + "loss": 0.0025, + "num_input_tokens_seen": 146445248, + "step": 2117 + }, + { + "epoch": 132.3125, + "loss": 0.0024823653511703014, + "loss_ce": 0.00011725301010301337, + "loss_xval": 0.0023651123046875, + "num_input_tokens_seen": 146445248, + "step": 2117 + }, + { + "epoch": 132.375, + "grad_norm": 2.4504378031553036, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 146516928, + "step": 2118 + }, + { + "epoch": 132.375, + "loss": 0.0016645942814648151, + "loss_ce": 0.00012345658615231514, + "loss_xval": 0.0015411376953125, + "num_input_tokens_seen": 146516928, + "step": 2118 + }, + { + "epoch": 132.4375, + "grad_norm": 1.5718108744843646, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 146576128, + "step": 2119 + }, + { + "epoch": 132.4375, + "loss": 0.0009304281556978822, + "loss_ce": 0.00013315639807842672, + "loss_xval": 0.000797271728515625, + "num_input_tokens_seen": 146576128, + "step": 2119 + }, + { + "epoch": 132.5, + "grad_norm": 0.4789646132744658, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 146647936, + "step": 2120 + }, + { + "epoch": 132.5, + "loss": 0.0004606024594977498, + "loss_ce": 0.00014970461779739708, + "loss_xval": 0.0003108978271484375, + "num_input_tokens_seen": 146647936, + "step": 2120 + }, + { + "epoch": 132.5625, + "grad_norm": 0.6269730419291779, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 146707136, + "step": 2121 + }, + { + "epoch": 132.5625, + "loss": 0.0003991887788288295, + "loss_ce": 0.0001493260933784768, + "loss_xval": 0.0002498626708984375, + "num_input_tokens_seen": 146707136, + "step": 2121 + }, + { + "epoch": 132.625, + "grad_norm": 1.7977778352649605, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 146778816, + "step": 2122 + }, + { + "epoch": 132.625, + "loss": 0.0011801546206697822, + "loss_ce": 0.0001425569789716974, + "loss_xval": 0.00103759765625, + "num_input_tokens_seen": 146778816, + "step": 2122 + }, + { + "epoch": 132.6875, + "grad_norm": 3.1212230939689753, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 146850432, + "step": 2123 + }, + { + "epoch": 132.6875, + "loss": 0.0028741902206093073, + "loss_ce": 0.00015812575293239206, + "loss_xval": 0.002716064453125, + "num_input_tokens_seen": 146850432, + "step": 2123 + }, + { + "epoch": 132.75, + "grad_norm": 4.587900160641939, + "learning_rate": 5e-05, + "loss": 0.0059, + "num_input_tokens_seen": 146922048, + "step": 2124 + }, + { + "epoch": 132.75, + "loss": 0.005815291311591864, + "loss_ce": 0.00013902169303037226, + "loss_xval": 0.00567626953125, + "num_input_tokens_seen": 146922048, + "step": 2124 + }, + { + "epoch": 132.8125, + "grad_norm": 5.973389673402774, + "learning_rate": 5e-05, + "loss": 0.0098, + "num_input_tokens_seen": 146993664, + "step": 2125 + }, + { + "epoch": 132.8125, + "loss": 0.009735052473843098, + "loss_ce": 0.00015253292804118246, + "loss_xval": 0.00958251953125, + "num_input_tokens_seen": 146993664, + "step": 2125 + }, + { + "epoch": 132.875, + "grad_norm": 6.805012275520879, + "learning_rate": 5e-05, + "loss": 0.013, + "num_input_tokens_seen": 147065344, + "step": 2126 + }, + { + "epoch": 132.875, + "loss": 0.012403740547597408, + "loss_ce": 0.0001356737338937819, + "loss_xval": 0.01226806640625, + "num_input_tokens_seen": 147065344, + "step": 2126 + }, + { + "epoch": 132.9375, + "grad_norm": 6.354336216661166, + "learning_rate": 5e-05, + "loss": 0.0112, + "num_input_tokens_seen": 147136960, + "step": 2127 + }, + { + "epoch": 132.9375, + "loss": 0.010768568143248558, + "loss_ce": 0.00014845086843706667, + "loss_xval": 0.0106201171875, + "num_input_tokens_seen": 147136960, + "step": 2127 + }, + { + "epoch": 133.0, + "grad_norm": 4.233638122891625, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 147208576, + "step": 2128 + }, + { + "epoch": 133.0, + "loss": 0.005150423850864172, + "loss_ce": 0.00017605879111215472, + "loss_xval": 0.004974365234375, + "num_input_tokens_seen": 147208576, + "step": 2128 + }, + { + "epoch": 133.0625, + "grad_norm": 1.0086102085646704, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 147280192, + "step": 2129 + }, + { + "epoch": 133.0625, + "loss": 0.0004604866844601929, + "loss_ce": 0.0001972725585801527, + "loss_xval": 0.000263214111328125, + "num_input_tokens_seen": 147280192, + "step": 2129 + }, + { + "epoch": 133.125, + "grad_norm": 2.2402200838579125, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 147351808, + "step": 2130 + }, + { + "epoch": 133.125, + "loss": 0.0016773329116404057, + "loss_ce": 0.0002048597380053252, + "loss_xval": 0.00147247314453125, + "num_input_tokens_seen": 147351808, + "step": 2130 + }, + { + "epoch": 133.1875, + "grad_norm": 4.178564998809066, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 147411008, + "step": 2131 + }, + { + "epoch": 133.1875, + "loss": 0.0049562654457986355, + "loss_ce": 0.00022604072000831366, + "loss_xval": 0.004730224609375, + "num_input_tokens_seen": 147411008, + "step": 2131 + }, + { + "epoch": 133.25, + "grad_norm": 4.097662993556067, + "learning_rate": 5e-05, + "loss": 0.0052, + "num_input_tokens_seen": 147482688, + "step": 2132 + }, + { + "epoch": 133.25, + "loss": 0.005465524271130562, + "loss_ce": 0.00021650105190929025, + "loss_xval": 0.0052490234375, + "num_input_tokens_seen": 147482688, + "step": 2132 + }, + { + "epoch": 133.3125, + "grad_norm": 2.351868917270304, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 147554368, + "step": 2133 + }, + { + "epoch": 133.3125, + "loss": 0.0020998907275497913, + "loss_ce": 0.00024594776914454997, + "loss_xval": 0.00185394287109375, + "num_input_tokens_seen": 147554368, + "step": 2133 + }, + { + "epoch": 133.375, + "grad_norm": 0.09088466157172155, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 147626048, + "step": 2134 + }, + { + "epoch": 133.375, + "loss": 0.000461443851236254, + "loss_ce": 0.0002697553136385977, + "loss_xval": 0.00019168853759765625, + "num_input_tokens_seen": 147626048, + "step": 2134 + }, + { + "epoch": 133.4375, + "grad_norm": 2.034847356599081, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 147697664, + "step": 2135 + }, + { + "epoch": 133.4375, + "loss": 0.0017401942750439048, + "loss_ce": 0.00028297994867898524, + "loss_xval": 0.00145721435546875, + "num_input_tokens_seen": 147697664, + "step": 2135 + }, + { + "epoch": 133.5, + "grad_norm": 2.841930809682247, + "learning_rate": 5e-05, + "loss": 0.0026, + "num_input_tokens_seen": 147769344, + "step": 2136 + }, + { + "epoch": 133.5, + "loss": 0.002519955625757575, + "loss_ce": 0.0002769135753624141, + "loss_xval": 0.0022430419921875, + "num_input_tokens_seen": 147769344, + "step": 2136 + }, + { + "epoch": 133.5625, + "grad_norm": 2.349514500940194, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 147841024, + "step": 2137 + }, + { + "epoch": 133.5625, + "loss": 0.002035383600741625, + "loss_ce": 0.0002577346167527139, + "loss_xval": 0.00177764892578125, + "num_input_tokens_seen": 147841024, + "step": 2137 + }, + { + "epoch": 133.625, + "grad_norm": 0.7448406078444589, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 147912704, + "step": 2138 + }, + { + "epoch": 133.625, + "loss": 0.0006271991296671331, + "loss_ce": 0.0002381000085733831, + "loss_xval": 0.00038909912109375, + "num_input_tokens_seen": 147912704, + "step": 2138 + }, + { + "epoch": 133.6875, + "grad_norm": 1.4929476874380203, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 147971776, + "step": 2139 + }, + { + "epoch": 133.6875, + "loss": 0.0014256933936849236, + "loss_ce": 0.00020499022502917796, + "loss_xval": 0.001220703125, + "num_input_tokens_seen": 147971776, + "step": 2139 + }, + { + "epoch": 133.75, + "grad_norm": 3.5753967779590026, + "learning_rate": 5e-05, + "loss": 0.0038, + "num_input_tokens_seen": 148043520, + "step": 2140 + }, + { + "epoch": 133.75, + "loss": 0.0038607781752943993, + "loss_ce": 0.00018340993847232312, + "loss_xval": 0.0036773681640625, + "num_input_tokens_seen": 148043520, + "step": 2140 + }, + { + "epoch": 133.8125, + "grad_norm": 4.669850830222451, + "learning_rate": 5e-05, + "loss": 0.0062, + "num_input_tokens_seen": 148102592, + "step": 2141 + }, + { + "epoch": 133.8125, + "loss": 0.006373754236847162, + "loss_ce": 0.00017868586292024702, + "loss_xval": 0.006195068359375, + "num_input_tokens_seen": 148102592, + "step": 2141 + }, + { + "epoch": 133.875, + "grad_norm": 4.273147287943639, + "learning_rate": 5e-05, + "loss": 0.0052, + "num_input_tokens_seen": 148174208, + "step": 2142 + }, + { + "epoch": 133.875, + "loss": 0.005067807622253895, + "loss_ce": 0.0001544774859212339, + "loss_xval": 0.004913330078125, + "num_input_tokens_seen": 148174208, + "step": 2142 + }, + { + "epoch": 133.9375, + "grad_norm": 2.557900459431935, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 148245824, + "step": 2143 + }, + { + "epoch": 133.9375, + "loss": 0.002185113960877061, + "loss_ce": 0.00015569495735689998, + "loss_xval": 0.0020294189453125, + "num_input_tokens_seen": 148245824, + "step": 2143 + }, + { + "epoch": 134.0, + "grad_norm": 0.1244516336618763, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 148317632, + "step": 2144 + }, + { + "epoch": 134.0, + "loss": 0.00032432418083772063, + "loss_ce": 0.0001583848352311179, + "loss_xval": 0.0001659393310546875, + "num_input_tokens_seen": 148317632, + "step": 2144 + }, + { + "epoch": 134.0625, + "grad_norm": 2.3715035560448148, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 148389312, + "step": 2145 + }, + { + "epoch": 134.0625, + "loss": 0.001994270598515868, + "loss_ce": 0.00015558644372504205, + "loss_xval": 0.00183868408203125, + "num_input_tokens_seen": 148389312, + "step": 2145 + }, + { + "epoch": 134.125, + "grad_norm": 4.106114568684724, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 148461056, + "step": 2146 + }, + { + "epoch": 134.125, + "loss": 0.005479307379573584, + "loss_ce": 0.00013873119314666837, + "loss_xval": 0.005340576171875, + "num_input_tokens_seen": 148461056, + "step": 2146 + }, + { + "epoch": 134.1875, + "grad_norm": 4.820552638251836, + "learning_rate": 5e-05, + "loss": 0.0066, + "num_input_tokens_seen": 148532736, + "step": 2147 + }, + { + "epoch": 134.1875, + "loss": 0.006738653406500816, + "loss_ce": 0.00014685651694890112, + "loss_xval": 0.006591796875, + "num_input_tokens_seen": 148532736, + "step": 2147 + }, + { + "epoch": 134.25, + "grad_norm": 4.291935062772253, + "learning_rate": 5e-05, + "loss": 0.0052, + "num_input_tokens_seen": 148604480, + "step": 2148 + }, + { + "epoch": 134.25, + "loss": 0.005339866038411856, + "loss_ce": 0.00015187790268100798, + "loss_xval": 0.00518798828125, + "num_input_tokens_seen": 148604480, + "step": 2148 + }, + { + "epoch": 134.3125, + "grad_norm": 2.494981976416252, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 148676160, + "step": 2149 + }, + { + "epoch": 134.3125, + "loss": 0.0019246542360633612, + "loss_ce": 0.00016989343566820025, + "loss_xval": 0.0017547607421875, + "num_input_tokens_seen": 148676160, + "step": 2149 + }, + { + "epoch": 134.375, + "grad_norm": 0.22830845799667687, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 148747840, + "step": 2150 + }, + { + "epoch": 134.375, + "loss": 0.00036656681913882494, + "loss_ce": 0.0001806003274396062, + "loss_xval": 0.00018596649169921875, + "num_input_tokens_seen": 148747840, + "step": 2150 + }, + { + "epoch": 134.4375, + "grad_norm": 2.847110043325801, + "learning_rate": 5e-05, + "loss": 0.0025, + "num_input_tokens_seen": 148806976, + "step": 2151 + }, + { + "epoch": 134.4375, + "loss": 0.0023445640690624714, + "loss_ce": 0.0001625571894692257, + "loss_xval": 0.0021820068359375, + "num_input_tokens_seen": 148806976, + "step": 2151 + }, + { + "epoch": 134.5, + "grad_norm": 4.3579190454001875, + "learning_rate": 5e-05, + "loss": 0.0055, + "num_input_tokens_seen": 148878784, + "step": 2152 + }, + { + "epoch": 134.5, + "loss": 0.005705356132239103, + "loss_ce": 0.000181674535269849, + "loss_xval": 0.005523681640625, + "num_input_tokens_seen": 148878784, + "step": 2152 + }, + { + "epoch": 134.5625, + "grad_norm": 4.161126822103258, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 148950400, + "step": 2153 + }, + { + "epoch": 134.5625, + "loss": 0.004884625319391489, + "loss_ce": 0.00015440078277606517, + "loss_xval": 0.004730224609375, + "num_input_tokens_seen": 148950400, + "step": 2153 + }, + { + "epoch": 134.625, + "grad_norm": 2.6864583102223, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 149022080, + "step": 2154 + }, + { + "epoch": 134.625, + "loss": 0.002241534413769841, + "loss_ce": 0.00016633901395834982, + "loss_xval": 0.0020751953125, + "num_input_tokens_seen": 149022080, + "step": 2154 + }, + { + "epoch": 134.6875, + "grad_norm": 0.5679632082483581, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 149081216, + "step": 2155 + }, + { + "epoch": 134.6875, + "loss": 0.00033710565185174346, + "loss_ce": 0.00016639796376693994, + "loss_xval": 0.00017070770263671875, + "num_input_tokens_seen": 149081216, + "step": 2155 + }, + { + "epoch": 134.75, + "grad_norm": 1.6203994046512498, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 149140352, + "step": 2156 + }, + { + "epoch": 134.75, + "loss": 0.001092609716579318, + "loss_ce": 0.00015037944831419736, + "loss_xval": 0.000942230224609375, + "num_input_tokens_seen": 149140352, + "step": 2156 + }, + { + "epoch": 134.8125, + "grad_norm": 3.2646001056365086, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_input_tokens_seen": 149199552, + "step": 2157 + }, + { + "epoch": 134.8125, + "loss": 0.003142598317936063, + "loss_ce": 0.0001366168726235628, + "loss_xval": 0.0030059814453125, + "num_input_tokens_seen": 149199552, + "step": 2157 + }, + { + "epoch": 134.875, + "grad_norm": 3.9872981802680254, + "learning_rate": 5e-05, + "loss": 0.0045, + "num_input_tokens_seen": 149271360, + "step": 2158 + }, + { + "epoch": 134.875, + "loss": 0.004656321369111538, + "loss_ce": 0.00013972001033835113, + "loss_xval": 0.0045166015625, + "num_input_tokens_seen": 149271360, + "step": 2158 + }, + { + "epoch": 134.9375, + "grad_norm": 3.9922012140639787, + "learning_rate": 5e-05, + "loss": 0.0045, + "num_input_tokens_seen": 149343168, + "step": 2159 + }, + { + "epoch": 134.9375, + "loss": 0.004001868888735771, + "loss_ce": 0.00012613640865311027, + "loss_xval": 0.003875732421875, + "num_input_tokens_seen": 149343168, + "step": 2159 + }, + { + "epoch": 135.0, + "grad_norm": 3.2265108874032564, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_input_tokens_seen": 149402240, + "step": 2160 + }, + { + "epoch": 135.0, + "loss": 0.0029423029627650976, + "loss_ce": 0.00014994456432759762, + "loss_xval": 0.0027923583984375, + "num_input_tokens_seen": 149402240, + "step": 2160 + }, + { + "epoch": 135.0625, + "grad_norm": 1.7072210926221831, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 149473792, + "step": 2161 + }, + { + "epoch": 135.0625, + "loss": 0.0012776124058291316, + "loss_ce": 0.00013320324069354683, + "loss_xval": 0.0011444091796875, + "num_input_tokens_seen": 149473792, + "step": 2161 + }, + { + "epoch": 135.125, + "grad_norm": 0.1241373477669299, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 149545408, + "step": 2162 + }, + { + "epoch": 135.125, + "loss": 0.00028155322070233524, + "loss_ce": 0.00015185351367108524, + "loss_xval": 0.00012969970703125, + "num_input_tokens_seen": 149545408, + "step": 2162 + }, + { + "epoch": 135.1875, + "grad_norm": 1.903590505377728, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 149616960, + "step": 2163 + }, + { + "epoch": 135.1875, + "loss": 0.0014339193003252149, + "loss_ce": 0.0001521810336271301, + "loss_xval": 0.00128173828125, + "num_input_tokens_seen": 149616960, + "step": 2163 + }, + { + "epoch": 135.25, + "grad_norm": 3.573030706002393, + "learning_rate": 5e-05, + "loss": 0.0043, + "num_input_tokens_seen": 149688640, + "step": 2164 + }, + { + "epoch": 135.25, + "loss": 0.003992053214460611, + "loss_ce": 0.0001468385016778484, + "loss_xval": 0.00384521484375, + "num_input_tokens_seen": 149688640, + "step": 2164 + }, + { + "epoch": 135.3125, + "grad_norm": 4.742522445976156, + "learning_rate": 5e-05, + "loss": 0.0067, + "num_input_tokens_seen": 149760192, + "step": 2165 + }, + { + "epoch": 135.3125, + "loss": 0.006116765085607767, + "loss_ce": 0.00013531999138649553, + "loss_xval": 0.0059814453125, + "num_input_tokens_seen": 149760192, + "step": 2165 + }, + { + "epoch": 135.375, + "grad_norm": 4.418326558097359, + "learning_rate": 5e-05, + "loss": 0.0074, + "num_input_tokens_seen": 149819456, + "step": 2166 + }, + { + "epoch": 135.375, + "loss": 0.007084544748067856, + "loss_ce": 0.00012653697922360152, + "loss_xval": 0.0069580078125, + "num_input_tokens_seen": 149819456, + "step": 2166 + }, + { + "epoch": 135.4375, + "grad_norm": 2.5258406225652026, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_input_tokens_seen": 149891072, + "step": 2167 + }, + { + "epoch": 135.4375, + "loss": 0.0028270725160837173, + "loss_ce": 0.0001567845029057935, + "loss_xval": 0.0026702880859375, + "num_input_tokens_seen": 149891072, + "step": 2167 + }, + { + "epoch": 135.5, + "grad_norm": 0.43194665823243344, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 149950336, + "step": 2168 + }, + { + "epoch": 135.5, + "loss": 0.0009768784511834383, + "loss_ce": 0.00015671852452214807, + "loss_xval": 0.000820159912109375, + "num_input_tokens_seen": 149950336, + "step": 2168 + }, + { + "epoch": 135.5625, + "grad_norm": 4.093186764659218, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_input_tokens_seen": 150022144, + "step": 2169 + }, + { + "epoch": 135.5625, + "loss": 0.00625575752928853, + "loss_ce": 0.00015224194794427603, + "loss_xval": 0.006103515625, + "num_input_tokens_seen": 150022144, + "step": 2169 + }, + { + "epoch": 135.625, + "grad_norm": 7.19927736392235, + "learning_rate": 5e-05, + "loss": 0.0146, + "num_input_tokens_seen": 150093824, + "step": 2170 + }, + { + "epoch": 135.625, + "loss": 0.014271882362663746, + "loss_ce": 0.00017276135622523725, + "loss_xval": 0.01409912109375, + "num_input_tokens_seen": 150093824, + "step": 2170 + }, + { + "epoch": 135.6875, + "grad_norm": 7.1791670497950335, + "learning_rate": 5e-05, + "loss": 0.0179, + "num_input_tokens_seen": 150165504, + "step": 2171 + }, + { + "epoch": 135.6875, + "loss": 0.016541821882128716, + "loss_ce": 0.00018440044368617237, + "loss_xval": 0.016357421875, + "num_input_tokens_seen": 150165504, + "step": 2171 + }, + { + "epoch": 135.75, + "grad_norm": 2.6245323142450956, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 150237120, + "step": 2172 + }, + { + "epoch": 135.75, + "loss": 0.0037419276777654886, + "loss_ce": 0.00024766495334915817, + "loss_xval": 0.0034942626953125, + "num_input_tokens_seen": 150237120, + "step": 2172 + }, + { + "epoch": 135.8125, + "grad_norm": 2.9451622408931923, + "learning_rate": 5e-05, + "loss": 0.003, + "num_input_tokens_seen": 150308736, + "step": 2173 + }, + { + "epoch": 135.8125, + "loss": 0.0027326305862516165, + "loss_ce": 0.00035225943429395556, + "loss_xval": 0.00238037109375, + "num_input_tokens_seen": 150308736, + "step": 2173 + }, + { + "epoch": 135.875, + "grad_norm": 7.313853082086013, + "learning_rate": 5e-05, + "loss": 0.017, + "num_input_tokens_seen": 150367808, + "step": 2174 + }, + { + "epoch": 135.875, + "loss": 0.01791013963520527, + "loss_ce": 0.00045408555888570845, + "loss_xval": 0.0174560546875, + "num_input_tokens_seen": 150367808, + "step": 2174 + }, + { + "epoch": 135.9375, + "grad_norm": 9.56986347418184, + "learning_rate": 5e-05, + "loss": 0.0256, + "num_input_tokens_seen": 150439360, + "step": 2175 + }, + { + "epoch": 135.9375, + "loss": 0.024953246116638184, + "loss_ce": 0.0002950435155071318, + "loss_xval": 0.024658203125, + "num_input_tokens_seen": 150439360, + "step": 2175 + }, + { + "epoch": 136.0, + "grad_norm": 6.600646933001922, + "learning_rate": 5e-05, + "loss": 0.013, + "num_input_tokens_seen": 150510912, + "step": 2176 + }, + { + "epoch": 136.0, + "loss": 0.013693580403923988, + "loss_ce": 0.0002048104361165315, + "loss_xval": 0.01348876953125, + "num_input_tokens_seen": 150510912, + "step": 2176 + }, + { + "epoch": 136.0625, + "grad_norm": 0.6427743574819383, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 150582592, + "step": 2177 + }, + { + "epoch": 136.0625, + "loss": 0.0012838091934099793, + "loss_ce": 0.00023095273354556412, + "loss_xval": 0.0010528564453125, + "num_input_tokens_seen": 150582592, + "step": 2177 + }, + { + "epoch": 136.125, + "grad_norm": 7.175273253244792, + "learning_rate": 5e-05, + "loss": 0.0143, + "num_input_tokens_seen": 150654272, + "step": 2178 + }, + { + "epoch": 136.125, + "loss": 0.01449158787727356, + "loss_ce": 0.00027039687847718596, + "loss_xval": 0.01422119140625, + "num_input_tokens_seen": 150654272, + "step": 2178 + }, + { + "epoch": 136.1875, + "grad_norm": 9.420888468605963, + "learning_rate": 5e-05, + "loss": 0.0252, + "num_input_tokens_seen": 150725824, + "step": 2179 + }, + { + "epoch": 136.1875, + "loss": 0.026520689949393272, + "loss_ce": 0.00027557314024306834, + "loss_xval": 0.0262451171875, + "num_input_tokens_seen": 150725824, + "step": 2179 + }, + { + "epoch": 136.25, + "grad_norm": 4.459816136929259, + "learning_rate": 5e-05, + "loss": 0.0068, + "num_input_tokens_seen": 150797376, + "step": 2180 + }, + { + "epoch": 136.25, + "loss": 0.005994915496557951, + "loss_ce": 0.0005322691868059337, + "loss_xval": 0.005462646484375, + "num_input_tokens_seen": 150797376, + "step": 2180 + }, + { + "epoch": 136.3125, + "grad_norm": 0.676377349039084, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 150856448, + "step": 2181 + }, + { + "epoch": 136.3125, + "loss": 0.0020002590026706457, + "loss_ce": 0.0012296901550143957, + "loss_xval": 0.00077056884765625, + "num_input_tokens_seen": 150856448, + "step": 2181 + }, + { + "epoch": 136.375, + "grad_norm": 7.099016476439983, + "learning_rate": 5e-05, + "loss": 0.0182, + "num_input_tokens_seen": 150928064, + "step": 2182 + }, + { + "epoch": 136.375, + "loss": 0.018470434471964836, + "loss_ce": 0.0022350833751261234, + "loss_xval": 0.0162353515625, + "num_input_tokens_seen": 150928064, + "step": 2182 + }, + { + "epoch": 136.4375, + "grad_norm": 16.96350016913768, + "learning_rate": 5e-05, + "loss": 0.0817, + "num_input_tokens_seen": 150999616, + "step": 2183 + }, + { + "epoch": 136.4375, + "loss": 0.08347728848457336, + "loss_ce": 0.0004694793897215277, + "loss_xval": 0.0830078125, + "num_input_tokens_seen": 150999616, + "step": 2183 + }, + { + "epoch": 136.5, + "grad_norm": 12.780990377411227, + "learning_rate": 5e-05, + "loss": 0.0439, + "num_input_tokens_seen": 151071296, + "step": 2184 + }, + { + "epoch": 136.5, + "loss": 0.04217681288719177, + "loss_ce": 0.00018462415027897805, + "loss_xval": 0.0419921875, + "num_input_tokens_seen": 151071296, + "step": 2184 + }, + { + "epoch": 136.5625, + "grad_norm": 4.135673579801127, + "learning_rate": 5e-05, + "loss": 0.0082, + "num_input_tokens_seen": 151142912, + "step": 2185 + }, + { + "epoch": 136.5625, + "loss": 0.008142843842506409, + "loss_ce": 0.0002693082788027823, + "loss_xval": 0.00787353515625, + "num_input_tokens_seen": 151142912, + "step": 2185 + }, + { + "epoch": 136.625, + "grad_norm": 16.228388083310122, + "learning_rate": 5e-05, + "loss": 0.0732, + "num_input_tokens_seen": 151202112, + "step": 2186 + }, + { + "epoch": 136.625, + "loss": 0.06874268501996994, + "loss_ce": 0.0003833128430414945, + "loss_xval": 0.068359375, + "num_input_tokens_seen": 151202112, + "step": 2186 + }, + { + "epoch": 136.6875, + "grad_norm": 16.39062976495596, + "learning_rate": 5e-05, + "loss": 0.0769, + "num_input_tokens_seen": 151261376, + "step": 2187 + }, + { + "epoch": 136.6875, + "loss": 0.07935876399278641, + "loss_ce": 0.0007454807055182755, + "loss_xval": 0.07861328125, + "num_input_tokens_seen": 151261376, + "step": 2187 + }, + { + "epoch": 136.75, + "grad_norm": 3.1005737558272437, + "learning_rate": 5e-05, + "loss": 0.0056, + "num_input_tokens_seen": 151333120, + "step": 2188 + }, + { + "epoch": 136.75, + "loss": 0.00870249792933464, + "loss_ce": 0.0007068926934152842, + "loss_xval": 0.00799560546875, + "num_input_tokens_seen": 151333120, + "step": 2188 + }, + { + "epoch": 136.8125, + "grad_norm": 13.490271293549755, + "learning_rate": 5e-05, + "loss": 0.0461, + "num_input_tokens_seen": 151404736, + "step": 2189 + }, + { + "epoch": 136.8125, + "loss": 0.04683200642466545, + "loss_ce": 0.000689428299665451, + "loss_xval": 0.046142578125, + "num_input_tokens_seen": 151404736, + "step": 2189 + }, + { + "epoch": 136.875, + "grad_norm": 21.577246738555573, + "learning_rate": 5e-05, + "loss": 0.1245, + "num_input_tokens_seen": 151476352, + "step": 2190 + }, + { + "epoch": 136.875, + "loss": 0.12032702565193176, + "loss_ce": 0.0006981170736253262, + "loss_xval": 0.11962890625, + "num_input_tokens_seen": 151476352, + "step": 2190 + }, + { + "epoch": 136.9375, + "grad_norm": 6.149390085768126, + "learning_rate": 5e-05, + "loss": 0.0113, + "num_input_tokens_seen": 151548096, + "step": 2191 + }, + { + "epoch": 136.9375, + "loss": 0.010628869757056236, + "loss_ce": 0.0005580692086368799, + "loss_xval": 0.01007080078125, + "num_input_tokens_seen": 151548096, + "step": 2191 + }, + { + "epoch": 137.0, + "grad_norm": 13.650308173024213, + "learning_rate": 5e-05, + "loss": 0.052, + "num_input_tokens_seen": 151619648, + "step": 2192 + }, + { + "epoch": 137.0, + "loss": 0.05068320035934448, + "loss_ce": 0.00039023064891807735, + "loss_xval": 0.05029296875, + "num_input_tokens_seen": 151619648, + "step": 2192 + }, + { + "epoch": 137.0625, + "grad_norm": 9.617224665229953, + "learning_rate": 5e-05, + "loss": 0.0281, + "num_input_tokens_seen": 151678784, + "step": 2193 + }, + { + "epoch": 137.0625, + "loss": 0.03268560767173767, + "loss_ce": 0.0002149032079614699, + "loss_xval": 0.032470703125, + "num_input_tokens_seen": 151678784, + "step": 2193 + }, + { + "epoch": 137.125, + "grad_norm": 7.486163953767819, + "learning_rate": 5e-05, + "loss": 0.0159, + "num_input_tokens_seen": 151750336, + "step": 2194 + }, + { + "epoch": 137.125, + "loss": 0.016298949718475342, + "loss_ce": 0.00030773921753279865, + "loss_xval": 0.0159912109375, + "num_input_tokens_seen": 151750336, + "step": 2194 + }, + { + "epoch": 137.1875, + "grad_norm": 10.21288790843152, + "learning_rate": 5e-05, + "loss": 0.0293, + "num_input_tokens_seen": 151821952, + "step": 2195 + }, + { + "epoch": 137.1875, + "loss": 0.02932918071746826, + "loss_ce": 0.0005205870256759226, + "loss_xval": 0.02880859375, + "num_input_tokens_seen": 151821952, + "step": 2195 + }, + { + "epoch": 137.25, + "grad_norm": 3.702340244739552, + "learning_rate": 5e-05, + "loss": 0.0088, + "num_input_tokens_seen": 151893568, + "step": 2196 + }, + { + "epoch": 137.25, + "loss": 0.009085440076887608, + "loss_ce": 0.0007846589433029294, + "loss_xval": 0.00830078125, + "num_input_tokens_seen": 151893568, + "step": 2196 + }, + { + "epoch": 137.3125, + "grad_norm": 9.37196438277608, + "learning_rate": 5e-05, + "loss": 0.0257, + "num_input_tokens_seen": 151965184, + "step": 2197 + }, + { + "epoch": 137.3125, + "loss": 0.02497875690460205, + "loss_ce": 0.0008088357863016427, + "loss_xval": 0.024169921875, + "num_input_tokens_seen": 151965184, + "step": 2197 + }, + { + "epoch": 137.375, + "grad_norm": 0.5164767426303001, + "learning_rate": 5e-05, + "loss": 0.0025, + "num_input_tokens_seen": 152036800, + "step": 2198 + }, + { + "epoch": 137.375, + "loss": 0.0027048829942941666, + "loss_ce": 0.0008814577013254166, + "loss_xval": 0.00182342529296875, + "num_input_tokens_seen": 152036800, + "step": 2198 + }, + { + "epoch": 137.4375, + "grad_norm": 7.568909428786799, + "learning_rate": 5e-05, + "loss": 0.0209, + "num_input_tokens_seen": 152108416, + "step": 2199 + }, + { + "epoch": 137.4375, + "loss": 0.015644783154129982, + "loss_ce": 0.0008132405928336084, + "loss_xval": 0.01483154296875, + "num_input_tokens_seen": 152108416, + "step": 2199 + }, + { + "epoch": 137.5, + "grad_norm": 0.9429202499800853, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_input_tokens_seen": 152167488, + "step": 2200 + }, + { + "epoch": 137.5, + "loss": 0.002534096594899893, + "loss_ce": 0.0008937768288888037, + "loss_xval": 0.00164031982421875, + "num_input_tokens_seen": 152167488, + "step": 2200 + }, + { + "epoch": 137.5625, + "grad_norm": 6.852523911842224, + "learning_rate": 5e-05, + "loss": 0.0141, + "num_input_tokens_seen": 152239104, + "step": 2201 + }, + { + "epoch": 137.5625, + "loss": 0.013777682557702065, + "loss_ce": 0.00096029945416376, + "loss_xval": 0.0128173828125, + "num_input_tokens_seen": 152239104, + "step": 2201 + }, + { + "epoch": 137.625, + "grad_norm": 1.6369849402409489, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 152310784, + "step": 2202 + }, + { + "epoch": 137.625, + "loss": 0.0015844183508306742, + "loss_ce": 0.0007680730777792633, + "loss_xval": 0.00081634521484375, + "num_input_tokens_seen": 152310784, + "step": 2202 + }, + { + "epoch": 137.6875, + "grad_norm": 6.529793333660272, + "learning_rate": 5e-05, + "loss": 0.0126, + "num_input_tokens_seen": 152369856, + "step": 2203 + }, + { + "epoch": 137.6875, + "loss": 0.012915344908833504, + "loss_ce": 0.0005862429388798773, + "loss_xval": 0.0123291015625, + "num_input_tokens_seen": 152369856, + "step": 2203 + }, + { + "epoch": 137.75, + "grad_norm": 1.3590339013771888, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 152441664, + "step": 2204 + }, + { + "epoch": 137.75, + "loss": 0.001615358516573906, + "loss_ce": 0.0005777608603239059, + "loss_xval": 0.00103759765625, + "num_input_tokens_seen": 152441664, + "step": 2204 + }, + { + "epoch": 137.8125, + "grad_norm": 6.429611279367293, + "learning_rate": 5e-05, + "loss": 0.0121, + "num_input_tokens_seen": 152500800, + "step": 2205 + }, + { + "epoch": 137.8125, + "loss": 0.011699453927576542, + "loss_ce": 0.00046898474101908505, + "loss_xval": 0.01123046875, + "num_input_tokens_seen": 152500800, + "step": 2205 + }, + { + "epoch": 137.875, + "grad_norm": 1.5553170722890897, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 152572352, + "step": 2206 + }, + { + "epoch": 137.875, + "loss": 0.0012894327519461513, + "loss_ce": 0.00032049964647740126, + "loss_xval": 0.00096893310546875, + "num_input_tokens_seen": 152572352, + "step": 2206 + }, + { + "epoch": 137.9375, + "grad_norm": 6.308887726050781, + "learning_rate": 5e-05, + "loss": 0.0124, + "num_input_tokens_seen": 152619008, + "step": 2207 + }, + { + "epoch": 137.9375, + "loss": 0.012343852780759335, + "loss_ce": 0.00025889204698614776, + "loss_xval": 0.0120849609375, + "num_input_tokens_seen": 152619008, + "step": 2207 + }, + { + "epoch": 138.0, + "grad_norm": 1.514388520820637, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 152690624, + "step": 2208 + }, + { + "epoch": 138.0, + "loss": 0.0014354572631418705, + "loss_ce": 0.00023001297086011618, + "loss_xval": 0.0012054443359375, + "num_input_tokens_seen": 152690624, + "step": 2208 + }, + { + "epoch": 138.0625, + "grad_norm": 6.194249119347115, + "learning_rate": 5e-05, + "loss": 0.011, + "num_input_tokens_seen": 152762176, + "step": 2209 + }, + { + "epoch": 138.0625, + "loss": 0.010418097488582134, + "loss_ce": 0.00022522661311086267, + "loss_xval": 0.01019287109375, + "num_input_tokens_seen": 152762176, + "step": 2209 + }, + { + "epoch": 138.125, + "grad_norm": 1.1014088449740902, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 152833792, + "step": 2210 + }, + { + "epoch": 138.125, + "loss": 0.000695666647516191, + "loss_ce": 0.00016160900122486055, + "loss_xval": 0.0005340576171875, + "num_input_tokens_seen": 152833792, + "step": 2210 + }, + { + "epoch": 138.1875, + "grad_norm": 5.586581449559117, + "learning_rate": 5e-05, + "loss": 0.009, + "num_input_tokens_seen": 152905344, + "step": 2211 + }, + { + "epoch": 138.1875, + "loss": 0.009005383588373661, + "loss_ce": 0.00015528571384493262, + "loss_xval": 0.00885009765625, + "num_input_tokens_seen": 152905344, + "step": 2211 + }, + { + "epoch": 138.25, + "grad_norm": 0.6119646356804296, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 152977024, + "step": 2212 + }, + { + "epoch": 138.25, + "loss": 0.0005884505808353424, + "loss_ce": 0.00015166777302511036, + "loss_xval": 0.0004367828369140625, + "num_input_tokens_seen": 152977024, + "step": 2212 + }, + { + "epoch": 138.3125, + "grad_norm": 4.730973741739341, + "learning_rate": 5e-05, + "loss": 0.0065, + "num_input_tokens_seen": 153048640, + "step": 2213 + }, + { + "epoch": 138.3125, + "loss": 0.007002365775406361, + "loss_ce": 0.0001359108428005129, + "loss_xval": 0.006866455078125, + "num_input_tokens_seen": 153048640, + "step": 2213 + }, + { + "epoch": 138.375, + "grad_norm": 0.49462988250905304, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 153107776, + "step": 2214 + }, + { + "epoch": 138.375, + "loss": 0.0005698601016774774, + "loss_ce": 0.0001292625820497051, + "loss_xval": 0.0004405975341796875, + "num_input_tokens_seen": 153107776, + "step": 2214 + }, + { + "epoch": 138.4375, + "grad_norm": 4.070162378919028, + "learning_rate": 5e-05, + "loss": 0.0052, + "num_input_tokens_seen": 153179328, + "step": 2215 + }, + { + "epoch": 138.4375, + "loss": 0.005405760835856199, + "loss_ce": 0.00012622002395801246, + "loss_xval": 0.005279541015625, + "num_input_tokens_seen": 153179328, + "step": 2215 + }, + { + "epoch": 138.5, + "grad_norm": 1.2962029399375476, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 153238336, + "step": 2216 + }, + { + "epoch": 138.5, + "loss": 0.0007740218425169587, + "loss_ce": 0.00011789389100158587, + "loss_xval": 0.0006561279296875, + "num_input_tokens_seen": 153238336, + "step": 2216 + }, + { + "epoch": 138.5625, + "grad_norm": 3.585443961923812, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 153297472, + "step": 2217 + }, + { + "epoch": 138.5625, + "loss": 0.00407800730317831, + "loss_ce": 0.0001107221978600137, + "loss_xval": 0.00396728515625, + "num_input_tokens_seen": 153297472, + "step": 2217 + }, + { + "epoch": 138.625, + "grad_norm": 1.5923921681887396, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 153369152, + "step": 2218 + }, + { + "epoch": 138.625, + "loss": 0.0010032439604401588, + "loss_ce": 0.0001220489211846143, + "loss_xval": 0.000881195068359375, + "num_input_tokens_seen": 153369152, + "step": 2218 + }, + { + "epoch": 138.6875, + "grad_norm": 3.071232683319909, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_input_tokens_seen": 153440832, + "step": 2219 + }, + { + "epoch": 138.6875, + "loss": 0.0028993317391723394, + "loss_ce": 0.0001222321152454242, + "loss_xval": 0.002777099609375, + "num_input_tokens_seen": 153440832, + "step": 2219 + }, + { + "epoch": 138.75, + "grad_norm": 1.7176997414034458, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 153512576, + "step": 2220 + }, + { + "epoch": 138.75, + "loss": 0.0010029050754383206, + "loss_ce": 0.00011789533164119348, + "loss_xval": 0.000885009765625, + "num_input_tokens_seen": 153512576, + "step": 2220 + }, + { + "epoch": 138.8125, + "grad_norm": 2.804938035327702, + "learning_rate": 5e-05, + "loss": 0.0025, + "num_input_tokens_seen": 153571776, + "step": 2221 + }, + { + "epoch": 138.8125, + "loss": 0.003006437560543418, + "loss_ce": 0.00012252634041942656, + "loss_xval": 0.0028839111328125, + "num_input_tokens_seen": 153571776, + "step": 2221 + }, + { + "epoch": 138.875, + "grad_norm": 1.4055190533044555, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 153643584, + "step": 2222 + }, + { + "epoch": 138.875, + "loss": 0.0008755518938414752, + "loss_ce": 0.00012405651796143502, + "loss_xval": 0.000751495361328125, + "num_input_tokens_seen": 153643584, + "step": 2222 + }, + { + "epoch": 138.9375, + "grad_norm": 2.110967150498914, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 153702720, + "step": 2223 + }, + { + "epoch": 138.9375, + "loss": 0.0020835991017520428, + "loss_ce": 0.00011521533451741561, + "loss_xval": 0.0019683837890625, + "num_input_tokens_seen": 153702720, + "step": 2223 + }, + { + "epoch": 139.0, + "grad_norm": 1.4203094135983487, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 153761920, + "step": 2224 + }, + { + "epoch": 139.0, + "loss": 0.0012334496714174747, + "loss_ce": 0.00011955812806263566, + "loss_xval": 0.0011138916015625, + "num_input_tokens_seen": 153761920, + "step": 2224 + }, + { + "epoch": 139.0625, + "grad_norm": 1.5418461161142287, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 153833600, + "step": 2225 + }, + { + "epoch": 139.0625, + "loss": 0.0013548274291679263, + "loss_ce": 0.00011123609874630347, + "loss_xval": 0.00124359130859375, + "num_input_tokens_seen": 153833600, + "step": 2225 + }, + { + "epoch": 139.125, + "grad_norm": 1.3799738043250926, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 153905216, + "step": 2226 + }, + { + "epoch": 139.125, + "loss": 0.0008790802676230669, + "loss_ce": 0.0001161408144980669, + "loss_xval": 0.000762939453125, + "num_input_tokens_seen": 153905216, + "step": 2226 + }, + { + "epoch": 139.1875, + "grad_norm": 1.5584645779359327, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 153976768, + "step": 2227 + }, + { + "epoch": 139.1875, + "loss": 0.0011807398404926062, + "loss_ce": 0.00012025403702864423, + "loss_xval": 0.00106048583984375, + "num_input_tokens_seen": 153976768, + "step": 2227 + }, + { + "epoch": 139.25, + "grad_norm": 0.8157982009158394, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 154048448, + "step": 2228 + }, + { + "epoch": 139.25, + "loss": 0.0007961892988532782, + "loss_ce": 0.00011717317102011293, + "loss_xval": 0.00067901611328125, + "num_input_tokens_seen": 154048448, + "step": 2228 + }, + { + "epoch": 139.3125, + "grad_norm": 1.4322162772025198, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 154120000, + "step": 2229 + }, + { + "epoch": 139.3125, + "loss": 0.001034379587508738, + "loss_ce": 0.00011503753921715543, + "loss_xval": 0.000919342041015625, + "num_input_tokens_seen": 154120000, + "step": 2229 + }, + { + "epoch": 139.375, + "grad_norm": 0.47032266578243526, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 154191616, + "step": 2230 + }, + { + "epoch": 139.375, + "loss": 0.00034551139106042683, + "loss_ce": 0.00011281486513325945, + "loss_xval": 0.000232696533203125, + "num_input_tokens_seen": 154191616, + "step": 2230 + }, + { + "epoch": 139.4375, + "grad_norm": 1.2886053491749265, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 154263232, + "step": 2231 + }, + { + "epoch": 139.4375, + "loss": 0.0007385425269603729, + "loss_ce": 0.00010148809087695554, + "loss_xval": 0.000637054443359375, + "num_input_tokens_seen": 154263232, + "step": 2231 + }, + { + "epoch": 139.5, + "grad_norm": 0.17049475117003512, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 154334848, + "step": 2232 + }, + { + "epoch": 139.5, + "loss": 0.0004746349295601249, + "loss_ce": 9.507254435447976e-05, + "loss_xval": 0.0003795623779296875, + "num_input_tokens_seen": 154334848, + "step": 2232 + }, + { + "epoch": 139.5625, + "grad_norm": 0.2630249658768459, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 154406400, + "step": 2233 + }, + { + "epoch": 139.5625, + "loss": 0.0004748214269056916, + "loss_ce": 9.14443371584639e-05, + "loss_xval": 0.0003833770751953125, + "num_input_tokens_seen": 154406400, + "step": 2233 + }, + { + "epoch": 139.625, + "grad_norm": 0.311810875046848, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 154478016, + "step": 2234 + }, + { + "epoch": 139.625, + "loss": 0.0003131224075332284, + "loss_ce": 8.805528341326863e-05, + "loss_xval": 0.000225067138671875, + "num_input_tokens_seen": 154478016, + "step": 2234 + }, + { + "epoch": 139.6875, + "grad_norm": 0.3015887855796754, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 154549696, + "step": 2235 + }, + { + "epoch": 139.6875, + "loss": 0.00023762644559610635, + "loss_ce": 8.503855497110635e-05, + "loss_xval": 0.000152587890625, + "num_input_tokens_seen": 154549696, + "step": 2235 + }, + { + "epoch": 139.75, + "grad_norm": 0.28683055185678186, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 154621312, + "step": 2236 + }, + { + "epoch": 139.75, + "loss": 0.0003172550059389323, + "loss_ce": 8.169744251063094e-05, + "loss_xval": 0.00023555755615234375, + "num_input_tokens_seen": 154621312, + "step": 2236 + }, + { + "epoch": 139.8125, + "grad_norm": 0.44450358647372484, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 154693056, + "step": 2237 + }, + { + "epoch": 139.8125, + "loss": 0.00025231545441783965, + "loss_ce": 7.588571315864101e-05, + "loss_xval": 0.00017642974853515625, + "num_input_tokens_seen": 154693056, + "step": 2237 + }, + { + "epoch": 139.875, + "grad_norm": 0.35186947388869055, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 154764672, + "step": 2238 + }, + { + "epoch": 139.875, + "loss": 0.00023929460439831018, + "loss_ce": 7.14479319867678e-05, + "loss_xval": 0.0001678466796875, + "num_input_tokens_seen": 154764672, + "step": 2238 + }, + { + "epoch": 139.9375, + "grad_norm": 0.845929231225681, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 154836288, + "step": 2239 + }, + { + "epoch": 139.9375, + "loss": 0.00032878026831895113, + "loss_ce": 6.365880835801363e-05, + "loss_xval": 0.0002651214599609375, + "num_input_tokens_seen": 154836288, + "step": 2239 + }, + { + "epoch": 140.0, + "grad_norm": 0.2749791957448786, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 154907968, + "step": 2240 + }, + { + "epoch": 140.0, + "loss": 0.0003007893683388829, + "loss_ce": 6.332447082968429e-05, + "loss_xval": 0.00023746490478515625, + "num_input_tokens_seen": 154907968, + "step": 2240 + }, + { + "epoch": 140.0625, + "grad_norm": 0.6196150387867916, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 154967168, + "step": 2241 + }, + { + "epoch": 140.0625, + "loss": 0.0003579995536711067, + "loss_ce": 5.663846604875289e-05, + "loss_xval": 0.000301361083984375, + "num_input_tokens_seen": 154967168, + "step": 2241 + }, + { + "epoch": 140.125, + "grad_norm": 0.6804395287117125, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 155038784, + "step": 2242 + }, + { + "epoch": 140.125, + "loss": 0.00025112146977335215, + "loss_ce": 5.180353036848828e-05, + "loss_xval": 0.00019931793212890625, + "num_input_tokens_seen": 155038784, + "step": 2242 + }, + { + "epoch": 140.1875, + "grad_norm": 0.13514985352534017, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 155110336, + "step": 2243 + }, + { + "epoch": 140.1875, + "loss": 0.00020603234588634223, + "loss_ce": 4.772240572492592e-05, + "loss_xval": 0.0001583099365234375, + "num_input_tokens_seen": 155110336, + "step": 2243 + }, + { + "epoch": 140.25, + "grad_norm": 0.7497696258118558, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 155181888, + "step": 2244 + }, + { + "epoch": 140.25, + "loss": 0.00037656372296623886, + "loss_ce": 4.849975448451005e-05, + "loss_xval": 0.00032806396484375, + "num_input_tokens_seen": 155181888, + "step": 2244 + }, + { + "epoch": 140.3125, + "grad_norm": 0.41926920477007684, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 155253504, + "step": 2245 + }, + { + "epoch": 140.3125, + "loss": 0.0003734802012331784, + "loss_ce": 4.732359229819849e-05, + "loss_xval": 0.0003261566162109375, + "num_input_tokens_seen": 155253504, + "step": 2245 + }, + { + "epoch": 140.375, + "grad_norm": 0.07113424112628354, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 155325120, + "step": 2246 + }, + { + "epoch": 140.375, + "loss": 0.00014185221516527236, + "loss_ce": 4.3146916141267866e-05, + "loss_xval": 9.870529174804688e-05, + "num_input_tokens_seen": 155325120, + "step": 2246 + }, + { + "epoch": 140.4375, + "grad_norm": 0.25364112394243704, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 155396736, + "step": 2247 + }, + { + "epoch": 140.4375, + "loss": 0.00019017353770323098, + "loss_ce": 4.044667730340734e-05, + "loss_xval": 0.00014972686767578125, + "num_input_tokens_seen": 155396736, + "step": 2247 + }, + { + "epoch": 140.5, + "grad_norm": 0.8181657596738321, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 155468416, + "step": 2248 + }, + { + "epoch": 140.5, + "loss": 0.0003228874411433935, + "loss_ce": 3.869248030241579e-05, + "loss_xval": 0.0002841949462890625, + "num_input_tokens_seen": 155468416, + "step": 2248 + }, + { + "epoch": 140.5625, + "grad_norm": 0.8271386434304645, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 155540160, + "step": 2249 + }, + { + "epoch": 140.5625, + "loss": 0.0003159397456329316, + "loss_ce": 3.9374201151076704e-05, + "loss_xval": 0.0002765655517578125, + "num_input_tokens_seen": 155540160, + "step": 2249 + }, + { + "epoch": 140.625, + "grad_norm": 0.11107590119883078, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 155611968, + "step": 2250 + }, + { + "epoch": 140.625, + "eval_synth_IoU": 0.29707459546625614, + "eval_synth_MAE_x": 0.014007568359375, + "eval_synth_MAE_y": 0.0155181884765625, + "eval_synth_NUM_probability": 0.9995821416378021, + "eval_synth_inside_bbox": 0.5, + "eval_synth_loss": 0.0002646126667968929, + "eval_synth_loss_ce": 3.5015559660678264e-05, + "eval_synth_loss_xval": 0.0002295970916748047, + "eval_synth_runtime": 62.9843, + "eval_synth_samples_per_second": 2.032, + "eval_synth_steps_per_second": 0.064, + "num_input_tokens_seen": 155611968, + "step": 2250 + }, + { + "epoch": 140.625, + "loss": 0.0002962798753287643, + "loss_ce": 3.49731017195154e-05, + "loss_xval": 0.0002613067626953125, + "num_input_tokens_seen": 155611968, + "step": 2250 + }, + { + "epoch": 140.6875, + "grad_norm": 0.6278575183312798, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 155683776, + "step": 2251 + }, + { + "epoch": 140.6875, + "loss": 0.0002068749745376408, + "loss_ce": 3.425991963013075e-05, + "loss_xval": 0.00017261505126953125, + "num_input_tokens_seen": 155683776, + "step": 2251 + }, + { + "epoch": 140.75, + "grad_norm": 0.6664386753012557, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 155755392, + "step": 2252 + }, + { + "epoch": 140.75, + "loss": 0.00036714281304739416, + "loss_ce": 3.335680958116427e-05, + "loss_xval": 0.0003337860107421875, + "num_input_tokens_seen": 155755392, + "step": 2252 + }, + { + "epoch": 140.8125, + "grad_norm": 0.06576501359392857, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 155826944, + "step": 2253 + }, + { + "epoch": 140.8125, + "loss": 0.00013523247616831213, + "loss_ce": 3.223565363441594e-05, + "loss_xval": 0.000102996826171875, + "num_input_tokens_seen": 155826944, + "step": 2253 + }, + { + "epoch": 140.875, + "grad_norm": 0.7849105738929436, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 155898560, + "step": 2254 + }, + { + "epoch": 140.875, + "loss": 0.0003013724344782531, + "loss_ce": 2.8621574529097416e-05, + "loss_xval": 0.0002727508544921875, + "num_input_tokens_seen": 155898560, + "step": 2254 + }, + { + "epoch": 140.9375, + "grad_norm": 0.9380444255904111, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 155970112, + "step": 2255 + }, + { + "epoch": 140.9375, + "loss": 0.0003526804503053427, + "loss_ce": 2.8431173632270657e-05, + "loss_xval": 0.000324249267578125, + "num_input_tokens_seen": 155970112, + "step": 2255 + }, + { + "epoch": 141.0, + "grad_norm": 0.44775255648791673, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 156041728, + "step": 2256 + }, + { + "epoch": 141.0, + "loss": 0.00012957978469785303, + "loss_ce": 2.7059801141149364e-05, + "loss_xval": 0.00010251998901367188, + "num_input_tokens_seen": 156041728, + "step": 2256 + }, + { + "epoch": 141.0625, + "grad_norm": 0.126081080342943, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 156100864, + "step": 2257 + }, + { + "epoch": 141.0625, + "loss": 8.322107169078663e-05, + "loss_ce": 2.647744804562535e-05, + "loss_xval": 5.6743621826171875e-05, + "num_input_tokens_seen": 156100864, + "step": 2257 + }, + { + "epoch": 141.125, + "grad_norm": 0.47681506835538257, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 156159872, + "step": 2258 + }, + { + "epoch": 141.125, + "loss": 0.00012125995272072032, + "loss_ce": 2.398517244728282e-05, + "loss_xval": 9.72747802734375e-05, + "num_input_tokens_seen": 156159872, + "step": 2258 + }, + { + "epoch": 141.1875, + "grad_norm": 0.4726416575728405, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 156231552, + "step": 2259 + }, + { + "epoch": 141.1875, + "loss": 0.00021909164206590503, + "loss_ce": 2.644942833285313e-05, + "loss_xval": 0.0001926422119140625, + "num_input_tokens_seen": 156231552, + "step": 2259 + }, + { + "epoch": 141.25, + "grad_norm": 0.1718543214570315, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 156303296, + "step": 2260 + }, + { + "epoch": 141.25, + "loss": 0.00012607949611265212, + "loss_ce": 2.5943692889995873e-05, + "loss_xval": 0.00010013580322265625, + "num_input_tokens_seen": 156303296, + "step": 2260 + }, + { + "epoch": 141.3125, + "grad_norm": 0.191379048394649, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 156362432, + "step": 2261 + }, + { + "epoch": 141.3125, + "loss": 6.542243500007316e-05, + "loss_ce": 2.3460766897187568e-05, + "loss_xval": 4.1961669921875e-05, + "num_input_tokens_seen": 156362432, + "step": 2261 + }, + { + "epoch": 141.375, + "grad_norm": 0.5373399154129548, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 156434048, + "step": 2262 + }, + { + "epoch": 141.375, + "loss": 0.00020634174870792776, + "loss_ce": 2.2282611098489724e-05, + "loss_xval": 0.00018405914306640625, + "num_input_tokens_seen": 156434048, + "step": 2262 + }, + { + "epoch": 141.4375, + "grad_norm": 0.7771818980975708, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 156505600, + "step": 2263 + }, + { + "epoch": 141.4375, + "loss": 0.00023935161880217493, + "loss_ce": 2.1913880118518136e-05, + "loss_xval": 0.000217437744140625, + "num_input_tokens_seen": 156505600, + "step": 2263 + }, + { + "epoch": 141.5, + "grad_norm": 0.652937392279704, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 156577152, + "step": 2264 + }, + { + "epoch": 141.5, + "loss": 0.00015698786592110991, + "loss_ce": 2.347346344322432e-05, + "loss_xval": 0.000133514404296875, + "num_input_tokens_seen": 156577152, + "step": 2264 + }, + { + "epoch": 141.5625, + "grad_norm": 0.24034733399568411, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 156648832, + "step": 2265 + }, + { + "epoch": 141.5625, + "loss": 8.453571354039013e-05, + "loss_ce": 2.1116375137353316e-05, + "loss_xval": 6.341934204101562e-05, + "num_input_tokens_seen": 156648832, + "step": 2265 + }, + { + "epoch": 141.625, + "grad_norm": 0.05137380518404474, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 156707968, + "step": 2266 + }, + { + "epoch": 141.625, + "loss": 8.02580761956051e-05, + "loss_ce": 1.9938175682909787e-05, + "loss_xval": 6.031990051269531e-05, + "num_input_tokens_seen": 156707968, + "step": 2266 + }, + { + "epoch": 141.6875, + "grad_norm": 0.07237913080118438, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 156779584, + "step": 2267 + }, + { + "epoch": 141.6875, + "loss": 3.4921242331620306e-05, + "loss_ce": 2.0020079318783246e-05, + "loss_xval": 1.4901161193847656e-05, + "num_input_tokens_seen": 156779584, + "step": 2267 + }, + { + "epoch": 141.75, + "grad_norm": 0.02706912992405218, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 156838592, + "step": 2268 + }, + { + "epoch": 141.75, + "loss": 8.904261630959809e-05, + "loss_ce": 1.8470716895535588e-05, + "loss_xval": 7.05718994140625e-05, + "num_input_tokens_seen": 156838592, + "step": 2268 + }, + { + "epoch": 141.8125, + "grad_norm": 0.12553347339518686, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 156910144, + "step": 2269 + }, + { + "epoch": 141.8125, + "loss": 6.425999163184315e-05, + "loss_ce": 1.8960461602546275e-05, + "loss_xval": 4.5299530029296875e-05, + "num_input_tokens_seen": 156910144, + "step": 2269 + }, + { + "epoch": 141.875, + "grad_norm": 0.4635155560761945, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 156969344, + "step": 2270 + }, + { + "epoch": 141.875, + "loss": 0.0001502758968854323, + "loss_ce": 1.6761494407546706e-05, + "loss_xval": 0.000133514404296875, + "num_input_tokens_seen": 156969344, + "step": 2270 + }, + { + "epoch": 141.9375, + "grad_norm": 0.8880865872714322, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 157041024, + "step": 2271 + }, + { + "epoch": 141.9375, + "loss": 0.00034242760739289224, + "loss_ce": 1.8178337995777838e-05, + "loss_xval": 0.000324249267578125, + "num_input_tokens_seen": 157041024, + "step": 2271 + }, + { + "epoch": 142.0, + "grad_norm": 1.02132370067152, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 157112768, + "step": 2272 + }, + { + "epoch": 142.0, + "loss": 0.00038059926009736955, + "loss_ce": 1.629567486816086e-05, + "loss_xval": 0.0003643035888671875, + "num_input_tokens_seen": 157112768, + "step": 2272 + }, + { + "epoch": 142.0625, + "grad_norm": 0.754057194678211, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 157184384, + "step": 2273 + }, + { + "epoch": 142.0625, + "loss": 0.0002003343979595229, + "loss_ce": 1.627525307412725e-05, + "loss_xval": 0.00018405914306640625, + "num_input_tokens_seen": 157184384, + "step": 2273 + }, + { + "epoch": 142.125, + "grad_norm": 0.270742358060926, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 157256192, + "step": 2274 + }, + { + "epoch": 142.125, + "loss": 0.00013197364751249552, + "loss_ce": 1.7532724086777307e-05, + "loss_xval": 0.00011444091796875, + "num_input_tokens_seen": 157256192, + "step": 2274 + }, + { + "epoch": 142.1875, + "grad_norm": 0.3032083858971255, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 157315328, + "step": 2275 + }, + { + "epoch": 142.1875, + "loss": 8.614677062723786e-05, + "loss_ce": 1.5574874851154163e-05, + "loss_xval": 7.05718994140625e-05, + "num_input_tokens_seen": 157315328, + "step": 2275 + }, + { + "epoch": 142.25, + "grad_norm": 0.8531261556843668, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 157386880, + "step": 2276 + }, + { + "epoch": 142.25, + "loss": 0.00036126747727394104, + "loss_ce": 1.6037383829825558e-05, + "loss_xval": 0.0003452301025390625, + "num_input_tokens_seen": 157386880, + "step": 2276 + }, + { + "epoch": 142.3125, + "grad_norm": 1.2143602578373374, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 157458560, + "step": 2277 + }, + { + "epoch": 142.3125, + "loss": 0.0003886088670697063, + "loss_ce": 1.6675890947226435e-05, + "loss_xval": 0.0003719329833984375, + "num_input_tokens_seen": 157458560, + "step": 2277 + }, + { + "epoch": 142.375, + "grad_norm": 1.4565740954306055, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 157530112, + "step": 2278 + }, + { + "epoch": 142.375, + "loss": 0.0005817589699290693, + "loss_ce": 1.7183778254548088e-05, + "loss_xval": 0.0005645751953125, + "num_input_tokens_seen": 157530112, + "step": 2278 + }, + { + "epoch": 142.4375, + "grad_norm": 1.7198112420019938, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 157601920, + "step": 2279 + }, + { + "epoch": 142.4375, + "loss": 0.0009235304314643145, + "loss_ce": 1.5632504073437303e-05, + "loss_xval": 0.00090789794921875, + "num_input_tokens_seen": 157601920, + "step": 2279 + }, + { + "epoch": 142.5, + "grad_norm": 2.0291451474600204, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 157673664, + "step": 2280 + }, + { + "epoch": 142.5, + "loss": 0.001222676713950932, + "loss_ce": 1.7232428945135325e-05, + "loss_xval": 0.0012054443359375, + "num_input_tokens_seen": 157673664, + "step": 2280 + }, + { + "epoch": 142.5625, + "grad_norm": 2.4538085825998506, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 157745280, + "step": 2281 + }, + { + "epoch": 142.5625, + "loss": 0.0017871184973046184, + "loss_ce": 1.7099009710364044e-05, + "loss_xval": 0.00177001953125, + "num_input_tokens_seen": 157745280, + "step": 2281 + }, + { + "epoch": 142.625, + "grad_norm": 3.101466685210458, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_input_tokens_seen": 157816960, + "step": 2282 + }, + { + "epoch": 142.625, + "loss": 0.0027955160476267338, + "loss_ce": 1.8416507373331115e-05, + "loss_xval": 0.002777099609375, + "num_input_tokens_seen": 157816960, + "step": 2282 + }, + { + "epoch": 142.6875, + "grad_norm": 3.860516054775248, + "learning_rate": 5e-05, + "loss": 0.0042, + "num_input_tokens_seen": 157875968, + "step": 2283 + }, + { + "epoch": 142.6875, + "loss": 0.004079585429280996, + "loss_ce": 2.0747500457218848e-05, + "loss_xval": 0.004058837890625, + "num_input_tokens_seen": 157875968, + "step": 2283 + }, + { + "epoch": 142.75, + "grad_norm": 4.391002128654902, + "learning_rate": 5e-05, + "loss": 0.0054, + "num_input_tokens_seen": 157947584, + "step": 2284 + }, + { + "epoch": 142.75, + "loss": 0.005272651091217995, + "loss_ce": 2.362753548368346e-05, + "loss_xval": 0.0052490234375, + "num_input_tokens_seen": 157947584, + "step": 2284 + }, + { + "epoch": 142.8125, + "grad_norm": 3.8481785567403946, + "learning_rate": 5e-05, + "loss": 0.0043, + "num_input_tokens_seen": 158019264, + "step": 2285 + }, + { + "epoch": 142.8125, + "loss": 0.00454552099108696, + "loss_ce": 2.8919605028931983e-05, + "loss_xval": 0.0045166015625, + "num_input_tokens_seen": 158019264, + "step": 2285 + }, + { + "epoch": 142.875, + "grad_norm": 1.639097733325779, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 158091008, + "step": 2286 + }, + { + "epoch": 142.875, + "loss": 0.0011131562059745193, + "loss_ce": 3.741156615433283e-05, + "loss_xval": 0.00107574462890625, + "num_input_tokens_seen": 158091008, + "step": 2286 + }, + { + "epoch": 142.9375, + "grad_norm": 1.261991233490499, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 158162624, + "step": 2287 + }, + { + "epoch": 142.9375, + "loss": 0.0009148393291980028, + "loss_ce": 4.8903075366979465e-05, + "loss_xval": 0.000865936279296875, + "num_input_tokens_seen": 158162624, + "step": 2287 + }, + { + "epoch": 143.0, + "grad_norm": 3.2012555194906236, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_input_tokens_seen": 158234240, + "step": 2288 + }, + { + "epoch": 143.0, + "loss": 0.0035402164794504642, + "loss_ce": 6.121250044088811e-05, + "loss_xval": 0.00347900390625, + "num_input_tokens_seen": 158234240, + "step": 2288 + }, + { + "epoch": 143.0625, + "grad_norm": 3.072103026451075, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_input_tokens_seen": 158305920, + "step": 2289 + }, + { + "epoch": 143.0625, + "loss": 0.003025761106982827, + "loss_ce": 8.081478881649673e-05, + "loss_xval": 0.0029449462890625, + "num_input_tokens_seen": 158305920, + "step": 2289 + }, + { + "epoch": 143.125, + "grad_norm": 0.9733620436290504, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 158377472, + "step": 2290 + }, + { + "epoch": 143.125, + "loss": 0.0005783883389085531, + "loss_ce": 0.00010345852206228301, + "loss_xval": 0.0004749298095703125, + "num_input_tokens_seen": 158377472, + "step": 2290 + }, + { + "epoch": 143.1875, + "grad_norm": 1.6681225600357712, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 158449152, + "step": 2291 + }, + { + "epoch": 143.1875, + "loss": 0.0009542530169710517, + "loss_ce": 0.00012264902761671692, + "loss_xval": 0.00083160400390625, + "num_input_tokens_seen": 158449152, + "step": 2291 + }, + { + "epoch": 143.25, + "grad_norm": 3.0001199342153058, + "learning_rate": 5e-05, + "loss": 0.003, + "num_input_tokens_seen": 158520832, + "step": 2292 + }, + { + "epoch": 143.25, + "loss": 0.00304748909547925, + "loss_ce": 0.00014831911539658904, + "loss_xval": 0.002899169921875, + "num_input_tokens_seen": 158520832, + "step": 2292 + }, + { + "epoch": 143.3125, + "grad_norm": 2.267618572064086, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 158592512, + "step": 2293 + }, + { + "epoch": 143.3125, + "loss": 0.002161857206374407, + "loss_ce": 0.00017821461369749159, + "loss_xval": 0.001983642578125, + "num_input_tokens_seen": 158592512, + "step": 2293 + }, + { + "epoch": 143.375, + "grad_norm": 0.04030482510571815, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 158651520, + "step": 2294 + }, + { + "epoch": 143.375, + "loss": 0.00026899820659309626, + "loss_ce": 0.00017506130097899586, + "loss_xval": 9.393692016601562e-05, + "num_input_tokens_seen": 158651520, + "step": 2294 + }, + { + "epoch": 143.4375, + "grad_norm": 2.254898691584632, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 158723200, + "step": 2295 + }, + { + "epoch": 143.4375, + "loss": 0.001831955392844975, + "loss_ce": 0.00019163561228197068, + "loss_xval": 0.00164031982421875, + "num_input_tokens_seen": 158723200, + "step": 2295 + }, + { + "epoch": 143.5, + "grad_norm": 2.7482044925183335, + "learning_rate": 5e-05, + "loss": 0.0025, + "num_input_tokens_seen": 158794816, + "step": 2296 + }, + { + "epoch": 143.5, + "loss": 0.0025814685504883528, + "loss_ce": 0.0001858387258835137, + "loss_xval": 0.0023956298828125, + "num_input_tokens_seen": 158794816, + "step": 2296 + }, + { + "epoch": 143.5625, + "grad_norm": 1.1240351697063617, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 158866432, + "step": 2297 + }, + { + "epoch": 143.5625, + "loss": 0.0007681635906919837, + "loss_ce": 0.00019977372721768916, + "loss_xval": 0.000568389892578125, + "num_input_tokens_seen": 158866432, + "step": 2297 + }, + { + "epoch": 143.625, + "grad_norm": 1.1891412391354403, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 158938112, + "step": 2298 + }, + { + "epoch": 143.625, + "loss": 0.0009352611377835274, + "loss_ce": 0.0001837657910073176, + "loss_xval": 0.000751495361328125, + "num_input_tokens_seen": 158938112, + "step": 2298 + }, + { + "epoch": 143.6875, + "grad_norm": 2.247141443842513, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 159009792, + "step": 2299 + }, + { + "epoch": 143.6875, + "loss": 0.002149116713553667, + "loss_ce": 0.00016547413542866707, + "loss_xval": 0.001983642578125, + "num_input_tokens_seen": 159009792, + "step": 2299 + }, + { + "epoch": 143.75, + "grad_norm": 1.6416382389026558, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 159081472, + "step": 2300 + }, + { + "epoch": 143.75, + "loss": 0.001146484981290996, + "loss_ce": 0.00016229308675974607, + "loss_xval": 0.00098419189453125, + "num_input_tokens_seen": 159081472, + "step": 2300 + }, + { + "epoch": 143.8125, + "grad_norm": 0.349722522096258, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 159153088, + "step": 2301 + }, + { + "epoch": 143.8125, + "loss": 0.00040729466127231717, + "loss_ce": 0.00014217320131137967, + "loss_xval": 0.0002651214599609375, + "num_input_tokens_seen": 159153088, + "step": 2301 + }, + { + "epoch": 143.875, + "grad_norm": 0.6319018735463661, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 159224640, + "step": 2302 + }, + { + "epoch": 143.875, + "loss": 0.0005437061772681773, + "loss_ce": 0.00012790417531505227, + "loss_xval": 0.000415802001953125, + "num_input_tokens_seen": 159224640, + "step": 2302 + }, + { + "epoch": 143.9375, + "grad_norm": 0.8852029961655974, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 159296256, + "step": 2303 + }, + { + "epoch": 143.9375, + "loss": 0.000454133638413623, + "loss_ce": 0.00010890352859860286, + "loss_xval": 0.0003452301025390625, + "num_input_tokens_seen": 159296256, + "step": 2303 + }, + { + "epoch": 144.0, + "grad_norm": 0.7714757947019761, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 159368064, + "step": 2304 + }, + { + "epoch": 144.0, + "loss": 0.00044721277663484216, + "loss_ce": 0.00010579736408544704, + "loss_xval": 0.0003414154052734375, + "num_input_tokens_seen": 159368064, + "step": 2304 + }, + { + "epoch": 144.0625, + "grad_norm": 0.4846986541212903, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 159439616, + "step": 2305 + }, + { + "epoch": 144.0625, + "loss": 0.0004524993128143251, + "loss_ce": 9.391778439749032e-05, + "loss_xval": 0.00035858154296875, + "num_input_tokens_seen": 159439616, + "step": 2305 + }, + { + "epoch": 144.125, + "grad_norm": 0.220384214054421, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 159511296, + "step": 2306 + }, + { + "epoch": 144.125, + "loss": 0.00016809874796308577, + "loss_ce": 7.797653233865276e-05, + "loss_xval": 9.012222290039062e-05, + "num_input_tokens_seen": 159511296, + "step": 2306 + }, + { + "epoch": 144.1875, + "grad_norm": 1.0475055335972265, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 159583104, + "step": 2307 + }, + { + "epoch": 144.1875, + "loss": 0.00048047397285699844, + "loss_ce": 6.276463682297617e-05, + "loss_xval": 0.0004177093505859375, + "num_input_tokens_seen": 159583104, + "step": 2307 + }, + { + "epoch": 144.25, + "grad_norm": 1.7401840496492975, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 159654848, + "step": 2308 + }, + { + "epoch": 144.25, + "loss": 0.0011681739706546068, + "loss_ce": 5.428240183391608e-05, + "loss_xval": 0.0011138916015625, + "num_input_tokens_seen": 159654848, + "step": 2308 + }, + { + "epoch": 144.3125, + "grad_norm": 2.008997027680627, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 159726528, + "step": 2309 + }, + { + "epoch": 144.3125, + "loss": 0.0014396762708202004, + "loss_ce": 5.112646613270044e-05, + "loss_xval": 0.0013885498046875, + "num_input_tokens_seen": 159726528, + "step": 2309 + }, + { + "epoch": 144.375, + "grad_norm": 1.7834433012025868, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 159798208, + "step": 2310 + }, + { + "epoch": 144.375, + "loss": 0.001046067220158875, + "loss_ce": 4.661651837523095e-05, + "loss_xval": 0.00099945068359375, + "num_input_tokens_seen": 159798208, + "step": 2310 + }, + { + "epoch": 144.4375, + "grad_norm": 1.4923854766570368, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 159857344, + "step": 2311 + }, + { + "epoch": 144.4375, + "loss": 0.0011433206964284182, + "loss_ce": 4.468788392841816e-05, + "loss_xval": 0.0010986328125, + "num_input_tokens_seen": 159857344, + "step": 2311 + }, + { + "epoch": 144.5, + "grad_norm": 1.4666471946527897, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 159928896, + "step": 2312 + }, + { + "epoch": 144.5, + "loss": 0.0006469779764302075, + "loss_ce": 4.044111119583249e-05, + "loss_xval": 0.000606536865234375, + "num_input_tokens_seen": 159928896, + "step": 2312 + }, + { + "epoch": 144.5625, + "grad_norm": 1.4521669594107383, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 160000576, + "step": 2313 + }, + { + "epoch": 144.5625, + "loss": 0.0008465639548376203, + "loss_ce": 3.784813452512026e-05, + "loss_xval": 0.0008087158203125, + "num_input_tokens_seen": 160000576, + "step": 2313 + }, + { + "epoch": 144.625, + "grad_norm": 1.3065765034926327, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 160072256, + "step": 2314 + }, + { + "epoch": 144.625, + "loss": 0.0010280825663357973, + "loss_ce": 3.6261219065636396e-05, + "loss_xval": 0.0009918212890625, + "num_input_tokens_seen": 160072256, + "step": 2314 + }, + { + "epoch": 144.6875, + "grad_norm": 1.2298249889341375, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 160143808, + "step": 2315 + }, + { + "epoch": 144.6875, + "loss": 0.0005464506102725863, + "loss_ce": 3.5281202144687995e-05, + "loss_xval": 0.00051116943359375, + "num_input_tokens_seen": 160143808, + "step": 2315 + }, + { + "epoch": 144.75, + "grad_norm": 1.1456463289722179, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 160215552, + "step": 2316 + }, + { + "epoch": 144.75, + "loss": 0.0007918947958387434, + "loss_ce": 3.277005453128368e-05, + "loss_xval": 0.000759124755859375, + "num_input_tokens_seen": 160215552, + "step": 2316 + }, + { + "epoch": 144.8125, + "grad_norm": 0.9536307015781831, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 160287232, + "step": 2317 + }, + { + "epoch": 144.8125, + "loss": 0.0003948156663682312, + "loss_ce": 3.0512072044075467e-05, + "loss_xval": 0.0003643035888671875, + "num_input_tokens_seen": 160287232, + "step": 2317 + }, + { + "epoch": 144.875, + "grad_norm": 0.8927383093885575, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 160358912, + "step": 2318 + }, + { + "epoch": 144.875, + "loss": 0.000286363298073411, + "loss_ce": 3.07785885524936e-05, + "loss_xval": 0.000255584716796875, + "num_input_tokens_seen": 160358912, + "step": 2318 + }, + { + "epoch": 144.9375, + "grad_norm": 1.0475096611391477, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 160430464, + "step": 2319 + }, + { + "epoch": 144.9375, + "loss": 0.00042939482955262065, + "loss_ce": 2.8851631213910878e-05, + "loss_xval": 0.000400543212890625, + "num_input_tokens_seen": 160430464, + "step": 2319 + }, + { + "epoch": 145.0, + "grad_norm": 1.2569209874717198, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 160502144, + "step": 2320 + }, + { + "epoch": 145.0, + "loss": 0.0005893835914321244, + "loss_ce": 2.86231061181752e-05, + "loss_xval": 0.000560760498046875, + "num_input_tokens_seen": 160502144, + "step": 2320 + }, + { + "epoch": 145.0625, + "grad_norm": 1.433665622859712, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 160573824, + "step": 2321 + }, + { + "epoch": 145.0625, + "loss": 0.0007936473120935261, + "loss_ce": 2.6893150788964704e-05, + "loss_xval": 0.000766754150390625, + "num_input_tokens_seen": 160573824, + "step": 2321 + }, + { + "epoch": 145.125, + "grad_norm": 1.6109769417554887, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 160645504, + "step": 2322 + }, + { + "epoch": 145.125, + "loss": 0.0008781353244557977, + "loss_ce": 2.7457821488496847e-05, + "loss_xval": 0.000850677490234375, + "num_input_tokens_seen": 160645504, + "step": 2322 + }, + { + "epoch": 145.1875, + "grad_norm": 2.0578679521087397, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 160717056, + "step": 2323 + }, + { + "epoch": 145.1875, + "loss": 0.001340178889222443, + "loss_ce": 2.7923082598135807e-05, + "loss_xval": 0.001312255859375, + "num_input_tokens_seen": 160717056, + "step": 2323 + }, + { + "epoch": 145.25, + "grad_norm": 3.059981768578544, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 160776128, + "step": 2324 + }, + { + "epoch": 145.25, + "loss": 0.0027596692088991404, + "loss_ce": 2.8345857572276145e-05, + "loss_xval": 0.0027313232421875, + "num_input_tokens_seen": 160776128, + "step": 2324 + }, + { + "epoch": 145.3125, + "grad_norm": 4.841719922986072, + "learning_rate": 5e-05, + "loss": 0.0067, + "num_input_tokens_seen": 160847808, + "step": 2325 + }, + { + "epoch": 145.3125, + "loss": 0.006712182424962521, + "loss_ce": 2.8832586394855753e-05, + "loss_xval": 0.006683349609375, + "num_input_tokens_seen": 160847808, + "step": 2325 + }, + { + "epoch": 145.375, + "grad_norm": 7.633571031981438, + "learning_rate": 5e-05, + "loss": 0.0166, + "num_input_tokens_seen": 160919552, + "step": 2326 + }, + { + "epoch": 145.375, + "loss": 0.01700500398874283, + "loss_ce": 3.7230420275591314e-05, + "loss_xval": 0.0169677734375, + "num_input_tokens_seen": 160919552, + "step": 2326 + }, + { + "epoch": 145.4375, + "grad_norm": 10.222211247378048, + "learning_rate": 5e-05, + "loss": 0.03, + "num_input_tokens_seen": 160991232, + "step": 2327 + }, + { + "epoch": 145.4375, + "loss": 0.029842397198081017, + "loss_ce": 5.724148650187999e-05, + "loss_xval": 0.02978515625, + "num_input_tokens_seen": 160991232, + "step": 2327 + }, + { + "epoch": 145.5, + "grad_norm": 8.580259887263983, + "learning_rate": 5e-05, + "loss": 0.0219, + "num_input_tokens_seen": 161063040, + "step": 2328 + }, + { + "epoch": 145.5, + "loss": 0.02100929617881775, + "loss_ce": 0.0001352726249024272, + "loss_xval": 0.0208740234375, + "num_input_tokens_seen": 161063040, + "step": 2328 + }, + { + "epoch": 145.5625, + "grad_norm": 1.5104478122402045, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 161134656, + "step": 2329 + }, + { + "epoch": 145.5625, + "loss": 0.0015624697552993894, + "loss_ce": 0.0003341372648719698, + "loss_xval": 0.00122833251953125, + "num_input_tokens_seen": 161134656, + "step": 2329 + }, + { + "epoch": 145.625, + "grad_norm": 5.965994091283684, + "learning_rate": 5e-05, + "loss": 0.0117, + "num_input_tokens_seen": 161206400, + "step": 2330 + }, + { + "epoch": 145.625, + "loss": 0.011319183744490147, + "loss_ce": 0.000699066964443773, + "loss_xval": 0.0106201171875, + "num_input_tokens_seen": 161206400, + "step": 2330 + }, + { + "epoch": 145.6875, + "grad_norm": 7.511859289700143, + "learning_rate": 5e-05, + "loss": 0.0182, + "num_input_tokens_seen": 161278080, + "step": 2331 + }, + { + "epoch": 145.6875, + "loss": 0.018168123438954353, + "loss_ce": 0.0008341383654624224, + "loss_xval": 0.017333984375, + "num_input_tokens_seen": 161278080, + "step": 2331 + }, + { + "epoch": 145.75, + "grad_norm": 1.5647029181604932, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 161349632, + "step": 2332 + }, + { + "epoch": 145.75, + "loss": 0.0016789359506219625, + "loss_ce": 0.0007977409404702485, + "loss_xval": 0.000881195068359375, + "num_input_tokens_seen": 161349632, + "step": 2332 + }, + { + "epoch": 145.8125, + "grad_norm": 5.257327290104701, + "learning_rate": 5e-05, + "loss": 0.0097, + "num_input_tokens_seen": 161421312, + "step": 2333 + }, + { + "epoch": 145.8125, + "loss": 0.009456822648644447, + "loss_ce": 0.0006067253416404128, + "loss_xval": 0.00885009765625, + "num_input_tokens_seen": 161421312, + "step": 2333 + }, + { + "epoch": 145.875, + "grad_norm": 4.94192237617853, + "learning_rate": 5e-05, + "loss": 0.008, + "num_input_tokens_seen": 161492928, + "step": 2334 + }, + { + "epoch": 145.875, + "loss": 0.007787576876580715, + "loss_ce": 0.0004023227375000715, + "loss_xval": 0.00738525390625, + "num_input_tokens_seen": 161492928, + "step": 2334 + }, + { + "epoch": 145.9375, + "grad_norm": 1.2994286275910956, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 161564544, + "step": 2335 + }, + { + "epoch": 145.9375, + "loss": 0.0014959691325202584, + "loss_ce": 0.00026763658388517797, + "loss_xval": 0.00122833251953125, + "num_input_tokens_seen": 161564544, + "step": 2335 + }, + { + "epoch": 146.0, + "grad_norm": 5.7923874654102, + "learning_rate": 5e-05, + "loss": 0.0101, + "num_input_tokens_seen": 161636160, + "step": 2336 + }, + { + "epoch": 146.0, + "loss": 0.010387586429715157, + "loss_ce": 0.00019471513223834336, + "loss_xval": 0.01019287109375, + "num_input_tokens_seen": 161636160, + "step": 2336 + }, + { + "epoch": 146.0625, + "grad_norm": 4.061666459172769, + "learning_rate": 5e-05, + "loss": 0.0052, + "num_input_tokens_seen": 161707776, + "step": 2337 + }, + { + "epoch": 146.0625, + "loss": 0.005342346150428057, + "loss_ce": 0.00015435769455507398, + "loss_xval": 0.00518798828125, + "num_input_tokens_seen": 161707776, + "step": 2337 + }, + { + "epoch": 146.125, + "grad_norm": 1.5401101359509008, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 161779392, + "step": 2338 + }, + { + "epoch": 146.125, + "loss": 0.0011722747003659606, + "loss_ce": 0.0001346770004602149, + "loss_xval": 0.00103759765625, + "num_input_tokens_seen": 161779392, + "step": 2338 + }, + { + "epoch": 146.1875, + "grad_norm": 5.173482264622772, + "learning_rate": 5e-05, + "loss": 0.008, + "num_input_tokens_seen": 161851008, + "step": 2339 + }, + { + "epoch": 146.1875, + "loss": 0.008483229205012321, + "loss_ce": 0.00012141237675677985, + "loss_xval": 0.00836181640625, + "num_input_tokens_seen": 161851008, + "step": 2339 + }, + { + "epoch": 146.25, + "grad_norm": 3.4134114365896546, + "learning_rate": 5e-05, + "loss": 0.0037, + "num_input_tokens_seen": 161922624, + "step": 2340 + }, + { + "epoch": 146.25, + "loss": 0.003694977844133973, + "loss_ce": 0.00012442127626854926, + "loss_xval": 0.003570556640625, + "num_input_tokens_seen": 161922624, + "step": 2340 + }, + { + "epoch": 146.3125, + "grad_norm": 1.4282072315824867, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 161981760, + "step": 2341 + }, + { + "epoch": 146.3125, + "loss": 0.0013863763306289911, + "loss_ce": 0.0001351555692963302, + "loss_xval": 0.001251220703125, + "num_input_tokens_seen": 161981760, + "step": 2341 + }, + { + "epoch": 146.375, + "grad_norm": 4.02508908047131, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_input_tokens_seen": 162053312, + "step": 2342 + }, + { + "epoch": 146.375, + "loss": 0.005358177702873945, + "loss_ce": 0.0001396719308104366, + "loss_xval": 0.005218505859375, + "num_input_tokens_seen": 162053312, + "step": 2342 + }, + { + "epoch": 146.4375, + "grad_norm": 2.0923616011747375, + "learning_rate": 5e-05, + "loss": 0.002, + "num_input_tokens_seen": 162124928, + "step": 2343 + }, + { + "epoch": 146.4375, + "loss": 0.0017234979895874858, + "loss_ce": 0.00015184275980573148, + "loss_xval": 0.0015716552734375, + "num_input_tokens_seen": 162124928, + "step": 2343 + }, + { + "epoch": 146.5, + "grad_norm": 1.746073179556703, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 162196736, + "step": 2344 + }, + { + "epoch": 146.5, + "loss": 0.0013283995212987065, + "loss_ce": 0.00016110220167320222, + "loss_xval": 0.00116729736328125, + "num_input_tokens_seen": 162196736, + "step": 2344 + }, + { + "epoch": 146.5625, + "grad_norm": 3.4177429260977448, + "learning_rate": 5e-05, + "loss": 0.004, + "num_input_tokens_seen": 162268352, + "step": 2345 + }, + { + "epoch": 146.5625, + "loss": 0.004274115432053804, + "loss_ce": 0.00018476003606338054, + "loss_xval": 0.00408935546875, + "num_input_tokens_seen": 162268352, + "step": 2345 + }, + { + "epoch": 146.625, + "grad_norm": 1.3166728482788537, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 162340032, + "step": 2346 + }, + { + "epoch": 146.625, + "loss": 0.0010671853087842464, + "loss_ce": 0.000178360816789791, + "loss_xval": 0.000888824462890625, + "num_input_tokens_seen": 162340032, + "step": 2346 + }, + { + "epoch": 146.6875, + "grad_norm": 2.075985271349164, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 162386560, + "step": 2347 + }, + { + "epoch": 146.6875, + "loss": 0.0016639826353639364, + "loss_ce": 0.00018388015450909734, + "loss_xval": 0.0014801025390625, + "num_input_tokens_seen": 162386560, + "step": 2347 + }, + { + "epoch": 146.75, + "grad_norm": 2.8229531028952852, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 162458176, + "step": 2348 + }, + { + "epoch": 146.75, + "loss": 0.002624648157507181, + "loss_ce": 0.00018324196571484208, + "loss_xval": 0.00244140625, + "num_input_tokens_seen": 162458176, + "step": 2348 + }, + { + "epoch": 146.8125, + "grad_norm": 0.5651772904000949, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 162529856, + "step": 2349 + }, + { + "epoch": 146.8125, + "loss": 0.0006011626101098955, + "loss_ce": 0.00017391651635989547, + "loss_xval": 0.00042724609375, + "num_input_tokens_seen": 162529856, + "step": 2349 + }, + { + "epoch": 146.875, + "grad_norm": 2.122147012123423, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 162589056, + "step": 2350 + }, + { + "epoch": 146.875, + "loss": 0.0016808974323794246, + "loss_ce": 0.0001550184824736789, + "loss_xval": 0.00152587890625, + "num_input_tokens_seen": 162589056, + "step": 2350 + }, + { + "epoch": 146.9375, + "grad_norm": 2.6642079199279696, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_input_tokens_seen": 162660736, + "step": 2351 + }, + { + "epoch": 146.9375, + "loss": 0.002070722868666053, + "loss_ce": 0.00014811537403147668, + "loss_xval": 0.001922607421875, + "num_input_tokens_seen": 162660736, + "step": 2351 + }, + { + "epoch": 147.0, + "grad_norm": 0.6804208855864794, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 162719872, + "step": 2352 + }, + { + "epoch": 147.0, + "loss": 0.00033290323335677385, + "loss_ce": 0.00012500221782829612, + "loss_xval": 0.0002079010009765625, + "num_input_tokens_seen": 162719872, + "step": 2352 + }, + { + "epoch": 147.0625, + "grad_norm": 2.030024025671497, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 162791488, + "step": 2353 + }, + { + "epoch": 147.0625, + "loss": 0.0017423724057152867, + "loss_ce": 0.00010968195419991389, + "loss_xval": 0.0016326904296875, + "num_input_tokens_seen": 162791488, + "step": 2353 + }, + { + "epoch": 147.125, + "grad_norm": 3.117284781056457, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_input_tokens_seen": 162863296, + "step": 2354 + }, + { + "epoch": 147.125, + "loss": 0.003642292460426688, + "loss_ce": 0.00010225331061519682, + "loss_xval": 0.0035400390625, + "num_input_tokens_seen": 162863296, + "step": 2354 + }, + { + "epoch": 147.1875, + "grad_norm": 1.6553163910169943, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 162934848, + "step": 2355 + }, + { + "epoch": 147.1875, + "loss": 0.0010774503462016582, + "loss_ce": 9.325839346274734e-05, + "loss_xval": 0.00098419189453125, + "num_input_tokens_seen": 162934848, + "step": 2355 + }, + { + "epoch": 147.25, + "grad_norm": 0.9701196269738218, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 163006528, + "step": 2356 + }, + { + "epoch": 147.25, + "loss": 0.00039144710171967745, + "loss_ce": 8.817866182653233e-05, + "loss_xval": 0.0003032684326171875, + "num_input_tokens_seen": 163006528, + "step": 2356 + }, + { + "epoch": 147.3125, + "grad_norm": 2.569815599589962, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 163078208, + "step": 2357 + }, + { + "epoch": 147.3125, + "loss": 0.0022654198110103607, + "loss_ce": 8.341299690073356e-05, + "loss_xval": 0.0021820068359375, + "num_input_tokens_seen": 163078208, + "step": 2357 + }, + { + "epoch": 147.375, + "grad_norm": 2.0197217575491, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 163149888, + "step": 2358 + }, + { + "epoch": 147.375, + "loss": 0.0012817709939554334, + "loss_ce": 7.632669439772144e-05, + "loss_xval": 0.0012054443359375, + "num_input_tokens_seen": 163149888, + "step": 2358 + }, + { + "epoch": 147.4375, + "grad_norm": 0.16068342052611778, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 163221632, + "step": 2359 + }, + { + "epoch": 147.4375, + "loss": 0.0002521135611459613, + "loss_ce": 7.28227969375439e-05, + "loss_xval": 0.000179290771484375, + "num_input_tokens_seen": 163221632, + "step": 2359 + }, + { + "epoch": 147.5, + "grad_norm": 2.114308270336181, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 163280768, + "step": 2360 + }, + { + "epoch": 147.5, + "loss": 0.0013699112460017204, + "loss_ce": 6.52848175377585e-05, + "loss_xval": 0.00130462646484375, + "num_input_tokens_seen": 163280768, + "step": 2360 + }, + { + "epoch": 147.5625, + "grad_norm": 2.6919587894761263, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_input_tokens_seen": 163352384, + "step": 2361 + }, + { + "epoch": 147.5625, + "loss": 0.002398441778495908, + "loss_ce": 6.384702282957733e-05, + "loss_xval": 0.0023345947265625, + "num_input_tokens_seen": 163352384, + "step": 2361 + }, + { + "epoch": 147.625, + "grad_norm": 1.5939603759228747, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 163424064, + "step": 2362 + }, + { + "epoch": 147.625, + "loss": 0.001161682652309537, + "loss_ce": 6.304984708549455e-05, + "loss_xval": 0.0010986328125, + "num_input_tokens_seen": 163424064, + "step": 2362 + }, + { + "epoch": 147.6875, + "grad_norm": 0.46960435548578017, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 163495744, + "step": 2363 + }, + { + "epoch": 147.6875, + "loss": 0.0001773663389030844, + "loss_ce": 5.815705662826076e-05, + "loss_xval": 0.00011920928955078125, + "num_input_tokens_seen": 163495744, + "step": 2363 + }, + { + "epoch": 147.75, + "grad_norm": 1.94759574317629, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 163567360, + "step": 2364 + }, + { + "epoch": 147.75, + "loss": 0.0014991631032899022, + "loss_ce": 5.720752233173698e-05, + "loss_xval": 0.00144195556640625, + "num_input_tokens_seen": 163567360, + "step": 2364 + }, + { + "epoch": 147.8125, + "grad_norm": 1.7810217172520648, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 163638912, + "step": 2365 + }, + { + "epoch": 147.8125, + "loss": 0.0013713567750528455, + "loss_ce": 5.1471488404786214e-05, + "loss_xval": 0.00131988525390625, + "num_input_tokens_seen": 163638912, + "step": 2365 + }, + { + "epoch": 147.875, + "grad_norm": 0.2965164840091746, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 163698048, + "step": 2366 + }, + { + "epoch": 147.875, + "loss": 0.00019151023298036307, + "loss_ce": 5.4181127779884264e-05, + "loss_xval": 0.0001373291015625, + "num_input_tokens_seen": 163698048, + "step": 2366 + }, + { + "epoch": 147.9375, + "grad_norm": 1.2371936328233522, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 163744640, + "step": 2367 + }, + { + "epoch": 147.9375, + "loss": 0.0007852339185774326, + "loss_ce": 5.662675903295167e-05, + "loss_xval": 0.000728607177734375, + "num_input_tokens_seen": 163744640, + "step": 2367 + }, + { + "epoch": 148.0, + "grad_norm": 1.8258004590520762, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 163816256, + "step": 2368 + }, + { + "epoch": 148.0, + "loss": 0.001227924833074212, + "loss_ce": 5.299809708958492e-05, + "loss_xval": 0.0011749267578125, + "num_input_tokens_seen": 163816256, + "step": 2368 + }, + { + "epoch": 148.0625, + "grad_norm": 1.3028197191754545, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 163887872, + "step": 2369 + }, + { + "epoch": 148.0625, + "loss": 0.0006705644191242754, + "loss_ce": 5.2583487558877096e-05, + "loss_xval": 0.00061798095703125, + "num_input_tokens_seen": 163887872, + "step": 2369 + }, + { + "epoch": 148.125, + "grad_norm": 0.2217973624577362, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 163959424, + "step": 2370 + }, + { + "epoch": 148.125, + "loss": 0.0006748749292455614, + "loss_ce": 4.926458859699778e-05, + "loss_xval": 0.0006256103515625, + "num_input_tokens_seen": 163959424, + "step": 2370 + }, + { + "epoch": 148.1875, + "grad_norm": 1.423999264306517, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 164031040, + "step": 2371 + }, + { + "epoch": 148.1875, + "loss": 0.0007270199712365866, + "loss_ce": 4.8003839765442535e-05, + "loss_xval": 0.00067901611328125, + "num_input_tokens_seen": 164031040, + "step": 2371 + }, + { + "epoch": 148.25, + "grad_norm": 1.8733486817133544, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 164090176, + "step": 2372 + }, + { + "epoch": 148.25, + "loss": 0.00138192274607718, + "loss_ce": 4.677875404013321e-05, + "loss_xval": 0.00133514404296875, + "num_input_tokens_seen": 164090176, + "step": 2372 + }, + { + "epoch": 148.3125, + "grad_norm": 1.1859644410491554, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 164161920, + "step": 2373 + }, + { + "epoch": 148.3125, + "loss": 0.0008659652667120099, + "loss_ce": 4.962003367836587e-05, + "loss_xval": 0.00081634521484375, + "num_input_tokens_seen": 164161920, + "step": 2373 + }, + { + "epoch": 148.375, + "grad_norm": 0.18254656740800682, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_input_tokens_seen": 164233664, + "step": 2374 + }, + { + "epoch": 148.375, + "loss": 8.343144145328552e-05, + "loss_ce": 4.2661868064897135e-05, + "loss_xval": 4.076957702636719e-05, + "num_input_tokens_seen": 164233664, + "step": 2374 + }, + { + "epoch": 148.4375, + "grad_norm": 1.4045512358573804, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 164292800, + "step": 2375 + }, + { + "epoch": 148.4375, + "loss": 0.0006769074825569987, + "loss_ce": 4.366773646324873e-05, + "loss_xval": 0.00063323974609375, + "num_input_tokens_seen": 164292800, + "step": 2375 + }, + { + "epoch": 148.5, + "grad_norm": 1.761208723522083, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_input_tokens_seen": 164364416, + "step": 2376 + }, + { + "epoch": 148.5, + "loss": 0.00126472651027143, + "loss_ce": 4.402335616759956e-05, + "loss_xval": 0.001220703125, + "num_input_tokens_seen": 164364416, + "step": 2376 + }, + { + "epoch": 148.5625, + "grad_norm": 1.2882790294587565, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 164436096, + "step": 2377 + }, + { + "epoch": 148.5625, + "loss": 0.0005924896104261279, + "loss_ce": 3.9358477806672454e-05, + "loss_xval": 0.000553131103515625, + "num_input_tokens_seen": 164436096, + "step": 2377 + }, + { + "epoch": 148.625, + "grad_norm": 0.666004216117137, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 164507712, + "step": 2378 + }, + { + "epoch": 148.625, + "loss": 0.0004235539527144283, + "loss_ce": 4.208423706586473e-05, + "loss_xval": 0.0003814697265625, + "num_input_tokens_seen": 164507712, + "step": 2378 + }, + { + "epoch": 148.6875, + "grad_norm": 0.2717273144022413, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 164566784, + "step": 2379 + }, + { + "epoch": 148.6875, + "loss": 0.0004639588587451726, + "loss_ce": 4.43421486124862e-05, + "loss_xval": 0.00041961669921875, + "num_input_tokens_seen": 164566784, + "step": 2379 + }, + { + "epoch": 148.75, + "grad_norm": 1.1169222811849129, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 164625856, + "step": 2380 + }, + { + "epoch": 148.75, + "loss": 0.0005799496429972351, + "loss_ce": 4.207733218208887e-05, + "loss_xval": 0.000537872314453125, + "num_input_tokens_seen": 164625856, + "step": 2380 + }, + { + "epoch": 148.8125, + "grad_norm": 1.9921527724459633, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 164697472, + "step": 2381 + }, + { + "epoch": 148.8125, + "loss": 0.0016376186395063996, + "loss_ce": 4.307520794100128e-05, + "loss_xval": 0.00159454345703125, + "num_input_tokens_seen": 164697472, + "step": 2381 + }, + { + "epoch": 148.875, + "grad_norm": 2.5562549292946337, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 164756480, + "step": 2382 + }, + { + "epoch": 148.875, + "loss": 0.002530452562496066, + "loss_ce": 4.326990165282041e-05, + "loss_xval": 0.0024871826171875, + "num_input_tokens_seen": 164756480, + "step": 2382 + }, + { + "epoch": 148.9375, + "grad_norm": 2.5740588770110158, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 164828032, + "step": 2383 + }, + { + "epoch": 148.9375, + "loss": 0.0023500563111156225, + "loss_ce": 4.597923543769866e-05, + "loss_xval": 0.0023040771484375, + "num_input_tokens_seen": 164828032, + "step": 2383 + }, + { + "epoch": 149.0, + "grad_norm": 1.9237964044249054, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 164899648, + "step": 2384 + }, + { + "epoch": 149.0, + "loss": 0.001942421542480588, + "loss_ce": 5.033174966229126e-05, + "loss_xval": 0.00189208984375, + "num_input_tokens_seen": 164899648, + "step": 2384 + }, + { + "epoch": 149.0625, + "grad_norm": 0.9794175832805015, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 164946240, + "step": 2385 + }, + { + "epoch": 149.0625, + "loss": 0.00033468782203271985, + "loss_ce": 5.430756209534593e-05, + "loss_xval": 0.0002803802490234375, + "num_input_tokens_seen": 164946240, + "step": 2385 + }, + { + "epoch": 149.125, + "grad_norm": 0.4144373637048414, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 165017856, + "step": 2386 + }, + { + "epoch": 149.125, + "loss": 0.0006403362494893372, + "loss_ce": 5.6687582400627434e-05, + "loss_xval": 0.000583648681640625, + "num_input_tokens_seen": 165017856, + "step": 2386 + }, + { + "epoch": 149.1875, + "grad_norm": 0.23057807805512553, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 165077056, + "step": 2387 + }, + { + "epoch": 149.1875, + "loss": 0.00042012392077594995, + "loss_ce": 5.963501462247223e-05, + "loss_xval": 0.0003604888916015625, + "num_input_tokens_seen": 165077056, + "step": 2387 + }, + { + "epoch": 149.25, + "grad_norm": 0.11669737874547069, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 165148608, + "step": 2388 + }, + { + "epoch": 149.25, + "loss": 0.00020264298655092716, + "loss_ce": 5.959184272796847e-05, + "loss_xval": 0.0001430511474609375, + "num_input_tokens_seen": 165148608, + "step": 2388 + }, + { + "epoch": 149.3125, + "grad_norm": 0.4484633697241466, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 165220224, + "step": 2389 + }, + { + "epoch": 149.3125, + "loss": 0.0004950024886056781, + "loss_ce": 6.775640940759331e-05, + "loss_xval": 0.00042724609375, + "num_input_tokens_seen": 165220224, + "step": 2389 + }, + { + "epoch": 149.375, + "grad_norm": 1.0541395280626393, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 165291904, + "step": 2390 + }, + { + "epoch": 149.375, + "loss": 0.0005390362348407507, + "loss_ce": 6.0291731642792e-05, + "loss_xval": 0.0004787445068359375, + "num_input_tokens_seen": 165291904, + "step": 2390 + }, + { + "epoch": 149.4375, + "grad_norm": 2.2593428653272807, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 165363648, + "step": 2391 + }, + { + "epoch": 149.4375, + "loss": 0.0015220154309645295, + "loss_ce": 5.7171691878465936e-05, + "loss_xval": 0.00146484375, + "num_input_tokens_seen": 165363648, + "step": 2391 + }, + { + "epoch": 149.5, + "grad_norm": 3.7638655610361695, + "learning_rate": 5e-05, + "loss": 0.0044, + "num_input_tokens_seen": 165435264, + "step": 2392 + }, + { + "epoch": 149.5, + "loss": 0.004354399163275957, + "loss_ce": 5.142059671925381e-05, + "loss_xval": 0.004302978515625, + "num_input_tokens_seen": 165435264, + "step": 2392 + }, + { + "epoch": 149.5625, + "grad_norm": 5.0545420851926, + "learning_rate": 5e-05, + "loss": 0.0077, + "num_input_tokens_seen": 165506816, + "step": 2393 + }, + { + "epoch": 149.5625, + "loss": 0.00762578472495079, + "loss_ce": 5.742551002185792e-05, + "loss_xval": 0.007568359375, + "num_input_tokens_seen": 165506816, + "step": 2393 + }, + { + "epoch": 149.625, + "grad_norm": 5.500015390501117, + "learning_rate": 5e-05, + "loss": 0.009, + "num_input_tokens_seen": 165578496, + "step": 2394 + }, + { + "epoch": 149.625, + "loss": 0.008792513981461525, + "loss_ce": 6.448627391364425e-05, + "loss_xval": 0.00872802734375, + "num_input_tokens_seen": 165578496, + "step": 2394 + }, + { + "epoch": 149.6875, + "grad_norm": 4.239923001902639, + "learning_rate": 5e-05, + "loss": 0.0059, + "num_input_tokens_seen": 165637632, + "step": 2395 + }, + { + "epoch": 149.6875, + "loss": 0.006338251288980246, + "loss_ce": 8.214791887439787e-05, + "loss_xval": 0.006256103515625, + "num_input_tokens_seen": 165637632, + "step": 2395 + }, + { + "epoch": 149.75, + "grad_norm": 1.4885826961334672, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 165709312, + "step": 2396 + }, + { + "epoch": 149.75, + "loss": 0.002050133654847741, + "loss_ce": 0.00011226750939385965, + "loss_xval": 0.0019378662109375, + "num_input_tokens_seen": 165709312, + "step": 2396 + }, + { + "epoch": 149.8125, + "grad_norm": 0.8805698261264313, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 165780864, + "step": 2397 + }, + { + "epoch": 149.8125, + "loss": 0.0015474815154448152, + "loss_ce": 0.00016656114894431084, + "loss_xval": 0.00138092041015625, + "num_input_tokens_seen": 165780864, + "step": 2397 + }, + { + "epoch": 149.875, + "grad_norm": 1.5547521412813465, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 165852672, + "step": 2398 + }, + { + "epoch": 149.875, + "loss": 0.0012131972471252084, + "loss_ce": 0.00021374659263528883, + "loss_xval": 0.00099945068359375, + "num_input_tokens_seen": 165852672, + "step": 2398 + }, + { + "epoch": 149.9375, + "grad_norm": 1.5844453429196825, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 165924352, + "step": 2399 + }, + { + "epoch": 149.9375, + "loss": 0.0024127454962581396, + "loss_ce": 0.00026125620934180915, + "loss_xval": 0.0021514892578125, + "num_input_tokens_seen": 165924352, + "step": 2399 + }, + { + "epoch": 150.0, + "grad_norm": 1.3534716861597587, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 165996032, + "step": 2400 + }, + { + "epoch": 150.0, + "loss": 0.0009662243537604809, + "loss_ce": 0.0002681347250472754, + "loss_xval": 0.000698089599609375, + "num_input_tokens_seen": 165996032, + "step": 2400 + }, + { + "epoch": 150.0625, + "grad_norm": 1.0737007932266736, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 166067584, + "step": 2401 + }, + { + "epoch": 150.0625, + "loss": 0.0017657603602856398, + "loss_ce": 0.0002475108194630593, + "loss_xval": 0.00151824951171875, + "num_input_tokens_seen": 166067584, + "step": 2401 + }, + { + "epoch": 150.125, + "grad_norm": 0.6989581212698932, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 166126720, + "step": 2402 + }, + { + "epoch": 150.125, + "loss": 0.0008287295931950212, + "loss_ce": 0.00019930452981498092, + "loss_xval": 0.000629425048828125, + "num_input_tokens_seen": 166126720, + "step": 2402 + }, + { + "epoch": 150.1875, + "grad_norm": 0.5112130537543741, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 166198272, + "step": 2403 + }, + { + "epoch": 150.1875, + "loss": 0.0005174290854483843, + "loss_ce": 0.00014358872431330383, + "loss_xval": 0.00037384033203125, + "num_input_tokens_seen": 166198272, + "step": 2403 + }, + { + "epoch": 150.25, + "grad_norm": 0.3919207549294726, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 166270016, + "step": 2404 + }, + { + "epoch": 150.25, + "loss": 0.0008279854082502425, + "loss_ce": 0.00011082231503678486, + "loss_xval": 0.0007171630859375, + "num_input_tokens_seen": 166270016, + "step": 2404 + }, + { + "epoch": 150.3125, + "grad_norm": 0.10739100581299145, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_input_tokens_seen": 166341760, + "step": 2405 + }, + { + "epoch": 150.3125, + "loss": 0.00023223445168696344, + "loss_ce": 8.346126560354605e-05, + "loss_xval": 0.000148773193359375, + "num_input_tokens_seen": 166341760, + "step": 2405 + }, + { + "epoch": 150.375, + "grad_norm": 0.45975874155091206, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 166413376, + "step": 2406 + }, + { + "epoch": 150.375, + "loss": 0.0007082895026542246, + "loss_ce": 6.360567203955725e-05, + "loss_xval": 0.000644683837890625, + "num_input_tokens_seen": 166413376, + "step": 2406 + }, + { + "epoch": 150.4375, + "grad_norm": 0.9410291482374249, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 166485056, + "step": 2407 + }, + { + "epoch": 150.4375, + "loss": 0.0007459745393134654, + "loss_ce": 4.407024243846536e-05, + "loss_xval": 0.000701904296875, + "num_input_tokens_seen": 166485056, + "step": 2407 + }, + { + "epoch": 150.5, + "grad_norm": 1.6421583770726258, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 166556672, + "step": 2408 + }, + { + "epoch": 150.5, + "loss": 0.0009627325343899429, + "loss_ce": 3.576111339498311e-05, + "loss_xval": 0.000926971435546875, + "num_input_tokens_seen": 166556672, + "step": 2408 + }, + { + "epoch": 150.5625, + "grad_norm": 2.7839599851905446, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_input_tokens_seen": 166628224, + "step": 2409 + }, + { + "epoch": 150.5625, + "loss": 0.0030045805033296347, + "loss_ce": 2.9116721634636633e-05, + "loss_xval": 0.0029754638671875, + "num_input_tokens_seen": 166628224, + "step": 2409 + }, + { + "epoch": 150.625, + "grad_norm": 5.54840185171238, + "learning_rate": 5e-05, + "loss": 0.0092, + "num_input_tokens_seen": 166699904, + "step": 2410 + }, + { + "epoch": 150.625, + "loss": 0.009306657128036022, + "loss_ce": 2.9313479899428785e-05, + "loss_xval": 0.00927734375, + "num_input_tokens_seen": 166699904, + "step": 2410 + }, + { + "epoch": 150.6875, + "grad_norm": 10.46276851301423, + "learning_rate": 5e-05, + "loss": 0.0329, + "num_input_tokens_seen": 166771456, + "step": 2411 + }, + { + "epoch": 150.6875, + "loss": 0.03250272199511528, + "loss_ce": 3.201870640623383e-05, + "loss_xval": 0.032470703125, + "num_input_tokens_seen": 166771456, + "step": 2411 + }, + { + "epoch": 150.75, + "grad_norm": 12.49379731434549, + "learning_rate": 5e-05, + "loss": 0.0504, + "num_input_tokens_seen": 166843136, + "step": 2412 + }, + { + "epoch": 150.75, + "loss": 0.048925530165433884, + "loss_ce": 9.740592940943316e-05, + "loss_xval": 0.048828125, + "num_input_tokens_seen": 166843136, + "step": 2412 + }, + { + "epoch": 150.8125, + "grad_norm": 8.792072173625062, + "learning_rate": 5e-05, + "loss": 0.0248, + "num_input_tokens_seen": 166914688, + "step": 2413 + }, + { + "epoch": 150.8125, + "loss": 0.024670902639627457, + "loss_ce": 0.0009892623638734221, + "loss_xval": 0.023681640625, + "num_input_tokens_seen": 166914688, + "step": 2413 + }, + { + "epoch": 150.875, + "grad_norm": 4.2342744701784, + "learning_rate": 5e-05, + "loss": 0.0065, + "num_input_tokens_seen": 166986304, + "step": 2414 + }, + { + "epoch": 150.875, + "loss": 0.00638669403269887, + "loss_ce": 0.000863012217450887, + "loss_xval": 0.005523681640625, + "num_input_tokens_seen": 166986304, + "step": 2414 + }, + { + "epoch": 150.9375, + "grad_norm": 0.4366523557031741, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 167057984, + "step": 2415 + }, + { + "epoch": 150.9375, + "loss": 0.0010362620232626796, + "loss_ce": 0.0005537028191611171, + "loss_xval": 0.0004825592041015625, + "num_input_tokens_seen": 167057984, + "step": 2415 + }, + { + "epoch": 151.0, + "grad_norm": 6.031583553086854, + "learning_rate": 5e-05, + "loss": 0.011, + "num_input_tokens_seen": 167129664, + "step": 2416 + }, + { + "epoch": 151.0, + "loss": 0.011357029899954796, + "loss_ce": 0.0003707021242007613, + "loss_xval": 0.010986328125, + "num_input_tokens_seen": 167129664, + "step": 2416 + }, + { + "epoch": 151.0625, + "grad_norm": 15.787760958608375, + "learning_rate": 5e-05, + "loss": 0.0699, + "num_input_tokens_seen": 167201280, + "step": 2417 + }, + { + "epoch": 151.0625, + "loss": 0.07035034149885178, + "loss_ce": 0.0005261220503598452, + "loss_xval": 0.06982421875, + "num_input_tokens_seen": 167201280, + "step": 2417 + }, + { + "epoch": 151.125, + "grad_norm": 17.264080440378386, + "learning_rate": 5e-05, + "loss": 0.0875, + "num_input_tokens_seen": 167272896, + "step": 2418 + }, + { + "epoch": 151.125, + "loss": 0.08509006351232529, + "loss_ce": 0.0001291264343308285, + "loss_xval": 0.0849609375, + "num_input_tokens_seen": 167272896, + "step": 2418 + }, + { + "epoch": 151.1875, + "grad_norm": 5.293257878290243, + "learning_rate": 5e-05, + "loss": 0.0098, + "num_input_tokens_seen": 167344640, + "step": 2419 + }, + { + "epoch": 151.1875, + "loss": 0.009484825655817986, + "loss_ce": 0.0001464468368794769, + "loss_xval": 0.00933837890625, + "num_input_tokens_seen": 167344640, + "step": 2419 + }, + { + "epoch": 151.25, + "grad_norm": 8.234025956692294, + "learning_rate": 5e-05, + "loss": 0.0196, + "num_input_tokens_seen": 167403840, + "step": 2420 + }, + { + "epoch": 151.25, + "loss": 0.020006585866212845, + "loss_ce": 0.00023119535762816668, + "loss_xval": 0.019775390625, + "num_input_tokens_seen": 167403840, + "step": 2420 + }, + { + "epoch": 151.3125, + "grad_norm": 26.46834646209185, + "learning_rate": 5e-05, + "loss": 0.1832, + "num_input_tokens_seen": 167475392, + "step": 2421 + }, + { + "epoch": 151.3125, + "loss": 0.1851268708705902, + "loss_ce": 0.0005565655883401632, + "loss_xval": 0.1845703125, + "num_input_tokens_seen": 167475392, + "step": 2421 + }, + { + "epoch": 151.375, + "grad_norm": 22.697539209910644, + "learning_rate": 5e-05, + "loss": 0.1425, + "num_input_tokens_seen": 167534400, + "step": 2422 + }, + { + "epoch": 151.375, + "loss": 0.1417573243379593, + "loss_ce": 0.00015575841825921088, + "loss_xval": 0.1416015625, + "num_input_tokens_seen": 167534400, + "step": 2422 + }, + { + "epoch": 151.4375, + "grad_norm": 7.6623304718247995, + "learning_rate": 5e-05, + "loss": 0.0178, + "num_input_tokens_seen": 167593472, + "step": 2423 + }, + { + "epoch": 151.4375, + "loss": 0.017808010801672935, + "loss_ce": 0.00022988590353634208, + "loss_xval": 0.017578125, + "num_input_tokens_seen": 167593472, + "step": 2423 + }, + { + "epoch": 151.5, + "grad_norm": 414.1100107255893, + "learning_rate": 5e-05, + "loss": 0.9645, + "num_input_tokens_seen": 167652608, + "step": 2424 + }, + { + "epoch": 151.5, + "loss": 1.7413725852966309, + "loss_ce": 0.030435139313340187, + "loss_xval": 1.7109375, + "num_input_tokens_seen": 167652608, + "step": 2424 + }, + { + "epoch": 151.5625, + "grad_norm": 19.262266220946344, + "learning_rate": 5e-05, + "loss": 0.0932, + "num_input_tokens_seen": 167724288, + "step": 2425 + }, + { + "epoch": 151.5625, + "loss": 0.09090714901685715, + "loss_ce": 0.0005751141579821706, + "loss_xval": 0.09033203125, + "num_input_tokens_seen": 167724288, + "step": 2425 + }, + { + "epoch": 151.625, + "grad_norm": 24.36110882222433, + "learning_rate": 5e-05, + "loss": 0.1085, + "num_input_tokens_seen": 167795904, + "step": 2426 + }, + { + "epoch": 151.625, + "loss": 0.12360428273677826, + "loss_ce": 0.08698318898677826, + "loss_xval": 0.03662109375, + "num_input_tokens_seen": 167795904, + "step": 2426 + }, + { + "epoch": 151.6875, + "grad_norm": 36.983860252487354, + "learning_rate": 5e-05, + "loss": 0.5427, + "num_input_tokens_seen": 167867456, + "step": 2427 + }, + { + "epoch": 151.6875, + "loss": 0.5592447519302368, + "loss_ce": 0.4049479067325592, + "loss_xval": 0.154296875, + "num_input_tokens_seen": 167867456, + "step": 2427 + }, + { + "epoch": 151.75, + "grad_norm": 16.237935621266285, + "learning_rate": 5e-05, + "loss": 0.4207, + "num_input_tokens_seen": 167939136, + "step": 2428 + }, + { + "epoch": 151.75, + "loss": 0.4261643588542938, + "loss_ce": 0.4107834994792938, + "loss_xval": 0.015380859375, + "num_input_tokens_seen": 167939136, + "step": 2428 + }, + { + "epoch": 151.8125, + "grad_norm": 19.680429926189632, + "learning_rate": 5e-05, + "loss": 0.4199, + "num_input_tokens_seen": 168010752, + "step": 2429 + }, + { + "epoch": 151.8125, + "loss": 0.4241836667060852, + "loss_ce": 0.3348281979560852, + "loss_xval": 0.08935546875, + "num_input_tokens_seen": 168010752, + "step": 2429 + }, + { + "epoch": 151.875, + "grad_norm": 18.703801220606653, + "learning_rate": 5e-05, + "loss": 0.3537, + "num_input_tokens_seen": 168069824, + "step": 2430 + }, + { + "epoch": 151.875, + "loss": 0.35095417499542236, + "loss_ce": 0.26648151874542236, + "loss_xval": 0.08447265625, + "num_input_tokens_seen": 168069824, + "step": 2430 + }, + { + "epoch": 151.9375, + "grad_norm": 15.645070978296983, + "learning_rate": 5e-05, + "loss": 0.3064, + "num_input_tokens_seen": 168141440, + "step": 2431 + }, + { + "epoch": 151.9375, + "loss": 0.30103811621665955, + "loss_ce": 0.26881155371665955, + "loss_xval": 0.0322265625, + "num_input_tokens_seen": 168141440, + "step": 2431 + }, + { + "epoch": 152.0, + "grad_norm": 21.225905190040425, + "learning_rate": 5e-05, + "loss": 0.2675, + "num_input_tokens_seen": 168213056, + "step": 2432 + }, + { + "epoch": 152.0, + "loss": 0.2683914303779602, + "loss_ce": 0.1848953515291214, + "loss_xval": 0.08349609375, + "num_input_tokens_seen": 168213056, + "step": 2432 + }, + { + "epoch": 152.0625, + "grad_norm": 12.611375752594181, + "learning_rate": 5e-05, + "loss": 0.1306, + "num_input_tokens_seen": 168272192, + "step": 2433 + }, + { + "epoch": 152.0625, + "loss": 0.13391664624214172, + "loss_ce": 0.09827210754156113, + "loss_xval": 0.03564453125, + "num_input_tokens_seen": 168272192, + "step": 2433 + }, + { + "epoch": 152.125, + "grad_norm": 17.538312214800563, + "learning_rate": 5e-05, + "loss": 0.1296, + "num_input_tokens_seen": 168343808, + "step": 2434 + }, + { + "epoch": 152.125, + "loss": 0.1320444941520691, + "loss_ce": 0.07052105665206909, + "loss_xval": 0.0615234375, + "num_input_tokens_seen": 168343808, + "step": 2434 + }, + { + "epoch": 152.1875, + "grad_norm": 19.61837353575689, + "learning_rate": 5e-05, + "loss": 0.1222, + "num_input_tokens_seen": 168415360, + "step": 2435 + }, + { + "epoch": 152.1875, + "loss": 0.12624847888946533, + "loss_ce": 0.047146908938884735, + "loss_xval": 0.0791015625, + "num_input_tokens_seen": 168415360, + "step": 2435 + }, + { + "epoch": 152.25, + "grad_norm": 8.839200180151096, + "learning_rate": 5e-05, + "loss": 0.0516, + "num_input_tokens_seen": 168487104, + "step": 2436 + }, + { + "epoch": 152.25, + "loss": 0.050722721964120865, + "loss_ce": 0.033877018839120865, + "loss_xval": 0.016845703125, + "num_input_tokens_seen": 168487104, + "step": 2436 + }, + { + "epoch": 152.3125, + "grad_norm": 29.340307873660837, + "learning_rate": 5e-05, + "loss": 0.1746, + "num_input_tokens_seen": 168558720, + "step": 2437 + }, + { + "epoch": 152.3125, + "loss": 0.17294269800186157, + "loss_ce": 0.024505192413926125, + "loss_xval": 0.1484375, + "num_input_tokens_seen": 168558720, + "step": 2437 + }, + { + "epoch": 152.375, + "grad_norm": 19.262572996746773, + "learning_rate": 5e-05, + "loss": 0.0831, + "num_input_tokens_seen": 168630336, + "step": 2438 + }, + { + "epoch": 152.375, + "loss": 0.08717883378267288, + "loss_ce": 0.021260865032672882, + "loss_xval": 0.06591796875, + "num_input_tokens_seen": 168630336, + "step": 2438 + }, + { + "epoch": 152.4375, + "grad_norm": 9.476712746612705, + "learning_rate": 5e-05, + "loss": 0.0225, + "num_input_tokens_seen": 168702080, + "step": 2439 + }, + { + "epoch": 152.4375, + "loss": 0.02292168326675892, + "loss_ce": 0.0061980499885976315, + "loss_xval": 0.0167236328125, + "num_input_tokens_seen": 168702080, + "step": 2439 + }, + { + "epoch": 152.5, + "grad_norm": 27.06563178685452, + "learning_rate": 5e-05, + "loss": 0.1195, + "num_input_tokens_seen": 168773760, + "step": 2440 + }, + { + "epoch": 152.5, + "loss": 0.11924093216657639, + "loss_ce": 0.004006555303931236, + "loss_xval": 0.115234375, + "num_input_tokens_seen": 168773760, + "step": 2440 + }, + { + "epoch": 152.5625, + "grad_norm": 16.280688867399473, + "learning_rate": 5e-05, + "loss": 0.0452, + "num_input_tokens_seen": 168845376, + "step": 2441 + }, + { + "epoch": 152.5625, + "loss": 0.04708874225616455, + "loss_ce": 0.00265514780767262, + "loss_xval": 0.04443359375, + "num_input_tokens_seen": 168845376, + "step": 2441 + }, + { + "epoch": 152.625, + "grad_norm": 9.938469294554265, + "learning_rate": 5e-05, + "loss": 0.0191, + "num_input_tokens_seen": 168904512, + "step": 2442 + }, + { + "epoch": 152.625, + "loss": 0.018490314483642578, + "loss_ce": 0.003231526119634509, + "loss_xval": 0.0152587890625, + "num_input_tokens_seen": 168904512, + "step": 2442 + }, + { + "epoch": 152.6875, + "grad_norm": 23.010114349229884, + "learning_rate": 5e-05, + "loss": 0.0849, + "num_input_tokens_seen": 168963520, + "step": 2443 + }, + { + "epoch": 152.6875, + "loss": 0.08742774277925491, + "loss_ce": 0.0029550848994404078, + "loss_xval": 0.08447265625, + "num_input_tokens_seen": 168963520, + "step": 2443 + }, + { + "epoch": 152.75, + "grad_norm": 9.040468310430352, + "learning_rate": 5e-05, + "loss": 0.0153, + "num_input_tokens_seen": 169035072, + "step": 2444 + }, + { + "epoch": 152.75, + "loss": 0.016200613230466843, + "loss_ce": 0.0022235626820474863, + "loss_xval": 0.01397705078125, + "num_input_tokens_seen": 169035072, + "step": 2444 + }, + { + "epoch": 152.8125, + "grad_norm": 15.46627308177994, + "learning_rate": 5e-05, + "loss": 0.0391, + "num_input_tokens_seen": 169106624, + "step": 2445 + }, + { + "epoch": 152.8125, + "loss": 0.037854935973882675, + "loss_ce": 0.001233842340297997, + "loss_xval": 0.03662109375, + "num_input_tokens_seen": 169106624, + "step": 2445 + }, + { + "epoch": 152.875, + "grad_norm": 17.74470932258292, + "learning_rate": 5e-05, + "loss": 0.0512, + "num_input_tokens_seen": 169178304, + "step": 2446 + }, + { + "epoch": 152.875, + "loss": 0.049263544380664825, + "loss_ce": 0.00043541795457713306, + "loss_xval": 0.048828125, + "num_input_tokens_seen": 169178304, + "step": 2446 + }, + { + "epoch": 152.9375, + "grad_norm": 3.983319638379511, + "learning_rate": 5e-05, + "loss": 0.0033, + "num_input_tokens_seen": 169250048, + "step": 2447 + }, + { + "epoch": 152.9375, + "loss": 0.00340116024017334, + "loss_ce": 0.0002883673587348312, + "loss_xval": 0.00311279296875, + "num_input_tokens_seen": 169250048, + "step": 2447 + }, + { + "epoch": 153.0, + "grad_norm": 17.052741587322256, + "learning_rate": 5e-05, + "loss": 0.0486, + "num_input_tokens_seen": 169296640, + "step": 2448 + }, + { + "epoch": 153.0, + "loss": 0.04775082692503929, + "loss_ce": 0.00014340641791932285, + "loss_xval": 0.047607421875, + "num_input_tokens_seen": 169296640, + "step": 2448 + }, + { + "epoch": 153.0625, + "grad_norm": 1.6488709856004677, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 169368192, + "step": 2449 + }, + { + "epoch": 153.0625, + "loss": 0.0007192580960690975, + "loss_ce": 7.457426545443013e-05, + "loss_xval": 0.000644683837890625, + "num_input_tokens_seen": 169368192, + "step": 2449 + }, + { + "epoch": 153.125, + "grad_norm": 14.82132478454114, + "learning_rate": 5e-05, + "loss": 0.038, + "num_input_tokens_seen": 169439744, + "step": 2450 + }, + { + "epoch": 153.125, + "loss": 0.0388912670314312, + "loss_ce": 7.290825305972248e-05, + "loss_xval": 0.038818359375, + "num_input_tokens_seen": 169439744, + "step": 2450 + }, + { + "epoch": 153.1875, + "grad_norm": 4.269698392907984, + "learning_rate": 5e-05, + "loss": 0.0039, + "num_input_tokens_seen": 169511424, + "step": 2451 + }, + { + "epoch": 153.1875, + "loss": 0.0037585701793432236, + "loss_ce": 3.5425644455244765e-05, + "loss_xval": 0.00372314453125, + "num_input_tokens_seen": 169511424, + "step": 2451 + }, + { + "epoch": 153.25, + "grad_norm": 11.674239325590117, + "learning_rate": 5e-05, + "loss": 0.0252, + "num_input_tokens_seen": 169583104, + "step": 2452 + }, + { + "epoch": 153.25, + "loss": 0.02480572834610939, + "loss_ce": 2.5454932256252505e-05, + "loss_xval": 0.0247802734375, + "num_input_tokens_seen": 169583104, + "step": 2452 + }, + { + "epoch": 153.3125, + "grad_norm": 3.212616261973195, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 169654784, + "step": 2453 + }, + { + "epoch": 153.3125, + "loss": 0.0025852012913674116, + "loss_ce": 2.172468521166593e-05, + "loss_xval": 0.0025634765625, + "num_input_tokens_seen": 169654784, + "step": 2453 + }, + { + "epoch": 153.375, + "grad_norm": 10.599579969728131, + "learning_rate": 5e-05, + "loss": 0.0206, + "num_input_tokens_seen": 169726464, + "step": 2454 + }, + { + "epoch": 153.375, + "loss": 0.020038805902004242, + "loss_ce": 1.9274504666100256e-05, + "loss_xval": 0.02001953125, + "num_input_tokens_seen": 169726464, + "step": 2454 + }, + { + "epoch": 153.4375, + "grad_norm": 2.329570429065563, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 169798144, + "step": 2455 + }, + { + "epoch": 153.4375, + "loss": 0.0016645665746182203, + "loss_ce": 1.6617326764389873e-05, + "loss_xval": 0.00164794921875, + "num_input_tokens_seen": 169798144, + "step": 2455 + }, + { + "epoch": 153.5, + "grad_norm": 9.745183346329203, + "learning_rate": 5e-05, + "loss": 0.0184, + "num_input_tokens_seen": 169869760, + "step": 2456 + }, + { + "epoch": 153.5, + "loss": 0.017713962122797966, + "loss_ce": 1.3767366908723488e-05, + "loss_xval": 0.0177001953125, + "num_input_tokens_seen": 169869760, + "step": 2456 + }, + { + "epoch": 153.5625, + "grad_norm": 1.2033565802466124, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 169941568, + "step": 2457 + }, + { + "epoch": 153.5625, + "loss": 0.0006536798318848014, + "loss_ce": 1.2810680345864967e-05, + "loss_xval": 0.000640869140625, + "num_input_tokens_seen": 169941568, + "step": 2457 + }, + { + "epoch": 153.625, + "grad_norm": 8.976723632694345, + "learning_rate": 5e-05, + "loss": 0.0157, + "num_input_tokens_seen": 170013248, + "step": 2458 + }, + { + "epoch": 153.625, + "loss": 0.016246860846877098, + "loss_ce": 1.150855405285256e-05, + "loss_xval": 0.0162353515625, + "num_input_tokens_seen": 170013248, + "step": 2458 + }, + { + "epoch": 153.6875, + "grad_norm": 0.5399703579578968, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 170085056, + "step": 2459 + }, + { + "epoch": 153.6875, + "loss": 0.00033668929245322943, + "loss_ce": 1.244001578015741e-05, + "loss_xval": 0.000324249267578125, + "num_input_tokens_seen": 170085056, + "step": 2459 + }, + { + "epoch": 153.75, + "grad_norm": 7.471261265286304, + "learning_rate": 5e-05, + "loss": 0.0113, + "num_input_tokens_seen": 170156736, + "step": 2460 + }, + { + "epoch": 153.75, + "loss": 0.011607798747718334, + "loss_ce": 1.1119185728603043e-05, + "loss_xval": 0.0115966796875, + "num_input_tokens_seen": 170156736, + "step": 2460 + }, + { + "epoch": 153.8125, + "grad_norm": 1.004765225632213, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_input_tokens_seen": 170215872, + "step": 2461 + }, + { + "epoch": 153.8125, + "loss": 0.0003412454097997397, + "loss_ce": 1.1274102689640131e-05, + "loss_xval": 0.0003299713134765625, + "num_input_tokens_seen": 170215872, + "step": 2461 + }, + { + "epoch": 153.875, + "grad_norm": 6.8509049121426315, + "learning_rate": 5e-05, + "loss": 0.0095, + "num_input_tokens_seen": 170287488, + "step": 2462 + }, + { + "epoch": 153.875, + "loss": 0.009349527768790722, + "loss_ce": 1.1149150850542355e-05, + "loss_xval": 0.00933837890625, + "num_input_tokens_seen": 170287488, + "step": 2462 + }, + { + "epoch": 153.9375, + "grad_norm": 1.4588749386358972, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 170359168, + "step": 2463 + }, + { + "epoch": 153.9375, + "loss": 0.0006519771413877606, + "loss_ce": 1.1107976206403691e-05, + "loss_xval": 0.000640869140625, + "num_input_tokens_seen": 170359168, + "step": 2463 + }, + { + "epoch": 154.0, + "grad_norm": 5.5031631339173215, + "learning_rate": 5e-05, + "loss": 0.0063, + "num_input_tokens_seen": 170430848, + "step": 2464 + }, + { + "epoch": 154.0, + "loss": 0.006387988105416298, + "loss_ce": 9.814341865421738e-06, + "loss_xval": 0.006378173828125, + "num_input_tokens_seen": 170430848, + "step": 2464 + }, + { + "epoch": 154.0625, + "grad_norm": 2.0060406491376064, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 170502400, + "step": 2465 + }, + { + "epoch": 154.0625, + "loss": 0.0010102284140884876, + "loss_ce": 1.077772230928531e-05, + "loss_xval": 0.00099945068359375, + "num_input_tokens_seen": 170502400, + "step": 2465 + }, + { + "epoch": 154.125, + "grad_norm": 4.1404109225822765, + "learning_rate": 5e-05, + "loss": 0.0037, + "num_input_tokens_seen": 170573952, + "step": 2466 + }, + { + "epoch": 154.125, + "loss": 0.003657120745629072, + "loss_ce": 1.0270115126331802e-05, + "loss_xval": 0.0036468505859375, + "num_input_tokens_seen": 170573952, + "step": 2466 + }, + { + "epoch": 154.1875, + "grad_norm": 2.731704866940803, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_input_tokens_seen": 170645632, + "step": 2467 + }, + { + "epoch": 154.1875, + "loss": 0.0018024840392172337, + "loss_ce": 9.576368938724045e-06, + "loss_xval": 0.00179290771484375, + "num_input_tokens_seen": 170645632, + "step": 2467 + }, + { + "epoch": 154.25, + "grad_norm": 3.563697178179098, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_input_tokens_seen": 170717312, + "step": 2468 + }, + { + "epoch": 154.25, + "loss": 0.002802206901833415, + "loss_ce": 9.848467925621662e-06, + "loss_xval": 0.0027923583984375, + "num_input_tokens_seen": 170717312, + "step": 2468 + }, + { + "epoch": 154.3125, + "grad_norm": 2.8417179115709756, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 170788864, + "step": 2469 + }, + { + "epoch": 154.3125, + "loss": 0.001655792584642768, + "loss_ce": 7.84333678893745e-06, + "loss_xval": 0.00164794921875, + "num_input_tokens_seen": 170788864, + "step": 2469 + }, + { + "epoch": 154.375, + "grad_norm": 2.631168310040131, + "learning_rate": 5e-05, + "loss": 0.0016, + "num_input_tokens_seen": 170860608, + "step": 2470 + }, + { + "epoch": 154.375, + "loss": 0.0017291078111156821, + "loss_ce": 1.2494046131905634e-05, + "loss_xval": 0.00171661376953125, + "num_input_tokens_seen": 170860608, + "step": 2470 + }, + { + "epoch": 154.4375, + "grad_norm": 3.2617330357890206, + "learning_rate": 5e-05, + "loss": 0.0023, + "num_input_tokens_seen": 170932160, + "step": 2471 + }, + { + "epoch": 154.4375, + "loss": 0.002359155099838972, + "loss_ce": 9.301562386099249e-06, + "loss_xval": 0.002349853515625, + "num_input_tokens_seen": 170932160, + "step": 2471 + }, + { + "epoch": 154.5, + "grad_norm": 2.4545855076784964, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_input_tokens_seen": 171003840, + "step": 2472 + }, + { + "epoch": 154.5, + "loss": 0.0010087847476825118, + "loss_ce": 9.334085007139947e-06, + "loss_xval": 0.00099945068359375, + "num_input_tokens_seen": 171003840, + "step": 2472 + }, + { + "epoch": 154.5625, + "grad_norm": 3.054959949582806, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_input_tokens_seen": 171075456, + "step": 2473 + }, + { + "epoch": 154.5625, + "loss": 0.002541671507060528, + "loss_ce": 8.712435374036431e-06, + "loss_xval": 0.002532958984375, + "num_input_tokens_seen": 171075456, + "step": 2473 + }, + { + "epoch": 154.625, + "grad_norm": 2.1776287269885204, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 171147072, + "step": 2474 + }, + { + "epoch": 154.625, + "loss": 0.001070039696060121, + "loss_ce": 9.55386713030748e-06, + "loss_xval": 0.00106048583984375, + "num_input_tokens_seen": 171147072, + "step": 2474 + }, + { + "epoch": 154.6875, + "grad_norm": 2.901123518547072, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_input_tokens_seen": 171218624, + "step": 2475 + }, + { + "epoch": 154.6875, + "loss": 0.001939892885275185, + "loss_ce": 9.656041584094055e-06, + "loss_xval": 0.00193023681640625, + "num_input_tokens_seen": 171218624, + "step": 2475 + }, + { + "epoch": 154.75, + "grad_norm": 1.8316559975125357, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 171290368, + "step": 2476 + }, + { + "epoch": 154.75, + "loss": 0.0008991528302431107, + "loss_ce": 1.0328377356927376e-05, + "loss_xval": 0.000888824462890625, + "num_input_tokens_seen": 171290368, + "step": 2476 + }, + { + "epoch": 154.8125, + "grad_norm": 2.8623043914660786, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_input_tokens_seen": 171361984, + "step": 2477 + }, + { + "epoch": 154.8125, + "loss": 0.0019163988763466477, + "loss_ce": 9.050200787896756e-06, + "loss_xval": 0.0019073486328125, + "num_input_tokens_seen": 171361984, + "step": 2477 + }, + { + "epoch": 154.875, + "grad_norm": 1.79429224420876, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 171433536, + "step": 2478 + }, + { + "epoch": 154.875, + "loss": 0.0011088310275226831, + "loss_ce": 1.019821866066195e-05, + "loss_xval": 0.0010986328125, + "num_input_tokens_seen": 171433536, + "step": 2478 + }, + { + "epoch": 154.9375, + "grad_norm": 2.4579784433113057, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_input_tokens_seen": 171505280, + "step": 2479 + }, + { + "epoch": 154.9375, + "loss": 0.0012454879470169544, + "loss_ce": 9.526052963337861e-06, + "loss_xval": 0.0012359619140625, + "num_input_tokens_seen": 171505280, + "step": 2479 + }, + { + "epoch": 155.0, + "grad_norm": 1.7145579880094552, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 171577024, + "step": 2480 + }, + { + "epoch": 155.0, + "loss": 0.0006987936212681234, + "loss_ce": 8.33344256534474e-06, + "loss_xval": 0.000690460205078125, + "num_input_tokens_seen": 171577024, + "step": 2480 + }, + { + "epoch": 155.0625, + "grad_norm": 2.2600772332493424, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 171636160, + "step": 2481 + }, + { + "epoch": 155.0625, + "loss": 0.0012454873649403453, + "loss_ce": 9.525448149361182e-06, + "loss_xval": 0.0012359619140625, + "num_input_tokens_seen": 171636160, + "step": 2481 + }, + { + "epoch": 155.125, + "grad_norm": 1.8448501855644037, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 171695360, + "step": 2482 + }, + { + "epoch": 155.125, + "loss": 0.0007585145649500191, + "loss_ce": 7.0192181738093495e-06, + "loss_xval": 0.000751495361328125, + "num_input_tokens_seen": 171695360, + "step": 2482 + }, + { + "epoch": 155.1875, + "grad_norm": 1.8702917832781996, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 171767040, + "step": 2483 + }, + { + "epoch": 155.1875, + "loss": 0.0008324088994413614, + "loss_ce": 8.434298251813743e-06, + "loss_xval": 0.000823974609375, + "num_input_tokens_seen": 171767040, + "step": 2483 + }, + { + "epoch": 155.25, + "grad_norm": 1.6708108711761513, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 171838656, + "step": 2484 + }, + { + "epoch": 155.25, + "loss": 0.0006913580000400543, + "loss_ce": 8.52719404065283e-06, + "loss_xval": 0.000682830810546875, + "num_input_tokens_seen": 171838656, + "step": 2484 + }, + { + "epoch": 155.3125, + "grad_norm": 1.8771915125255452, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 171910272, + "step": 2485 + }, + { + "epoch": 155.3125, + "loss": 0.0007790317176841199, + "loss_ce": 8.462846381007694e-06, + "loss_xval": 0.00077056884765625, + "num_input_tokens_seen": 171910272, + "step": 2485 + }, + { + "epoch": 155.375, + "grad_norm": 1.7608488549611228, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_input_tokens_seen": 171981952, + "step": 2486 + }, + { + "epoch": 155.375, + "loss": 0.0008362592780031264, + "loss_ce": 8.469964086543769e-06, + "loss_xval": 0.000827789306640625, + "num_input_tokens_seen": 171981952, + "step": 2486 + }, + { + "epoch": 155.4375, + "grad_norm": 1.9982541590104177, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 172053568, + "step": 2487 + }, + { + "epoch": 155.4375, + "loss": 0.0011517609236761928, + "loss_ce": 7.351757176365936e-06, + "loss_xval": 0.0011444091796875, + "num_input_tokens_seen": 172053568, + "step": 2487 + }, + { + "epoch": 155.5, + "grad_norm": 2.2192158162225946, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_input_tokens_seen": 172125312, + "step": 2488 + }, + { + "epoch": 155.5, + "loss": 0.0011686511570587754, + "loss_ce": 8.983131920103915e-06, + "loss_xval": 0.00115966796875, + "num_input_tokens_seen": 172125312, + "step": 2488 + }, + { + "epoch": 155.5625, + "grad_norm": 1.4457901672409814, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_input_tokens_seen": 172184512, + "step": 2489 + }, + { + "epoch": 155.5625, + "loss": 0.0005124812014400959, + "loss_ce": 8.941165106080007e-06, + "loss_xval": 0.0005035400390625, + "num_input_tokens_seen": 172184512, + "step": 2489 + }, + { + "epoch": 155.625, + "grad_norm": 2.1011507510571357, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_input_tokens_seen": 172243648, + "step": 2490 + }, + { + "epoch": 155.625, + "loss": 0.0011147982440888882, + "loss_ce": 8.536085260857362e-06, + "loss_xval": 0.00110626220703125, + "num_input_tokens_seen": 172243648, + "step": 2490 + }, + { + "epoch": 155.6875, + "grad_norm": 1.056409217218583, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_input_tokens_seen": 172315264, + "step": 2491 + }, + { + "epoch": 155.6875, + "loss": 0.0003501183819025755, + "loss_ce": 8.702984814590309e-06, + "loss_xval": 0.0003414154052734375, + "num_input_tokens_seen": 172315264, + "step": 2491 + }, + { + "epoch": 155.75, + "grad_norm": 1.9594186214660128, + "learning_rate": 5e-05, + "loss": 0.001, + "num_input_tokens_seen": 172374400, + "step": 2492 + }, + { + "epoch": 155.75, + "loss": 0.0011073722271248698, + "loss_ce": 8.739401891943999e-06, + "loss_xval": 0.0010986328125, + "num_input_tokens_seen": 172374400, + "step": 2492 + }, + { + "epoch": 155.8125, + "grad_norm": 0.6302802377590286, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 172446016, + "step": 2493 + }, + { + "epoch": 155.8125, + "loss": 0.0002539145352784544, + "loss_ce": 7.866553460189607e-06, + "loss_xval": 0.0002460479736328125, + "num_input_tokens_seen": 172446016, + "step": 2493 + }, + { + "epoch": 155.875, + "grad_norm": 1.6048452374088327, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 172517632, + "step": 2494 + }, + { + "epoch": 155.875, + "loss": 0.000572717166505754, + "loss_ce": 8.141959369822871e-06, + "loss_xval": 0.0005645751953125, + "num_input_tokens_seen": 172517632, + "step": 2494 + }, + { + "epoch": 155.9375, + "grad_norm": 0.49816692649252725, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 172589248, + "step": 2495 + }, + { + "epoch": 155.9375, + "loss": 0.00022121713845990598, + "loss_ce": 8.54776499181753e-06, + "loss_xval": 0.00021266937255859375, + "num_input_tokens_seen": 172589248, + "step": 2495 + }, + { + "epoch": 156.0, + "grad_norm": 1.689080638561562, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 172660800, + "step": 2496 + }, + { + "epoch": 156.0, + "loss": 0.0008247463265433908, + "loss_ce": 8.401118066103663e-06, + "loss_xval": 0.00081634521484375, + "num_input_tokens_seen": 172660800, + "step": 2496 + }, + { + "epoch": 156.0625, + "grad_norm": 0.23182185348303827, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 172719872, + "step": 2497 + }, + { + "epoch": 156.0625, + "loss": 0.0002188103972002864, + "loss_ce": 8.048378731473349e-06, + "loss_xval": 0.00021076202392578125, + "num_input_tokens_seen": 172719872, + "step": 2497 + }, + { + "epoch": 156.125, + "grad_norm": 1.880333994977742, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_input_tokens_seen": 172778880, + "step": 2498 + }, + { + "epoch": 156.125, + "loss": 0.0008566674077883363, + "loss_ce": 9.804623005038593e-06, + "loss_xval": 0.00084686279296875, + "num_input_tokens_seen": 172778880, + "step": 2498 + }, + { + "epoch": 156.1875, + "grad_norm": 0.2540775595755331, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_input_tokens_seen": 172850496, + "step": 2499 + }, + { + "epoch": 156.1875, + "loss": 0.0002453178749419749, + "loss_ce": 9.760318789631128e-06, + "loss_xval": 0.00023555755615234375, + "num_input_tokens_seen": 172850496, + "step": 2499 + }, + { + "epoch": 156.25, + "grad_norm": 1.6266601835135746, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_input_tokens_seen": 172922048, + "step": 2500 + }, + { + "epoch": 156.25, + "eval_synth_IoU": 0.28413987159729004, + "eval_synth_MAE_x": 0.01047515869140625, + "eval_synth_MAE_y": 0.0106201171875, + "eval_synth_NUM_probability": 0.9999175518751144, + "eval_synth_inside_bbox": 0.6875, + "eval_synth_loss": 0.00015532341785728931, + "eval_synth_loss_ce": 8.934383231462562e-06, + "eval_synth_loss_xval": 0.00014638900756835938, + "eval_synth_runtime": 60.3629, + "eval_synth_samples_per_second": 2.121, + "eval_synth_steps_per_second": 0.066, + "num_input_tokens_seen": 172922048, + "step": 2500 + } + ], + "logging_steps": 1.0, + "max_steps": 3000, + "num_input_tokens_seen": 172922048, + "num_train_epochs": 188, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1414729900425216.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}