{ "best_metric": 0.823376476764679, "best_model_checkpoint": "./gte-small-pairscore/checkpoint-38500", "epoch": 2.4142471938295604, "eval_steps": 100, "global_step": 38500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006270771932024832, "grad_norm": 8.922538757324219, "learning_rate": 3.135287662643047e-07, "loss": 6.3669, "step": 100 }, { "epoch": 0.006270771932024832, "eval_loss": 6.6512651443481445, "eval_runtime": 499.3604, "eval_samples_per_second": 255.471, "eval_steps_per_second": 7.984, "step": 100 }, { "epoch": 0.012541543864049664, "grad_norm": 8.918222427368164, "learning_rate": 6.270575325286094e-07, "loss": 6.1795, "step": 200 }, { "epoch": 0.012541543864049664, "eval_loss": 6.254122734069824, "eval_runtime": 501.3941, "eval_samples_per_second": 254.435, "eval_steps_per_second": 7.952, "step": 200 }, { "epoch": 0.018812315796074498, "grad_norm": 10.144318580627441, "learning_rate": 9.405862987929143e-07, "loss": 5.893, "step": 300 }, { "epoch": 0.018812315796074498, "eval_loss": 5.773278713226318, "eval_runtime": 495.3793, "eval_samples_per_second": 257.524, "eval_steps_per_second": 8.048, "step": 300 }, { "epoch": 0.025083087728099328, "grad_norm": 9.239725112915039, "learning_rate": 1.2541150650572189e-06, "loss": 5.5099, "step": 400 }, { "epoch": 0.025083087728099328, "eval_loss": 5.362614154815674, "eval_runtime": 502.6963, "eval_samples_per_second": 253.776, "eval_steps_per_second": 7.931, "step": 400 }, { "epoch": 0.03135385966012416, "grad_norm": 9.863190650939941, "learning_rate": 1.5676438313215239e-06, "loss": 5.1589, "step": 500 }, { "epoch": 0.03135385966012416, "eval_loss": 4.990196228027344, "eval_runtime": 497.9208, "eval_samples_per_second": 256.209, "eval_steps_per_second": 8.007, "step": 500 }, { "epoch": 0.037624631592148995, "grad_norm": 13.765459060668945, "learning_rate": 1.8811725975858286e-06, "loss": 4.8599, "step": 600 }, { "epoch": 0.037624631592148995, "eval_loss": 4.652267932891846, "eval_runtime": 502.9499, "eval_samples_per_second": 253.648, "eval_steps_per_second": 7.927, "step": 600 }, { "epoch": 0.04389540352417383, "grad_norm": 12.12948226928711, "learning_rate": 2.1915660761874904e-06, "loss": 4.6075, "step": 700 }, { "epoch": 0.04389540352417383, "eval_loss": 4.423278331756592, "eval_runtime": 500.0391, "eval_samples_per_second": 255.124, "eval_steps_per_second": 7.973, "step": 700 }, { "epoch": 0.050166175456198656, "grad_norm": 19.26013946533203, "learning_rate": 2.505094842451795e-06, "loss": 4.3831, "step": 800 }, { "epoch": 0.050166175456198656, "eval_loss": 4.243069171905518, "eval_runtime": 502.9493, "eval_samples_per_second": 253.648, "eval_steps_per_second": 7.927, "step": 800 }, { "epoch": 0.05643694738822349, "grad_norm": 15.950737953186035, "learning_rate": 2.8186236087161e-06, "loss": 4.1737, "step": 900 }, { "epoch": 0.05643694738822349, "eval_loss": 4.134955883026123, "eval_runtime": 500.339, "eval_samples_per_second": 254.971, "eval_steps_per_second": 7.969, "step": 900 }, { "epoch": 0.06270771932024832, "grad_norm": 22.265525817871094, "learning_rate": 3.1290170873177617e-06, "loss": 4.0266, "step": 1000 }, { "epoch": 0.06270771932024832, "eval_loss": 4.0326995849609375, "eval_runtime": 498.5852, "eval_samples_per_second": 255.868, "eval_steps_per_second": 7.997, "step": 1000 }, { "epoch": 0.06897849125227315, "grad_norm": 25.777463912963867, "learning_rate": 3.4425458535820665e-06, "loss": 3.9526, "step": 1100 }, { "epoch": 0.06897849125227315, "eval_loss": 3.928138256072998, "eval_runtime": 500.7885, "eval_samples_per_second": 254.742, "eval_steps_per_second": 7.961, "step": 1100 }, { "epoch": 0.07524926318429799, "grad_norm": 23.928476333618164, "learning_rate": 3.756074619846371e-06, "loss": 3.8773, "step": 1200 }, { "epoch": 0.07524926318429799, "eval_loss": 3.873471260070801, "eval_runtime": 500.5002, "eval_samples_per_second": 254.889, "eval_steps_per_second": 7.966, "step": 1200 }, { "epoch": 0.08152003511632282, "grad_norm": 33.25246810913086, "learning_rate": 4.069603386110676e-06, "loss": 3.7856, "step": 1300 }, { "epoch": 0.08152003511632282, "eval_loss": 3.777860164642334, "eval_runtime": 502.3139, "eval_samples_per_second": 253.969, "eval_steps_per_second": 7.937, "step": 1300 }, { "epoch": 0.08779080704834766, "grad_norm": 29.081541061401367, "learning_rate": 4.383132152374981e-06, "loss": 3.5994, "step": 1400 }, { "epoch": 0.08779080704834766, "eval_loss": 3.705378532409668, "eval_runtime": 500.945, "eval_samples_per_second": 254.663, "eval_steps_per_second": 7.959, "step": 1400 }, { "epoch": 0.09406157898037248, "grad_norm": 32.77998352050781, "learning_rate": 4.696660918639286e-06, "loss": 3.7067, "step": 1500 }, { "epoch": 0.09406157898037248, "eval_loss": 3.615516185760498, "eval_runtime": 495.8157, "eval_samples_per_second": 257.297, "eval_steps_per_second": 8.041, "step": 1500 }, { "epoch": 0.10033235091239731, "grad_norm": 80.04820251464844, "learning_rate": 5.01018968490359e-06, "loss": 3.5471, "step": 1600 }, { "epoch": 0.10033235091239731, "eval_loss": 3.57977032661438, "eval_runtime": 494.2912, "eval_samples_per_second": 258.091, "eval_steps_per_second": 8.066, "step": 1600 }, { "epoch": 0.10660312284442215, "grad_norm": 32.79664611816406, "learning_rate": 5.323718451167895e-06, "loss": 3.6679, "step": 1700 }, { "epoch": 0.10660312284442215, "eval_loss": 3.4653944969177246, "eval_runtime": 487.8092, "eval_samples_per_second": 261.52, "eval_steps_per_second": 8.173, "step": 1700 }, { "epoch": 0.11287389477644698, "grad_norm": 52.47648620605469, "learning_rate": 5.6372472174322e-06, "loss": 3.4484, "step": 1800 }, { "epoch": 0.11287389477644698, "eval_loss": 3.4174623489379883, "eval_runtime": 488.4427, "eval_samples_per_second": 261.181, "eval_steps_per_second": 8.163, "step": 1800 }, { "epoch": 0.11914466670847182, "grad_norm": 56.593727111816406, "learning_rate": 5.947640696033862e-06, "loss": 3.377, "step": 1900 }, { "epoch": 0.11914466670847182, "eval_loss": 3.412893056869507, "eval_runtime": 487.7362, "eval_samples_per_second": 261.559, "eval_steps_per_second": 8.175, "step": 1900 }, { "epoch": 0.12541543864049665, "grad_norm": 47.65892028808594, "learning_rate": 6.2611694622981665e-06, "loss": 3.4259, "step": 2000 }, { "epoch": 0.12541543864049665, "eval_loss": 3.3347389698028564, "eval_runtime": 485.6244, "eval_samples_per_second": 262.697, "eval_steps_per_second": 8.21, "step": 2000 }, { "epoch": 0.13168621057252147, "grad_norm": 46.80497741699219, "learning_rate": 6.574698228562471e-06, "loss": 3.4832, "step": 2100 }, { "epoch": 0.13168621057252147, "eval_loss": 3.2113163471221924, "eval_runtime": 485.3601, "eval_samples_per_second": 262.84, "eval_steps_per_second": 8.215, "step": 2100 }, { "epoch": 0.1379569825045463, "grad_norm": 61.08994674682617, "learning_rate": 6.888226994826775e-06, "loss": 3.3043, "step": 2200 }, { "epoch": 0.1379569825045463, "eval_loss": 3.164067268371582, "eval_runtime": 486.9272, "eval_samples_per_second": 261.994, "eval_steps_per_second": 8.188, "step": 2200 }, { "epoch": 0.14422775443657115, "grad_norm": 54.029170989990234, "learning_rate": 7.201755761091081e-06, "loss": 3.2344, "step": 2300 }, { "epoch": 0.14422775443657115, "eval_loss": 3.1528868675231934, "eval_runtime": 485.6647, "eval_samples_per_second": 262.675, "eval_steps_per_second": 8.209, "step": 2300 }, { "epoch": 0.15049852636859598, "grad_norm": 51.89152526855469, "learning_rate": 7.5152845273553855e-06, "loss": 3.1238, "step": 2400 }, { "epoch": 0.15049852636859598, "eval_loss": 3.2577104568481445, "eval_runtime": 483.4268, "eval_samples_per_second": 263.891, "eval_steps_per_second": 8.247, "step": 2400 }, { "epoch": 0.1567692983006208, "grad_norm": 42.53623962402344, "learning_rate": 7.828813293619691e-06, "loss": 3.1456, "step": 2500 }, { "epoch": 0.1567692983006208, "eval_loss": 3.067770481109619, "eval_runtime": 483.4427, "eval_samples_per_second": 263.882, "eval_steps_per_second": 8.247, "step": 2500 }, { "epoch": 0.16304007023264563, "grad_norm": 40.32865905761719, "learning_rate": 8.142342059883996e-06, "loss": 3.0223, "step": 2600 }, { "epoch": 0.16304007023264563, "eval_loss": 3.000631332397461, "eval_runtime": 490.1586, "eval_samples_per_second": 260.267, "eval_steps_per_second": 8.134, "step": 2600 }, { "epoch": 0.16931084216467046, "grad_norm": 52.34198760986328, "learning_rate": 8.4558708261483e-06, "loss": 3.2046, "step": 2700 }, { "epoch": 0.16931084216467046, "eval_loss": 2.9682161808013916, "eval_runtime": 501.7817, "eval_samples_per_second": 254.238, "eval_steps_per_second": 7.946, "step": 2700 }, { "epoch": 0.17558161409669532, "grad_norm": 54.570518493652344, "learning_rate": 8.766264304749962e-06, "loss": 3.0866, "step": 2800 }, { "epoch": 0.17558161409669532, "eval_loss": 3.0524070262908936, "eval_runtime": 492.0201, "eval_samples_per_second": 259.282, "eval_steps_per_second": 8.103, "step": 2800 }, { "epoch": 0.18185238602872014, "grad_norm": 91.97798156738281, "learning_rate": 9.079793071014266e-06, "loss": 2.9271, "step": 2900 }, { "epoch": 0.18185238602872014, "eval_loss": 3.057253122329712, "eval_runtime": 483.8055, "eval_samples_per_second": 263.684, "eval_steps_per_second": 8.241, "step": 2900 }, { "epoch": 0.18812315796074497, "grad_norm": 72.72432708740234, "learning_rate": 9.393321837278571e-06, "loss": 2.7692, "step": 3000 }, { "epoch": 0.18812315796074497, "eval_loss": 3.0557968616485596, "eval_runtime": 481.3118, "eval_samples_per_second": 265.051, "eval_steps_per_second": 8.284, "step": 3000 }, { "epoch": 0.1943939298927698, "grad_norm": 66.55966186523438, "learning_rate": 9.706850603542876e-06, "loss": 3.1498, "step": 3100 }, { "epoch": 0.1943939298927698, "eval_loss": 2.786630153656006, "eval_runtime": 470.7517, "eval_samples_per_second": 270.996, "eval_steps_per_second": 8.469, "step": 3100 }, { "epoch": 0.20066470182479462, "grad_norm": 60.73588180541992, "learning_rate": 1.002037936980718e-05, "loss": 3.0683, "step": 3200 }, { "epoch": 0.20066470182479462, "eval_loss": 2.847790002822876, "eval_runtime": 479.7323, "eval_samples_per_second": 265.923, "eval_steps_per_second": 8.311, "step": 3200 }, { "epoch": 0.20693547375681948, "grad_norm": 138.51925659179688, "learning_rate": 1.0333908136071484e-05, "loss": 2.5776, "step": 3300 }, { "epoch": 0.20693547375681948, "eval_loss": 2.9458932876586914, "eval_runtime": 476.173, "eval_samples_per_second": 267.911, "eval_steps_per_second": 8.373, "step": 3300 }, { "epoch": 0.2132062456888443, "grad_norm": 44.883033752441406, "learning_rate": 1.064743690233579e-05, "loss": 2.9394, "step": 3400 }, { "epoch": 0.2132062456888443, "eval_loss": 2.7133240699768066, "eval_runtime": 484.1605, "eval_samples_per_second": 263.491, "eval_steps_per_second": 8.235, "step": 3400 }, { "epoch": 0.21947701762086913, "grad_norm": 62.664493560791016, "learning_rate": 1.0960965668600095e-05, "loss": 2.6996, "step": 3500 }, { "epoch": 0.21947701762086913, "eval_loss": 2.8582112789154053, "eval_runtime": 486.4854, "eval_samples_per_second": 262.232, "eval_steps_per_second": 8.196, "step": 3500 }, { "epoch": 0.22574778955289396, "grad_norm": 121.68364715576172, "learning_rate": 1.12744944348644e-05, "loss": 2.569, "step": 3600 }, { "epoch": 0.22574778955289396, "eval_loss": 2.8092362880706787, "eval_runtime": 488.2917, "eval_samples_per_second": 261.262, "eval_steps_per_second": 8.165, "step": 3600 }, { "epoch": 0.23201856148491878, "grad_norm": 62.47746658325195, "learning_rate": 1.1588023201128705e-05, "loss": 2.6535, "step": 3700 }, { "epoch": 0.23201856148491878, "eval_loss": 2.7977445125579834, "eval_runtime": 489.6066, "eval_samples_per_second": 260.56, "eval_steps_per_second": 8.143, "step": 3700 }, { "epoch": 0.23828933341694364, "grad_norm": 61.19312286376953, "learning_rate": 1.1901551967393011e-05, "loss": 2.6679, "step": 3800 }, { "epoch": 0.23828933341694364, "eval_loss": 2.8578476905822754, "eval_runtime": 490.8833, "eval_samples_per_second": 259.883, "eval_steps_per_second": 8.122, "step": 3800 }, { "epoch": 0.24456010534896847, "grad_norm": 52.05066680908203, "learning_rate": 1.2215080733657314e-05, "loss": 2.592, "step": 3900 }, { "epoch": 0.24456010534896847, "eval_loss": 2.8251442909240723, "eval_runtime": 487.5816, "eval_samples_per_second": 261.642, "eval_steps_per_second": 8.177, "step": 3900 }, { "epoch": 0.2508308772809933, "grad_norm": 72.0737533569336, "learning_rate": 1.2525474212258977e-05, "loss": 2.4931, "step": 4000 }, { "epoch": 0.2508308772809933, "eval_loss": 2.5975987911224365, "eval_runtime": 483.614, "eval_samples_per_second": 263.789, "eval_steps_per_second": 8.244, "step": 4000 }, { "epoch": 0.25710164921301815, "grad_norm": 142.94813537597656, "learning_rate": 1.283900297852328e-05, "loss": 2.3012, "step": 4100 }, { "epoch": 0.25710164921301815, "eval_loss": 2.926022529602051, "eval_runtime": 482.5775, "eval_samples_per_second": 264.355, "eval_steps_per_second": 8.262, "step": 4100 }, { "epoch": 0.26337242114504295, "grad_norm": 188.10948181152344, "learning_rate": 1.3152531744787585e-05, "loss": 2.4728, "step": 4200 }, { "epoch": 0.26337242114504295, "eval_loss": 2.7869389057159424, "eval_runtime": 484.6597, "eval_samples_per_second": 263.22, "eval_steps_per_second": 8.226, "step": 4200 }, { "epoch": 0.2696431930770678, "grad_norm": 88.77122497558594, "learning_rate": 1.3466060511051891e-05, "loss": 2.4391, "step": 4300 }, { "epoch": 0.2696431930770678, "eval_loss": 2.898672580718994, "eval_runtime": 488.7638, "eval_samples_per_second": 261.009, "eval_steps_per_second": 8.157, "step": 4300 }, { "epoch": 0.2759139650090926, "grad_norm": 24.550174713134766, "learning_rate": 1.3779589277316194e-05, "loss": 2.3825, "step": 4400 }, { "epoch": 0.2759139650090926, "eval_loss": 2.7803783416748047, "eval_runtime": 479.1092, "eval_samples_per_second": 266.269, "eval_steps_per_second": 8.322, "step": 4400 }, { "epoch": 0.28218473694111745, "grad_norm": 262.1357727050781, "learning_rate": 1.40931180435805e-05, "loss": 2.6257, "step": 4500 }, { "epoch": 0.28218473694111745, "eval_loss": 2.8308775424957275, "eval_runtime": 481.1992, "eval_samples_per_second": 265.113, "eval_steps_per_second": 8.286, "step": 4500 }, { "epoch": 0.2884555088731423, "grad_norm": 106.33345794677734, "learning_rate": 1.4406646809844804e-05, "loss": 2.4304, "step": 4600 }, { "epoch": 0.2884555088731423, "eval_loss": 3.241865873336792, "eval_runtime": 480.8967, "eval_samples_per_second": 265.279, "eval_steps_per_second": 8.291, "step": 4600 }, { "epoch": 0.2947262808051671, "grad_norm": 24.162464141845703, "learning_rate": 1.472017557610911e-05, "loss": 3.0246, "step": 4700 }, { "epoch": 0.2947262808051671, "eval_loss": 2.5731775760650635, "eval_runtime": 478.865, "eval_samples_per_second": 266.405, "eval_steps_per_second": 8.326, "step": 4700 }, { "epoch": 0.30099705273719196, "grad_norm": 89.0951919555664, "learning_rate": 1.5033704342373415e-05, "loss": 2.6894, "step": 4800 }, { "epoch": 0.30099705273719196, "eval_loss": 2.8057522773742676, "eval_runtime": 469.6303, "eval_samples_per_second": 271.643, "eval_steps_per_second": 8.49, "step": 4800 }, { "epoch": 0.30726782466921676, "grad_norm": 10.197610855102539, "learning_rate": 1.5347233108637716e-05, "loss": 2.5333, "step": 4900 }, { "epoch": 0.30726782466921676, "eval_loss": 2.4581992626190186, "eval_runtime": 480.2298, "eval_samples_per_second": 265.648, "eval_steps_per_second": 8.302, "step": 4900 }, { "epoch": 0.3135385966012416, "grad_norm": 10.737910270690918, "learning_rate": 1.5660761874902023e-05, "loss": 2.3268, "step": 5000 }, { "epoch": 0.3135385966012416, "eval_loss": 2.8622071743011475, "eval_runtime": 477.9553, "eval_samples_per_second": 266.912, "eval_steps_per_second": 8.342, "step": 5000 }, { "epoch": 0.31980936853326647, "grad_norm": 107.68405151367188, "learning_rate": 1.597429064116633e-05, "loss": 2.6996, "step": 5100 }, { "epoch": 0.31980936853326647, "eval_loss": 2.751514196395874, "eval_runtime": 479.8929, "eval_samples_per_second": 265.834, "eval_steps_per_second": 8.308, "step": 5100 }, { "epoch": 0.32608014046529127, "grad_norm": 137.2300567626953, "learning_rate": 1.6287819407430632e-05, "loss": 2.8175, "step": 5200 }, { "epoch": 0.32608014046529127, "eval_loss": 2.5842323303222656, "eval_runtime": 473.9066, "eval_samples_per_second": 269.192, "eval_steps_per_second": 8.413, "step": 5200 }, { "epoch": 0.3323509123973161, "grad_norm": 30.23833465576172, "learning_rate": 1.660134817369494e-05, "loss": 2.1244, "step": 5300 }, { "epoch": 0.3323509123973161, "eval_loss": 2.725175380706787, "eval_runtime": 479.3619, "eval_samples_per_second": 266.129, "eval_steps_per_second": 8.317, "step": 5300 }, { "epoch": 0.3386216843293409, "grad_norm": 66.7165756225586, "learning_rate": 1.6914876939959242e-05, "loss": 2.7331, "step": 5400 }, { "epoch": 0.3386216843293409, "eval_loss": 2.5052876472473145, "eval_runtime": 482.2091, "eval_samples_per_second": 264.557, "eval_steps_per_second": 8.268, "step": 5400 }, { "epoch": 0.3448924562613658, "grad_norm": 20.616701126098633, "learning_rate": 1.722840570622355e-05, "loss": 2.3226, "step": 5500 }, { "epoch": 0.3448924562613658, "eval_loss": 2.2429914474487305, "eval_runtime": 483.8758, "eval_samples_per_second": 263.646, "eval_steps_per_second": 8.24, "step": 5500 }, { "epoch": 0.35116322819339063, "grad_norm": 10.704608917236328, "learning_rate": 1.754193447248785e-05, "loss": 2.0706, "step": 5600 }, { "epoch": 0.35116322819339063, "eval_loss": 2.6055426597595215, "eval_runtime": 481.5222, "eval_samples_per_second": 264.935, "eval_steps_per_second": 8.28, "step": 5600 }, { "epoch": 0.35743400012541543, "grad_norm": 116.87510681152344, "learning_rate": 1.7852327951089514e-05, "loss": 2.2461, "step": 5700 }, { "epoch": 0.35743400012541543, "eval_loss": 2.894943952560425, "eval_runtime": 482.8194, "eval_samples_per_second": 264.223, "eval_steps_per_second": 8.258, "step": 5700 }, { "epoch": 0.3637047720574403, "grad_norm": 75.6421890258789, "learning_rate": 1.8165856717353817e-05, "loss": 2.6365, "step": 5800 }, { "epoch": 0.3637047720574403, "eval_loss": 2.5271661281585693, "eval_runtime": 486.8779, "eval_samples_per_second": 262.021, "eval_steps_per_second": 8.189, "step": 5800 }, { "epoch": 0.3699755439894651, "grad_norm": 25.05718231201172, "learning_rate": 1.8479385483618124e-05, "loss": 2.7119, "step": 5900 }, { "epoch": 0.3699755439894651, "eval_loss": 2.433084011077881, "eval_runtime": 486.349, "eval_samples_per_second": 262.305, "eval_steps_per_second": 8.198, "step": 5900 }, { "epoch": 0.37624631592148994, "grad_norm": 88.68294525146484, "learning_rate": 1.8792914249882427e-05, "loss": 2.6146, "step": 6000 }, { "epoch": 0.37624631592148994, "eval_loss": 2.385845899581909, "eval_runtime": 485.6975, "eval_samples_per_second": 262.657, "eval_steps_per_second": 8.209, "step": 6000 }, { "epoch": 0.3825170878535148, "grad_norm": 185.446533203125, "learning_rate": 1.9106443016146733e-05, "loss": 2.1998, "step": 6100 }, { "epoch": 0.3825170878535148, "eval_loss": 2.6891462802886963, "eval_runtime": 488.7846, "eval_samples_per_second": 260.998, "eval_steps_per_second": 8.157, "step": 6100 }, { "epoch": 0.3887878597855396, "grad_norm": 105.49547576904297, "learning_rate": 1.9419971782411036e-05, "loss": 2.5076, "step": 6200 }, { "epoch": 0.3887878597855396, "eval_loss": 2.3827390670776367, "eval_runtime": 493.7693, "eval_samples_per_second": 258.364, "eval_steps_per_second": 8.075, "step": 6200 }, { "epoch": 0.39505863171756445, "grad_norm": 20.25705909729004, "learning_rate": 1.9733500548675343e-05, "loss": 2.5244, "step": 6300 }, { "epoch": 0.39505863171756445, "eval_loss": 2.6522157192230225, "eval_runtime": 500.1844, "eval_samples_per_second": 255.05, "eval_steps_per_second": 7.971, "step": 6300 }, { "epoch": 0.40132940364958924, "grad_norm": 1.7094597816467285, "learning_rate": 1.9994774338518353e-05, "loss": 2.0613, "step": 6400 }, { "epoch": 0.40132940364958924, "eval_loss": 2.4750421047210693, "eval_runtime": 501.9454, "eval_samples_per_second": 254.155, "eval_steps_per_second": 7.943, "step": 6400 }, { "epoch": 0.4076001755816141, "grad_norm": 2.9270060062408447, "learning_rate": 1.995993659530736e-05, "loss": 2.465, "step": 6500 }, { "epoch": 0.4076001755816141, "eval_loss": 2.525411367416382, "eval_runtime": 501.3359, "eval_samples_per_second": 254.464, "eval_steps_per_second": 7.953, "step": 6500 }, { "epoch": 0.41387094751363895, "grad_norm": 95.53108215332031, "learning_rate": 1.9925098852096362e-05, "loss": 2.3201, "step": 6600 }, { "epoch": 0.41387094751363895, "eval_loss": 2.2248587608337402, "eval_runtime": 495.8221, "eval_samples_per_second": 257.294, "eval_steps_per_second": 8.041, "step": 6600 }, { "epoch": 0.42014171944566375, "grad_norm": 19.441762924194336, "learning_rate": 1.9890261108885365e-05, "loss": 2.234, "step": 6700 }, { "epoch": 0.42014171944566375, "eval_loss": 2.5168297290802, "eval_runtime": 497.6361, "eval_samples_per_second": 256.356, "eval_steps_per_second": 8.012, "step": 6700 }, { "epoch": 0.4264124913776886, "grad_norm": 13.225996017456055, "learning_rate": 1.985542336567437e-05, "loss": 2.1277, "step": 6800 }, { "epoch": 0.4264124913776886, "eval_loss": 2.5358171463012695, "eval_runtime": 498.1476, "eval_samples_per_second": 256.093, "eval_steps_per_second": 8.004, "step": 6800 }, { "epoch": 0.4326832633097134, "grad_norm": 103.2215347290039, "learning_rate": 1.9820585622463378e-05, "loss": 2.3801, "step": 6900 }, { "epoch": 0.4326832633097134, "eval_loss": 2.4991824626922607, "eval_runtime": 500.8539, "eval_samples_per_second": 254.709, "eval_steps_per_second": 7.96, "step": 6900 }, { "epoch": 0.43895403524173826, "grad_norm": 97.55316925048828, "learning_rate": 1.9785747879252384e-05, "loss": 2.1443, "step": 7000 }, { "epoch": 0.43895403524173826, "eval_loss": 2.4043357372283936, "eval_runtime": 499.124, "eval_samples_per_second": 255.592, "eval_steps_per_second": 7.988, "step": 7000 }, { "epoch": 0.4452248071737631, "grad_norm": 233.4646759033203, "learning_rate": 1.9750910136041387e-05, "loss": 1.9136, "step": 7100 }, { "epoch": 0.4452248071737631, "eval_loss": 2.3874008655548096, "eval_runtime": 497.0926, "eval_samples_per_second": 256.636, "eval_steps_per_second": 8.021, "step": 7100 }, { "epoch": 0.4514955791057879, "grad_norm": 111.59117889404297, "learning_rate": 1.9716072392830394e-05, "loss": 2.3067, "step": 7200 }, { "epoch": 0.4514955791057879, "eval_loss": 2.647474765777588, "eval_runtime": 497.6919, "eval_samples_per_second": 256.327, "eval_steps_per_second": 8.011, "step": 7200 }, { "epoch": 0.45776635103781277, "grad_norm": 156.6864776611328, "learning_rate": 1.96812346496194e-05, "loss": 2.1464, "step": 7300 }, { "epoch": 0.45776635103781277, "eval_loss": 2.4704177379608154, "eval_runtime": 500.9739, "eval_samples_per_second": 254.648, "eval_steps_per_second": 7.958, "step": 7300 }, { "epoch": 0.46403712296983757, "grad_norm": 22.16613006591797, "learning_rate": 1.9646396906408406e-05, "loss": 2.2151, "step": 7400 }, { "epoch": 0.46403712296983757, "eval_loss": 2.519892692565918, "eval_runtime": 501.5459, "eval_samples_per_second": 254.358, "eval_steps_per_second": 7.949, "step": 7400 }, { "epoch": 0.4703078949018624, "grad_norm": 251.9285125732422, "learning_rate": 1.961155916319741e-05, "loss": 2.4653, "step": 7500 }, { "epoch": 0.4703078949018624, "eval_loss": 2.529334545135498, "eval_runtime": 497.5167, "eval_samples_per_second": 256.418, "eval_steps_per_second": 8.014, "step": 7500 }, { "epoch": 0.4765786668338873, "grad_norm": 222.22967529296875, "learning_rate": 1.9576721419986416e-05, "loss": 2.4425, "step": 7600 }, { "epoch": 0.4765786668338873, "eval_loss": 2.126385450363159, "eval_runtime": 501.9759, "eval_samples_per_second": 254.14, "eval_steps_per_second": 7.943, "step": 7600 }, { "epoch": 0.4828494387659121, "grad_norm": 135.4093780517578, "learning_rate": 1.954188367677542e-05, "loss": 2.3138, "step": 7700 }, { "epoch": 0.4828494387659121, "eval_loss": 2.18104887008667, "eval_runtime": 499.0397, "eval_samples_per_second": 255.635, "eval_steps_per_second": 7.989, "step": 7700 }, { "epoch": 0.48912021069793693, "grad_norm": 16.642980575561523, "learning_rate": 1.9507045933564425e-05, "loss": 2.247, "step": 7800 }, { "epoch": 0.48912021069793693, "eval_loss": 2.1403872966766357, "eval_runtime": 487.5913, "eval_samples_per_second": 261.637, "eval_steps_per_second": 8.177, "step": 7800 }, { "epoch": 0.49539098262996173, "grad_norm": 163.52439880371094, "learning_rate": 1.947220819035343e-05, "loss": 2.1621, "step": 7900 }, { "epoch": 0.49539098262996173, "eval_loss": 2.2122886180877686, "eval_runtime": 498.6473, "eval_samples_per_second": 255.836, "eval_steps_per_second": 7.996, "step": 7900 }, { "epoch": 0.5016617545619866, "grad_norm": 0.11034490168094635, "learning_rate": 1.9437370447142438e-05, "loss": 2.1338, "step": 8000 }, { "epoch": 0.5016617545619866, "eval_loss": 2.5108418464660645, "eval_runtime": 497.4577, "eval_samples_per_second": 256.448, "eval_steps_per_second": 8.015, "step": 8000 }, { "epoch": 0.5079325264940114, "grad_norm": 73.57258605957031, "learning_rate": 1.940253270393144e-05, "loss": 2.1846, "step": 8100 }, { "epoch": 0.5079325264940114, "eval_loss": 2.149299383163452, "eval_runtime": 500.0715, "eval_samples_per_second": 255.108, "eval_steps_per_second": 7.973, "step": 8100 }, { "epoch": 0.5142032984260363, "grad_norm": 71.24880981445312, "learning_rate": 1.9367694960720447e-05, "loss": 2.1167, "step": 8200 }, { "epoch": 0.5142032984260363, "eval_loss": 2.287858486175537, "eval_runtime": 500.8627, "eval_samples_per_second": 254.705, "eval_steps_per_second": 7.96, "step": 8200 }, { "epoch": 0.520474070358061, "grad_norm": 138.7628936767578, "learning_rate": 1.933285721750945e-05, "loss": 2.2143, "step": 8300 }, { "epoch": 0.520474070358061, "eval_loss": 2.1663804054260254, "eval_runtime": 501.8798, "eval_samples_per_second": 254.188, "eval_steps_per_second": 7.944, "step": 8300 }, { "epoch": 0.5267448422900859, "grad_norm": 17.088781356811523, "learning_rate": 1.9298019474298456e-05, "loss": 2.3152, "step": 8400 }, { "epoch": 0.5267448422900859, "eval_loss": 2.1071760654449463, "eval_runtime": 495.7183, "eval_samples_per_second": 257.348, "eval_steps_per_second": 8.043, "step": 8400 }, { "epoch": 0.5330156142221107, "grad_norm": 267.6972351074219, "learning_rate": 1.9263181731087462e-05, "loss": 1.7618, "step": 8500 }, { "epoch": 0.5330156142221107, "eval_loss": 2.032350540161133, "eval_runtime": 498.1752, "eval_samples_per_second": 256.079, "eval_steps_per_second": 8.003, "step": 8500 }, { "epoch": 0.5392863861541356, "grad_norm": 117.72229766845703, "learning_rate": 1.922834398787647e-05, "loss": 2.0777, "step": 8600 }, { "epoch": 0.5392863861541356, "eval_loss": 2.4468319416046143, "eval_runtime": 495.2364, "eval_samples_per_second": 257.598, "eval_steps_per_second": 8.051, "step": 8600 }, { "epoch": 0.5455571580861605, "grad_norm": 1.1375752687454224, "learning_rate": 1.9193506244665472e-05, "loss": 2.1573, "step": 8700 }, { "epoch": 0.5455571580861605, "eval_loss": 2.2053027153015137, "eval_runtime": 485.0279, "eval_samples_per_second": 263.02, "eval_steps_per_second": 8.22, "step": 8700 }, { "epoch": 0.5518279300181852, "grad_norm": 244.1565704345703, "learning_rate": 1.9158668501454478e-05, "loss": 1.9831, "step": 8800 }, { "epoch": 0.5518279300181852, "eval_loss": 2.3276798725128174, "eval_runtime": 499.4822, "eval_samples_per_second": 255.409, "eval_steps_per_second": 7.982, "step": 8800 }, { "epoch": 0.55809870195021, "grad_norm": 69.60086059570312, "learning_rate": 1.912383075824348e-05, "loss": 1.9083, "step": 8900 }, { "epoch": 0.55809870195021, "eval_loss": 1.9949347972869873, "eval_runtime": 495.1776, "eval_samples_per_second": 257.629, "eval_steps_per_second": 8.052, "step": 8900 }, { "epoch": 0.5643694738822349, "grad_norm": 1.5420753955841064, "learning_rate": 1.9088993015032487e-05, "loss": 1.932, "step": 9000 }, { "epoch": 0.5643694738822349, "eval_loss": 1.9848002195358276, "eval_runtime": 498.1194, "eval_samples_per_second": 256.107, "eval_steps_per_second": 8.004, "step": 9000 }, { "epoch": 0.5706402458142598, "grad_norm": 9.46451473236084, "learning_rate": 1.9054155271821494e-05, "loss": 2.3223, "step": 9100 }, { "epoch": 0.5706402458142598, "eval_loss": 1.9191622734069824, "eval_runtime": 495.6516, "eval_samples_per_second": 257.382, "eval_steps_per_second": 8.044, "step": 9100 }, { "epoch": 0.5769110177462846, "grad_norm": 22.84164047241211, "learning_rate": 1.90193175286105e-05, "loss": 1.7583, "step": 9200 }, { "epoch": 0.5769110177462846, "eval_loss": 2.0066075325012207, "eval_runtime": 497.8537, "eval_samples_per_second": 256.244, "eval_steps_per_second": 8.008, "step": 9200 }, { "epoch": 0.5831817896783094, "grad_norm": 3.343338966369629, "learning_rate": 1.8984479785399503e-05, "loss": 1.6394, "step": 9300 }, { "epoch": 0.5831817896783094, "eval_loss": 2.0322048664093018, "eval_runtime": 485.1706, "eval_samples_per_second": 262.943, "eval_steps_per_second": 8.218, "step": 9300 }, { "epoch": 0.5894525616103342, "grad_norm": 13.116720199584961, "learning_rate": 1.894964204218851e-05, "loss": 1.973, "step": 9400 }, { "epoch": 0.5894525616103342, "eval_loss": 2.100987195968628, "eval_runtime": 492.3424, "eval_samples_per_second": 259.112, "eval_steps_per_second": 8.098, "step": 9400 }, { "epoch": 0.5957233335423591, "grad_norm": 136.55160522460938, "learning_rate": 1.8914804298977512e-05, "loss": 2.2377, "step": 9500 }, { "epoch": 0.5957233335423591, "eval_loss": 2.11759090423584, "eval_runtime": 495.0098, "eval_samples_per_second": 257.716, "eval_steps_per_second": 8.054, "step": 9500 }, { "epoch": 0.6019941054743839, "grad_norm": 18.554906845092773, "learning_rate": 1.887996655576652e-05, "loss": 2.2269, "step": 9600 }, { "epoch": 0.6019941054743839, "eval_loss": 2.002722978591919, "eval_runtime": 498.3286, "eval_samples_per_second": 256.0, "eval_steps_per_second": 8.001, "step": 9600 }, { "epoch": 0.6082648774064088, "grad_norm": 118.95328521728516, "learning_rate": 1.8845477189987635e-05, "loss": 1.971, "step": 9700 }, { "epoch": 0.6082648774064088, "eval_loss": 1.9329177141189575, "eval_runtime": 493.8369, "eval_samples_per_second": 258.328, "eval_steps_per_second": 8.074, "step": 9700 }, { "epoch": 0.6145356493384335, "grad_norm": 0.6365923285484314, "learning_rate": 1.881063944677664e-05, "loss": 1.8982, "step": 9800 }, { "epoch": 0.6145356493384335, "eval_loss": 1.9797492027282715, "eval_runtime": 496.6699, "eval_samples_per_second": 256.855, "eval_steps_per_second": 8.027, "step": 9800 }, { "epoch": 0.6208064212704584, "grad_norm": 88.75743865966797, "learning_rate": 1.8775801703565644e-05, "loss": 2.2853, "step": 9900 }, { "epoch": 0.6208064212704584, "eval_loss": 1.8433477878570557, "eval_runtime": 502.8332, "eval_samples_per_second": 253.706, "eval_steps_per_second": 7.929, "step": 9900 }, { "epoch": 0.6270771932024832, "grad_norm": 2.4320499897003174, "learning_rate": 1.874096396035465e-05, "loss": 1.6657, "step": 10000 }, { "epoch": 0.6270771932024832, "eval_loss": 2.0090935230255127, "eval_runtime": 487.5525, "eval_samples_per_second": 261.658, "eval_steps_per_second": 8.178, "step": 10000 }, { "epoch": 0.6333479651345081, "grad_norm": 94.46017456054688, "learning_rate": 1.8706126217143653e-05, "loss": 2.0732, "step": 10100 }, { "epoch": 0.6333479651345081, "eval_loss": 1.7602357864379883, "eval_runtime": 476.6373, "eval_samples_per_second": 267.65, "eval_steps_per_second": 8.365, "step": 10100 }, { "epoch": 0.6396187370665329, "grad_norm": 186.80111694335938, "learning_rate": 1.867128847393266e-05, "loss": 1.6951, "step": 10200 }, { "epoch": 0.6396187370665329, "eval_loss": 1.8849464654922485, "eval_runtime": 494.2546, "eval_samples_per_second": 258.11, "eval_steps_per_second": 8.067, "step": 10200 }, { "epoch": 0.6458895089985577, "grad_norm": 175.21151733398438, "learning_rate": 1.8636450730721666e-05, "loss": 1.6548, "step": 10300 }, { "epoch": 0.6458895089985577, "eval_loss": 2.0065879821777344, "eval_runtime": 492.3572, "eval_samples_per_second": 259.105, "eval_steps_per_second": 8.098, "step": 10300 }, { "epoch": 0.6521602809305825, "grad_norm": 1.4941706657409668, "learning_rate": 1.8601612987510672e-05, "loss": 1.7187, "step": 10400 }, { "epoch": 0.6521602809305825, "eval_loss": 1.9644232988357544, "eval_runtime": 502.7154, "eval_samples_per_second": 253.766, "eval_steps_per_second": 7.931, "step": 10400 }, { "epoch": 0.6584310528626074, "grad_norm": 2.646362543106079, "learning_rate": 1.8566775244299675e-05, "loss": 2.1948, "step": 10500 }, { "epoch": 0.6584310528626074, "eval_loss": 1.8391690254211426, "eval_runtime": 478.8123, "eval_samples_per_second": 266.434, "eval_steps_per_second": 8.327, "step": 10500 }, { "epoch": 0.6647018247946322, "grad_norm": 47.12744140625, "learning_rate": 1.8531937501088682e-05, "loss": 1.9756, "step": 10600 }, { "epoch": 0.6647018247946322, "eval_loss": 1.8404371738433838, "eval_runtime": 487.2993, "eval_samples_per_second": 261.794, "eval_steps_per_second": 8.182, "step": 10600 }, { "epoch": 0.6709725967266571, "grad_norm": 0.8185029029846191, "learning_rate": 1.8497099757877685e-05, "loss": 1.7644, "step": 10700 }, { "epoch": 0.6709725967266571, "eval_loss": 1.910104751586914, "eval_runtime": 487.8167, "eval_samples_per_second": 261.516, "eval_steps_per_second": 8.173, "step": 10700 }, { "epoch": 0.6772433686586818, "grad_norm": 115.93226623535156, "learning_rate": 1.846226201466669e-05, "loss": 1.6295, "step": 10800 }, { "epoch": 0.6772433686586818, "eval_loss": 1.943991780281067, "eval_runtime": 489.7195, "eval_samples_per_second": 260.5, "eval_steps_per_second": 8.141, "step": 10800 }, { "epoch": 0.6835141405907067, "grad_norm": 112.30680084228516, "learning_rate": 1.8427772648887807e-05, "loss": 1.7687, "step": 10900 }, { "epoch": 0.6835141405907067, "eval_loss": 1.903131127357483, "eval_runtime": 492.5747, "eval_samples_per_second": 258.99, "eval_steps_per_second": 8.094, "step": 10900 }, { "epoch": 0.6897849125227316, "grad_norm": 105.42852783203125, "learning_rate": 1.839293490567681e-05, "loss": 1.8203, "step": 11000 }, { "epoch": 0.6897849125227316, "eval_loss": 1.9650237560272217, "eval_runtime": 495.7922, "eval_samples_per_second": 257.309, "eval_steps_per_second": 8.042, "step": 11000 }, { "epoch": 0.6960556844547564, "grad_norm": 143.69154357910156, "learning_rate": 1.8358097162465817e-05, "loss": 2.3055, "step": 11100 }, { "epoch": 0.6960556844547564, "eval_loss": 1.8432321548461914, "eval_runtime": 494.4248, "eval_samples_per_second": 258.021, "eval_steps_per_second": 8.064, "step": 11100 }, { "epoch": 0.7023264563867813, "grad_norm": 55.0260009765625, "learning_rate": 1.8323259419254823e-05, "loss": 1.8294, "step": 11200 }, { "epoch": 0.7023264563867813, "eval_loss": 1.7363530397415161, "eval_runtime": 498.3754, "eval_samples_per_second": 255.976, "eval_steps_per_second": 8.0, "step": 11200 }, { "epoch": 0.708597228318806, "grad_norm": 20.01657485961914, "learning_rate": 1.828842167604383e-05, "loss": 2.0026, "step": 11300 }, { "epoch": 0.708597228318806, "eval_loss": 1.789391279220581, "eval_runtime": 487.8486, "eval_samples_per_second": 261.499, "eval_steps_per_second": 8.173, "step": 11300 }, { "epoch": 0.7148680002508309, "grad_norm": 103.39813232421875, "learning_rate": 1.8253583932832832e-05, "loss": 1.9916, "step": 11400 }, { "epoch": 0.7148680002508309, "eval_loss": 1.8342993259429932, "eval_runtime": 493.6206, "eval_samples_per_second": 258.441, "eval_steps_per_second": 8.077, "step": 11400 }, { "epoch": 0.7211387721828557, "grad_norm": 24.854759216308594, "learning_rate": 1.821874618962184e-05, "loss": 1.8698, "step": 11500 }, { "epoch": 0.7211387721828557, "eval_loss": 1.807905673980713, "eval_runtime": 494.1372, "eval_samples_per_second": 258.171, "eval_steps_per_second": 8.069, "step": 11500 }, { "epoch": 0.7274095441148806, "grad_norm": 2.62512469291687, "learning_rate": 1.818390844641084e-05, "loss": 1.5213, "step": 11600 }, { "epoch": 0.7274095441148806, "eval_loss": 1.684904932975769, "eval_runtime": 492.1644, "eval_samples_per_second": 259.206, "eval_steps_per_second": 8.101, "step": 11600 }, { "epoch": 0.7336803160469054, "grad_norm": 68.22614288330078, "learning_rate": 1.8149070703199848e-05, "loss": 1.7462, "step": 11700 }, { "epoch": 0.7336803160469054, "eval_loss": 1.732839822769165, "eval_runtime": 502.4559, "eval_samples_per_second": 253.897, "eval_steps_per_second": 7.935, "step": 11700 }, { "epoch": 0.7399510879789302, "grad_norm": 54.836814880371094, "learning_rate": 1.8114232959988854e-05, "loss": 1.3519, "step": 11800 }, { "epoch": 0.7399510879789302, "eval_loss": 1.8369685411453247, "eval_runtime": 491.7357, "eval_samples_per_second": 259.432, "eval_steps_per_second": 8.108, "step": 11800 }, { "epoch": 0.746221859910955, "grad_norm": 189.6983184814453, "learning_rate": 1.807939521677786e-05, "loss": 1.4935, "step": 11900 }, { "epoch": 0.746221859910955, "eval_loss": 1.72471022605896, "eval_runtime": 497.1147, "eval_samples_per_second": 256.625, "eval_steps_per_second": 8.02, "step": 11900 }, { "epoch": 0.7524926318429799, "grad_norm": 103.63326263427734, "learning_rate": 1.8044557473566863e-05, "loss": 1.1721, "step": 12000 }, { "epoch": 0.7524926318429799, "eval_loss": 1.6529266834259033, "eval_runtime": 496.0569, "eval_samples_per_second": 257.172, "eval_steps_per_second": 8.037, "step": 12000 }, { "epoch": 0.7587634037750047, "grad_norm": 118.19406127929688, "learning_rate": 1.8009719730355866e-05, "loss": 2.2432, "step": 12100 }, { "epoch": 0.7587634037750047, "eval_loss": 1.6328880786895752, "eval_runtime": 502.1124, "eval_samples_per_second": 254.071, "eval_steps_per_second": 7.94, "step": 12100 }, { "epoch": 0.7650341757070296, "grad_norm": 135.55650329589844, "learning_rate": 1.7974881987144873e-05, "loss": 1.6931, "step": 12200 }, { "epoch": 0.7650341757070296, "eval_loss": 1.6563047170639038, "eval_runtime": 492.9421, "eval_samples_per_second": 258.797, "eval_steps_per_second": 8.088, "step": 12200 }, { "epoch": 0.7713049476390543, "grad_norm": 3.689490795135498, "learning_rate": 1.794004424393388e-05, "loss": 1.2736, "step": 12300 }, { "epoch": 0.7713049476390543, "eval_loss": 1.6984437704086304, "eval_runtime": 495.6061, "eval_samples_per_second": 257.406, "eval_steps_per_second": 8.045, "step": 12300 }, { "epoch": 0.7775757195710792, "grad_norm": 88.78681945800781, "learning_rate": 1.7905206500722885e-05, "loss": 1.7063, "step": 12400 }, { "epoch": 0.7775757195710792, "eval_loss": 1.6574100255966187, "eval_runtime": 504.1606, "eval_samples_per_second": 253.038, "eval_steps_per_second": 7.908, "step": 12400 }, { "epoch": 0.783846491503104, "grad_norm": 20.35865592956543, "learning_rate": 1.787036875751189e-05, "loss": 1.7921, "step": 12500 }, { "epoch": 0.783846491503104, "eval_loss": 1.7759722471237183, "eval_runtime": 497.8975, "eval_samples_per_second": 256.221, "eval_steps_per_second": 8.008, "step": 12500 }, { "epoch": 0.7901172634351289, "grad_norm": 2.046844720840454, "learning_rate": 1.7835531014300895e-05, "loss": 1.875, "step": 12600 }, { "epoch": 0.7901172634351289, "eval_loss": 1.7148810625076294, "eval_runtime": 492.3064, "eval_samples_per_second": 259.131, "eval_steps_per_second": 8.099, "step": 12600 }, { "epoch": 0.7963880353671537, "grad_norm": 0.9655187129974365, "learning_rate": 1.78006932710899e-05, "loss": 1.4435, "step": 12700 }, { "epoch": 0.7963880353671537, "eval_loss": 1.8084521293640137, "eval_runtime": 502.53, "eval_samples_per_second": 253.859, "eval_steps_per_second": 7.934, "step": 12700 }, { "epoch": 0.8026588072991785, "grad_norm": 135.72523498535156, "learning_rate": 1.7765855527878907e-05, "loss": 1.5271, "step": 12800 }, { "epoch": 0.8026588072991785, "eval_loss": 1.7246832847595215, "eval_runtime": 498.9684, "eval_samples_per_second": 255.672, "eval_steps_per_second": 7.99, "step": 12800 }, { "epoch": 0.8089295792312033, "grad_norm": 1.4582579135894775, "learning_rate": 1.773101778466791e-05, "loss": 1.618, "step": 12900 }, { "epoch": 0.8089295792312033, "eval_loss": 1.6542091369628906, "eval_runtime": 498.6777, "eval_samples_per_second": 255.821, "eval_steps_per_second": 7.995, "step": 12900 }, { "epoch": 0.8152003511632282, "grad_norm": 240.90525817871094, "learning_rate": 1.7696528418889027e-05, "loss": 1.9788, "step": 13000 }, { "epoch": 0.8152003511632282, "eval_loss": 1.5685710906982422, "eval_runtime": 501.55, "eval_samples_per_second": 254.355, "eval_steps_per_second": 7.949, "step": 13000 }, { "epoch": 0.821471123095253, "grad_norm": 0.4606687128543854, "learning_rate": 1.7661690675678033e-05, "loss": 1.8213, "step": 13100 }, { "epoch": 0.821471123095253, "eval_loss": 1.560313105583191, "eval_runtime": 495.8302, "eval_samples_per_second": 257.29, "eval_steps_per_second": 8.041, "step": 13100 }, { "epoch": 0.8277418950272779, "grad_norm": 33.05907440185547, "learning_rate": 1.7626852932467036e-05, "loss": 1.3661, "step": 13200 }, { "epoch": 0.8277418950272779, "eval_loss": 1.637640118598938, "eval_runtime": 499.6063, "eval_samples_per_second": 255.345, "eval_steps_per_second": 7.98, "step": 13200 }, { "epoch": 0.8340126669593027, "grad_norm": 58.993228912353516, "learning_rate": 1.7592015189256042e-05, "loss": 1.3852, "step": 13300 }, { "epoch": 0.8340126669593027, "eval_loss": 1.595252513885498, "eval_runtime": 497.3714, "eval_samples_per_second": 256.492, "eval_steps_per_second": 8.016, "step": 13300 }, { "epoch": 0.8402834388913275, "grad_norm": 52.913265228271484, "learning_rate": 1.7557177446045045e-05, "loss": 1.4673, "step": 13400 }, { "epoch": 0.8402834388913275, "eval_loss": 1.634629487991333, "eval_runtime": 497.4186, "eval_samples_per_second": 256.468, "eval_steps_per_second": 8.015, "step": 13400 }, { "epoch": 0.8465542108233524, "grad_norm": 126.8105697631836, "learning_rate": 1.752233970283405e-05, "loss": 1.6684, "step": 13500 }, { "epoch": 0.8465542108233524, "eval_loss": 1.5818397998809814, "eval_runtime": 499.5245, "eval_samples_per_second": 255.387, "eval_steps_per_second": 7.982, "step": 13500 }, { "epoch": 0.8528249827553772, "grad_norm": 130.67335510253906, "learning_rate": 1.7487501959623058e-05, "loss": 1.686, "step": 13600 }, { "epoch": 0.8528249827553772, "eval_loss": 1.5840120315551758, "eval_runtime": 500.7181, "eval_samples_per_second": 254.778, "eval_steps_per_second": 7.963, "step": 13600 }, { "epoch": 0.8590957546874021, "grad_norm": 3.3967671394348145, "learning_rate": 1.7452664216412064e-05, "loss": 1.4397, "step": 13700 }, { "epoch": 0.8590957546874021, "eval_loss": 1.5855337381362915, "eval_runtime": 502.231, "eval_samples_per_second": 254.011, "eval_steps_per_second": 7.939, "step": 13700 }, { "epoch": 0.8653665266194268, "grad_norm": 446.9328918457031, "learning_rate": 1.7417826473201067e-05, "loss": 1.5973, "step": 13800 }, { "epoch": 0.8653665266194268, "eval_loss": 1.720745325088501, "eval_runtime": 501.237, "eval_samples_per_second": 254.514, "eval_steps_per_second": 7.954, "step": 13800 }, { "epoch": 0.8716372985514517, "grad_norm": 0.6950648427009583, "learning_rate": 1.7382988729990073e-05, "loss": 1.221, "step": 13900 }, { "epoch": 0.8716372985514517, "eval_loss": 1.638085961341858, "eval_runtime": 499.3245, "eval_samples_per_second": 255.489, "eval_steps_per_second": 7.985, "step": 13900 }, { "epoch": 0.8779080704834765, "grad_norm": 24.994272232055664, "learning_rate": 1.7348150986779076e-05, "loss": 1.2082, "step": 14000 }, { "epoch": 0.8779080704834765, "eval_loss": 1.6335324048995972, "eval_runtime": 501.6663, "eval_samples_per_second": 254.297, "eval_steps_per_second": 7.948, "step": 14000 }, { "epoch": 0.8841788424155014, "grad_norm": 0.017005544155836105, "learning_rate": 1.7313313243568083e-05, "loss": 1.5399, "step": 14100 }, { "epoch": 0.8841788424155014, "eval_loss": 1.643354058265686, "eval_runtime": 500.7206, "eval_samples_per_second": 254.777, "eval_steps_per_second": 7.963, "step": 14100 }, { "epoch": 0.8904496143475262, "grad_norm": 48.26883316040039, "learning_rate": 1.727847550035709e-05, "loss": 1.5265, "step": 14200 }, { "epoch": 0.8904496143475262, "eval_loss": 1.7265760898590088, "eval_runtime": 503.0351, "eval_samples_per_second": 253.605, "eval_steps_per_second": 7.926, "step": 14200 }, { "epoch": 0.896720386279551, "grad_norm": 4.5458149909973145, "learning_rate": 1.7243637757146095e-05, "loss": 0.9321, "step": 14300 }, { "epoch": 0.896720386279551, "eval_loss": 1.5980534553527832, "eval_runtime": 500.0744, "eval_samples_per_second": 255.106, "eval_steps_per_second": 7.973, "step": 14300 }, { "epoch": 0.9029911582115758, "grad_norm": 1.9558783769607544, "learning_rate": 1.72088000139351e-05, "loss": 1.1133, "step": 14400 }, { "epoch": 0.9029911582115758, "eval_loss": 1.612575650215149, "eval_runtime": 502.1556, "eval_samples_per_second": 254.049, "eval_steps_per_second": 7.94, "step": 14400 }, { "epoch": 0.9092619301436007, "grad_norm": 71.19198608398438, "learning_rate": 1.7173962270724105e-05, "loss": 1.0754, "step": 14500 }, { "epoch": 0.9092619301436007, "eval_loss": 1.6227186918258667, "eval_runtime": 493.8123, "eval_samples_per_second": 258.341, "eval_steps_per_second": 8.074, "step": 14500 }, { "epoch": 0.9155327020756255, "grad_norm": 0.26305466890335083, "learning_rate": 1.7139124527513108e-05, "loss": 1.3486, "step": 14600 }, { "epoch": 0.9155327020756255, "eval_loss": 1.6142776012420654, "eval_runtime": 499.3615, "eval_samples_per_second": 255.47, "eval_steps_per_second": 7.984, "step": 14600 }, { "epoch": 0.9218034740076504, "grad_norm": 35.207157135009766, "learning_rate": 1.7104286784302114e-05, "loss": 1.6338, "step": 14700 }, { "epoch": 0.9218034740076504, "eval_loss": 1.5451936721801758, "eval_runtime": 501.5927, "eval_samples_per_second": 254.334, "eval_steps_per_second": 7.949, "step": 14700 }, { "epoch": 0.9280742459396751, "grad_norm": 213.60140991210938, "learning_rate": 1.706944904109112e-05, "loss": 1.389, "step": 14800 }, { "epoch": 0.9280742459396751, "eval_loss": 1.6098874807357788, "eval_runtime": 501.8582, "eval_samples_per_second": 254.199, "eval_steps_per_second": 7.944, "step": 14800 }, { "epoch": 0.9343450178717, "grad_norm": 111.08502960205078, "learning_rate": 1.7034611297880123e-05, "loss": 1.3776, "step": 14900 }, { "epoch": 0.9343450178717, "eval_loss": 1.6435140371322632, "eval_runtime": 502.6322, "eval_samples_per_second": 253.808, "eval_steps_per_second": 7.932, "step": 14900 }, { "epoch": 0.9406157898037248, "grad_norm": 18.123170852661133, "learning_rate": 1.699977355466913e-05, "loss": 1.8714, "step": 15000 }, { "epoch": 0.9406157898037248, "eval_loss": 1.537667989730835, "eval_runtime": 501.0111, "eval_samples_per_second": 254.629, "eval_steps_per_second": 7.958, "step": 15000 }, { "epoch": 0.9468865617357497, "grad_norm": 155.93455505371094, "learning_rate": 1.6964935811458133e-05, "loss": 1.1286, "step": 15100 }, { "epoch": 0.9468865617357497, "eval_loss": 1.6325874328613281, "eval_runtime": 502.9153, "eval_samples_per_second": 253.665, "eval_steps_per_second": 7.928, "step": 15100 }, { "epoch": 0.9531573336677746, "grad_norm": 172.1987762451172, "learning_rate": 1.693009806824714e-05, "loss": 1.4029, "step": 15200 }, { "epoch": 0.9531573336677746, "eval_loss": 1.6255732774734497, "eval_runtime": 508.4447, "eval_samples_per_second": 250.906, "eval_steps_per_second": 7.842, "step": 15200 }, { "epoch": 0.9594281055997993, "grad_norm": 6.499632835388184, "learning_rate": 1.6895260325036145e-05, "loss": 1.7772, "step": 15300 }, { "epoch": 0.9594281055997993, "eval_loss": 1.5221425294876099, "eval_runtime": 502.3511, "eval_samples_per_second": 253.95, "eval_steps_per_second": 7.937, "step": 15300 }, { "epoch": 0.9656988775318242, "grad_norm": 0.45312049984931946, "learning_rate": 1.686042258182515e-05, "loss": 1.3415, "step": 15400 }, { "epoch": 0.9656988775318242, "eval_loss": 1.5603629350662231, "eval_runtime": 502.2256, "eval_samples_per_second": 254.013, "eval_steps_per_second": 7.939, "step": 15400 }, { "epoch": 0.971969649463849, "grad_norm": 1.405121922492981, "learning_rate": 1.6825584838614155e-05, "loss": 1.1088, "step": 15500 }, { "epoch": 0.971969649463849, "eval_loss": 1.574865698814392, "eval_runtime": 517.5942, "eval_samples_per_second": 246.471, "eval_steps_per_second": 7.703, "step": 15500 }, { "epoch": 0.9782404213958739, "grad_norm": 6.808924198150635, "learning_rate": 1.679074709540316e-05, "loss": 1.4602, "step": 15600 }, { "epoch": 0.9782404213958739, "eval_loss": 1.494147777557373, "eval_runtime": 505.721, "eval_samples_per_second": 252.258, "eval_steps_per_second": 7.884, "step": 15600 }, { "epoch": 0.9845111933278987, "grad_norm": 88.74259185791016, "learning_rate": 1.6755909352192167e-05, "loss": 1.867, "step": 15700 }, { "epoch": 0.9845111933278987, "eval_loss": 1.3730698823928833, "eval_runtime": 515.7676, "eval_samples_per_second": 247.344, "eval_steps_per_second": 7.73, "step": 15700 }, { "epoch": 0.9907819652599235, "grad_norm": 3.1625919342041016, "learning_rate": 1.672141998641328e-05, "loss": 1.4541, "step": 15800 }, { "epoch": 0.9907819652599235, "eval_loss": 1.4205607175827026, "eval_runtime": 486.2625, "eval_samples_per_second": 262.352, "eval_steps_per_second": 8.199, "step": 15800 }, { "epoch": 0.9970527371919483, "grad_norm": 135.7765655517578, "learning_rate": 1.6686930620634396e-05, "loss": 1.1966, "step": 15900 }, { "epoch": 0.9970527371919483, "eval_loss": 1.4982208013534546, "eval_runtime": 494.4178, "eval_samples_per_second": 258.025, "eval_steps_per_second": 8.064, "step": 15900 }, { "epoch": 1.0033235091239732, "grad_norm": 97.84881591796875, "learning_rate": 1.6652092877423403e-05, "loss": 1.1447, "step": 16000 }, { "epoch": 1.0033235091239732, "eval_loss": 1.5120809078216553, "eval_runtime": 491.4115, "eval_samples_per_second": 259.603, "eval_steps_per_second": 8.113, "step": 16000 }, { "epoch": 1.009594281055998, "grad_norm": 152.9120635986328, "learning_rate": 1.6617255134212406e-05, "loss": 1.1266, "step": 16100 }, { "epoch": 1.009594281055998, "eval_loss": 1.4103273153305054, "eval_runtime": 501.3393, "eval_samples_per_second": 254.462, "eval_steps_per_second": 7.953, "step": 16100 }, { "epoch": 1.0158650529880229, "grad_norm": 10.062068939208984, "learning_rate": 1.6582417391001412e-05, "loss": 1.1971, "step": 16200 }, { "epoch": 1.0158650529880229, "eval_loss": 1.5044476985931396, "eval_runtime": 500.4234, "eval_samples_per_second": 254.928, "eval_steps_per_second": 7.967, "step": 16200 }, { "epoch": 1.0221358249200476, "grad_norm": 385.3752136230469, "learning_rate": 1.6547579647790418e-05, "loss": 1.3376, "step": 16300 }, { "epoch": 1.0221358249200476, "eval_loss": 1.5336840152740479, "eval_runtime": 494.8747, "eval_samples_per_second": 257.786, "eval_steps_per_second": 8.057, "step": 16300 }, { "epoch": 1.0284065968520726, "grad_norm": 0.33529093861579895, "learning_rate": 1.6512741904579425e-05, "loss": 1.7977, "step": 16400 }, { "epoch": 1.0284065968520726, "eval_loss": 1.5711828470230103, "eval_runtime": 502.7844, "eval_samples_per_second": 253.731, "eval_steps_per_second": 7.93, "step": 16400 }, { "epoch": 1.0346773687840973, "grad_norm": 228.05165100097656, "learning_rate": 1.6477904161368428e-05, "loss": 1.6946, "step": 16500 }, { "epoch": 1.0346773687840973, "eval_loss": 1.5322738885879517, "eval_runtime": 498.3968, "eval_samples_per_second": 255.965, "eval_steps_per_second": 8.0, "step": 16500 }, { "epoch": 1.040948140716122, "grad_norm": 1.2080790996551514, "learning_rate": 1.6443066418157434e-05, "loss": 0.8674, "step": 16600 }, { "epoch": 1.040948140716122, "eval_loss": 1.4461946487426758, "eval_runtime": 492.3648, "eval_samples_per_second": 259.101, "eval_steps_per_second": 8.098, "step": 16600 }, { "epoch": 1.047218912648147, "grad_norm": 68.50479888916016, "learning_rate": 1.6408228674946437e-05, "loss": 1.6447, "step": 16700 }, { "epoch": 1.047218912648147, "eval_loss": 1.483079433441162, "eval_runtime": 496.9095, "eval_samples_per_second": 256.731, "eval_steps_per_second": 8.024, "step": 16700 }, { "epoch": 1.0534896845801718, "grad_norm": 0.08792801946401596, "learning_rate": 1.6373390931735443e-05, "loss": 1.2709, "step": 16800 }, { "epoch": 1.0534896845801718, "eval_loss": 1.575551986694336, "eval_runtime": 503.9395, "eval_samples_per_second": 253.149, "eval_steps_per_second": 7.912, "step": 16800 }, { "epoch": 1.0597604565121967, "grad_norm": 1.81405770778656, "learning_rate": 1.633855318852445e-05, "loss": 1.5217, "step": 16900 }, { "epoch": 1.0597604565121967, "eval_loss": 1.5059562921524048, "eval_runtime": 506.8836, "eval_samples_per_second": 251.679, "eval_steps_per_second": 7.866, "step": 16900 }, { "epoch": 1.0660312284442215, "grad_norm": 38.73731231689453, "learning_rate": 1.6303715445313456e-05, "loss": 1.2986, "step": 17000 }, { "epoch": 1.0660312284442215, "eval_loss": 1.4834423065185547, "eval_runtime": 503.4795, "eval_samples_per_second": 253.381, "eval_steps_per_second": 7.919, "step": 17000 }, { "epoch": 1.0723020003762462, "grad_norm": 0.7970458269119263, "learning_rate": 1.626887770210246e-05, "loss": 0.9976, "step": 17100 }, { "epoch": 1.0723020003762462, "eval_loss": 1.4840906858444214, "eval_runtime": 504.3957, "eval_samples_per_second": 252.92, "eval_steps_per_second": 7.905, "step": 17100 }, { "epoch": 1.0785727723082712, "grad_norm": 0.04621260613203049, "learning_rate": 1.6234039958891465e-05, "loss": 1.3457, "step": 17200 }, { "epoch": 1.0785727723082712, "eval_loss": 1.4227601289749146, "eval_runtime": 493.9233, "eval_samples_per_second": 258.283, "eval_steps_per_second": 8.072, "step": 17200 }, { "epoch": 1.084843544240296, "grad_norm": 0.5272818803787231, "learning_rate": 1.6199202215680468e-05, "loss": 0.987, "step": 17300 }, { "epoch": 1.084843544240296, "eval_loss": 1.3806939125061035, "eval_runtime": 501.4303, "eval_samples_per_second": 254.416, "eval_steps_per_second": 7.951, "step": 17300 }, { "epoch": 1.091114316172321, "grad_norm": 0.4564209282398224, "learning_rate": 1.6164364472469474e-05, "loss": 1.2714, "step": 17400 }, { "epoch": 1.091114316172321, "eval_loss": 1.3470913171768188, "eval_runtime": 501.1914, "eval_samples_per_second": 254.538, "eval_steps_per_second": 7.955, "step": 17400 }, { "epoch": 1.0973850881043457, "grad_norm": 14.678479194641113, "learning_rate": 1.612952672925848e-05, "loss": 1.298, "step": 17500 }, { "epoch": 1.0973850881043457, "eval_loss": 1.4133707284927368, "eval_runtime": 499.6488, "eval_samples_per_second": 255.323, "eval_steps_per_second": 7.98, "step": 17500 }, { "epoch": 1.1036558600363704, "grad_norm": 1.6324628591537476, "learning_rate": 1.6094688986047484e-05, "loss": 0.9522, "step": 17600 }, { "epoch": 1.1036558600363704, "eval_loss": 1.4225292205810547, "eval_runtime": 501.7975, "eval_samples_per_second": 254.23, "eval_steps_per_second": 7.945, "step": 17600 }, { "epoch": 1.1099266319683954, "grad_norm": 1.6328845024108887, "learning_rate": 1.605985124283649e-05, "loss": 1.0634, "step": 17700 }, { "epoch": 1.1099266319683954, "eval_loss": 1.4474034309387207, "eval_runtime": 505.9682, "eval_samples_per_second": 252.134, "eval_steps_per_second": 7.88, "step": 17700 }, { "epoch": 1.11619740390042, "grad_norm": 0.9931433200836182, "learning_rate": 1.6025013499625493e-05, "loss": 1.2889, "step": 17800 }, { "epoch": 1.11619740390042, "eval_loss": 1.4678562879562378, "eval_runtime": 503.5781, "eval_samples_per_second": 253.331, "eval_steps_per_second": 7.917, "step": 17800 }, { "epoch": 1.122468175832445, "grad_norm": 59.28689956665039, "learning_rate": 1.5990175756414503e-05, "loss": 1.7532, "step": 17900 }, { "epoch": 1.122468175832445, "eval_loss": 1.3757271766662598, "eval_runtime": 499.9963, "eval_samples_per_second": 255.146, "eval_steps_per_second": 7.974, "step": 17900 }, { "epoch": 1.1287389477644698, "grad_norm": 72.52947998046875, "learning_rate": 1.5955338013203506e-05, "loss": 1.6613, "step": 18000 }, { "epoch": 1.1287389477644698, "eval_loss": 1.3807989358901978, "eval_runtime": 492.0199, "eval_samples_per_second": 259.282, "eval_steps_per_second": 8.103, "step": 18000 }, { "epoch": 1.1350097196964946, "grad_norm": 29.813941955566406, "learning_rate": 1.5920500269992512e-05, "loss": 1.1765, "step": 18100 }, { "epoch": 1.1350097196964946, "eval_loss": 1.3903069496154785, "eval_runtime": 501.0104, "eval_samples_per_second": 254.629, "eval_steps_per_second": 7.958, "step": 18100 }, { "epoch": 1.1412804916285195, "grad_norm": 1.3140065670013428, "learning_rate": 1.5885662526781515e-05, "loss": 1.2787, "step": 18200 }, { "epoch": 1.1412804916285195, "eval_loss": 1.3920559883117676, "eval_runtime": 500.2005, "eval_samples_per_second": 255.042, "eval_steps_per_second": 7.971, "step": 18200 }, { "epoch": 1.1475512635605443, "grad_norm": 0.21044209599494934, "learning_rate": 1.585082478357052e-05, "loss": 1.2532, "step": 18300 }, { "epoch": 1.1475512635605443, "eval_loss": 1.3519495725631714, "eval_runtime": 505.681, "eval_samples_per_second": 252.278, "eval_steps_per_second": 7.884, "step": 18300 }, { "epoch": 1.1538220354925692, "grad_norm": 56.845211029052734, "learning_rate": 1.5815987040359528e-05, "loss": 1.8056, "step": 18400 }, { "epoch": 1.1538220354925692, "eval_loss": 1.2984182834625244, "eval_runtime": 507.7377, "eval_samples_per_second": 251.256, "eval_steps_per_second": 7.852, "step": 18400 }, { "epoch": 1.160092807424594, "grad_norm": 99.5033950805664, "learning_rate": 1.5781149297148534e-05, "loss": 1.0985, "step": 18500 }, { "epoch": 1.160092807424594, "eval_loss": 1.3321679830551147, "eval_runtime": 504.9231, "eval_samples_per_second": 252.656, "eval_steps_per_second": 7.896, "step": 18500 }, { "epoch": 1.1663635793566187, "grad_norm": 75.43387603759766, "learning_rate": 1.5746311553937537e-05, "loss": 1.8665, "step": 18600 }, { "epoch": 1.1663635793566187, "eval_loss": 1.4059826135635376, "eval_runtime": 495.7712, "eval_samples_per_second": 257.32, "eval_steps_per_second": 8.042, "step": 18600 }, { "epoch": 1.1726343512886437, "grad_norm": 111.51386260986328, "learning_rate": 1.5711473810726543e-05, "loss": 1.2427, "step": 18700 }, { "epoch": 1.1726343512886437, "eval_loss": 1.3774936199188232, "eval_runtime": 502.6217, "eval_samples_per_second": 253.813, "eval_steps_per_second": 7.932, "step": 18700 }, { "epoch": 1.1789051232206684, "grad_norm": 1.3077305555343628, "learning_rate": 1.5676636067515546e-05, "loss": 1.1241, "step": 18800 }, { "epoch": 1.1789051232206684, "eval_loss": 1.3168435096740723, "eval_runtime": 498.3092, "eval_samples_per_second": 256.01, "eval_steps_per_second": 8.001, "step": 18800 }, { "epoch": 1.1851758951526934, "grad_norm": 29.557662963867188, "learning_rate": 1.5641798324304553e-05, "loss": 1.2348, "step": 18900 }, { "epoch": 1.1851758951526934, "eval_loss": 1.353879690170288, "eval_runtime": 503.07, "eval_samples_per_second": 253.587, "eval_steps_per_second": 7.925, "step": 18900 }, { "epoch": 1.1914466670847181, "grad_norm": 65.81330871582031, "learning_rate": 1.560696058109356e-05, "loss": 1.1709, "step": 19000 }, { "epoch": 1.1914466670847181, "eval_loss": 1.3540174961090088, "eval_runtime": 498.735, "eval_samples_per_second": 255.791, "eval_steps_per_second": 7.994, "step": 19000 }, { "epoch": 1.1977174390167429, "grad_norm": 48.844017028808594, "learning_rate": 1.5572122837882565e-05, "loss": 0.8844, "step": 19100 }, { "epoch": 1.1977174390167429, "eval_loss": 1.3141909837722778, "eval_runtime": 498.6069, "eval_samples_per_second": 255.857, "eval_steps_per_second": 7.996, "step": 19100 }, { "epoch": 1.2039882109487678, "grad_norm": 3.451929807662964, "learning_rate": 1.5537285094671568e-05, "loss": 1.0035, "step": 19200 }, { "epoch": 1.2039882109487678, "eval_loss": 1.3781260251998901, "eval_runtime": 506.2945, "eval_samples_per_second": 251.972, "eval_steps_per_second": 7.875, "step": 19200 }, { "epoch": 1.2102589828807926, "grad_norm": 77.69365692138672, "learning_rate": 1.5502447351460575e-05, "loss": 1.4279, "step": 19300 }, { "epoch": 1.2102589828807926, "eval_loss": 1.261493444442749, "eval_runtime": 498.5065, "eval_samples_per_second": 255.908, "eval_steps_per_second": 7.998, "step": 19300 }, { "epoch": 1.2165297548128176, "grad_norm": 21.791259765625, "learning_rate": 1.5467609608249577e-05, "loss": 1.3327, "step": 19400 }, { "epoch": 1.2165297548128176, "eval_loss": 1.2696096897125244, "eval_runtime": 498.1301, "eval_samples_per_second": 256.102, "eval_steps_per_second": 8.004, "step": 19400 }, { "epoch": 1.2228005267448423, "grad_norm": 2.250319242477417, "learning_rate": 1.5432771865038584e-05, "loss": 0.993, "step": 19500 }, { "epoch": 1.2228005267448423, "eval_loss": 1.3169900178909302, "eval_runtime": 495.9918, "eval_samples_per_second": 257.206, "eval_steps_per_second": 8.038, "step": 19500 }, { "epoch": 1.229071298676867, "grad_norm": 73.77873229980469, "learning_rate": 1.539793412182759e-05, "loss": 0.7869, "step": 19600 }, { "epoch": 1.229071298676867, "eval_loss": 1.2967498302459717, "eval_runtime": 497.8866, "eval_samples_per_second": 256.227, "eval_steps_per_second": 8.008, "step": 19600 }, { "epoch": 1.235342070608892, "grad_norm": 0.07626141607761383, "learning_rate": 1.5363096378616596e-05, "loss": 0.985, "step": 19700 }, { "epoch": 1.235342070608892, "eval_loss": 1.3056693077087402, "eval_runtime": 494.5073, "eval_samples_per_second": 257.978, "eval_steps_per_second": 8.063, "step": 19700 }, { "epoch": 1.2416128425409168, "grad_norm": 4.803875923156738, "learning_rate": 1.53282586354056e-05, "loss": 1.1603, "step": 19800 }, { "epoch": 1.2416128425409168, "eval_loss": 1.2796647548675537, "eval_runtime": 496.7276, "eval_samples_per_second": 256.825, "eval_steps_per_second": 8.027, "step": 19800 }, { "epoch": 1.2478836144729417, "grad_norm": 63.491329193115234, "learning_rate": 1.5293420892194606e-05, "loss": 1.2469, "step": 19900 }, { "epoch": 1.2478836144729417, "eval_loss": 1.2394485473632812, "eval_runtime": 504.5722, "eval_samples_per_second": 252.832, "eval_steps_per_second": 7.902, "step": 19900 }, { "epoch": 1.2541543864049665, "grad_norm": 155.53126525878906, "learning_rate": 1.525858314898361e-05, "loss": 1.521, "step": 20000 }, { "epoch": 1.2541543864049665, "eval_loss": 1.2309328317642212, "eval_runtime": 499.147, "eval_samples_per_second": 255.58, "eval_steps_per_second": 7.988, "step": 20000 }, { "epoch": 1.2604251583369912, "grad_norm": 0.10026417672634125, "learning_rate": 1.5223745405772617e-05, "loss": 1.2632, "step": 20100 }, { "epoch": 1.2604251583369912, "eval_loss": 1.2352900505065918, "eval_runtime": 498.6806, "eval_samples_per_second": 255.819, "eval_steps_per_second": 7.995, "step": 20100 }, { "epoch": 1.2666959302690162, "grad_norm": 20.156579971313477, "learning_rate": 1.518890766256162e-05, "loss": 1.3621, "step": 20200 }, { "epoch": 1.2666959302690162, "eval_loss": 1.2432923316955566, "eval_runtime": 488.6088, "eval_samples_per_second": 261.092, "eval_steps_per_second": 8.16, "step": 20200 }, { "epoch": 1.272966702201041, "grad_norm": 1.3594141006469727, "learning_rate": 1.5154069919350624e-05, "loss": 1.5145, "step": 20300 }, { "epoch": 1.272966702201041, "eval_loss": 1.3064727783203125, "eval_runtime": 501.5288, "eval_samples_per_second": 254.366, "eval_steps_per_second": 7.95, "step": 20300 }, { "epoch": 1.2792374741330659, "grad_norm": 26.742637634277344, "learning_rate": 1.511923217613963e-05, "loss": 1.3708, "step": 20400 }, { "epoch": 1.2792374741330659, "eval_loss": 1.2422964572906494, "eval_runtime": 504.9841, "eval_samples_per_second": 252.626, "eval_steps_per_second": 7.895, "step": 20400 }, { "epoch": 1.2855082460650906, "grad_norm": 666.2847290039062, "learning_rate": 1.5084394432928635e-05, "loss": 1.1716, "step": 20500 }, { "epoch": 1.2855082460650906, "eval_loss": 1.2922592163085938, "eval_runtime": 502.0283, "eval_samples_per_second": 254.113, "eval_steps_per_second": 7.942, "step": 20500 }, { "epoch": 1.2917790179971154, "grad_norm": 0.90843665599823, "learning_rate": 1.5049556689717642e-05, "loss": 1.419, "step": 20600 }, { "epoch": 1.2917790179971154, "eval_loss": 1.2193955183029175, "eval_runtime": 496.5986, "eval_samples_per_second": 256.892, "eval_steps_per_second": 8.029, "step": 20600 }, { "epoch": 1.2980497899291403, "grad_norm": 174.012451171875, "learning_rate": 1.5014718946506646e-05, "loss": 1.1644, "step": 20700 }, { "epoch": 1.2980497899291403, "eval_loss": 1.2368745803833008, "eval_runtime": 500.022, "eval_samples_per_second": 255.133, "eval_steps_per_second": 7.974, "step": 20700 }, { "epoch": 1.304320561861165, "grad_norm": 7.468738555908203, "learning_rate": 1.4979881203295653e-05, "loss": 1.6589, "step": 20800 }, { "epoch": 1.304320561861165, "eval_loss": 1.1971392631530762, "eval_runtime": 500.4989, "eval_samples_per_second": 254.89, "eval_steps_per_second": 7.966, "step": 20800 }, { "epoch": 1.31059133379319, "grad_norm": 120.70152282714844, "learning_rate": 1.4945043460084656e-05, "loss": 1.0299, "step": 20900 }, { "epoch": 1.31059133379319, "eval_loss": 1.2342555522918701, "eval_runtime": 499.8846, "eval_samples_per_second": 255.203, "eval_steps_per_second": 7.976, "step": 20900 }, { "epoch": 1.3168621057252148, "grad_norm": 90.38188934326172, "learning_rate": 1.4910205716873662e-05, "loss": 1.3452, "step": 21000 }, { "epoch": 1.3168621057252148, "eval_loss": 1.2725248336791992, "eval_runtime": 490.3505, "eval_samples_per_second": 260.165, "eval_steps_per_second": 8.131, "step": 21000 }, { "epoch": 1.3231328776572395, "grad_norm": 0.8048076033592224, "learning_rate": 1.4875367973662667e-05, "loss": 1.4234, "step": 21100 }, { "epoch": 1.3231328776572395, "eval_loss": 1.2416248321533203, "eval_runtime": 493.5915, "eval_samples_per_second": 258.457, "eval_steps_per_second": 8.078, "step": 21100 }, { "epoch": 1.3294036495892645, "grad_norm": 102.93982696533203, "learning_rate": 1.4840530230451673e-05, "loss": 1.2496, "step": 21200 }, { "epoch": 1.3294036495892645, "eval_loss": 1.3609205484390259, "eval_runtime": 501.4555, "eval_samples_per_second": 254.403, "eval_steps_per_second": 7.951, "step": 21200 }, { "epoch": 1.3356744215212892, "grad_norm": 0.0473560094833374, "learning_rate": 1.4805692487240678e-05, "loss": 1.2133, "step": 21300 }, { "epoch": 1.3356744215212892, "eval_loss": 1.2892857789993286, "eval_runtime": 486.7923, "eval_samples_per_second": 262.067, "eval_steps_per_second": 8.19, "step": 21300 }, { "epoch": 1.3419451934533142, "grad_norm": 0.2829754948616028, "learning_rate": 1.4770854744029684e-05, "loss": 0.8682, "step": 21400 }, { "epoch": 1.3419451934533142, "eval_loss": 1.2352983951568604, "eval_runtime": 501.6238, "eval_samples_per_second": 254.318, "eval_steps_per_second": 7.948, "step": 21400 }, { "epoch": 1.348215965385339, "grad_norm": 0.09349790215492249, "learning_rate": 1.4736017000818687e-05, "loss": 0.9499, "step": 21500 }, { "epoch": 1.348215965385339, "eval_loss": 1.2423368692398071, "eval_runtime": 503.2262, "eval_samples_per_second": 253.508, "eval_steps_per_second": 7.923, "step": 21500 }, { "epoch": 1.3544867373173637, "grad_norm": 0.7133996486663818, "learning_rate": 1.4701179257607693e-05, "loss": 1.2896, "step": 21600 }, { "epoch": 1.3544867373173637, "eval_loss": 1.1796832084655762, "eval_runtime": 504.2727, "eval_samples_per_second": 252.982, "eval_steps_per_second": 7.906, "step": 21600 }, { "epoch": 1.3607575092493887, "grad_norm": 44.3637580871582, "learning_rate": 1.4666341514396698e-05, "loss": 1.2392, "step": 21700 }, { "epoch": 1.3607575092493887, "eval_loss": 1.1962292194366455, "eval_runtime": 504.4317, "eval_samples_per_second": 252.902, "eval_steps_per_second": 7.904, "step": 21700 }, { "epoch": 1.3670282811814134, "grad_norm": 41.141788482666016, "learning_rate": 1.4631503771185704e-05, "loss": 0.9206, "step": 21800 }, { "epoch": 1.3670282811814134, "eval_loss": 1.2483233213424683, "eval_runtime": 502.7012, "eval_samples_per_second": 253.773, "eval_steps_per_second": 7.931, "step": 21800 }, { "epoch": 1.3732990531134384, "grad_norm": 0.8109003901481628, "learning_rate": 1.4596666027974709e-05, "loss": 1.174, "step": 21900 }, { "epoch": 1.3732990531134384, "eval_loss": 1.23282790184021, "eval_runtime": 505.2234, "eval_samples_per_second": 252.506, "eval_steps_per_second": 7.892, "step": 21900 }, { "epoch": 1.379569825045463, "grad_norm": 74.6466293334961, "learning_rate": 1.4562176662195823e-05, "loss": 1.6361, "step": 22000 }, { "epoch": 1.379569825045463, "eval_loss": 1.1558316946029663, "eval_runtime": 501.4654, "eval_samples_per_second": 254.398, "eval_steps_per_second": 7.951, "step": 22000 }, { "epoch": 1.3858405969774878, "grad_norm": 5.058000087738037, "learning_rate": 1.452733891898483e-05, "loss": 0.8284, "step": 22100 }, { "epoch": 1.3858405969774878, "eval_loss": 1.271115779876709, "eval_runtime": 493.6867, "eval_samples_per_second": 258.407, "eval_steps_per_second": 8.076, "step": 22100 }, { "epoch": 1.3921113689095128, "grad_norm": 1.647706389427185, "learning_rate": 1.4492849553205946e-05, "loss": 1.2814, "step": 22200 }, { "epoch": 1.3921113689095128, "eval_loss": 1.246185064315796, "eval_runtime": 492.0347, "eval_samples_per_second": 259.274, "eval_steps_per_second": 8.103, "step": 22200 }, { "epoch": 1.3983821408415376, "grad_norm": 0.4397072494029999, "learning_rate": 1.4458011809994949e-05, "loss": 1.1595, "step": 22300 }, { "epoch": 1.3983821408415376, "eval_loss": 1.2613025903701782, "eval_runtime": 498.6845, "eval_samples_per_second": 255.817, "eval_steps_per_second": 7.995, "step": 22300 }, { "epoch": 1.4046529127735625, "grad_norm": 16.611690521240234, "learning_rate": 1.4423174066783955e-05, "loss": 1.3129, "step": 22400 }, { "epoch": 1.4046529127735625, "eval_loss": 1.1816045045852661, "eval_runtime": 496.2598, "eval_samples_per_second": 257.067, "eval_steps_per_second": 8.034, "step": 22400 }, { "epoch": 1.4109236847055873, "grad_norm": 69.52592468261719, "learning_rate": 1.438833632357296e-05, "loss": 1.1353, "step": 22500 }, { "epoch": 1.4109236847055873, "eval_loss": 1.245389699935913, "eval_runtime": 494.4633, "eval_samples_per_second": 258.001, "eval_steps_per_second": 8.063, "step": 22500 }, { "epoch": 1.417194456637612, "grad_norm": 6.014486789703369, "learning_rate": 1.4353498580361966e-05, "loss": 1.3302, "step": 22600 }, { "epoch": 1.417194456637612, "eval_loss": 1.1397989988327026, "eval_runtime": 503.4853, "eval_samples_per_second": 253.378, "eval_steps_per_second": 7.919, "step": 22600 }, { "epoch": 1.423465228569637, "grad_norm": 2.0832605361938477, "learning_rate": 1.4318660837150971e-05, "loss": 1.1591, "step": 22700 }, { "epoch": 1.423465228569637, "eval_loss": 1.2935895919799805, "eval_runtime": 495.0142, "eval_samples_per_second": 257.714, "eval_steps_per_second": 8.054, "step": 22700 }, { "epoch": 1.4297360005016617, "grad_norm": 4.5407891273498535, "learning_rate": 1.4283823093939975e-05, "loss": 0.6551, "step": 22800 }, { "epoch": 1.4297360005016617, "eval_loss": 1.2345027923583984, "eval_runtime": 489.171, "eval_samples_per_second": 260.792, "eval_steps_per_second": 8.151, "step": 22800 }, { "epoch": 1.4360067724336867, "grad_norm": 78.76990509033203, "learning_rate": 1.4248985350728982e-05, "loss": 1.2884, "step": 22900 }, { "epoch": 1.4360067724336867, "eval_loss": 1.1629202365875244, "eval_runtime": 483.3185, "eval_samples_per_second": 263.95, "eval_steps_per_second": 8.249, "step": 22900 }, { "epoch": 1.4422775443657114, "grad_norm": 110.63036346435547, "learning_rate": 1.4214147607517985e-05, "loss": 1.1769, "step": 23000 }, { "epoch": 1.4422775443657114, "eval_loss": 1.2339965105056763, "eval_runtime": 486.5591, "eval_samples_per_second": 262.192, "eval_steps_per_second": 8.194, "step": 23000 }, { "epoch": 1.4485483162977362, "grad_norm": 126.27979278564453, "learning_rate": 1.4179309864306991e-05, "loss": 1.1331, "step": 23100 }, { "epoch": 1.4485483162977362, "eval_loss": 1.2035988569259644, "eval_runtime": 490.9417, "eval_samples_per_second": 259.852, "eval_steps_per_second": 8.121, "step": 23100 }, { "epoch": 1.4548190882297611, "grad_norm": 0.48294782638549805, "learning_rate": 1.4144472121095996e-05, "loss": 1.1008, "step": 23200 }, { "epoch": 1.4548190882297611, "eval_loss": 1.1685419082641602, "eval_runtime": 485.6527, "eval_samples_per_second": 262.682, "eval_steps_per_second": 8.21, "step": 23200 }, { "epoch": 1.4610898601617859, "grad_norm": 6.466658115386963, "learning_rate": 1.4109634377885002e-05, "loss": 1.1487, "step": 23300 }, { "epoch": 1.4610898601617859, "eval_loss": 1.1274471282958984, "eval_runtime": 496.8021, "eval_samples_per_second": 256.786, "eval_steps_per_second": 8.025, "step": 23300 }, { "epoch": 1.4673606320938108, "grad_norm": 18.893667221069336, "learning_rate": 1.4074796634674007e-05, "loss": 0.7753, "step": 23400 }, { "epoch": 1.4673606320938108, "eval_loss": 1.1737704277038574, "eval_runtime": 486.0352, "eval_samples_per_second": 262.475, "eval_steps_per_second": 8.203, "step": 23400 }, { "epoch": 1.4736314040258356, "grad_norm": 19.157712936401367, "learning_rate": 1.4039958891463013e-05, "loss": 1.3236, "step": 23500 }, { "epoch": 1.4736314040258356, "eval_loss": 1.2376619577407837, "eval_runtime": 488.4737, "eval_samples_per_second": 261.164, "eval_steps_per_second": 8.162, "step": 23500 }, { "epoch": 1.4799021759578603, "grad_norm": 9.691899299621582, "learning_rate": 1.4005121148252016e-05, "loss": 0.919, "step": 23600 }, { "epoch": 1.4799021759578603, "eval_loss": 1.2018409967422485, "eval_runtime": 493.2156, "eval_samples_per_second": 258.654, "eval_steps_per_second": 8.084, "step": 23600 }, { "epoch": 1.4861729478898853, "grad_norm": 98.8059310913086, "learning_rate": 1.3970283405041022e-05, "loss": 0.8516, "step": 23700 }, { "epoch": 1.4861729478898853, "eval_loss": 1.2296911478042603, "eval_runtime": 499.0547, "eval_samples_per_second": 255.627, "eval_steps_per_second": 7.989, "step": 23700 }, { "epoch": 1.49244371982191, "grad_norm": 22.1707706451416, "learning_rate": 1.3935445661830027e-05, "loss": 1.092, "step": 23800 }, { "epoch": 1.49244371982191, "eval_loss": 1.1629080772399902, "eval_runtime": 493.4212, "eval_samples_per_second": 258.546, "eval_steps_per_second": 8.08, "step": 23800 }, { "epoch": 1.498714491753935, "grad_norm": 0.31641775369644165, "learning_rate": 1.3900607918619033e-05, "loss": 0.673, "step": 23900 }, { "epoch": 1.498714491753935, "eval_loss": 1.2161920070648193, "eval_runtime": 495.1904, "eval_samples_per_second": 257.622, "eval_steps_per_second": 8.051, "step": 23900 }, { "epoch": 1.5049852636859598, "grad_norm": 0.4521692097187042, "learning_rate": 1.3865770175408038e-05, "loss": 0.994, "step": 24000 }, { "epoch": 1.5049852636859598, "eval_loss": 1.1778312921524048, "eval_runtime": 494.7958, "eval_samples_per_second": 257.828, "eval_steps_per_second": 8.058, "step": 24000 }, { "epoch": 1.5112560356179845, "grad_norm": 1.2718249559402466, "learning_rate": 1.3830932432197044e-05, "loss": 0.8766, "step": 24100 }, { "epoch": 1.5112560356179845, "eval_loss": 1.1902062892913818, "eval_runtime": 498.2478, "eval_samples_per_second": 256.041, "eval_steps_per_second": 8.002, "step": 24100 }, { "epoch": 1.5175268075500095, "grad_norm": 78.13153076171875, "learning_rate": 1.3796094688986047e-05, "loss": 1.3818, "step": 24200 }, { "epoch": 1.5175268075500095, "eval_loss": 1.1638315916061401, "eval_runtime": 475.6768, "eval_samples_per_second": 268.191, "eval_steps_per_second": 8.382, "step": 24200 }, { "epoch": 1.5237975794820342, "grad_norm": 11.799439430236816, "learning_rate": 1.3761256945775054e-05, "loss": 1.1215, "step": 24300 }, { "epoch": 1.5237975794820342, "eval_loss": 1.1665599346160889, "eval_runtime": 493.9156, "eval_samples_per_second": 258.287, "eval_steps_per_second": 8.072, "step": 24300 }, { "epoch": 1.5300683514140592, "grad_norm": 0.15210537612438202, "learning_rate": 1.3726419202564058e-05, "loss": 0.8485, "step": 24400 }, { "epoch": 1.5300683514140592, "eval_loss": 1.190748929977417, "eval_runtime": 489.6338, "eval_samples_per_second": 260.546, "eval_steps_per_second": 8.143, "step": 24400 }, { "epoch": 1.536339123346084, "grad_norm": 111.32445526123047, "learning_rate": 1.3691581459353065e-05, "loss": 1.1033, "step": 24500 }, { "epoch": 1.536339123346084, "eval_loss": 1.2317506074905396, "eval_runtime": 495.4364, "eval_samples_per_second": 257.494, "eval_steps_per_second": 8.047, "step": 24500 }, { "epoch": 1.5426098952781087, "grad_norm": 4.906432151794434, "learning_rate": 1.365674371614207e-05, "loss": 0.9001, "step": 24600 }, { "epoch": 1.5426098952781087, "eval_loss": 1.2112876176834106, "eval_runtime": 501.1743, "eval_samples_per_second": 254.546, "eval_steps_per_second": 7.955, "step": 24600 }, { "epoch": 1.5488806672101336, "grad_norm": 3.4020934104919434, "learning_rate": 1.3622254350363184e-05, "loss": 1.3256, "step": 24700 }, { "epoch": 1.5488806672101336, "eval_loss": 1.23091721534729, "eval_runtime": 488.223, "eval_samples_per_second": 261.299, "eval_steps_per_second": 8.166, "step": 24700 }, { "epoch": 1.5551514391421584, "grad_norm": 172.33592224121094, "learning_rate": 1.358741660715219e-05, "loss": 0.8162, "step": 24800 }, { "epoch": 1.5551514391421584, "eval_loss": 1.213860273361206, "eval_runtime": 492.7571, "eval_samples_per_second": 258.894, "eval_steps_per_second": 8.091, "step": 24800 }, { "epoch": 1.5614222110741833, "grad_norm": 1.1643731594085693, "learning_rate": 1.3552578863941195e-05, "loss": 0.5741, "step": 24900 }, { "epoch": 1.5614222110741833, "eval_loss": 1.237512469291687, "eval_runtime": 506.8194, "eval_samples_per_second": 251.711, "eval_steps_per_second": 7.867, "step": 24900 }, { "epoch": 1.567692983006208, "grad_norm": 106.2492446899414, "learning_rate": 1.3517741120730201e-05, "loss": 0.883, "step": 25000 }, { "epoch": 1.567692983006208, "eval_loss": 1.203902244567871, "eval_runtime": 495.5543, "eval_samples_per_second": 257.433, "eval_steps_per_second": 8.046, "step": 25000 }, { "epoch": 1.5739637549382328, "grad_norm": 24.915504455566406, "learning_rate": 1.3482903377519206e-05, "loss": 1.1212, "step": 25100 }, { "epoch": 1.5739637549382328, "eval_loss": 1.1428111791610718, "eval_runtime": 489.9866, "eval_samples_per_second": 260.358, "eval_steps_per_second": 8.137, "step": 25100 }, { "epoch": 1.5802345268702578, "grad_norm": 0.43622246384620667, "learning_rate": 1.3448065634308212e-05, "loss": 0.8229, "step": 25200 }, { "epoch": 1.5802345268702578, "eval_loss": 1.2338348627090454, "eval_runtime": 488.67, "eval_samples_per_second": 261.06, "eval_steps_per_second": 8.159, "step": 25200 }, { "epoch": 1.5865052988022825, "grad_norm": 76.21497344970703, "learning_rate": 1.3413227891097215e-05, "loss": 0.8856, "step": 25300 }, { "epoch": 1.5865052988022825, "eval_loss": 1.146145224571228, "eval_runtime": 504.1995, "eval_samples_per_second": 253.019, "eval_steps_per_second": 7.908, "step": 25300 }, { "epoch": 1.5927760707343075, "grad_norm": 114.51611328125, "learning_rate": 1.337839014788622e-05, "loss": 1.2323, "step": 25400 }, { "epoch": 1.5927760707343075, "eval_loss": 1.1568622589111328, "eval_runtime": 492.991, "eval_samples_per_second": 258.771, "eval_steps_per_second": 8.087, "step": 25400 }, { "epoch": 1.5990468426663322, "grad_norm": 3.8696110248565674, "learning_rate": 1.3343552404675226e-05, "loss": 0.9724, "step": 25500 }, { "epoch": 1.5990468426663322, "eval_loss": 1.1549348831176758, "eval_runtime": 499.5621, "eval_samples_per_second": 255.368, "eval_steps_per_second": 7.981, "step": 25500 }, { "epoch": 1.605317614598357, "grad_norm": 1.6167796850204468, "learning_rate": 1.330871466146423e-05, "loss": 1.0791, "step": 25600 }, { "epoch": 1.605317614598357, "eval_loss": 1.1160709857940674, "eval_runtime": 484.6001, "eval_samples_per_second": 263.252, "eval_steps_per_second": 8.227, "step": 25600 }, { "epoch": 1.611588386530382, "grad_norm": 138.8144073486328, "learning_rate": 1.3273876918253237e-05, "loss": 0.9845, "step": 25700 }, { "epoch": 1.611588386530382, "eval_loss": 1.1060998439788818, "eval_runtime": 496.5423, "eval_samples_per_second": 256.921, "eval_steps_per_second": 8.03, "step": 25700 }, { "epoch": 1.6178591584624067, "grad_norm": 4.400548934936523, "learning_rate": 1.3239039175042242e-05, "loss": 1.1591, "step": 25800 }, { "epoch": 1.6178591584624067, "eval_loss": 1.110283613204956, "eval_runtime": 486.9154, "eval_samples_per_second": 262.0, "eval_steps_per_second": 8.188, "step": 25800 }, { "epoch": 1.6241299303944317, "grad_norm": 239.38189697265625, "learning_rate": 1.3204201431831248e-05, "loss": 1.116, "step": 25900 }, { "epoch": 1.6241299303944317, "eval_loss": 1.1404825448989868, "eval_runtime": 492.7605, "eval_samples_per_second": 258.892, "eval_steps_per_second": 8.091, "step": 25900 }, { "epoch": 1.6304007023264564, "grad_norm": 232.2500457763672, "learning_rate": 1.3169363688620251e-05, "loss": 1.2221, "step": 26000 }, { "epoch": 1.6304007023264564, "eval_loss": 1.1528397798538208, "eval_runtime": 487.3414, "eval_samples_per_second": 261.771, "eval_steps_per_second": 8.181, "step": 26000 }, { "epoch": 1.6366714742584811, "grad_norm": 5.894351959228516, "learning_rate": 1.3134525945409257e-05, "loss": 0.9085, "step": 26100 }, { "epoch": 1.6366714742584811, "eval_loss": 1.139626145362854, "eval_runtime": 480.0477, "eval_samples_per_second": 265.749, "eval_steps_per_second": 8.305, "step": 26100 }, { "epoch": 1.642942246190506, "grad_norm": 0.19382409751415253, "learning_rate": 1.3099688202198262e-05, "loss": 0.9543, "step": 26200 }, { "epoch": 1.642942246190506, "eval_loss": 1.195331335067749, "eval_runtime": 487.008, "eval_samples_per_second": 261.951, "eval_steps_per_second": 8.187, "step": 26200 }, { "epoch": 1.6492130181225308, "grad_norm": 240.2974090576172, "learning_rate": 1.3064850458987268e-05, "loss": 1.1855, "step": 26300 }, { "epoch": 1.6492130181225308, "eval_loss": 1.1792023181915283, "eval_runtime": 487.2539, "eval_samples_per_second": 261.818, "eval_steps_per_second": 8.183, "step": 26300 }, { "epoch": 1.6554837900545558, "grad_norm": 5.021773338317871, "learning_rate": 1.3030012715776273e-05, "loss": 1.0583, "step": 26400 }, { "epoch": 1.6554837900545558, "eval_loss": 1.1666100025177002, "eval_runtime": 490.5958, "eval_samples_per_second": 260.035, "eval_steps_per_second": 8.127, "step": 26400 }, { "epoch": 1.6617545619865806, "grad_norm": 0.47061604261398315, "learning_rate": 1.299517497256528e-05, "loss": 0.6583, "step": 26500 }, { "epoch": 1.6617545619865806, "eval_loss": 1.1151552200317383, "eval_runtime": 489.33, "eval_samples_per_second": 260.708, "eval_steps_per_second": 8.148, "step": 26500 }, { "epoch": 1.6680253339186053, "grad_norm": 0.7339816689491272, "learning_rate": 1.2960337229354282e-05, "loss": 1.3067, "step": 26600 }, { "epoch": 1.6680253339186053, "eval_loss": 1.0397262573242188, "eval_runtime": 490.7479, "eval_samples_per_second": 259.954, "eval_steps_per_second": 8.124, "step": 26600 }, { "epoch": 1.6742961058506303, "grad_norm": 0.43579697608947754, "learning_rate": 1.2925499486143289e-05, "loss": 1.5336, "step": 26700 }, { "epoch": 1.6742961058506303, "eval_loss": 1.1244205236434937, "eval_runtime": 504.0991, "eval_samples_per_second": 253.069, "eval_steps_per_second": 7.909, "step": 26700 }, { "epoch": 1.680566877782655, "grad_norm": 0.877700686454773, "learning_rate": 1.2890661742932293e-05, "loss": 0.614, "step": 26800 }, { "epoch": 1.680566877782655, "eval_loss": 1.1273393630981445, "eval_runtime": 490.8071, "eval_samples_per_second": 259.923, "eval_steps_per_second": 8.123, "step": 26800 }, { "epoch": 1.68683764971468, "grad_norm": 2.61261248588562, "learning_rate": 1.28558239997213e-05, "loss": 1.0336, "step": 26900 }, { "epoch": 1.68683764971468, "eval_loss": 1.067978024482727, "eval_runtime": 488.4991, "eval_samples_per_second": 261.151, "eval_steps_per_second": 8.162, "step": 26900 }, { "epoch": 1.6931084216467047, "grad_norm": 1.7996759414672852, "learning_rate": 1.2821334633942416e-05, "loss": 1.462, "step": 27000 }, { "epoch": 1.6931084216467047, "eval_loss": 1.0983270406723022, "eval_runtime": 497.9625, "eval_samples_per_second": 256.188, "eval_steps_per_second": 8.007, "step": 27000 }, { "epoch": 1.6993791935787295, "grad_norm": 0.4661722183227539, "learning_rate": 1.2786496890731419e-05, "loss": 0.8858, "step": 27100 }, { "epoch": 1.6993791935787295, "eval_loss": 1.0672377347946167, "eval_runtime": 488.5627, "eval_samples_per_second": 261.117, "eval_steps_per_second": 8.161, "step": 27100 }, { "epoch": 1.7056499655107544, "grad_norm": 131.8981475830078, "learning_rate": 1.2751659147520425e-05, "loss": 0.7494, "step": 27200 }, { "epoch": 1.7056499655107544, "eval_loss": 1.1623871326446533, "eval_runtime": 489.2152, "eval_samples_per_second": 260.769, "eval_steps_per_second": 8.15, "step": 27200 }, { "epoch": 1.7119207374427792, "grad_norm": 1.5505995750427246, "learning_rate": 1.271682140430943e-05, "loss": 0.8152, "step": 27300 }, { "epoch": 1.7119207374427792, "eval_loss": 1.0928338766098022, "eval_runtime": 485.3945, "eval_samples_per_second": 262.821, "eval_steps_per_second": 8.214, "step": 27300 }, { "epoch": 1.7181915093748041, "grad_norm": 0.11606509238481522, "learning_rate": 1.2681983661098436e-05, "loss": 0.7785, "step": 27400 }, { "epoch": 1.7181915093748041, "eval_loss": 1.0952435731887817, "eval_runtime": 490.1873, "eval_samples_per_second": 260.252, "eval_steps_per_second": 8.134, "step": 27400 }, { "epoch": 1.7244622813068289, "grad_norm": 60.00815963745117, "learning_rate": 1.264714591788744e-05, "loss": 1.0471, "step": 27500 }, { "epoch": 1.7244622813068289, "eval_loss": 1.0999162197113037, "eval_runtime": 472.8514, "eval_samples_per_second": 269.793, "eval_steps_per_second": 8.432, "step": 27500 }, { "epoch": 1.7307330532388536, "grad_norm": 0.18325106799602509, "learning_rate": 1.2612308174676447e-05, "loss": 1.0994, "step": 27600 }, { "epoch": 1.7307330532388536, "eval_loss": 0.9880152344703674, "eval_runtime": 489.527, "eval_samples_per_second": 260.603, "eval_steps_per_second": 8.145, "step": 27600 }, { "epoch": 1.7370038251708786, "grad_norm": 33.887603759765625, "learning_rate": 1.257747043146545e-05, "loss": 1.0706, "step": 27700 }, { "epoch": 1.7370038251708786, "eval_loss": 1.0416243076324463, "eval_runtime": 486.6381, "eval_samples_per_second": 262.15, "eval_steps_per_second": 8.193, "step": 27700 }, { "epoch": 1.7432745971029033, "grad_norm": 122.05184936523438, "learning_rate": 1.2542632688254456e-05, "loss": 1.1158, "step": 27800 }, { "epoch": 1.7432745971029033, "eval_loss": 1.0675890445709229, "eval_runtime": 488.3694, "eval_samples_per_second": 261.22, "eval_steps_per_second": 8.164, "step": 27800 }, { "epoch": 1.7495453690349283, "grad_norm": 3.5680992603302, "learning_rate": 1.2507794945043461e-05, "loss": 0.9893, "step": 27900 }, { "epoch": 1.7495453690349283, "eval_loss": 1.0288848876953125, "eval_runtime": 487.5059, "eval_samples_per_second": 261.683, "eval_steps_per_second": 8.178, "step": 27900 }, { "epoch": 1.755816140966953, "grad_norm": 0.61468905210495, "learning_rate": 1.2472957201832467e-05, "loss": 1.2939, "step": 28000 }, { "epoch": 1.755816140966953, "eval_loss": 1.0149768590927124, "eval_runtime": 496.1264, "eval_samples_per_second": 257.136, "eval_steps_per_second": 8.036, "step": 28000 }, { "epoch": 1.7620869128989778, "grad_norm": 0.23548483848571777, "learning_rate": 1.2438119458621472e-05, "loss": 0.9543, "step": 28100 }, { "epoch": 1.7620869128989778, "eval_loss": 1.076741099357605, "eval_runtime": 494.571, "eval_samples_per_second": 257.945, "eval_steps_per_second": 8.062, "step": 28100 }, { "epoch": 1.7683576848310028, "grad_norm": 0.04505012556910515, "learning_rate": 1.2403281715410475e-05, "loss": 0.7907, "step": 28200 }, { "epoch": 1.7683576848310028, "eval_loss": 1.071725845336914, "eval_runtime": 498.1358, "eval_samples_per_second": 256.099, "eval_steps_per_second": 8.004, "step": 28200 }, { "epoch": 1.7746284567630275, "grad_norm": 0.3665514886379242, "learning_rate": 1.2368443972199481e-05, "loss": 0.92, "step": 28300 }, { "epoch": 1.7746284567630275, "eval_loss": 1.1132545471191406, "eval_runtime": 494.9621, "eval_samples_per_second": 257.741, "eval_steps_per_second": 8.055, "step": 28300 }, { "epoch": 1.7808992286950525, "grad_norm": 2.6903622150421143, "learning_rate": 1.2333606228988486e-05, "loss": 0.8636, "step": 28400 }, { "epoch": 1.7808992286950525, "eval_loss": 1.070193886756897, "eval_runtime": 487.101, "eval_samples_per_second": 261.901, "eval_steps_per_second": 8.185, "step": 28400 }, { "epoch": 1.7871700006270772, "grad_norm": 246.5596923828125, "learning_rate": 1.2298768485777492e-05, "loss": 0.9118, "step": 28500 }, { "epoch": 1.7871700006270772, "eval_loss": 1.0536377429962158, "eval_runtime": 500.9429, "eval_samples_per_second": 254.664, "eval_steps_per_second": 7.959, "step": 28500 }, { "epoch": 1.793440772559102, "grad_norm": 15.87330150604248, "learning_rate": 1.2263930742566497e-05, "loss": 1.2643, "step": 28600 }, { "epoch": 1.793440772559102, "eval_loss": 1.135445237159729, "eval_runtime": 491.8209, "eval_samples_per_second": 259.387, "eval_steps_per_second": 8.107, "step": 28600 }, { "epoch": 1.7997115444911267, "grad_norm": 0.04285774007439613, "learning_rate": 1.2229092999355503e-05, "loss": 0.8284, "step": 28700 }, { "epoch": 1.7997115444911267, "eval_loss": 1.0714679956436157, "eval_runtime": 491.3195, "eval_samples_per_second": 259.652, "eval_steps_per_second": 8.115, "step": 28700 }, { "epoch": 1.8059823164231517, "grad_norm": 50.862327575683594, "learning_rate": 1.2194255256144508e-05, "loss": 0.8447, "step": 28800 }, { "epoch": 1.8059823164231517, "eval_loss": 1.0457782745361328, "eval_runtime": 497.4392, "eval_samples_per_second": 256.457, "eval_steps_per_second": 8.015, "step": 28800 }, { "epoch": 1.8122530883551766, "grad_norm": 1.507433295249939, "learning_rate": 1.2159417512933514e-05, "loss": 1.2102, "step": 28900 }, { "epoch": 1.8122530883551766, "eval_loss": 1.1000713109970093, "eval_runtime": 492.3678, "eval_samples_per_second": 259.099, "eval_steps_per_second": 8.098, "step": 28900 }, { "epoch": 1.8185238602872014, "grad_norm": 182.16946411132812, "learning_rate": 1.2124579769722517e-05, "loss": 1.1042, "step": 29000 }, { "epoch": 1.8185238602872014, "eval_loss": 1.0364127159118652, "eval_runtime": 493.1395, "eval_samples_per_second": 258.694, "eval_steps_per_second": 8.085, "step": 29000 }, { "epoch": 1.824794632219226, "grad_norm": 84.75048065185547, "learning_rate": 1.2089742026511523e-05, "loss": 0.9638, "step": 29100 }, { "epoch": 1.824794632219226, "eval_loss": 1.0946918725967407, "eval_runtime": 494.4031, "eval_samples_per_second": 258.032, "eval_steps_per_second": 8.064, "step": 29100 }, { "epoch": 1.8310654041512509, "grad_norm": 0.5844135284423828, "learning_rate": 1.2054904283300528e-05, "loss": 0.6847, "step": 29200 }, { "epoch": 1.8310654041512509, "eval_loss": 1.0311741828918457, "eval_runtime": 481.6292, "eval_samples_per_second": 264.876, "eval_steps_per_second": 8.278, "step": 29200 }, { "epoch": 1.8373361760832758, "grad_norm": 21.12558364868164, "learning_rate": 1.2020066540089534e-05, "loss": 1.7671, "step": 29300 }, { "epoch": 1.8373361760832758, "eval_loss": 1.0470467805862427, "eval_runtime": 494.9594, "eval_samples_per_second": 257.742, "eval_steps_per_second": 8.055, "step": 29300 }, { "epoch": 1.8436069480153008, "grad_norm": 7.0535407066345215, "learning_rate": 1.1985228796878539e-05, "loss": 0.7525, "step": 29400 }, { "epoch": 1.8436069480153008, "eval_loss": 1.1158130168914795, "eval_runtime": 492.3408, "eval_samples_per_second": 259.113, "eval_steps_per_second": 8.098, "step": 29400 }, { "epoch": 1.8498777199473255, "grad_norm": 0.11249526590108871, "learning_rate": 1.1950391053667545e-05, "loss": 1.2843, "step": 29500 }, { "epoch": 1.8498777199473255, "eval_loss": 1.0139508247375488, "eval_runtime": 483.0492, "eval_samples_per_second": 264.097, "eval_steps_per_second": 8.254, "step": 29500 }, { "epoch": 1.8561484918793503, "grad_norm": 72.12831115722656, "learning_rate": 1.191590168788866e-05, "loss": 0.6844, "step": 29600 }, { "epoch": 1.8561484918793503, "eval_loss": 1.1603798866271973, "eval_runtime": 491.4897, "eval_samples_per_second": 259.562, "eval_steps_per_second": 8.112, "step": 29600 }, { "epoch": 1.862419263811375, "grad_norm": 21.705537796020508, "learning_rate": 1.1881063944677665e-05, "loss": 1.2824, "step": 29700 }, { "epoch": 1.862419263811375, "eval_loss": 1.0052319765090942, "eval_runtime": 487.7473, "eval_samples_per_second": 261.553, "eval_steps_per_second": 8.174, "step": 29700 }, { "epoch": 1.8686900357434, "grad_norm": 1.3453004360198975, "learning_rate": 1.1846226201466671e-05, "loss": 1.314, "step": 29800 }, { "epoch": 1.8686900357434, "eval_loss": 1.0322686433792114, "eval_runtime": 480.0979, "eval_samples_per_second": 265.721, "eval_steps_per_second": 8.305, "step": 29800 }, { "epoch": 1.874960807675425, "grad_norm": 5.6963677406311035, "learning_rate": 1.1811388458255676e-05, "loss": 1.0796, "step": 29900 }, { "epoch": 1.874960807675425, "eval_loss": 1.0885429382324219, "eval_runtime": 483.9557, "eval_samples_per_second": 263.603, "eval_steps_per_second": 8.238, "step": 29900 }, { "epoch": 1.8812315796074497, "grad_norm": 0.3642306923866272, "learning_rate": 1.1776550715044682e-05, "loss": 1.0012, "step": 30000 }, { "epoch": 1.8812315796074497, "eval_loss": 1.0266896486282349, "eval_runtime": 498.6153, "eval_samples_per_second": 255.853, "eval_steps_per_second": 7.996, "step": 30000 }, { "epoch": 1.8875023515394744, "grad_norm": 45.68118667602539, "learning_rate": 1.1741712971833685e-05, "loss": 1.4932, "step": 30100 }, { "epoch": 1.8875023515394744, "eval_loss": 1.0438352823257446, "eval_runtime": 479.3134, "eval_samples_per_second": 266.156, "eval_steps_per_second": 8.318, "step": 30100 }, { "epoch": 1.8937731234714992, "grad_norm": 159.10227966308594, "learning_rate": 1.1706875228622691e-05, "loss": 1.0404, "step": 30200 }, { "epoch": 1.8937731234714992, "eval_loss": 1.0162733793258667, "eval_runtime": 484.5198, "eval_samples_per_second": 263.296, "eval_steps_per_second": 8.229, "step": 30200 }, { "epoch": 1.9000438954035241, "grad_norm": 9.165184020996094, "learning_rate": 1.1672037485411696e-05, "loss": 0.614, "step": 30300 }, { "epoch": 1.9000438954035241, "eval_loss": 1.0366989374160767, "eval_runtime": 494.4949, "eval_samples_per_second": 257.984, "eval_steps_per_second": 8.063, "step": 30300 }, { "epoch": 1.906314667335549, "grad_norm": 93.2901840209961, "learning_rate": 1.1637199742200702e-05, "loss": 1.2676, "step": 30400 }, { "epoch": 1.906314667335549, "eval_loss": 1.080250859260559, "eval_runtime": 506.0169, "eval_samples_per_second": 252.11, "eval_steps_per_second": 7.879, "step": 30400 }, { "epoch": 1.9125854392675739, "grad_norm": 22.93528938293457, "learning_rate": 1.1602361998989707e-05, "loss": 1.2431, "step": 30500 }, { "epoch": 1.9125854392675739, "eval_loss": 1.042752742767334, "eval_runtime": 482.2307, "eval_samples_per_second": 264.546, "eval_steps_per_second": 8.268, "step": 30500 }, { "epoch": 1.9188562111995986, "grad_norm": 44.19611358642578, "learning_rate": 1.1567524255778713e-05, "loss": 1.4063, "step": 30600 }, { "epoch": 1.9188562111995986, "eval_loss": 1.0318702459335327, "eval_runtime": 482.0351, "eval_samples_per_second": 264.653, "eval_steps_per_second": 8.271, "step": 30600 }, { "epoch": 1.9251269831316233, "grad_norm": 0.21961411833763123, "learning_rate": 1.1532686512567716e-05, "loss": 0.7787, "step": 30700 }, { "epoch": 1.9251269831316233, "eval_loss": 0.9666246175765991, "eval_runtime": 497.4003, "eval_samples_per_second": 256.478, "eval_steps_per_second": 8.016, "step": 30700 }, { "epoch": 1.9313977550636483, "grad_norm": 5.579217910766602, "learning_rate": 1.1497848769356722e-05, "loss": 1.0311, "step": 30800 }, { "epoch": 1.9313977550636483, "eval_loss": 1.0375796556472778, "eval_runtime": 496.1027, "eval_samples_per_second": 257.148, "eval_steps_per_second": 8.037, "step": 30800 }, { "epoch": 1.9376685269956733, "grad_norm": 0.01572820357978344, "learning_rate": 1.1463011026145727e-05, "loss": 1.0353, "step": 30900 }, { "epoch": 1.9376685269956733, "eval_loss": 0.9868729114532471, "eval_runtime": 491.6277, "eval_samples_per_second": 259.489, "eval_steps_per_second": 8.11, "step": 30900 }, { "epoch": 1.943939298927698, "grad_norm": 1.0484445095062256, "learning_rate": 1.1428173282934732e-05, "loss": 1.2221, "step": 31000 }, { "epoch": 1.943939298927698, "eval_loss": 0.968561589717865, "eval_runtime": 499.708, "eval_samples_per_second": 255.293, "eval_steps_per_second": 7.979, "step": 31000 }, { "epoch": 1.9502100708597228, "grad_norm": 123.73536682128906, "learning_rate": 1.1393335539723738e-05, "loss": 0.5806, "step": 31100 }, { "epoch": 1.9502100708597228, "eval_loss": 0.9662685394287109, "eval_runtime": 496.0179, "eval_samples_per_second": 257.192, "eval_steps_per_second": 8.038, "step": 31100 }, { "epoch": 1.9564808427917475, "grad_norm": 265.9390869140625, "learning_rate": 1.1358497796512741e-05, "loss": 0.6919, "step": 31200 }, { "epoch": 1.9564808427917475, "eval_loss": 0.9837759733200073, "eval_runtime": 481.0273, "eval_samples_per_second": 265.207, "eval_steps_per_second": 8.289, "step": 31200 }, { "epoch": 1.9627516147237725, "grad_norm": 1.0015980005264282, "learning_rate": 1.1323660053301749e-05, "loss": 0.8028, "step": 31300 }, { "epoch": 1.9627516147237725, "eval_loss": 0.9759084582328796, "eval_runtime": 487.9887, "eval_samples_per_second": 261.424, "eval_steps_per_second": 8.17, "step": 31300 }, { "epoch": 1.9690223866557974, "grad_norm": 31.675607681274414, "learning_rate": 1.1288822310090752e-05, "loss": 0.8365, "step": 31400 }, { "epoch": 1.9690223866557974, "eval_loss": 0.9640862345695496, "eval_runtime": 496.5309, "eval_samples_per_second": 256.927, "eval_steps_per_second": 8.03, "step": 31400 }, { "epoch": 1.9752931585878222, "grad_norm": 1.1243913173675537, "learning_rate": 1.1253984566879758e-05, "loss": 0.7518, "step": 31500 }, { "epoch": 1.9752931585878222, "eval_loss": 1.008094310760498, "eval_runtime": 499.5695, "eval_samples_per_second": 255.364, "eval_steps_per_second": 7.981, "step": 31500 }, { "epoch": 1.981563930519847, "grad_norm": 216.04434204101562, "learning_rate": 1.1219495201100875e-05, "loss": 1.0654, "step": 31600 }, { "epoch": 1.981563930519847, "eval_loss": 0.9843435287475586, "eval_runtime": 480.8256, "eval_samples_per_second": 265.319, "eval_steps_per_second": 8.292, "step": 31600 }, { "epoch": 1.9878347024518717, "grad_norm": 0.3936084806919098, "learning_rate": 1.1184657457889878e-05, "loss": 0.8637, "step": 31700 }, { "epoch": 1.9878347024518717, "eval_loss": 0.963536262512207, "eval_runtime": 481.4757, "eval_samples_per_second": 264.96, "eval_steps_per_second": 8.281, "step": 31700 }, { "epoch": 1.9941054743838966, "grad_norm": 8.97900104522705, "learning_rate": 1.1149819714678884e-05, "loss": 0.8663, "step": 31800 }, { "epoch": 1.9941054743838966, "eval_loss": 0.9537881016731262, "eval_runtime": 488.2812, "eval_samples_per_second": 261.268, "eval_steps_per_second": 8.165, "step": 31800 }, { "epoch": 2.0003762463159216, "grad_norm": 0.23352281749248505, "learning_rate": 1.1114981971467889e-05, "loss": 0.8524, "step": 31900 }, { "epoch": 2.0003762463159216, "eval_loss": 0.9627546072006226, "eval_runtime": 476.3461, "eval_samples_per_second": 267.814, "eval_steps_per_second": 8.37, "step": 31900 }, { "epoch": 2.0066470182479463, "grad_norm": 10.038532257080078, "learning_rate": 1.1080144228256895e-05, "loss": 1.2748, "step": 32000 }, { "epoch": 2.0066470182479463, "eval_loss": 0.9381898641586304, "eval_runtime": 483.6522, "eval_samples_per_second": 263.768, "eval_steps_per_second": 8.244, "step": 32000 }, { "epoch": 2.012917790179971, "grad_norm": 3.102550745010376, "learning_rate": 1.10453064850459e-05, "loss": 0.8138, "step": 32100 }, { "epoch": 2.012917790179971, "eval_loss": 0.9460862874984741, "eval_runtime": 486.1122, "eval_samples_per_second": 262.433, "eval_steps_per_second": 8.202, "step": 32100 }, { "epoch": 2.019188562111996, "grad_norm": 5.872899532318115, "learning_rate": 1.1010468741834906e-05, "loss": 0.4484, "step": 32200 }, { "epoch": 2.019188562111996, "eval_loss": 0.9221316576004028, "eval_runtime": 489.5035, "eval_samples_per_second": 260.615, "eval_steps_per_second": 8.145, "step": 32200 }, { "epoch": 2.025459334044021, "grad_norm": 70.84674072265625, "learning_rate": 1.0975630998623909e-05, "loss": 0.8839, "step": 32300 }, { "epoch": 2.025459334044021, "eval_loss": 0.9566515684127808, "eval_runtime": 497.3551, "eval_samples_per_second": 256.501, "eval_steps_per_second": 8.016, "step": 32300 }, { "epoch": 2.0317301059760458, "grad_norm": 20.528474807739258, "learning_rate": 1.0940793255412915e-05, "loss": 0.7599, "step": 32400 }, { "epoch": 2.0317301059760458, "eval_loss": 0.9439575672149658, "eval_runtime": 475.9709, "eval_samples_per_second": 268.025, "eval_steps_per_second": 8.377, "step": 32400 }, { "epoch": 2.0380008779080705, "grad_norm": 0.2569330930709839, "learning_rate": 1.090595551220192e-05, "loss": 0.8665, "step": 32500 }, { "epoch": 2.0380008779080705, "eval_loss": 0.9651756882667542, "eval_runtime": 476.2761, "eval_samples_per_second": 267.853, "eval_steps_per_second": 8.371, "step": 32500 }, { "epoch": 2.0442716498400952, "grad_norm": 160.0611572265625, "learning_rate": 1.0871117768990926e-05, "loss": 0.5802, "step": 32600 }, { "epoch": 2.0442716498400952, "eval_loss": 0.9474946856498718, "eval_runtime": 498.1214, "eval_samples_per_second": 256.106, "eval_steps_per_second": 8.004, "step": 32600 }, { "epoch": 2.05054242177212, "grad_norm": 13.137542724609375, "learning_rate": 1.083628002577993e-05, "loss": 0.7731, "step": 32700 }, { "epoch": 2.05054242177212, "eval_loss": 0.9197245240211487, "eval_runtime": 471.6865, "eval_samples_per_second": 270.459, "eval_steps_per_second": 8.453, "step": 32700 }, { "epoch": 2.056813193704145, "grad_norm": 4.745016574859619, "learning_rate": 1.0801442282568937e-05, "loss": 0.7913, "step": 32800 }, { "epoch": 2.056813193704145, "eval_loss": 1.002418875694275, "eval_runtime": 444.3682, "eval_samples_per_second": 287.086, "eval_steps_per_second": 8.972, "step": 32800 }, { "epoch": 2.06308396563617, "grad_norm": 273.15252685546875, "learning_rate": 1.0766604539357942e-05, "loss": 0.7758, "step": 32900 }, { "epoch": 2.06308396563617, "eval_loss": 0.9257067441940308, "eval_runtime": 479.6839, "eval_samples_per_second": 265.95, "eval_steps_per_second": 8.312, "step": 32900 }, { "epoch": 2.0693547375681947, "grad_norm": 0.2749234437942505, "learning_rate": 1.0731766796146948e-05, "loss": 0.7468, "step": 33000 }, { "epoch": 2.0693547375681947, "eval_loss": 0.9662745594978333, "eval_runtime": 482.8123, "eval_samples_per_second": 264.227, "eval_steps_per_second": 8.258, "step": 33000 }, { "epoch": 2.0756255095002194, "grad_norm": 2.7121362686157227, "learning_rate": 1.0696929052935951e-05, "loss": 0.9947, "step": 33100 }, { "epoch": 2.0756255095002194, "eval_loss": 0.9788134098052979, "eval_runtime": 488.227, "eval_samples_per_second": 261.296, "eval_steps_per_second": 8.166, "step": 33100 }, { "epoch": 2.081896281432244, "grad_norm": 0.2543056905269623, "learning_rate": 1.0662091309724957e-05, "loss": 0.5618, "step": 33200 }, { "epoch": 2.081896281432244, "eval_loss": 0.948021650314331, "eval_runtime": 491.5864, "eval_samples_per_second": 259.511, "eval_steps_per_second": 8.11, "step": 33200 }, { "epoch": 2.0881670533642693, "grad_norm": 0.034537989646196365, "learning_rate": 1.0627253566513962e-05, "loss": 0.8805, "step": 33300 }, { "epoch": 2.0881670533642693, "eval_loss": 0.9520492553710938, "eval_runtime": 482.9571, "eval_samples_per_second": 264.148, "eval_steps_per_second": 8.255, "step": 33300 }, { "epoch": 2.094437825296294, "grad_norm": 4.662662982940674, "learning_rate": 1.0592415823302968e-05, "loss": 0.9755, "step": 33400 }, { "epoch": 2.094437825296294, "eval_loss": 0.9288346767425537, "eval_runtime": 495.4516, "eval_samples_per_second": 257.486, "eval_steps_per_second": 8.047, "step": 33400 }, { "epoch": 2.100708597228319, "grad_norm": 64.40668487548828, "learning_rate": 1.0557578080091973e-05, "loss": 0.8942, "step": 33500 }, { "epoch": 2.100708597228319, "eval_loss": 0.9233998656272888, "eval_runtime": 486.5273, "eval_samples_per_second": 262.209, "eval_steps_per_second": 8.195, "step": 33500 }, { "epoch": 2.1069793691603436, "grad_norm": 2.1412320137023926, "learning_rate": 1.0522740336880976e-05, "loss": 0.7242, "step": 33600 }, { "epoch": 2.1069793691603436, "eval_loss": 0.9412585496902466, "eval_runtime": 481.4435, "eval_samples_per_second": 264.978, "eval_steps_per_second": 8.281, "step": 33600 }, { "epoch": 2.1132501410923683, "grad_norm": 5.01767635345459, "learning_rate": 1.0487902593669982e-05, "loss": 0.6231, "step": 33700 }, { "epoch": 2.1132501410923683, "eval_loss": 0.9660213589668274, "eval_runtime": 480.6062, "eval_samples_per_second": 265.44, "eval_steps_per_second": 8.296, "step": 33700 }, { "epoch": 2.1195209130243935, "grad_norm": 0.02841496281325817, "learning_rate": 1.0453064850458987e-05, "loss": 0.7144, "step": 33800 }, { "epoch": 2.1195209130243935, "eval_loss": 0.8900822997093201, "eval_runtime": 506.9048, "eval_samples_per_second": 251.669, "eval_steps_per_second": 7.865, "step": 33800 }, { "epoch": 2.1257916849564182, "grad_norm": 14.184029579162598, "learning_rate": 1.0418227107247993e-05, "loss": 0.7139, "step": 33900 }, { "epoch": 2.1257916849564182, "eval_loss": 0.9535605907440186, "eval_runtime": 467.6722, "eval_samples_per_second": 272.781, "eval_steps_per_second": 8.525, "step": 33900 }, { "epoch": 2.132062456888443, "grad_norm": 247.51730346679688, "learning_rate": 1.0383389364036998e-05, "loss": 0.6378, "step": 34000 }, { "epoch": 2.132062456888443, "eval_loss": 0.9369811415672302, "eval_runtime": 467.3096, "eval_samples_per_second": 272.992, "eval_steps_per_second": 8.532, "step": 34000 }, { "epoch": 2.1383332288204677, "grad_norm": 0.265493243932724, "learning_rate": 1.0348551620826004e-05, "loss": 0.7607, "step": 34100 }, { "epoch": 2.1383332288204677, "eval_loss": 0.9209387898445129, "eval_runtime": 460.3387, "eval_samples_per_second": 277.126, "eval_steps_per_second": 8.661, "step": 34100 }, { "epoch": 2.1446040007524925, "grad_norm": 4.44495153427124, "learning_rate": 1.0313713877615009e-05, "loss": 0.8667, "step": 34200 }, { "epoch": 2.1446040007524925, "eval_loss": 0.9734475016593933, "eval_runtime": 472.3123, "eval_samples_per_second": 270.101, "eval_steps_per_second": 8.441, "step": 34200 }, { "epoch": 2.1508747726845177, "grad_norm": 1.1490778923034668, "learning_rate": 1.0278876134404015e-05, "loss": 0.8533, "step": 34300 }, { "epoch": 2.1508747726845177, "eval_loss": 0.9177405834197998, "eval_runtime": 481.8576, "eval_samples_per_second": 264.75, "eval_steps_per_second": 8.274, "step": 34300 }, { "epoch": 2.1571455446165424, "grad_norm": 6.377614498138428, "learning_rate": 1.0244038391193018e-05, "loss": 0.6395, "step": 34400 }, { "epoch": 2.1571455446165424, "eval_loss": 0.9285467863082886, "eval_runtime": 491.7764, "eval_samples_per_second": 259.411, "eval_steps_per_second": 8.107, "step": 34400 }, { "epoch": 2.163416316548567, "grad_norm": 63.10408401489258, "learning_rate": 1.0209200647982025e-05, "loss": 0.7377, "step": 34500 }, { "epoch": 2.163416316548567, "eval_loss": 0.9046958088874817, "eval_runtime": 472.1262, "eval_samples_per_second": 270.207, "eval_steps_per_second": 8.445, "step": 34500 }, { "epoch": 2.169687088480592, "grad_norm": 0.07853188365697861, "learning_rate": 1.017471128220314e-05, "loss": 0.7787, "step": 34600 }, { "epoch": 2.169687088480592, "eval_loss": 0.9967793822288513, "eval_runtime": 488.6457, "eval_samples_per_second": 261.073, "eval_steps_per_second": 8.159, "step": 34600 }, { "epoch": 2.1759578604126166, "grad_norm": 44.51852035522461, "learning_rate": 1.0139873538992144e-05, "loss": 0.6561, "step": 34700 }, { "epoch": 2.1759578604126166, "eval_loss": 0.9653065800666809, "eval_runtime": 480.7646, "eval_samples_per_second": 265.352, "eval_steps_per_second": 8.293, "step": 34700 }, { "epoch": 2.182228632344642, "grad_norm": 37.319366455078125, "learning_rate": 1.010503579578115e-05, "loss": 0.6169, "step": 34800 }, { "epoch": 2.182228632344642, "eval_loss": 0.9403988122940063, "eval_runtime": 467.6154, "eval_samples_per_second": 272.814, "eval_steps_per_second": 8.526, "step": 34800 }, { "epoch": 2.1884994042766666, "grad_norm": 0.24766607582569122, "learning_rate": 1.0070198052570155e-05, "loss": 0.7643, "step": 34900 }, { "epoch": 2.1884994042766666, "eval_loss": 0.9397174715995789, "eval_runtime": 492.9746, "eval_samples_per_second": 258.78, "eval_steps_per_second": 8.088, "step": 34900 }, { "epoch": 2.1947701762086913, "grad_norm": 1.6579983234405518, "learning_rate": 1.0035360309359161e-05, "loss": 0.998, "step": 35000 }, { "epoch": 2.1947701762086913, "eval_loss": 0.9152400493621826, "eval_runtime": 488.9266, "eval_samples_per_second": 260.923, "eval_steps_per_second": 8.155, "step": 35000 }, { "epoch": 2.201040948140716, "grad_norm": 14.633705139160156, "learning_rate": 1.0000522566148166e-05, "loss": 0.8246, "step": 35100 }, { "epoch": 2.201040948140716, "eval_loss": 0.9512937068939209, "eval_runtime": 497.3249, "eval_samples_per_second": 256.516, "eval_steps_per_second": 8.017, "step": 35100 }, { "epoch": 2.207311720072741, "grad_norm": 162.75132751464844, "learning_rate": 9.96568482293717e-06, "loss": 0.6655, "step": 35200 }, { "epoch": 2.207311720072741, "eval_loss": 0.9354454278945923, "eval_runtime": 479.342, "eval_samples_per_second": 266.14, "eval_steps_per_second": 8.318, "step": 35200 }, { "epoch": 2.213582492004766, "grad_norm": 0.0890607163310051, "learning_rate": 9.930847079726175e-06, "loss": 0.9279, "step": 35300 }, { "epoch": 2.213582492004766, "eval_loss": 0.9034134745597839, "eval_runtime": 495.8444, "eval_samples_per_second": 257.282, "eval_steps_per_second": 8.041, "step": 35300 }, { "epoch": 2.2198532639367907, "grad_norm": 12.482114791870117, "learning_rate": 9.896009336515181e-06, "loss": 0.4239, "step": 35400 }, { "epoch": 2.2198532639367907, "eval_loss": 0.9606735706329346, "eval_runtime": 472.1675, "eval_samples_per_second": 270.184, "eval_steps_per_second": 8.444, "step": 35400 }, { "epoch": 2.2261240358688155, "grad_norm": 193.45916748046875, "learning_rate": 9.861171593304186e-06, "loss": 1.0023, "step": 35500 }, { "epoch": 2.2261240358688155, "eval_loss": 0.8731982707977295, "eval_runtime": 502.841, "eval_samples_per_second": 253.702, "eval_steps_per_second": 7.929, "step": 35500 }, { "epoch": 2.23239480780084, "grad_norm": 25.368621826171875, "learning_rate": 9.826333850093192e-06, "loss": 0.7426, "step": 35600 }, { "epoch": 2.23239480780084, "eval_loss": 0.8882994651794434, "eval_runtime": 489.5037, "eval_samples_per_second": 260.615, "eval_steps_per_second": 8.145, "step": 35600 }, { "epoch": 2.238665579732865, "grad_norm": 6.321267127990723, "learning_rate": 9.791496106882197e-06, "loss": 0.8675, "step": 35700 }, { "epoch": 2.238665579732865, "eval_loss": 0.9296298027038574, "eval_runtime": 481.2287, "eval_samples_per_second": 265.096, "eval_steps_per_second": 8.285, "step": 35700 }, { "epoch": 2.24493635166489, "grad_norm": 0.16120706498622894, "learning_rate": 9.756658363671202e-06, "loss": 0.9226, "step": 35800 }, { "epoch": 2.24493635166489, "eval_loss": 0.9145704507827759, "eval_runtime": 507.1996, "eval_samples_per_second": 251.522, "eval_steps_per_second": 7.861, "step": 35800 }, { "epoch": 2.251207123596915, "grad_norm": 14.761024475097656, "learning_rate": 9.721820620460208e-06, "loss": 0.4944, "step": 35900 }, { "epoch": 2.251207123596915, "eval_loss": 0.9145201444625854, "eval_runtime": 480.7973, "eval_samples_per_second": 265.334, "eval_steps_per_second": 8.292, "step": 35900 }, { "epoch": 2.2574778955289396, "grad_norm": 15.988486289978027, "learning_rate": 9.686982877249213e-06, "loss": 0.9663, "step": 36000 }, { "epoch": 2.2574778955289396, "eval_loss": 0.8893073201179504, "eval_runtime": 466.9887, "eval_samples_per_second": 273.18, "eval_steps_per_second": 8.538, "step": 36000 }, { "epoch": 2.2637486674609644, "grad_norm": 8.232684135437012, "learning_rate": 9.652493511470327e-06, "loss": 0.6455, "step": 36100 }, { "epoch": 2.2637486674609644, "eval_loss": 0.9238069653511047, "eval_runtime": 471.8533, "eval_samples_per_second": 270.364, "eval_steps_per_second": 8.45, "step": 36100 }, { "epoch": 2.270019439392989, "grad_norm": 0.20196978747844696, "learning_rate": 9.617655768259333e-06, "loss": 0.9673, "step": 36200 }, { "epoch": 2.270019439392989, "eval_loss": 0.8942546248435974, "eval_runtime": 497.8096, "eval_samples_per_second": 256.267, "eval_steps_per_second": 8.009, "step": 36200 }, { "epoch": 2.2762902113250143, "grad_norm": 180.26956176757812, "learning_rate": 9.582818025048338e-06, "loss": 0.7974, "step": 36300 }, { "epoch": 2.2762902113250143, "eval_loss": 0.8620045185089111, "eval_runtime": 505.8787, "eval_samples_per_second": 252.179, "eval_steps_per_second": 7.881, "step": 36300 }, { "epoch": 2.282560983257039, "grad_norm": 118.34184265136719, "learning_rate": 9.547980281837343e-06, "loss": 0.9777, "step": 36400 }, { "epoch": 2.282560983257039, "eval_loss": 0.8812283873558044, "eval_runtime": 484.5263, "eval_samples_per_second": 263.292, "eval_steps_per_second": 8.229, "step": 36400 }, { "epoch": 2.288831755189064, "grad_norm": 1.6557927131652832, "learning_rate": 9.513142538626349e-06, "loss": 0.8741, "step": 36500 }, { "epoch": 2.288831755189064, "eval_loss": 0.8862267732620239, "eval_runtime": 499.5387, "eval_samples_per_second": 255.38, "eval_steps_per_second": 7.981, "step": 36500 }, { "epoch": 2.2951025271210885, "grad_norm": 0.09699351340532303, "learning_rate": 9.478304795415354e-06, "loss": 0.9642, "step": 36600 }, { "epoch": 2.2951025271210885, "eval_loss": 0.9157158732414246, "eval_runtime": 473.3855, "eval_samples_per_second": 269.489, "eval_steps_per_second": 8.422, "step": 36600 }, { "epoch": 2.3013732990531133, "grad_norm": 166.3496551513672, "learning_rate": 9.44346705220436e-06, "loss": 0.9225, "step": 36700 }, { "epoch": 2.3013732990531133, "eval_loss": 0.8784195780754089, "eval_runtime": 480.9986, "eval_samples_per_second": 265.223, "eval_steps_per_second": 8.289, "step": 36700 }, { "epoch": 2.3076440709851385, "grad_norm": 3.2308545112609863, "learning_rate": 9.408629308993365e-06, "loss": 0.6789, "step": 36800 }, { "epoch": 2.3076440709851385, "eval_loss": 0.9065931439399719, "eval_runtime": 486.2861, "eval_samples_per_second": 262.339, "eval_steps_per_second": 8.199, "step": 36800 }, { "epoch": 2.313914842917163, "grad_norm": 117.87212371826172, "learning_rate": 9.37379156578237e-06, "loss": 0.6726, "step": 36900 }, { "epoch": 2.313914842917163, "eval_loss": 0.9090869426727295, "eval_runtime": 479.1369, "eval_samples_per_second": 266.254, "eval_steps_per_second": 8.321, "step": 36900 }, { "epoch": 2.320185614849188, "grad_norm": 64.78949737548828, "learning_rate": 9.338953822571376e-06, "loss": 0.7326, "step": 37000 }, { "epoch": 2.320185614849188, "eval_loss": 0.9202622175216675, "eval_runtime": 484.0573, "eval_samples_per_second": 263.547, "eval_steps_per_second": 8.237, "step": 37000 }, { "epoch": 2.3264563867812127, "grad_norm": 259.2130126953125, "learning_rate": 9.30411607936038e-06, "loss": 1.007, "step": 37100 }, { "epoch": 2.3264563867812127, "eval_loss": 0.9124699234962463, "eval_runtime": 494.1141, "eval_samples_per_second": 258.183, "eval_steps_per_second": 8.069, "step": 37100 }, { "epoch": 2.3327271587132374, "grad_norm": 3.8969433307647705, "learning_rate": 9.269278336149385e-06, "loss": 0.6134, "step": 37200 }, { "epoch": 2.3327271587132374, "eval_loss": 0.8837085366249084, "eval_runtime": 473.1604, "eval_samples_per_second": 269.617, "eval_steps_per_second": 8.426, "step": 37200 }, { "epoch": 2.3389979306452626, "grad_norm": 0.8037031292915344, "learning_rate": 9.234440592938391e-06, "loss": 0.9051, "step": 37300 }, { "epoch": 2.3389979306452626, "eval_loss": 0.8945268392562866, "eval_runtime": 488.4927, "eval_samples_per_second": 261.154, "eval_steps_per_second": 8.162, "step": 37300 }, { "epoch": 2.3452687025772874, "grad_norm": 70.98564910888672, "learning_rate": 9.199602849727396e-06, "loss": 0.837, "step": 37400 }, { "epoch": 2.3452687025772874, "eval_loss": 0.8740183711051941, "eval_runtime": 492.9522, "eval_samples_per_second": 258.792, "eval_steps_per_second": 8.088, "step": 37400 }, { "epoch": 2.351539474509312, "grad_norm": 1.006698489189148, "learning_rate": 9.1647651065164e-06, "loss": 0.7615, "step": 37500 }, { "epoch": 2.351539474509312, "eval_loss": 0.916473388671875, "eval_runtime": 499.2162, "eval_samples_per_second": 255.545, "eval_steps_per_second": 7.987, "step": 37500 }, { "epoch": 2.357810246441337, "grad_norm": 0.15957336127758026, "learning_rate": 9.129927363305405e-06, "loss": 0.8304, "step": 37600 }, { "epoch": 2.357810246441337, "eval_loss": 0.9107189774513245, "eval_runtime": 494.2784, "eval_samples_per_second": 258.097, "eval_steps_per_second": 8.066, "step": 37600 }, { "epoch": 2.3640810183733616, "grad_norm": 0.21330799162387848, "learning_rate": 9.09508962009441e-06, "loss": 0.6255, "step": 37700 }, { "epoch": 2.3640810183733616, "eval_loss": 0.8891344666481018, "eval_runtime": 489.2061, "eval_samples_per_second": 260.774, "eval_steps_per_second": 8.15, "step": 37700 }, { "epoch": 2.370351790305387, "grad_norm": 1.2431716918945312, "learning_rate": 9.060251876883416e-06, "loss": 0.6775, "step": 37800 }, { "epoch": 2.370351790305387, "eval_loss": 0.8907997608184814, "eval_runtime": 497.9968, "eval_samples_per_second": 256.17, "eval_steps_per_second": 8.006, "step": 37800 }, { "epoch": 2.3766225622374115, "grad_norm": 0.6021884679794312, "learning_rate": 9.025414133672421e-06, "loss": 0.7159, "step": 37900 }, { "epoch": 2.3766225622374115, "eval_loss": 0.8589950203895569, "eval_runtime": 499.2581, "eval_samples_per_second": 255.523, "eval_steps_per_second": 7.986, "step": 37900 }, { "epoch": 2.3828933341694363, "grad_norm": 0.35575389862060547, "learning_rate": 8.990576390461425e-06, "loss": 0.6422, "step": 38000 }, { "epoch": 2.3828933341694363, "eval_loss": 0.8558962941169739, "eval_runtime": 493.9361, "eval_samples_per_second": 258.276, "eval_steps_per_second": 8.072, "step": 38000 }, { "epoch": 2.389164106101461, "grad_norm": 0.1820683479309082, "learning_rate": 8.955738647250432e-06, "loss": 0.7773, "step": 38100 }, { "epoch": 2.389164106101461, "eval_loss": 0.8600557446479797, "eval_runtime": 493.6945, "eval_samples_per_second": 258.403, "eval_steps_per_second": 8.076, "step": 38100 }, { "epoch": 2.3954348780334858, "grad_norm": 0.02349485270678997, "learning_rate": 8.920900904039436e-06, "loss": 0.5457, "step": 38200 }, { "epoch": 2.3954348780334858, "eval_loss": 0.8856033086776733, "eval_runtime": 499.9609, "eval_samples_per_second": 255.164, "eval_steps_per_second": 7.975, "step": 38200 }, { "epoch": 2.401705649965511, "grad_norm": 38.077266693115234, "learning_rate": 8.886063160828443e-06, "loss": 0.4997, "step": 38300 }, { "epoch": 2.401705649965511, "eval_loss": 0.8785237669944763, "eval_runtime": 503.5878, "eval_samples_per_second": 253.326, "eval_steps_per_second": 7.917, "step": 38300 }, { "epoch": 2.4079764218975357, "grad_norm": 100.00057983398438, "learning_rate": 8.851225417617447e-06, "loss": 0.6319, "step": 38400 }, { "epoch": 2.4079764218975357, "eval_loss": 0.885017454624176, "eval_runtime": 496.4876, "eval_samples_per_second": 256.949, "eval_steps_per_second": 8.03, "step": 38400 }, { "epoch": 2.4142471938295604, "grad_norm": 71.8719253540039, "learning_rate": 8.816387674406452e-06, "loss": 0.7096, "step": 38500 }, { "epoch": 2.4142471938295604, "eval_loss": 0.823376476764679, "eval_runtime": 500.2741, "eval_samples_per_second": 255.004, "eval_steps_per_second": 7.97, "step": 38500 } ], "logging_steps": 100, "max_steps": 63788, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }