|
{ |
|
"best_metric": 0.823376476764679, |
|
"best_model_checkpoint": "./gte-small-pairscore/checkpoint-38500", |
|
"epoch": 2.4142471938295604, |
|
"eval_steps": 100, |
|
"global_step": 38500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006270771932024832, |
|
"grad_norm": 8.922538757324219, |
|
"learning_rate": 3.135287662643047e-07, |
|
"loss": 6.3669, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.006270771932024832, |
|
"eval_loss": 6.6512651443481445, |
|
"eval_runtime": 499.3604, |
|
"eval_samples_per_second": 255.471, |
|
"eval_steps_per_second": 7.984, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.012541543864049664, |
|
"grad_norm": 8.918222427368164, |
|
"learning_rate": 6.270575325286094e-07, |
|
"loss": 6.1795, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.012541543864049664, |
|
"eval_loss": 6.254122734069824, |
|
"eval_runtime": 501.3941, |
|
"eval_samples_per_second": 254.435, |
|
"eval_steps_per_second": 7.952, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.018812315796074498, |
|
"grad_norm": 10.144318580627441, |
|
"learning_rate": 9.405862987929143e-07, |
|
"loss": 5.893, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.018812315796074498, |
|
"eval_loss": 5.773278713226318, |
|
"eval_runtime": 495.3793, |
|
"eval_samples_per_second": 257.524, |
|
"eval_steps_per_second": 8.048, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.025083087728099328, |
|
"grad_norm": 9.239725112915039, |
|
"learning_rate": 1.2541150650572189e-06, |
|
"loss": 5.5099, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.025083087728099328, |
|
"eval_loss": 5.362614154815674, |
|
"eval_runtime": 502.6963, |
|
"eval_samples_per_second": 253.776, |
|
"eval_steps_per_second": 7.931, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03135385966012416, |
|
"grad_norm": 9.863190650939941, |
|
"learning_rate": 1.5676438313215239e-06, |
|
"loss": 5.1589, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03135385966012416, |
|
"eval_loss": 4.990196228027344, |
|
"eval_runtime": 497.9208, |
|
"eval_samples_per_second": 256.209, |
|
"eval_steps_per_second": 8.007, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.037624631592148995, |
|
"grad_norm": 13.765459060668945, |
|
"learning_rate": 1.8811725975858286e-06, |
|
"loss": 4.8599, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.037624631592148995, |
|
"eval_loss": 4.652267932891846, |
|
"eval_runtime": 502.9499, |
|
"eval_samples_per_second": 253.648, |
|
"eval_steps_per_second": 7.927, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.04389540352417383, |
|
"grad_norm": 12.12948226928711, |
|
"learning_rate": 2.1915660761874904e-06, |
|
"loss": 4.6075, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.04389540352417383, |
|
"eval_loss": 4.423278331756592, |
|
"eval_runtime": 500.0391, |
|
"eval_samples_per_second": 255.124, |
|
"eval_steps_per_second": 7.973, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.050166175456198656, |
|
"grad_norm": 19.26013946533203, |
|
"learning_rate": 2.505094842451795e-06, |
|
"loss": 4.3831, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.050166175456198656, |
|
"eval_loss": 4.243069171905518, |
|
"eval_runtime": 502.9493, |
|
"eval_samples_per_second": 253.648, |
|
"eval_steps_per_second": 7.927, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.05643694738822349, |
|
"grad_norm": 15.950737953186035, |
|
"learning_rate": 2.8186236087161e-06, |
|
"loss": 4.1737, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.05643694738822349, |
|
"eval_loss": 4.134955883026123, |
|
"eval_runtime": 500.339, |
|
"eval_samples_per_second": 254.971, |
|
"eval_steps_per_second": 7.969, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.06270771932024832, |
|
"grad_norm": 22.265525817871094, |
|
"learning_rate": 3.1290170873177617e-06, |
|
"loss": 4.0266, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.06270771932024832, |
|
"eval_loss": 4.0326995849609375, |
|
"eval_runtime": 498.5852, |
|
"eval_samples_per_second": 255.868, |
|
"eval_steps_per_second": 7.997, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.06897849125227315, |
|
"grad_norm": 25.777463912963867, |
|
"learning_rate": 3.4425458535820665e-06, |
|
"loss": 3.9526, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.06897849125227315, |
|
"eval_loss": 3.928138256072998, |
|
"eval_runtime": 500.7885, |
|
"eval_samples_per_second": 254.742, |
|
"eval_steps_per_second": 7.961, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.07524926318429799, |
|
"grad_norm": 23.928476333618164, |
|
"learning_rate": 3.756074619846371e-06, |
|
"loss": 3.8773, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.07524926318429799, |
|
"eval_loss": 3.873471260070801, |
|
"eval_runtime": 500.5002, |
|
"eval_samples_per_second": 254.889, |
|
"eval_steps_per_second": 7.966, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.08152003511632282, |
|
"grad_norm": 33.25246810913086, |
|
"learning_rate": 4.069603386110676e-06, |
|
"loss": 3.7856, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.08152003511632282, |
|
"eval_loss": 3.777860164642334, |
|
"eval_runtime": 502.3139, |
|
"eval_samples_per_second": 253.969, |
|
"eval_steps_per_second": 7.937, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.08779080704834766, |
|
"grad_norm": 29.081541061401367, |
|
"learning_rate": 4.383132152374981e-06, |
|
"loss": 3.5994, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.08779080704834766, |
|
"eval_loss": 3.705378532409668, |
|
"eval_runtime": 500.945, |
|
"eval_samples_per_second": 254.663, |
|
"eval_steps_per_second": 7.959, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.09406157898037248, |
|
"grad_norm": 32.77998352050781, |
|
"learning_rate": 4.696660918639286e-06, |
|
"loss": 3.7067, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.09406157898037248, |
|
"eval_loss": 3.615516185760498, |
|
"eval_runtime": 495.8157, |
|
"eval_samples_per_second": 257.297, |
|
"eval_steps_per_second": 8.041, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.10033235091239731, |
|
"grad_norm": 80.04820251464844, |
|
"learning_rate": 5.01018968490359e-06, |
|
"loss": 3.5471, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.10033235091239731, |
|
"eval_loss": 3.57977032661438, |
|
"eval_runtime": 494.2912, |
|
"eval_samples_per_second": 258.091, |
|
"eval_steps_per_second": 8.066, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.10660312284442215, |
|
"grad_norm": 32.79664611816406, |
|
"learning_rate": 5.323718451167895e-06, |
|
"loss": 3.6679, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.10660312284442215, |
|
"eval_loss": 3.4653944969177246, |
|
"eval_runtime": 487.8092, |
|
"eval_samples_per_second": 261.52, |
|
"eval_steps_per_second": 8.173, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.11287389477644698, |
|
"grad_norm": 52.47648620605469, |
|
"learning_rate": 5.6372472174322e-06, |
|
"loss": 3.4484, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.11287389477644698, |
|
"eval_loss": 3.4174623489379883, |
|
"eval_runtime": 488.4427, |
|
"eval_samples_per_second": 261.181, |
|
"eval_steps_per_second": 8.163, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.11914466670847182, |
|
"grad_norm": 56.593727111816406, |
|
"learning_rate": 5.947640696033862e-06, |
|
"loss": 3.377, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.11914466670847182, |
|
"eval_loss": 3.412893056869507, |
|
"eval_runtime": 487.7362, |
|
"eval_samples_per_second": 261.559, |
|
"eval_steps_per_second": 8.175, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.12541543864049665, |
|
"grad_norm": 47.65892028808594, |
|
"learning_rate": 6.2611694622981665e-06, |
|
"loss": 3.4259, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.12541543864049665, |
|
"eval_loss": 3.3347389698028564, |
|
"eval_runtime": 485.6244, |
|
"eval_samples_per_second": 262.697, |
|
"eval_steps_per_second": 8.21, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.13168621057252147, |
|
"grad_norm": 46.80497741699219, |
|
"learning_rate": 6.574698228562471e-06, |
|
"loss": 3.4832, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.13168621057252147, |
|
"eval_loss": 3.2113163471221924, |
|
"eval_runtime": 485.3601, |
|
"eval_samples_per_second": 262.84, |
|
"eval_steps_per_second": 8.215, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.1379569825045463, |
|
"grad_norm": 61.08994674682617, |
|
"learning_rate": 6.888226994826775e-06, |
|
"loss": 3.3043, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.1379569825045463, |
|
"eval_loss": 3.164067268371582, |
|
"eval_runtime": 486.9272, |
|
"eval_samples_per_second": 261.994, |
|
"eval_steps_per_second": 8.188, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.14422775443657115, |
|
"grad_norm": 54.029170989990234, |
|
"learning_rate": 7.201755761091081e-06, |
|
"loss": 3.2344, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.14422775443657115, |
|
"eval_loss": 3.1528868675231934, |
|
"eval_runtime": 485.6647, |
|
"eval_samples_per_second": 262.675, |
|
"eval_steps_per_second": 8.209, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.15049852636859598, |
|
"grad_norm": 51.89152526855469, |
|
"learning_rate": 7.5152845273553855e-06, |
|
"loss": 3.1238, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.15049852636859598, |
|
"eval_loss": 3.2577104568481445, |
|
"eval_runtime": 483.4268, |
|
"eval_samples_per_second": 263.891, |
|
"eval_steps_per_second": 8.247, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.1567692983006208, |
|
"grad_norm": 42.53623962402344, |
|
"learning_rate": 7.828813293619691e-06, |
|
"loss": 3.1456, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.1567692983006208, |
|
"eval_loss": 3.067770481109619, |
|
"eval_runtime": 483.4427, |
|
"eval_samples_per_second": 263.882, |
|
"eval_steps_per_second": 8.247, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.16304007023264563, |
|
"grad_norm": 40.32865905761719, |
|
"learning_rate": 8.142342059883996e-06, |
|
"loss": 3.0223, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.16304007023264563, |
|
"eval_loss": 3.000631332397461, |
|
"eval_runtime": 490.1586, |
|
"eval_samples_per_second": 260.267, |
|
"eval_steps_per_second": 8.134, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.16931084216467046, |
|
"grad_norm": 52.34198760986328, |
|
"learning_rate": 8.4558708261483e-06, |
|
"loss": 3.2046, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.16931084216467046, |
|
"eval_loss": 2.9682161808013916, |
|
"eval_runtime": 501.7817, |
|
"eval_samples_per_second": 254.238, |
|
"eval_steps_per_second": 7.946, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.17558161409669532, |
|
"grad_norm": 54.570518493652344, |
|
"learning_rate": 8.766264304749962e-06, |
|
"loss": 3.0866, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.17558161409669532, |
|
"eval_loss": 3.0524070262908936, |
|
"eval_runtime": 492.0201, |
|
"eval_samples_per_second": 259.282, |
|
"eval_steps_per_second": 8.103, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.18185238602872014, |
|
"grad_norm": 91.97798156738281, |
|
"learning_rate": 9.079793071014266e-06, |
|
"loss": 2.9271, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.18185238602872014, |
|
"eval_loss": 3.057253122329712, |
|
"eval_runtime": 483.8055, |
|
"eval_samples_per_second": 263.684, |
|
"eval_steps_per_second": 8.241, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.18812315796074497, |
|
"grad_norm": 72.72432708740234, |
|
"learning_rate": 9.393321837278571e-06, |
|
"loss": 2.7692, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.18812315796074497, |
|
"eval_loss": 3.0557968616485596, |
|
"eval_runtime": 481.3118, |
|
"eval_samples_per_second": 265.051, |
|
"eval_steps_per_second": 8.284, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.1943939298927698, |
|
"grad_norm": 66.55966186523438, |
|
"learning_rate": 9.706850603542876e-06, |
|
"loss": 3.1498, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.1943939298927698, |
|
"eval_loss": 2.786630153656006, |
|
"eval_runtime": 470.7517, |
|
"eval_samples_per_second": 270.996, |
|
"eval_steps_per_second": 8.469, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.20066470182479462, |
|
"grad_norm": 60.73588180541992, |
|
"learning_rate": 1.002037936980718e-05, |
|
"loss": 3.0683, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.20066470182479462, |
|
"eval_loss": 2.847790002822876, |
|
"eval_runtime": 479.7323, |
|
"eval_samples_per_second": 265.923, |
|
"eval_steps_per_second": 8.311, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.20693547375681948, |
|
"grad_norm": 138.51925659179688, |
|
"learning_rate": 1.0333908136071484e-05, |
|
"loss": 2.5776, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.20693547375681948, |
|
"eval_loss": 2.9458932876586914, |
|
"eval_runtime": 476.173, |
|
"eval_samples_per_second": 267.911, |
|
"eval_steps_per_second": 8.373, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.2132062456888443, |
|
"grad_norm": 44.883033752441406, |
|
"learning_rate": 1.064743690233579e-05, |
|
"loss": 2.9394, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.2132062456888443, |
|
"eval_loss": 2.7133240699768066, |
|
"eval_runtime": 484.1605, |
|
"eval_samples_per_second": 263.491, |
|
"eval_steps_per_second": 8.235, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.21947701762086913, |
|
"grad_norm": 62.664493560791016, |
|
"learning_rate": 1.0960965668600095e-05, |
|
"loss": 2.6996, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.21947701762086913, |
|
"eval_loss": 2.8582112789154053, |
|
"eval_runtime": 486.4854, |
|
"eval_samples_per_second": 262.232, |
|
"eval_steps_per_second": 8.196, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.22574778955289396, |
|
"grad_norm": 121.68364715576172, |
|
"learning_rate": 1.12744944348644e-05, |
|
"loss": 2.569, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.22574778955289396, |
|
"eval_loss": 2.8092362880706787, |
|
"eval_runtime": 488.2917, |
|
"eval_samples_per_second": 261.262, |
|
"eval_steps_per_second": 8.165, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.23201856148491878, |
|
"grad_norm": 62.47746658325195, |
|
"learning_rate": 1.1588023201128705e-05, |
|
"loss": 2.6535, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.23201856148491878, |
|
"eval_loss": 2.7977445125579834, |
|
"eval_runtime": 489.6066, |
|
"eval_samples_per_second": 260.56, |
|
"eval_steps_per_second": 8.143, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.23828933341694364, |
|
"grad_norm": 61.19312286376953, |
|
"learning_rate": 1.1901551967393011e-05, |
|
"loss": 2.6679, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.23828933341694364, |
|
"eval_loss": 2.8578476905822754, |
|
"eval_runtime": 490.8833, |
|
"eval_samples_per_second": 259.883, |
|
"eval_steps_per_second": 8.122, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.24456010534896847, |
|
"grad_norm": 52.05066680908203, |
|
"learning_rate": 1.2215080733657314e-05, |
|
"loss": 2.592, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.24456010534896847, |
|
"eval_loss": 2.8251442909240723, |
|
"eval_runtime": 487.5816, |
|
"eval_samples_per_second": 261.642, |
|
"eval_steps_per_second": 8.177, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.2508308772809933, |
|
"grad_norm": 72.0737533569336, |
|
"learning_rate": 1.2525474212258977e-05, |
|
"loss": 2.4931, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2508308772809933, |
|
"eval_loss": 2.5975987911224365, |
|
"eval_runtime": 483.614, |
|
"eval_samples_per_second": 263.789, |
|
"eval_steps_per_second": 8.244, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.25710164921301815, |
|
"grad_norm": 142.94813537597656, |
|
"learning_rate": 1.283900297852328e-05, |
|
"loss": 2.3012, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.25710164921301815, |
|
"eval_loss": 2.926022529602051, |
|
"eval_runtime": 482.5775, |
|
"eval_samples_per_second": 264.355, |
|
"eval_steps_per_second": 8.262, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.26337242114504295, |
|
"grad_norm": 188.10948181152344, |
|
"learning_rate": 1.3152531744787585e-05, |
|
"loss": 2.4728, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.26337242114504295, |
|
"eval_loss": 2.7869389057159424, |
|
"eval_runtime": 484.6597, |
|
"eval_samples_per_second": 263.22, |
|
"eval_steps_per_second": 8.226, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.2696431930770678, |
|
"grad_norm": 88.77122497558594, |
|
"learning_rate": 1.3466060511051891e-05, |
|
"loss": 2.4391, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.2696431930770678, |
|
"eval_loss": 2.898672580718994, |
|
"eval_runtime": 488.7638, |
|
"eval_samples_per_second": 261.009, |
|
"eval_steps_per_second": 8.157, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.2759139650090926, |
|
"grad_norm": 24.550174713134766, |
|
"learning_rate": 1.3779589277316194e-05, |
|
"loss": 2.3825, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.2759139650090926, |
|
"eval_loss": 2.7803783416748047, |
|
"eval_runtime": 479.1092, |
|
"eval_samples_per_second": 266.269, |
|
"eval_steps_per_second": 8.322, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.28218473694111745, |
|
"grad_norm": 262.1357727050781, |
|
"learning_rate": 1.40931180435805e-05, |
|
"loss": 2.6257, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.28218473694111745, |
|
"eval_loss": 2.8308775424957275, |
|
"eval_runtime": 481.1992, |
|
"eval_samples_per_second": 265.113, |
|
"eval_steps_per_second": 8.286, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.2884555088731423, |
|
"grad_norm": 106.33345794677734, |
|
"learning_rate": 1.4406646809844804e-05, |
|
"loss": 2.4304, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.2884555088731423, |
|
"eval_loss": 3.241865873336792, |
|
"eval_runtime": 480.8967, |
|
"eval_samples_per_second": 265.279, |
|
"eval_steps_per_second": 8.291, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.2947262808051671, |
|
"grad_norm": 24.162464141845703, |
|
"learning_rate": 1.472017557610911e-05, |
|
"loss": 3.0246, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.2947262808051671, |
|
"eval_loss": 2.5731775760650635, |
|
"eval_runtime": 478.865, |
|
"eval_samples_per_second": 266.405, |
|
"eval_steps_per_second": 8.326, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.30099705273719196, |
|
"grad_norm": 89.0951919555664, |
|
"learning_rate": 1.5033704342373415e-05, |
|
"loss": 2.6894, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.30099705273719196, |
|
"eval_loss": 2.8057522773742676, |
|
"eval_runtime": 469.6303, |
|
"eval_samples_per_second": 271.643, |
|
"eval_steps_per_second": 8.49, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.30726782466921676, |
|
"grad_norm": 10.197610855102539, |
|
"learning_rate": 1.5347233108637716e-05, |
|
"loss": 2.5333, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.30726782466921676, |
|
"eval_loss": 2.4581992626190186, |
|
"eval_runtime": 480.2298, |
|
"eval_samples_per_second": 265.648, |
|
"eval_steps_per_second": 8.302, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.3135385966012416, |
|
"grad_norm": 10.737910270690918, |
|
"learning_rate": 1.5660761874902023e-05, |
|
"loss": 2.3268, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.3135385966012416, |
|
"eval_loss": 2.8622071743011475, |
|
"eval_runtime": 477.9553, |
|
"eval_samples_per_second": 266.912, |
|
"eval_steps_per_second": 8.342, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.31980936853326647, |
|
"grad_norm": 107.68405151367188, |
|
"learning_rate": 1.597429064116633e-05, |
|
"loss": 2.6996, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.31980936853326647, |
|
"eval_loss": 2.751514196395874, |
|
"eval_runtime": 479.8929, |
|
"eval_samples_per_second": 265.834, |
|
"eval_steps_per_second": 8.308, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.32608014046529127, |
|
"grad_norm": 137.2300567626953, |
|
"learning_rate": 1.6287819407430632e-05, |
|
"loss": 2.8175, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.32608014046529127, |
|
"eval_loss": 2.5842323303222656, |
|
"eval_runtime": 473.9066, |
|
"eval_samples_per_second": 269.192, |
|
"eval_steps_per_second": 8.413, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.3323509123973161, |
|
"grad_norm": 30.23833465576172, |
|
"learning_rate": 1.660134817369494e-05, |
|
"loss": 2.1244, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.3323509123973161, |
|
"eval_loss": 2.725175380706787, |
|
"eval_runtime": 479.3619, |
|
"eval_samples_per_second": 266.129, |
|
"eval_steps_per_second": 8.317, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.3386216843293409, |
|
"grad_norm": 66.7165756225586, |
|
"learning_rate": 1.6914876939959242e-05, |
|
"loss": 2.7331, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.3386216843293409, |
|
"eval_loss": 2.5052876472473145, |
|
"eval_runtime": 482.2091, |
|
"eval_samples_per_second": 264.557, |
|
"eval_steps_per_second": 8.268, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.3448924562613658, |
|
"grad_norm": 20.616701126098633, |
|
"learning_rate": 1.722840570622355e-05, |
|
"loss": 2.3226, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.3448924562613658, |
|
"eval_loss": 2.2429914474487305, |
|
"eval_runtime": 483.8758, |
|
"eval_samples_per_second": 263.646, |
|
"eval_steps_per_second": 8.24, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.35116322819339063, |
|
"grad_norm": 10.704608917236328, |
|
"learning_rate": 1.754193447248785e-05, |
|
"loss": 2.0706, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.35116322819339063, |
|
"eval_loss": 2.6055426597595215, |
|
"eval_runtime": 481.5222, |
|
"eval_samples_per_second": 264.935, |
|
"eval_steps_per_second": 8.28, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.35743400012541543, |
|
"grad_norm": 116.87510681152344, |
|
"learning_rate": 1.7852327951089514e-05, |
|
"loss": 2.2461, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.35743400012541543, |
|
"eval_loss": 2.894943952560425, |
|
"eval_runtime": 482.8194, |
|
"eval_samples_per_second": 264.223, |
|
"eval_steps_per_second": 8.258, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.3637047720574403, |
|
"grad_norm": 75.6421890258789, |
|
"learning_rate": 1.8165856717353817e-05, |
|
"loss": 2.6365, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.3637047720574403, |
|
"eval_loss": 2.5271661281585693, |
|
"eval_runtime": 486.8779, |
|
"eval_samples_per_second": 262.021, |
|
"eval_steps_per_second": 8.189, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.3699755439894651, |
|
"grad_norm": 25.05718231201172, |
|
"learning_rate": 1.8479385483618124e-05, |
|
"loss": 2.7119, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.3699755439894651, |
|
"eval_loss": 2.433084011077881, |
|
"eval_runtime": 486.349, |
|
"eval_samples_per_second": 262.305, |
|
"eval_steps_per_second": 8.198, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.37624631592148994, |
|
"grad_norm": 88.68294525146484, |
|
"learning_rate": 1.8792914249882427e-05, |
|
"loss": 2.6146, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.37624631592148994, |
|
"eval_loss": 2.385845899581909, |
|
"eval_runtime": 485.6975, |
|
"eval_samples_per_second": 262.657, |
|
"eval_steps_per_second": 8.209, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3825170878535148, |
|
"grad_norm": 185.446533203125, |
|
"learning_rate": 1.9106443016146733e-05, |
|
"loss": 2.1998, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.3825170878535148, |
|
"eval_loss": 2.6891462802886963, |
|
"eval_runtime": 488.7846, |
|
"eval_samples_per_second": 260.998, |
|
"eval_steps_per_second": 8.157, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.3887878597855396, |
|
"grad_norm": 105.49547576904297, |
|
"learning_rate": 1.9419971782411036e-05, |
|
"loss": 2.5076, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.3887878597855396, |
|
"eval_loss": 2.3827390670776367, |
|
"eval_runtime": 493.7693, |
|
"eval_samples_per_second": 258.364, |
|
"eval_steps_per_second": 8.075, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.39505863171756445, |
|
"grad_norm": 20.25705909729004, |
|
"learning_rate": 1.9733500548675343e-05, |
|
"loss": 2.5244, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.39505863171756445, |
|
"eval_loss": 2.6522157192230225, |
|
"eval_runtime": 500.1844, |
|
"eval_samples_per_second": 255.05, |
|
"eval_steps_per_second": 7.971, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.40132940364958924, |
|
"grad_norm": 1.7094597816467285, |
|
"learning_rate": 1.9994774338518353e-05, |
|
"loss": 2.0613, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.40132940364958924, |
|
"eval_loss": 2.4750421047210693, |
|
"eval_runtime": 501.9454, |
|
"eval_samples_per_second": 254.155, |
|
"eval_steps_per_second": 7.943, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.4076001755816141, |
|
"grad_norm": 2.9270060062408447, |
|
"learning_rate": 1.995993659530736e-05, |
|
"loss": 2.465, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.4076001755816141, |
|
"eval_loss": 2.525411367416382, |
|
"eval_runtime": 501.3359, |
|
"eval_samples_per_second": 254.464, |
|
"eval_steps_per_second": 7.953, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.41387094751363895, |
|
"grad_norm": 95.53108215332031, |
|
"learning_rate": 1.9925098852096362e-05, |
|
"loss": 2.3201, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.41387094751363895, |
|
"eval_loss": 2.2248587608337402, |
|
"eval_runtime": 495.8221, |
|
"eval_samples_per_second": 257.294, |
|
"eval_steps_per_second": 8.041, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.42014171944566375, |
|
"grad_norm": 19.441762924194336, |
|
"learning_rate": 1.9890261108885365e-05, |
|
"loss": 2.234, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.42014171944566375, |
|
"eval_loss": 2.5168297290802, |
|
"eval_runtime": 497.6361, |
|
"eval_samples_per_second": 256.356, |
|
"eval_steps_per_second": 8.012, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.4264124913776886, |
|
"grad_norm": 13.225996017456055, |
|
"learning_rate": 1.985542336567437e-05, |
|
"loss": 2.1277, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.4264124913776886, |
|
"eval_loss": 2.5358171463012695, |
|
"eval_runtime": 498.1476, |
|
"eval_samples_per_second": 256.093, |
|
"eval_steps_per_second": 8.004, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.4326832633097134, |
|
"grad_norm": 103.2215347290039, |
|
"learning_rate": 1.9820585622463378e-05, |
|
"loss": 2.3801, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.4326832633097134, |
|
"eval_loss": 2.4991824626922607, |
|
"eval_runtime": 500.8539, |
|
"eval_samples_per_second": 254.709, |
|
"eval_steps_per_second": 7.96, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.43895403524173826, |
|
"grad_norm": 97.55316925048828, |
|
"learning_rate": 1.9785747879252384e-05, |
|
"loss": 2.1443, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.43895403524173826, |
|
"eval_loss": 2.4043357372283936, |
|
"eval_runtime": 499.124, |
|
"eval_samples_per_second": 255.592, |
|
"eval_steps_per_second": 7.988, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.4452248071737631, |
|
"grad_norm": 233.4646759033203, |
|
"learning_rate": 1.9750910136041387e-05, |
|
"loss": 1.9136, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.4452248071737631, |
|
"eval_loss": 2.3874008655548096, |
|
"eval_runtime": 497.0926, |
|
"eval_samples_per_second": 256.636, |
|
"eval_steps_per_second": 8.021, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.4514955791057879, |
|
"grad_norm": 111.59117889404297, |
|
"learning_rate": 1.9716072392830394e-05, |
|
"loss": 2.3067, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.4514955791057879, |
|
"eval_loss": 2.647474765777588, |
|
"eval_runtime": 497.6919, |
|
"eval_samples_per_second": 256.327, |
|
"eval_steps_per_second": 8.011, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.45776635103781277, |
|
"grad_norm": 156.6864776611328, |
|
"learning_rate": 1.96812346496194e-05, |
|
"loss": 2.1464, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.45776635103781277, |
|
"eval_loss": 2.4704177379608154, |
|
"eval_runtime": 500.9739, |
|
"eval_samples_per_second": 254.648, |
|
"eval_steps_per_second": 7.958, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.46403712296983757, |
|
"grad_norm": 22.16613006591797, |
|
"learning_rate": 1.9646396906408406e-05, |
|
"loss": 2.2151, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.46403712296983757, |
|
"eval_loss": 2.519892692565918, |
|
"eval_runtime": 501.5459, |
|
"eval_samples_per_second": 254.358, |
|
"eval_steps_per_second": 7.949, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.4703078949018624, |
|
"grad_norm": 251.9285125732422, |
|
"learning_rate": 1.961155916319741e-05, |
|
"loss": 2.4653, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.4703078949018624, |
|
"eval_loss": 2.529334545135498, |
|
"eval_runtime": 497.5167, |
|
"eval_samples_per_second": 256.418, |
|
"eval_steps_per_second": 8.014, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.4765786668338873, |
|
"grad_norm": 222.22967529296875, |
|
"learning_rate": 1.9576721419986416e-05, |
|
"loss": 2.4425, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.4765786668338873, |
|
"eval_loss": 2.126385450363159, |
|
"eval_runtime": 501.9759, |
|
"eval_samples_per_second": 254.14, |
|
"eval_steps_per_second": 7.943, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.4828494387659121, |
|
"grad_norm": 135.4093780517578, |
|
"learning_rate": 1.954188367677542e-05, |
|
"loss": 2.3138, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.4828494387659121, |
|
"eval_loss": 2.18104887008667, |
|
"eval_runtime": 499.0397, |
|
"eval_samples_per_second": 255.635, |
|
"eval_steps_per_second": 7.989, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.48912021069793693, |
|
"grad_norm": 16.642980575561523, |
|
"learning_rate": 1.9507045933564425e-05, |
|
"loss": 2.247, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.48912021069793693, |
|
"eval_loss": 2.1403872966766357, |
|
"eval_runtime": 487.5913, |
|
"eval_samples_per_second": 261.637, |
|
"eval_steps_per_second": 8.177, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.49539098262996173, |
|
"grad_norm": 163.52439880371094, |
|
"learning_rate": 1.947220819035343e-05, |
|
"loss": 2.1621, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.49539098262996173, |
|
"eval_loss": 2.2122886180877686, |
|
"eval_runtime": 498.6473, |
|
"eval_samples_per_second": 255.836, |
|
"eval_steps_per_second": 7.996, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.5016617545619866, |
|
"grad_norm": 0.11034490168094635, |
|
"learning_rate": 1.9437370447142438e-05, |
|
"loss": 2.1338, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.5016617545619866, |
|
"eval_loss": 2.5108418464660645, |
|
"eval_runtime": 497.4577, |
|
"eval_samples_per_second": 256.448, |
|
"eval_steps_per_second": 8.015, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.5079325264940114, |
|
"grad_norm": 73.57258605957031, |
|
"learning_rate": 1.940253270393144e-05, |
|
"loss": 2.1846, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.5079325264940114, |
|
"eval_loss": 2.149299383163452, |
|
"eval_runtime": 500.0715, |
|
"eval_samples_per_second": 255.108, |
|
"eval_steps_per_second": 7.973, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.5142032984260363, |
|
"grad_norm": 71.24880981445312, |
|
"learning_rate": 1.9367694960720447e-05, |
|
"loss": 2.1167, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.5142032984260363, |
|
"eval_loss": 2.287858486175537, |
|
"eval_runtime": 500.8627, |
|
"eval_samples_per_second": 254.705, |
|
"eval_steps_per_second": 7.96, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.520474070358061, |
|
"grad_norm": 138.7628936767578, |
|
"learning_rate": 1.933285721750945e-05, |
|
"loss": 2.2143, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.520474070358061, |
|
"eval_loss": 2.1663804054260254, |
|
"eval_runtime": 501.8798, |
|
"eval_samples_per_second": 254.188, |
|
"eval_steps_per_second": 7.944, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.5267448422900859, |
|
"grad_norm": 17.088781356811523, |
|
"learning_rate": 1.9298019474298456e-05, |
|
"loss": 2.3152, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.5267448422900859, |
|
"eval_loss": 2.1071760654449463, |
|
"eval_runtime": 495.7183, |
|
"eval_samples_per_second": 257.348, |
|
"eval_steps_per_second": 8.043, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.5330156142221107, |
|
"grad_norm": 267.6972351074219, |
|
"learning_rate": 1.9263181731087462e-05, |
|
"loss": 1.7618, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.5330156142221107, |
|
"eval_loss": 2.032350540161133, |
|
"eval_runtime": 498.1752, |
|
"eval_samples_per_second": 256.079, |
|
"eval_steps_per_second": 8.003, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.5392863861541356, |
|
"grad_norm": 117.72229766845703, |
|
"learning_rate": 1.922834398787647e-05, |
|
"loss": 2.0777, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.5392863861541356, |
|
"eval_loss": 2.4468319416046143, |
|
"eval_runtime": 495.2364, |
|
"eval_samples_per_second": 257.598, |
|
"eval_steps_per_second": 8.051, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.5455571580861605, |
|
"grad_norm": 1.1375752687454224, |
|
"learning_rate": 1.9193506244665472e-05, |
|
"loss": 2.1573, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.5455571580861605, |
|
"eval_loss": 2.2053027153015137, |
|
"eval_runtime": 485.0279, |
|
"eval_samples_per_second": 263.02, |
|
"eval_steps_per_second": 8.22, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.5518279300181852, |
|
"grad_norm": 244.1565704345703, |
|
"learning_rate": 1.9158668501454478e-05, |
|
"loss": 1.9831, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.5518279300181852, |
|
"eval_loss": 2.3276798725128174, |
|
"eval_runtime": 499.4822, |
|
"eval_samples_per_second": 255.409, |
|
"eval_steps_per_second": 7.982, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.55809870195021, |
|
"grad_norm": 69.60086059570312, |
|
"learning_rate": 1.912383075824348e-05, |
|
"loss": 1.9083, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.55809870195021, |
|
"eval_loss": 1.9949347972869873, |
|
"eval_runtime": 495.1776, |
|
"eval_samples_per_second": 257.629, |
|
"eval_steps_per_second": 8.052, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.5643694738822349, |
|
"grad_norm": 1.5420753955841064, |
|
"learning_rate": 1.9088993015032487e-05, |
|
"loss": 1.932, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.5643694738822349, |
|
"eval_loss": 1.9848002195358276, |
|
"eval_runtime": 498.1194, |
|
"eval_samples_per_second": 256.107, |
|
"eval_steps_per_second": 8.004, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.5706402458142598, |
|
"grad_norm": 9.46451473236084, |
|
"learning_rate": 1.9054155271821494e-05, |
|
"loss": 2.3223, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.5706402458142598, |
|
"eval_loss": 1.9191622734069824, |
|
"eval_runtime": 495.6516, |
|
"eval_samples_per_second": 257.382, |
|
"eval_steps_per_second": 8.044, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.5769110177462846, |
|
"grad_norm": 22.84164047241211, |
|
"learning_rate": 1.90193175286105e-05, |
|
"loss": 1.7583, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.5769110177462846, |
|
"eval_loss": 2.0066075325012207, |
|
"eval_runtime": 497.8537, |
|
"eval_samples_per_second": 256.244, |
|
"eval_steps_per_second": 8.008, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.5831817896783094, |
|
"grad_norm": 3.343338966369629, |
|
"learning_rate": 1.8984479785399503e-05, |
|
"loss": 1.6394, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.5831817896783094, |
|
"eval_loss": 2.0322048664093018, |
|
"eval_runtime": 485.1706, |
|
"eval_samples_per_second": 262.943, |
|
"eval_steps_per_second": 8.218, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.5894525616103342, |
|
"grad_norm": 13.116720199584961, |
|
"learning_rate": 1.894964204218851e-05, |
|
"loss": 1.973, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.5894525616103342, |
|
"eval_loss": 2.100987195968628, |
|
"eval_runtime": 492.3424, |
|
"eval_samples_per_second": 259.112, |
|
"eval_steps_per_second": 8.098, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.5957233335423591, |
|
"grad_norm": 136.55160522460938, |
|
"learning_rate": 1.8914804298977512e-05, |
|
"loss": 2.2377, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5957233335423591, |
|
"eval_loss": 2.11759090423584, |
|
"eval_runtime": 495.0098, |
|
"eval_samples_per_second": 257.716, |
|
"eval_steps_per_second": 8.054, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.6019941054743839, |
|
"grad_norm": 18.554906845092773, |
|
"learning_rate": 1.887996655576652e-05, |
|
"loss": 2.2269, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.6019941054743839, |
|
"eval_loss": 2.002722978591919, |
|
"eval_runtime": 498.3286, |
|
"eval_samples_per_second": 256.0, |
|
"eval_steps_per_second": 8.001, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.6082648774064088, |
|
"grad_norm": 118.95328521728516, |
|
"learning_rate": 1.8845477189987635e-05, |
|
"loss": 1.971, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.6082648774064088, |
|
"eval_loss": 1.9329177141189575, |
|
"eval_runtime": 493.8369, |
|
"eval_samples_per_second": 258.328, |
|
"eval_steps_per_second": 8.074, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.6145356493384335, |
|
"grad_norm": 0.6365923285484314, |
|
"learning_rate": 1.881063944677664e-05, |
|
"loss": 1.8982, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.6145356493384335, |
|
"eval_loss": 1.9797492027282715, |
|
"eval_runtime": 496.6699, |
|
"eval_samples_per_second": 256.855, |
|
"eval_steps_per_second": 8.027, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.6208064212704584, |
|
"grad_norm": 88.75743865966797, |
|
"learning_rate": 1.8775801703565644e-05, |
|
"loss": 2.2853, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.6208064212704584, |
|
"eval_loss": 1.8433477878570557, |
|
"eval_runtime": 502.8332, |
|
"eval_samples_per_second": 253.706, |
|
"eval_steps_per_second": 7.929, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.6270771932024832, |
|
"grad_norm": 2.4320499897003174, |
|
"learning_rate": 1.874096396035465e-05, |
|
"loss": 1.6657, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.6270771932024832, |
|
"eval_loss": 2.0090935230255127, |
|
"eval_runtime": 487.5525, |
|
"eval_samples_per_second": 261.658, |
|
"eval_steps_per_second": 8.178, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.6333479651345081, |
|
"grad_norm": 94.46017456054688, |
|
"learning_rate": 1.8706126217143653e-05, |
|
"loss": 2.0732, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.6333479651345081, |
|
"eval_loss": 1.7602357864379883, |
|
"eval_runtime": 476.6373, |
|
"eval_samples_per_second": 267.65, |
|
"eval_steps_per_second": 8.365, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.6396187370665329, |
|
"grad_norm": 186.80111694335938, |
|
"learning_rate": 1.867128847393266e-05, |
|
"loss": 1.6951, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.6396187370665329, |
|
"eval_loss": 1.8849464654922485, |
|
"eval_runtime": 494.2546, |
|
"eval_samples_per_second": 258.11, |
|
"eval_steps_per_second": 8.067, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.6458895089985577, |
|
"grad_norm": 175.21151733398438, |
|
"learning_rate": 1.8636450730721666e-05, |
|
"loss": 1.6548, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.6458895089985577, |
|
"eval_loss": 2.0065879821777344, |
|
"eval_runtime": 492.3572, |
|
"eval_samples_per_second": 259.105, |
|
"eval_steps_per_second": 8.098, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.6521602809305825, |
|
"grad_norm": 1.4941706657409668, |
|
"learning_rate": 1.8601612987510672e-05, |
|
"loss": 1.7187, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.6521602809305825, |
|
"eval_loss": 1.9644232988357544, |
|
"eval_runtime": 502.7154, |
|
"eval_samples_per_second": 253.766, |
|
"eval_steps_per_second": 7.931, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.6584310528626074, |
|
"grad_norm": 2.646362543106079, |
|
"learning_rate": 1.8566775244299675e-05, |
|
"loss": 2.1948, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.6584310528626074, |
|
"eval_loss": 1.8391690254211426, |
|
"eval_runtime": 478.8123, |
|
"eval_samples_per_second": 266.434, |
|
"eval_steps_per_second": 8.327, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.6647018247946322, |
|
"grad_norm": 47.12744140625, |
|
"learning_rate": 1.8531937501088682e-05, |
|
"loss": 1.9756, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.6647018247946322, |
|
"eval_loss": 1.8404371738433838, |
|
"eval_runtime": 487.2993, |
|
"eval_samples_per_second": 261.794, |
|
"eval_steps_per_second": 8.182, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.6709725967266571, |
|
"grad_norm": 0.8185029029846191, |
|
"learning_rate": 1.8497099757877685e-05, |
|
"loss": 1.7644, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.6709725967266571, |
|
"eval_loss": 1.910104751586914, |
|
"eval_runtime": 487.8167, |
|
"eval_samples_per_second": 261.516, |
|
"eval_steps_per_second": 8.173, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.6772433686586818, |
|
"grad_norm": 115.93226623535156, |
|
"learning_rate": 1.846226201466669e-05, |
|
"loss": 1.6295, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.6772433686586818, |
|
"eval_loss": 1.943991780281067, |
|
"eval_runtime": 489.7195, |
|
"eval_samples_per_second": 260.5, |
|
"eval_steps_per_second": 8.141, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.6835141405907067, |
|
"grad_norm": 112.30680084228516, |
|
"learning_rate": 1.8427772648887807e-05, |
|
"loss": 1.7687, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.6835141405907067, |
|
"eval_loss": 1.903131127357483, |
|
"eval_runtime": 492.5747, |
|
"eval_samples_per_second": 258.99, |
|
"eval_steps_per_second": 8.094, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.6897849125227316, |
|
"grad_norm": 105.42852783203125, |
|
"learning_rate": 1.839293490567681e-05, |
|
"loss": 1.8203, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.6897849125227316, |
|
"eval_loss": 1.9650237560272217, |
|
"eval_runtime": 495.7922, |
|
"eval_samples_per_second": 257.309, |
|
"eval_steps_per_second": 8.042, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.6960556844547564, |
|
"grad_norm": 143.69154357910156, |
|
"learning_rate": 1.8358097162465817e-05, |
|
"loss": 2.3055, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.6960556844547564, |
|
"eval_loss": 1.8432321548461914, |
|
"eval_runtime": 494.4248, |
|
"eval_samples_per_second": 258.021, |
|
"eval_steps_per_second": 8.064, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.7023264563867813, |
|
"grad_norm": 55.0260009765625, |
|
"learning_rate": 1.8323259419254823e-05, |
|
"loss": 1.8294, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.7023264563867813, |
|
"eval_loss": 1.7363530397415161, |
|
"eval_runtime": 498.3754, |
|
"eval_samples_per_second": 255.976, |
|
"eval_steps_per_second": 8.0, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.708597228318806, |
|
"grad_norm": 20.01657485961914, |
|
"learning_rate": 1.828842167604383e-05, |
|
"loss": 2.0026, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.708597228318806, |
|
"eval_loss": 1.789391279220581, |
|
"eval_runtime": 487.8486, |
|
"eval_samples_per_second": 261.499, |
|
"eval_steps_per_second": 8.173, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.7148680002508309, |
|
"grad_norm": 103.39813232421875, |
|
"learning_rate": 1.8253583932832832e-05, |
|
"loss": 1.9916, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.7148680002508309, |
|
"eval_loss": 1.8342993259429932, |
|
"eval_runtime": 493.6206, |
|
"eval_samples_per_second": 258.441, |
|
"eval_steps_per_second": 8.077, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.7211387721828557, |
|
"grad_norm": 24.854759216308594, |
|
"learning_rate": 1.821874618962184e-05, |
|
"loss": 1.8698, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.7211387721828557, |
|
"eval_loss": 1.807905673980713, |
|
"eval_runtime": 494.1372, |
|
"eval_samples_per_second": 258.171, |
|
"eval_steps_per_second": 8.069, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.7274095441148806, |
|
"grad_norm": 2.62512469291687, |
|
"learning_rate": 1.818390844641084e-05, |
|
"loss": 1.5213, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.7274095441148806, |
|
"eval_loss": 1.684904932975769, |
|
"eval_runtime": 492.1644, |
|
"eval_samples_per_second": 259.206, |
|
"eval_steps_per_second": 8.101, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.7336803160469054, |
|
"grad_norm": 68.22614288330078, |
|
"learning_rate": 1.8149070703199848e-05, |
|
"loss": 1.7462, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.7336803160469054, |
|
"eval_loss": 1.732839822769165, |
|
"eval_runtime": 502.4559, |
|
"eval_samples_per_second": 253.897, |
|
"eval_steps_per_second": 7.935, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.7399510879789302, |
|
"grad_norm": 54.836814880371094, |
|
"learning_rate": 1.8114232959988854e-05, |
|
"loss": 1.3519, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.7399510879789302, |
|
"eval_loss": 1.8369685411453247, |
|
"eval_runtime": 491.7357, |
|
"eval_samples_per_second": 259.432, |
|
"eval_steps_per_second": 8.108, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.746221859910955, |
|
"grad_norm": 189.6983184814453, |
|
"learning_rate": 1.807939521677786e-05, |
|
"loss": 1.4935, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.746221859910955, |
|
"eval_loss": 1.72471022605896, |
|
"eval_runtime": 497.1147, |
|
"eval_samples_per_second": 256.625, |
|
"eval_steps_per_second": 8.02, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.7524926318429799, |
|
"grad_norm": 103.63326263427734, |
|
"learning_rate": 1.8044557473566863e-05, |
|
"loss": 1.1721, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7524926318429799, |
|
"eval_loss": 1.6529266834259033, |
|
"eval_runtime": 496.0569, |
|
"eval_samples_per_second": 257.172, |
|
"eval_steps_per_second": 8.037, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7587634037750047, |
|
"grad_norm": 118.19406127929688, |
|
"learning_rate": 1.8009719730355866e-05, |
|
"loss": 2.2432, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.7587634037750047, |
|
"eval_loss": 1.6328880786895752, |
|
"eval_runtime": 502.1124, |
|
"eval_samples_per_second": 254.071, |
|
"eval_steps_per_second": 7.94, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.7650341757070296, |
|
"grad_norm": 135.55650329589844, |
|
"learning_rate": 1.7974881987144873e-05, |
|
"loss": 1.6931, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.7650341757070296, |
|
"eval_loss": 1.6563047170639038, |
|
"eval_runtime": 492.9421, |
|
"eval_samples_per_second": 258.797, |
|
"eval_steps_per_second": 8.088, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.7713049476390543, |
|
"grad_norm": 3.689490795135498, |
|
"learning_rate": 1.794004424393388e-05, |
|
"loss": 1.2736, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.7713049476390543, |
|
"eval_loss": 1.6984437704086304, |
|
"eval_runtime": 495.6061, |
|
"eval_samples_per_second": 257.406, |
|
"eval_steps_per_second": 8.045, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.7775757195710792, |
|
"grad_norm": 88.78681945800781, |
|
"learning_rate": 1.7905206500722885e-05, |
|
"loss": 1.7063, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.7775757195710792, |
|
"eval_loss": 1.6574100255966187, |
|
"eval_runtime": 504.1606, |
|
"eval_samples_per_second": 253.038, |
|
"eval_steps_per_second": 7.908, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.783846491503104, |
|
"grad_norm": 20.35865592956543, |
|
"learning_rate": 1.787036875751189e-05, |
|
"loss": 1.7921, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.783846491503104, |
|
"eval_loss": 1.7759722471237183, |
|
"eval_runtime": 497.8975, |
|
"eval_samples_per_second": 256.221, |
|
"eval_steps_per_second": 8.008, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.7901172634351289, |
|
"grad_norm": 2.046844720840454, |
|
"learning_rate": 1.7835531014300895e-05, |
|
"loss": 1.875, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.7901172634351289, |
|
"eval_loss": 1.7148810625076294, |
|
"eval_runtime": 492.3064, |
|
"eval_samples_per_second": 259.131, |
|
"eval_steps_per_second": 8.099, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.7963880353671537, |
|
"grad_norm": 0.9655187129974365, |
|
"learning_rate": 1.78006932710899e-05, |
|
"loss": 1.4435, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.7963880353671537, |
|
"eval_loss": 1.8084521293640137, |
|
"eval_runtime": 502.53, |
|
"eval_samples_per_second": 253.859, |
|
"eval_steps_per_second": 7.934, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.8026588072991785, |
|
"grad_norm": 135.72523498535156, |
|
"learning_rate": 1.7765855527878907e-05, |
|
"loss": 1.5271, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.8026588072991785, |
|
"eval_loss": 1.7246832847595215, |
|
"eval_runtime": 498.9684, |
|
"eval_samples_per_second": 255.672, |
|
"eval_steps_per_second": 7.99, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.8089295792312033, |
|
"grad_norm": 1.4582579135894775, |
|
"learning_rate": 1.773101778466791e-05, |
|
"loss": 1.618, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.8089295792312033, |
|
"eval_loss": 1.6542091369628906, |
|
"eval_runtime": 498.6777, |
|
"eval_samples_per_second": 255.821, |
|
"eval_steps_per_second": 7.995, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.8152003511632282, |
|
"grad_norm": 240.90525817871094, |
|
"learning_rate": 1.7696528418889027e-05, |
|
"loss": 1.9788, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.8152003511632282, |
|
"eval_loss": 1.5685710906982422, |
|
"eval_runtime": 501.55, |
|
"eval_samples_per_second": 254.355, |
|
"eval_steps_per_second": 7.949, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.821471123095253, |
|
"grad_norm": 0.4606687128543854, |
|
"learning_rate": 1.7661690675678033e-05, |
|
"loss": 1.8213, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.821471123095253, |
|
"eval_loss": 1.560313105583191, |
|
"eval_runtime": 495.8302, |
|
"eval_samples_per_second": 257.29, |
|
"eval_steps_per_second": 8.041, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.8277418950272779, |
|
"grad_norm": 33.05907440185547, |
|
"learning_rate": 1.7626852932467036e-05, |
|
"loss": 1.3661, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.8277418950272779, |
|
"eval_loss": 1.637640118598938, |
|
"eval_runtime": 499.6063, |
|
"eval_samples_per_second": 255.345, |
|
"eval_steps_per_second": 7.98, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.8340126669593027, |
|
"grad_norm": 58.993228912353516, |
|
"learning_rate": 1.7592015189256042e-05, |
|
"loss": 1.3852, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.8340126669593027, |
|
"eval_loss": 1.595252513885498, |
|
"eval_runtime": 497.3714, |
|
"eval_samples_per_second": 256.492, |
|
"eval_steps_per_second": 8.016, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.8402834388913275, |
|
"grad_norm": 52.913265228271484, |
|
"learning_rate": 1.7557177446045045e-05, |
|
"loss": 1.4673, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.8402834388913275, |
|
"eval_loss": 1.634629487991333, |
|
"eval_runtime": 497.4186, |
|
"eval_samples_per_second": 256.468, |
|
"eval_steps_per_second": 8.015, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.8465542108233524, |
|
"grad_norm": 126.8105697631836, |
|
"learning_rate": 1.752233970283405e-05, |
|
"loss": 1.6684, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.8465542108233524, |
|
"eval_loss": 1.5818397998809814, |
|
"eval_runtime": 499.5245, |
|
"eval_samples_per_second": 255.387, |
|
"eval_steps_per_second": 7.982, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.8528249827553772, |
|
"grad_norm": 130.67335510253906, |
|
"learning_rate": 1.7487501959623058e-05, |
|
"loss": 1.686, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.8528249827553772, |
|
"eval_loss": 1.5840120315551758, |
|
"eval_runtime": 500.7181, |
|
"eval_samples_per_second": 254.778, |
|
"eval_steps_per_second": 7.963, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.8590957546874021, |
|
"grad_norm": 3.3967671394348145, |
|
"learning_rate": 1.7452664216412064e-05, |
|
"loss": 1.4397, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.8590957546874021, |
|
"eval_loss": 1.5855337381362915, |
|
"eval_runtime": 502.231, |
|
"eval_samples_per_second": 254.011, |
|
"eval_steps_per_second": 7.939, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.8653665266194268, |
|
"grad_norm": 446.9328918457031, |
|
"learning_rate": 1.7417826473201067e-05, |
|
"loss": 1.5973, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.8653665266194268, |
|
"eval_loss": 1.720745325088501, |
|
"eval_runtime": 501.237, |
|
"eval_samples_per_second": 254.514, |
|
"eval_steps_per_second": 7.954, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.8716372985514517, |
|
"grad_norm": 0.6950648427009583, |
|
"learning_rate": 1.7382988729990073e-05, |
|
"loss": 1.221, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.8716372985514517, |
|
"eval_loss": 1.638085961341858, |
|
"eval_runtime": 499.3245, |
|
"eval_samples_per_second": 255.489, |
|
"eval_steps_per_second": 7.985, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.8779080704834765, |
|
"grad_norm": 24.994272232055664, |
|
"learning_rate": 1.7348150986779076e-05, |
|
"loss": 1.2082, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.8779080704834765, |
|
"eval_loss": 1.6335324048995972, |
|
"eval_runtime": 501.6663, |
|
"eval_samples_per_second": 254.297, |
|
"eval_steps_per_second": 7.948, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.8841788424155014, |
|
"grad_norm": 0.017005544155836105, |
|
"learning_rate": 1.7313313243568083e-05, |
|
"loss": 1.5399, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.8841788424155014, |
|
"eval_loss": 1.643354058265686, |
|
"eval_runtime": 500.7206, |
|
"eval_samples_per_second": 254.777, |
|
"eval_steps_per_second": 7.963, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.8904496143475262, |
|
"grad_norm": 48.26883316040039, |
|
"learning_rate": 1.727847550035709e-05, |
|
"loss": 1.5265, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.8904496143475262, |
|
"eval_loss": 1.7265760898590088, |
|
"eval_runtime": 503.0351, |
|
"eval_samples_per_second": 253.605, |
|
"eval_steps_per_second": 7.926, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.896720386279551, |
|
"grad_norm": 4.5458149909973145, |
|
"learning_rate": 1.7243637757146095e-05, |
|
"loss": 0.9321, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.896720386279551, |
|
"eval_loss": 1.5980534553527832, |
|
"eval_runtime": 500.0744, |
|
"eval_samples_per_second": 255.106, |
|
"eval_steps_per_second": 7.973, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.9029911582115758, |
|
"grad_norm": 1.9558783769607544, |
|
"learning_rate": 1.72088000139351e-05, |
|
"loss": 1.1133, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.9029911582115758, |
|
"eval_loss": 1.612575650215149, |
|
"eval_runtime": 502.1556, |
|
"eval_samples_per_second": 254.049, |
|
"eval_steps_per_second": 7.94, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.9092619301436007, |
|
"grad_norm": 71.19198608398438, |
|
"learning_rate": 1.7173962270724105e-05, |
|
"loss": 1.0754, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.9092619301436007, |
|
"eval_loss": 1.6227186918258667, |
|
"eval_runtime": 493.8123, |
|
"eval_samples_per_second": 258.341, |
|
"eval_steps_per_second": 8.074, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.9155327020756255, |
|
"grad_norm": 0.26305466890335083, |
|
"learning_rate": 1.7139124527513108e-05, |
|
"loss": 1.3486, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.9155327020756255, |
|
"eval_loss": 1.6142776012420654, |
|
"eval_runtime": 499.3615, |
|
"eval_samples_per_second": 255.47, |
|
"eval_steps_per_second": 7.984, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.9218034740076504, |
|
"grad_norm": 35.207157135009766, |
|
"learning_rate": 1.7104286784302114e-05, |
|
"loss": 1.6338, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.9218034740076504, |
|
"eval_loss": 1.5451936721801758, |
|
"eval_runtime": 501.5927, |
|
"eval_samples_per_second": 254.334, |
|
"eval_steps_per_second": 7.949, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.9280742459396751, |
|
"grad_norm": 213.60140991210938, |
|
"learning_rate": 1.706944904109112e-05, |
|
"loss": 1.389, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.9280742459396751, |
|
"eval_loss": 1.6098874807357788, |
|
"eval_runtime": 501.8582, |
|
"eval_samples_per_second": 254.199, |
|
"eval_steps_per_second": 7.944, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.9343450178717, |
|
"grad_norm": 111.08502960205078, |
|
"learning_rate": 1.7034611297880123e-05, |
|
"loss": 1.3776, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.9343450178717, |
|
"eval_loss": 1.6435140371322632, |
|
"eval_runtime": 502.6322, |
|
"eval_samples_per_second": 253.808, |
|
"eval_steps_per_second": 7.932, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.9406157898037248, |
|
"grad_norm": 18.123170852661133, |
|
"learning_rate": 1.699977355466913e-05, |
|
"loss": 1.8714, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.9406157898037248, |
|
"eval_loss": 1.537667989730835, |
|
"eval_runtime": 501.0111, |
|
"eval_samples_per_second": 254.629, |
|
"eval_steps_per_second": 7.958, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.9468865617357497, |
|
"grad_norm": 155.93455505371094, |
|
"learning_rate": 1.6964935811458133e-05, |
|
"loss": 1.1286, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.9468865617357497, |
|
"eval_loss": 1.6325874328613281, |
|
"eval_runtime": 502.9153, |
|
"eval_samples_per_second": 253.665, |
|
"eval_steps_per_second": 7.928, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.9531573336677746, |
|
"grad_norm": 172.1987762451172, |
|
"learning_rate": 1.693009806824714e-05, |
|
"loss": 1.4029, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.9531573336677746, |
|
"eval_loss": 1.6255732774734497, |
|
"eval_runtime": 508.4447, |
|
"eval_samples_per_second": 250.906, |
|
"eval_steps_per_second": 7.842, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.9594281055997993, |
|
"grad_norm": 6.499632835388184, |
|
"learning_rate": 1.6895260325036145e-05, |
|
"loss": 1.7772, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.9594281055997993, |
|
"eval_loss": 1.5221425294876099, |
|
"eval_runtime": 502.3511, |
|
"eval_samples_per_second": 253.95, |
|
"eval_steps_per_second": 7.937, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.9656988775318242, |
|
"grad_norm": 0.45312049984931946, |
|
"learning_rate": 1.686042258182515e-05, |
|
"loss": 1.3415, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.9656988775318242, |
|
"eval_loss": 1.5603629350662231, |
|
"eval_runtime": 502.2256, |
|
"eval_samples_per_second": 254.013, |
|
"eval_steps_per_second": 7.939, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.971969649463849, |
|
"grad_norm": 1.405121922492981, |
|
"learning_rate": 1.6825584838614155e-05, |
|
"loss": 1.1088, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.971969649463849, |
|
"eval_loss": 1.574865698814392, |
|
"eval_runtime": 517.5942, |
|
"eval_samples_per_second": 246.471, |
|
"eval_steps_per_second": 7.703, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.9782404213958739, |
|
"grad_norm": 6.808924198150635, |
|
"learning_rate": 1.679074709540316e-05, |
|
"loss": 1.4602, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.9782404213958739, |
|
"eval_loss": 1.494147777557373, |
|
"eval_runtime": 505.721, |
|
"eval_samples_per_second": 252.258, |
|
"eval_steps_per_second": 7.884, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.9845111933278987, |
|
"grad_norm": 88.74259185791016, |
|
"learning_rate": 1.6755909352192167e-05, |
|
"loss": 1.867, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.9845111933278987, |
|
"eval_loss": 1.3730698823928833, |
|
"eval_runtime": 515.7676, |
|
"eval_samples_per_second": 247.344, |
|
"eval_steps_per_second": 7.73, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.9907819652599235, |
|
"grad_norm": 3.1625919342041016, |
|
"learning_rate": 1.672141998641328e-05, |
|
"loss": 1.4541, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.9907819652599235, |
|
"eval_loss": 1.4205607175827026, |
|
"eval_runtime": 486.2625, |
|
"eval_samples_per_second": 262.352, |
|
"eval_steps_per_second": 8.199, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.9970527371919483, |
|
"grad_norm": 135.7765655517578, |
|
"learning_rate": 1.6686930620634396e-05, |
|
"loss": 1.1966, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.9970527371919483, |
|
"eval_loss": 1.4982208013534546, |
|
"eval_runtime": 494.4178, |
|
"eval_samples_per_second": 258.025, |
|
"eval_steps_per_second": 8.064, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 1.0033235091239732, |
|
"grad_norm": 97.84881591796875, |
|
"learning_rate": 1.6652092877423403e-05, |
|
"loss": 1.1447, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.0033235091239732, |
|
"eval_loss": 1.5120809078216553, |
|
"eval_runtime": 491.4115, |
|
"eval_samples_per_second": 259.603, |
|
"eval_steps_per_second": 8.113, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.009594281055998, |
|
"grad_norm": 152.9120635986328, |
|
"learning_rate": 1.6617255134212406e-05, |
|
"loss": 1.1266, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 1.009594281055998, |
|
"eval_loss": 1.4103273153305054, |
|
"eval_runtime": 501.3393, |
|
"eval_samples_per_second": 254.462, |
|
"eval_steps_per_second": 7.953, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 1.0158650529880229, |
|
"grad_norm": 10.062068939208984, |
|
"learning_rate": 1.6582417391001412e-05, |
|
"loss": 1.1971, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.0158650529880229, |
|
"eval_loss": 1.5044476985931396, |
|
"eval_runtime": 500.4234, |
|
"eval_samples_per_second": 254.928, |
|
"eval_steps_per_second": 7.967, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.0221358249200476, |
|
"grad_norm": 385.3752136230469, |
|
"learning_rate": 1.6547579647790418e-05, |
|
"loss": 1.3376, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 1.0221358249200476, |
|
"eval_loss": 1.5336840152740479, |
|
"eval_runtime": 494.8747, |
|
"eval_samples_per_second": 257.786, |
|
"eval_steps_per_second": 8.057, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 1.0284065968520726, |
|
"grad_norm": 0.33529093861579895, |
|
"learning_rate": 1.6512741904579425e-05, |
|
"loss": 1.7977, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 1.0284065968520726, |
|
"eval_loss": 1.5711828470230103, |
|
"eval_runtime": 502.7844, |
|
"eval_samples_per_second": 253.731, |
|
"eval_steps_per_second": 7.93, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 1.0346773687840973, |
|
"grad_norm": 228.05165100097656, |
|
"learning_rate": 1.6477904161368428e-05, |
|
"loss": 1.6946, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.0346773687840973, |
|
"eval_loss": 1.5322738885879517, |
|
"eval_runtime": 498.3968, |
|
"eval_samples_per_second": 255.965, |
|
"eval_steps_per_second": 8.0, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.040948140716122, |
|
"grad_norm": 1.2080790996551514, |
|
"learning_rate": 1.6443066418157434e-05, |
|
"loss": 0.8674, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.040948140716122, |
|
"eval_loss": 1.4461946487426758, |
|
"eval_runtime": 492.3648, |
|
"eval_samples_per_second": 259.101, |
|
"eval_steps_per_second": 8.098, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.047218912648147, |
|
"grad_norm": 68.50479888916016, |
|
"learning_rate": 1.6408228674946437e-05, |
|
"loss": 1.6447, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 1.047218912648147, |
|
"eval_loss": 1.483079433441162, |
|
"eval_runtime": 496.9095, |
|
"eval_samples_per_second": 256.731, |
|
"eval_steps_per_second": 8.024, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 1.0534896845801718, |
|
"grad_norm": 0.08792801946401596, |
|
"learning_rate": 1.6373390931735443e-05, |
|
"loss": 1.2709, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.0534896845801718, |
|
"eval_loss": 1.575551986694336, |
|
"eval_runtime": 503.9395, |
|
"eval_samples_per_second": 253.149, |
|
"eval_steps_per_second": 7.912, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.0597604565121967, |
|
"grad_norm": 1.81405770778656, |
|
"learning_rate": 1.633855318852445e-05, |
|
"loss": 1.5217, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 1.0597604565121967, |
|
"eval_loss": 1.5059562921524048, |
|
"eval_runtime": 506.8836, |
|
"eval_samples_per_second": 251.679, |
|
"eval_steps_per_second": 7.866, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 1.0660312284442215, |
|
"grad_norm": 38.73731231689453, |
|
"learning_rate": 1.6303715445313456e-05, |
|
"loss": 1.2986, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.0660312284442215, |
|
"eval_loss": 1.4834423065185547, |
|
"eval_runtime": 503.4795, |
|
"eval_samples_per_second": 253.381, |
|
"eval_steps_per_second": 7.919, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.0723020003762462, |
|
"grad_norm": 0.7970458269119263, |
|
"learning_rate": 1.626887770210246e-05, |
|
"loss": 0.9976, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 1.0723020003762462, |
|
"eval_loss": 1.4840906858444214, |
|
"eval_runtime": 504.3957, |
|
"eval_samples_per_second": 252.92, |
|
"eval_steps_per_second": 7.905, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 1.0785727723082712, |
|
"grad_norm": 0.04621260613203049, |
|
"learning_rate": 1.6234039958891465e-05, |
|
"loss": 1.3457, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.0785727723082712, |
|
"eval_loss": 1.4227601289749146, |
|
"eval_runtime": 493.9233, |
|
"eval_samples_per_second": 258.283, |
|
"eval_steps_per_second": 8.072, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.084843544240296, |
|
"grad_norm": 0.5272818803787231, |
|
"learning_rate": 1.6199202215680468e-05, |
|
"loss": 0.987, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 1.084843544240296, |
|
"eval_loss": 1.3806939125061035, |
|
"eval_runtime": 501.4303, |
|
"eval_samples_per_second": 254.416, |
|
"eval_steps_per_second": 7.951, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 1.091114316172321, |
|
"grad_norm": 0.4564209282398224, |
|
"learning_rate": 1.6164364472469474e-05, |
|
"loss": 1.2714, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.091114316172321, |
|
"eval_loss": 1.3470913171768188, |
|
"eval_runtime": 501.1914, |
|
"eval_samples_per_second": 254.538, |
|
"eval_steps_per_second": 7.955, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.0973850881043457, |
|
"grad_norm": 14.678479194641113, |
|
"learning_rate": 1.612952672925848e-05, |
|
"loss": 1.298, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.0973850881043457, |
|
"eval_loss": 1.4133707284927368, |
|
"eval_runtime": 499.6488, |
|
"eval_samples_per_second": 255.323, |
|
"eval_steps_per_second": 7.98, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.1036558600363704, |
|
"grad_norm": 1.6324628591537476, |
|
"learning_rate": 1.6094688986047484e-05, |
|
"loss": 0.9522, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.1036558600363704, |
|
"eval_loss": 1.4225292205810547, |
|
"eval_runtime": 501.7975, |
|
"eval_samples_per_second": 254.23, |
|
"eval_steps_per_second": 7.945, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.1099266319683954, |
|
"grad_norm": 1.6328845024108887, |
|
"learning_rate": 1.605985124283649e-05, |
|
"loss": 1.0634, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.1099266319683954, |
|
"eval_loss": 1.4474034309387207, |
|
"eval_runtime": 505.9682, |
|
"eval_samples_per_second": 252.134, |
|
"eval_steps_per_second": 7.88, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.11619740390042, |
|
"grad_norm": 0.9931433200836182, |
|
"learning_rate": 1.6025013499625493e-05, |
|
"loss": 1.2889, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.11619740390042, |
|
"eval_loss": 1.4678562879562378, |
|
"eval_runtime": 503.5781, |
|
"eval_samples_per_second": 253.331, |
|
"eval_steps_per_second": 7.917, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.122468175832445, |
|
"grad_norm": 59.28689956665039, |
|
"learning_rate": 1.5990175756414503e-05, |
|
"loss": 1.7532, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 1.122468175832445, |
|
"eval_loss": 1.3757271766662598, |
|
"eval_runtime": 499.9963, |
|
"eval_samples_per_second": 255.146, |
|
"eval_steps_per_second": 7.974, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 1.1287389477644698, |
|
"grad_norm": 72.52947998046875, |
|
"learning_rate": 1.5955338013203506e-05, |
|
"loss": 1.6613, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.1287389477644698, |
|
"eval_loss": 1.3807989358901978, |
|
"eval_runtime": 492.0199, |
|
"eval_samples_per_second": 259.282, |
|
"eval_steps_per_second": 8.103, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.1350097196964946, |
|
"grad_norm": 29.813941955566406, |
|
"learning_rate": 1.5920500269992512e-05, |
|
"loss": 1.1765, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 1.1350097196964946, |
|
"eval_loss": 1.3903069496154785, |
|
"eval_runtime": 501.0104, |
|
"eval_samples_per_second": 254.629, |
|
"eval_steps_per_second": 7.958, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 1.1412804916285195, |
|
"grad_norm": 1.3140065670013428, |
|
"learning_rate": 1.5885662526781515e-05, |
|
"loss": 1.2787, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 1.1412804916285195, |
|
"eval_loss": 1.3920559883117676, |
|
"eval_runtime": 500.2005, |
|
"eval_samples_per_second": 255.042, |
|
"eval_steps_per_second": 7.971, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 1.1475512635605443, |
|
"grad_norm": 0.21044209599494934, |
|
"learning_rate": 1.585082478357052e-05, |
|
"loss": 1.2532, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 1.1475512635605443, |
|
"eval_loss": 1.3519495725631714, |
|
"eval_runtime": 505.681, |
|
"eval_samples_per_second": 252.278, |
|
"eval_steps_per_second": 7.884, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 1.1538220354925692, |
|
"grad_norm": 56.845211029052734, |
|
"learning_rate": 1.5815987040359528e-05, |
|
"loss": 1.8056, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.1538220354925692, |
|
"eval_loss": 1.2984182834625244, |
|
"eval_runtime": 507.7377, |
|
"eval_samples_per_second": 251.256, |
|
"eval_steps_per_second": 7.852, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.160092807424594, |
|
"grad_norm": 99.5033950805664, |
|
"learning_rate": 1.5781149297148534e-05, |
|
"loss": 1.0985, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.160092807424594, |
|
"eval_loss": 1.3321679830551147, |
|
"eval_runtime": 504.9231, |
|
"eval_samples_per_second": 252.656, |
|
"eval_steps_per_second": 7.896, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.1663635793566187, |
|
"grad_norm": 75.43387603759766, |
|
"learning_rate": 1.5746311553937537e-05, |
|
"loss": 1.8665, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 1.1663635793566187, |
|
"eval_loss": 1.4059826135635376, |
|
"eval_runtime": 495.7712, |
|
"eval_samples_per_second": 257.32, |
|
"eval_steps_per_second": 8.042, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 1.1726343512886437, |
|
"grad_norm": 111.51386260986328, |
|
"learning_rate": 1.5711473810726543e-05, |
|
"loss": 1.2427, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 1.1726343512886437, |
|
"eval_loss": 1.3774936199188232, |
|
"eval_runtime": 502.6217, |
|
"eval_samples_per_second": 253.813, |
|
"eval_steps_per_second": 7.932, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 1.1789051232206684, |
|
"grad_norm": 1.3077305555343628, |
|
"learning_rate": 1.5676636067515546e-05, |
|
"loss": 1.1241, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.1789051232206684, |
|
"eval_loss": 1.3168435096740723, |
|
"eval_runtime": 498.3092, |
|
"eval_samples_per_second": 256.01, |
|
"eval_steps_per_second": 8.001, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.1851758951526934, |
|
"grad_norm": 29.557662963867188, |
|
"learning_rate": 1.5641798324304553e-05, |
|
"loss": 1.2348, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 1.1851758951526934, |
|
"eval_loss": 1.353879690170288, |
|
"eval_runtime": 503.07, |
|
"eval_samples_per_second": 253.587, |
|
"eval_steps_per_second": 7.925, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 1.1914466670847181, |
|
"grad_norm": 65.81330871582031, |
|
"learning_rate": 1.560696058109356e-05, |
|
"loss": 1.1709, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.1914466670847181, |
|
"eval_loss": 1.3540174961090088, |
|
"eval_runtime": 498.735, |
|
"eval_samples_per_second": 255.791, |
|
"eval_steps_per_second": 7.994, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.1977174390167429, |
|
"grad_norm": 48.844017028808594, |
|
"learning_rate": 1.5572122837882565e-05, |
|
"loss": 0.8844, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 1.1977174390167429, |
|
"eval_loss": 1.3141909837722778, |
|
"eval_runtime": 498.6069, |
|
"eval_samples_per_second": 255.857, |
|
"eval_steps_per_second": 7.996, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 1.2039882109487678, |
|
"grad_norm": 3.451929807662964, |
|
"learning_rate": 1.5537285094671568e-05, |
|
"loss": 1.0035, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.2039882109487678, |
|
"eval_loss": 1.3781260251998901, |
|
"eval_runtime": 506.2945, |
|
"eval_samples_per_second": 251.972, |
|
"eval_steps_per_second": 7.875, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.2102589828807926, |
|
"grad_norm": 77.69365692138672, |
|
"learning_rate": 1.5502447351460575e-05, |
|
"loss": 1.4279, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 1.2102589828807926, |
|
"eval_loss": 1.261493444442749, |
|
"eval_runtime": 498.5065, |
|
"eval_samples_per_second": 255.908, |
|
"eval_steps_per_second": 7.998, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 1.2165297548128176, |
|
"grad_norm": 21.791259765625, |
|
"learning_rate": 1.5467609608249577e-05, |
|
"loss": 1.3327, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 1.2165297548128176, |
|
"eval_loss": 1.2696096897125244, |
|
"eval_runtime": 498.1301, |
|
"eval_samples_per_second": 256.102, |
|
"eval_steps_per_second": 8.004, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 1.2228005267448423, |
|
"grad_norm": 2.250319242477417, |
|
"learning_rate": 1.5432771865038584e-05, |
|
"loss": 0.993, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.2228005267448423, |
|
"eval_loss": 1.3169900178909302, |
|
"eval_runtime": 495.9918, |
|
"eval_samples_per_second": 257.206, |
|
"eval_steps_per_second": 8.038, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.229071298676867, |
|
"grad_norm": 73.77873229980469, |
|
"learning_rate": 1.539793412182759e-05, |
|
"loss": 0.7869, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.229071298676867, |
|
"eval_loss": 1.2967498302459717, |
|
"eval_runtime": 497.8866, |
|
"eval_samples_per_second": 256.227, |
|
"eval_steps_per_second": 8.008, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.235342070608892, |
|
"grad_norm": 0.07626141607761383, |
|
"learning_rate": 1.5363096378616596e-05, |
|
"loss": 0.985, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 1.235342070608892, |
|
"eval_loss": 1.3056693077087402, |
|
"eval_runtime": 494.5073, |
|
"eval_samples_per_second": 257.978, |
|
"eval_steps_per_second": 8.063, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 1.2416128425409168, |
|
"grad_norm": 4.803875923156738, |
|
"learning_rate": 1.53282586354056e-05, |
|
"loss": 1.1603, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 1.2416128425409168, |
|
"eval_loss": 1.2796647548675537, |
|
"eval_runtime": 496.7276, |
|
"eval_samples_per_second": 256.825, |
|
"eval_steps_per_second": 8.027, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 1.2478836144729417, |
|
"grad_norm": 63.491329193115234, |
|
"learning_rate": 1.5293420892194606e-05, |
|
"loss": 1.2469, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 1.2478836144729417, |
|
"eval_loss": 1.2394485473632812, |
|
"eval_runtime": 504.5722, |
|
"eval_samples_per_second": 252.832, |
|
"eval_steps_per_second": 7.902, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 1.2541543864049665, |
|
"grad_norm": 155.53126525878906, |
|
"learning_rate": 1.525858314898361e-05, |
|
"loss": 1.521, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.2541543864049665, |
|
"eval_loss": 1.2309328317642212, |
|
"eval_runtime": 499.147, |
|
"eval_samples_per_second": 255.58, |
|
"eval_steps_per_second": 7.988, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.2604251583369912, |
|
"grad_norm": 0.10026417672634125, |
|
"learning_rate": 1.5223745405772617e-05, |
|
"loss": 1.2632, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 1.2604251583369912, |
|
"eval_loss": 1.2352900505065918, |
|
"eval_runtime": 498.6806, |
|
"eval_samples_per_second": 255.819, |
|
"eval_steps_per_second": 7.995, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 1.2666959302690162, |
|
"grad_norm": 20.156579971313477, |
|
"learning_rate": 1.518890766256162e-05, |
|
"loss": 1.3621, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 1.2666959302690162, |
|
"eval_loss": 1.2432923316955566, |
|
"eval_runtime": 488.6088, |
|
"eval_samples_per_second": 261.092, |
|
"eval_steps_per_second": 8.16, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 1.272966702201041, |
|
"grad_norm": 1.3594141006469727, |
|
"learning_rate": 1.5154069919350624e-05, |
|
"loss": 1.5145, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 1.272966702201041, |
|
"eval_loss": 1.3064727783203125, |
|
"eval_runtime": 501.5288, |
|
"eval_samples_per_second": 254.366, |
|
"eval_steps_per_second": 7.95, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 1.2792374741330659, |
|
"grad_norm": 26.742637634277344, |
|
"learning_rate": 1.511923217613963e-05, |
|
"loss": 1.3708, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.2792374741330659, |
|
"eval_loss": 1.2422964572906494, |
|
"eval_runtime": 504.9841, |
|
"eval_samples_per_second": 252.626, |
|
"eval_steps_per_second": 7.895, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.2855082460650906, |
|
"grad_norm": 666.2847290039062, |
|
"learning_rate": 1.5084394432928635e-05, |
|
"loss": 1.1716, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.2855082460650906, |
|
"eval_loss": 1.2922592163085938, |
|
"eval_runtime": 502.0283, |
|
"eval_samples_per_second": 254.113, |
|
"eval_steps_per_second": 7.942, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.2917790179971154, |
|
"grad_norm": 0.90843665599823, |
|
"learning_rate": 1.5049556689717642e-05, |
|
"loss": 1.419, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 1.2917790179971154, |
|
"eval_loss": 1.2193955183029175, |
|
"eval_runtime": 496.5986, |
|
"eval_samples_per_second": 256.892, |
|
"eval_steps_per_second": 8.029, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 1.2980497899291403, |
|
"grad_norm": 174.012451171875, |
|
"learning_rate": 1.5014718946506646e-05, |
|
"loss": 1.1644, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 1.2980497899291403, |
|
"eval_loss": 1.2368745803833008, |
|
"eval_runtime": 500.022, |
|
"eval_samples_per_second": 255.133, |
|
"eval_steps_per_second": 7.974, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 1.304320561861165, |
|
"grad_norm": 7.468738555908203, |
|
"learning_rate": 1.4979881203295653e-05, |
|
"loss": 1.6589, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.304320561861165, |
|
"eval_loss": 1.1971392631530762, |
|
"eval_runtime": 500.4989, |
|
"eval_samples_per_second": 254.89, |
|
"eval_steps_per_second": 7.966, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.31059133379319, |
|
"grad_norm": 120.70152282714844, |
|
"learning_rate": 1.4945043460084656e-05, |
|
"loss": 1.0299, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 1.31059133379319, |
|
"eval_loss": 1.2342555522918701, |
|
"eval_runtime": 499.8846, |
|
"eval_samples_per_second": 255.203, |
|
"eval_steps_per_second": 7.976, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 1.3168621057252148, |
|
"grad_norm": 90.38188934326172, |
|
"learning_rate": 1.4910205716873662e-05, |
|
"loss": 1.3452, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.3168621057252148, |
|
"eval_loss": 1.2725248336791992, |
|
"eval_runtime": 490.3505, |
|
"eval_samples_per_second": 260.165, |
|
"eval_steps_per_second": 8.131, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.3231328776572395, |
|
"grad_norm": 0.8048076033592224, |
|
"learning_rate": 1.4875367973662667e-05, |
|
"loss": 1.4234, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 1.3231328776572395, |
|
"eval_loss": 1.2416248321533203, |
|
"eval_runtime": 493.5915, |
|
"eval_samples_per_second": 258.457, |
|
"eval_steps_per_second": 8.078, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 1.3294036495892645, |
|
"grad_norm": 102.93982696533203, |
|
"learning_rate": 1.4840530230451673e-05, |
|
"loss": 1.2496, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.3294036495892645, |
|
"eval_loss": 1.3609205484390259, |
|
"eval_runtime": 501.4555, |
|
"eval_samples_per_second": 254.403, |
|
"eval_steps_per_second": 7.951, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.3356744215212892, |
|
"grad_norm": 0.0473560094833374, |
|
"learning_rate": 1.4805692487240678e-05, |
|
"loss": 1.2133, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 1.3356744215212892, |
|
"eval_loss": 1.2892857789993286, |
|
"eval_runtime": 486.7923, |
|
"eval_samples_per_second": 262.067, |
|
"eval_steps_per_second": 8.19, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 1.3419451934533142, |
|
"grad_norm": 0.2829754948616028, |
|
"learning_rate": 1.4770854744029684e-05, |
|
"loss": 0.8682, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 1.3419451934533142, |
|
"eval_loss": 1.2352983951568604, |
|
"eval_runtime": 501.6238, |
|
"eval_samples_per_second": 254.318, |
|
"eval_steps_per_second": 7.948, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 1.348215965385339, |
|
"grad_norm": 0.09349790215492249, |
|
"learning_rate": 1.4736017000818687e-05, |
|
"loss": 0.9499, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.348215965385339, |
|
"eval_loss": 1.2423368692398071, |
|
"eval_runtime": 503.2262, |
|
"eval_samples_per_second": 253.508, |
|
"eval_steps_per_second": 7.923, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.3544867373173637, |
|
"grad_norm": 0.7133996486663818, |
|
"learning_rate": 1.4701179257607693e-05, |
|
"loss": 1.2896, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.3544867373173637, |
|
"eval_loss": 1.1796832084655762, |
|
"eval_runtime": 504.2727, |
|
"eval_samples_per_second": 252.982, |
|
"eval_steps_per_second": 7.906, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.3607575092493887, |
|
"grad_norm": 44.3637580871582, |
|
"learning_rate": 1.4666341514396698e-05, |
|
"loss": 1.2392, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 1.3607575092493887, |
|
"eval_loss": 1.1962292194366455, |
|
"eval_runtime": 504.4317, |
|
"eval_samples_per_second": 252.902, |
|
"eval_steps_per_second": 7.904, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 1.3670282811814134, |
|
"grad_norm": 41.141788482666016, |
|
"learning_rate": 1.4631503771185704e-05, |
|
"loss": 0.9206, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 1.3670282811814134, |
|
"eval_loss": 1.2483233213424683, |
|
"eval_runtime": 502.7012, |
|
"eval_samples_per_second": 253.773, |
|
"eval_steps_per_second": 7.931, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 1.3732990531134384, |
|
"grad_norm": 0.8109003901481628, |
|
"learning_rate": 1.4596666027974709e-05, |
|
"loss": 1.174, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 1.3732990531134384, |
|
"eval_loss": 1.23282790184021, |
|
"eval_runtime": 505.2234, |
|
"eval_samples_per_second": 252.506, |
|
"eval_steps_per_second": 7.892, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 1.379569825045463, |
|
"grad_norm": 74.6466293334961, |
|
"learning_rate": 1.4562176662195823e-05, |
|
"loss": 1.6361, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.379569825045463, |
|
"eval_loss": 1.1558316946029663, |
|
"eval_runtime": 501.4654, |
|
"eval_samples_per_second": 254.398, |
|
"eval_steps_per_second": 7.951, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.3858405969774878, |
|
"grad_norm": 5.058000087738037, |
|
"learning_rate": 1.452733891898483e-05, |
|
"loss": 0.8284, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 1.3858405969774878, |
|
"eval_loss": 1.271115779876709, |
|
"eval_runtime": 493.6867, |
|
"eval_samples_per_second": 258.407, |
|
"eval_steps_per_second": 8.076, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 1.3921113689095128, |
|
"grad_norm": 1.647706389427185, |
|
"learning_rate": 1.4492849553205946e-05, |
|
"loss": 1.2814, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 1.3921113689095128, |
|
"eval_loss": 1.246185064315796, |
|
"eval_runtime": 492.0347, |
|
"eval_samples_per_second": 259.274, |
|
"eval_steps_per_second": 8.103, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 1.3983821408415376, |
|
"grad_norm": 0.4397072494029999, |
|
"learning_rate": 1.4458011809994949e-05, |
|
"loss": 1.1595, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 1.3983821408415376, |
|
"eval_loss": 1.2613025903701782, |
|
"eval_runtime": 498.6845, |
|
"eval_samples_per_second": 255.817, |
|
"eval_steps_per_second": 7.995, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 1.4046529127735625, |
|
"grad_norm": 16.611690521240234, |
|
"learning_rate": 1.4423174066783955e-05, |
|
"loss": 1.3129, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.4046529127735625, |
|
"eval_loss": 1.1816045045852661, |
|
"eval_runtime": 496.2598, |
|
"eval_samples_per_second": 257.067, |
|
"eval_steps_per_second": 8.034, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.4109236847055873, |
|
"grad_norm": 69.52592468261719, |
|
"learning_rate": 1.438833632357296e-05, |
|
"loss": 1.1353, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.4109236847055873, |
|
"eval_loss": 1.245389699935913, |
|
"eval_runtime": 494.4633, |
|
"eval_samples_per_second": 258.001, |
|
"eval_steps_per_second": 8.063, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.417194456637612, |
|
"grad_norm": 6.014486789703369, |
|
"learning_rate": 1.4353498580361966e-05, |
|
"loss": 1.3302, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 1.417194456637612, |
|
"eval_loss": 1.1397989988327026, |
|
"eval_runtime": 503.4853, |
|
"eval_samples_per_second": 253.378, |
|
"eval_steps_per_second": 7.919, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 1.423465228569637, |
|
"grad_norm": 2.0832605361938477, |
|
"learning_rate": 1.4318660837150971e-05, |
|
"loss": 1.1591, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 1.423465228569637, |
|
"eval_loss": 1.2935895919799805, |
|
"eval_runtime": 495.0142, |
|
"eval_samples_per_second": 257.714, |
|
"eval_steps_per_second": 8.054, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 1.4297360005016617, |
|
"grad_norm": 4.5407891273498535, |
|
"learning_rate": 1.4283823093939975e-05, |
|
"loss": 0.6551, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.4297360005016617, |
|
"eval_loss": 1.2345027923583984, |
|
"eval_runtime": 489.171, |
|
"eval_samples_per_second": 260.792, |
|
"eval_steps_per_second": 8.151, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.4360067724336867, |
|
"grad_norm": 78.76990509033203, |
|
"learning_rate": 1.4248985350728982e-05, |
|
"loss": 1.2884, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 1.4360067724336867, |
|
"eval_loss": 1.1629202365875244, |
|
"eval_runtime": 483.3185, |
|
"eval_samples_per_second": 263.95, |
|
"eval_steps_per_second": 8.249, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 1.4422775443657114, |
|
"grad_norm": 110.63036346435547, |
|
"learning_rate": 1.4214147607517985e-05, |
|
"loss": 1.1769, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.4422775443657114, |
|
"eval_loss": 1.2339965105056763, |
|
"eval_runtime": 486.5591, |
|
"eval_samples_per_second": 262.192, |
|
"eval_steps_per_second": 8.194, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.4485483162977362, |
|
"grad_norm": 126.27979278564453, |
|
"learning_rate": 1.4179309864306991e-05, |
|
"loss": 1.1331, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 1.4485483162977362, |
|
"eval_loss": 1.2035988569259644, |
|
"eval_runtime": 490.9417, |
|
"eval_samples_per_second": 259.852, |
|
"eval_steps_per_second": 8.121, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 1.4548190882297611, |
|
"grad_norm": 0.48294782638549805, |
|
"learning_rate": 1.4144472121095996e-05, |
|
"loss": 1.1008, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.4548190882297611, |
|
"eval_loss": 1.1685419082641602, |
|
"eval_runtime": 485.6527, |
|
"eval_samples_per_second": 262.682, |
|
"eval_steps_per_second": 8.21, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.4610898601617859, |
|
"grad_norm": 6.466658115386963, |
|
"learning_rate": 1.4109634377885002e-05, |
|
"loss": 1.1487, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 1.4610898601617859, |
|
"eval_loss": 1.1274471282958984, |
|
"eval_runtime": 496.8021, |
|
"eval_samples_per_second": 256.786, |
|
"eval_steps_per_second": 8.025, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 1.4673606320938108, |
|
"grad_norm": 18.893667221069336, |
|
"learning_rate": 1.4074796634674007e-05, |
|
"loss": 0.7753, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 1.4673606320938108, |
|
"eval_loss": 1.1737704277038574, |
|
"eval_runtime": 486.0352, |
|
"eval_samples_per_second": 262.475, |
|
"eval_steps_per_second": 8.203, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 1.4736314040258356, |
|
"grad_norm": 19.157712936401367, |
|
"learning_rate": 1.4039958891463013e-05, |
|
"loss": 1.3236, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.4736314040258356, |
|
"eval_loss": 1.2376619577407837, |
|
"eval_runtime": 488.4737, |
|
"eval_samples_per_second": 261.164, |
|
"eval_steps_per_second": 8.162, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.4799021759578603, |
|
"grad_norm": 9.691899299621582, |
|
"learning_rate": 1.4005121148252016e-05, |
|
"loss": 0.919, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.4799021759578603, |
|
"eval_loss": 1.2018409967422485, |
|
"eval_runtime": 493.2156, |
|
"eval_samples_per_second": 258.654, |
|
"eval_steps_per_second": 8.084, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.4861729478898853, |
|
"grad_norm": 98.8059310913086, |
|
"learning_rate": 1.3970283405041022e-05, |
|
"loss": 0.8516, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 1.4861729478898853, |
|
"eval_loss": 1.2296911478042603, |
|
"eval_runtime": 499.0547, |
|
"eval_samples_per_second": 255.627, |
|
"eval_steps_per_second": 7.989, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 1.49244371982191, |
|
"grad_norm": 22.1707706451416, |
|
"learning_rate": 1.3935445661830027e-05, |
|
"loss": 1.092, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 1.49244371982191, |
|
"eval_loss": 1.1629080772399902, |
|
"eval_runtime": 493.4212, |
|
"eval_samples_per_second": 258.546, |
|
"eval_steps_per_second": 8.08, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 1.498714491753935, |
|
"grad_norm": 0.31641775369644165, |
|
"learning_rate": 1.3900607918619033e-05, |
|
"loss": 0.673, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 1.498714491753935, |
|
"eval_loss": 1.2161920070648193, |
|
"eval_runtime": 495.1904, |
|
"eval_samples_per_second": 257.622, |
|
"eval_steps_per_second": 8.051, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 1.5049852636859598, |
|
"grad_norm": 0.4521692097187042, |
|
"learning_rate": 1.3865770175408038e-05, |
|
"loss": 0.994, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.5049852636859598, |
|
"eval_loss": 1.1778312921524048, |
|
"eval_runtime": 494.7958, |
|
"eval_samples_per_second": 257.828, |
|
"eval_steps_per_second": 8.058, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.5112560356179845, |
|
"grad_norm": 1.2718249559402466, |
|
"learning_rate": 1.3830932432197044e-05, |
|
"loss": 0.8766, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 1.5112560356179845, |
|
"eval_loss": 1.1902062892913818, |
|
"eval_runtime": 498.2478, |
|
"eval_samples_per_second": 256.041, |
|
"eval_steps_per_second": 8.002, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 1.5175268075500095, |
|
"grad_norm": 78.13153076171875, |
|
"learning_rate": 1.3796094688986047e-05, |
|
"loss": 1.3818, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 1.5175268075500095, |
|
"eval_loss": 1.1638315916061401, |
|
"eval_runtime": 475.6768, |
|
"eval_samples_per_second": 268.191, |
|
"eval_steps_per_second": 8.382, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 1.5237975794820342, |
|
"grad_norm": 11.799439430236816, |
|
"learning_rate": 1.3761256945775054e-05, |
|
"loss": 1.1215, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 1.5237975794820342, |
|
"eval_loss": 1.1665599346160889, |
|
"eval_runtime": 493.9156, |
|
"eval_samples_per_second": 258.287, |
|
"eval_steps_per_second": 8.072, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 1.5300683514140592, |
|
"grad_norm": 0.15210537612438202, |
|
"learning_rate": 1.3726419202564058e-05, |
|
"loss": 0.8485, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 1.5300683514140592, |
|
"eval_loss": 1.190748929977417, |
|
"eval_runtime": 489.6338, |
|
"eval_samples_per_second": 260.546, |
|
"eval_steps_per_second": 8.143, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 1.536339123346084, |
|
"grad_norm": 111.32445526123047, |
|
"learning_rate": 1.3691581459353065e-05, |
|
"loss": 1.1033, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.536339123346084, |
|
"eval_loss": 1.2317506074905396, |
|
"eval_runtime": 495.4364, |
|
"eval_samples_per_second": 257.494, |
|
"eval_steps_per_second": 8.047, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.5426098952781087, |
|
"grad_norm": 4.906432151794434, |
|
"learning_rate": 1.365674371614207e-05, |
|
"loss": 0.9001, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 1.5426098952781087, |
|
"eval_loss": 1.2112876176834106, |
|
"eval_runtime": 501.1743, |
|
"eval_samples_per_second": 254.546, |
|
"eval_steps_per_second": 7.955, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 1.5488806672101336, |
|
"grad_norm": 3.4020934104919434, |
|
"learning_rate": 1.3622254350363184e-05, |
|
"loss": 1.3256, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 1.5488806672101336, |
|
"eval_loss": 1.23091721534729, |
|
"eval_runtime": 488.223, |
|
"eval_samples_per_second": 261.299, |
|
"eval_steps_per_second": 8.166, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 1.5551514391421584, |
|
"grad_norm": 172.33592224121094, |
|
"learning_rate": 1.358741660715219e-05, |
|
"loss": 0.8162, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 1.5551514391421584, |
|
"eval_loss": 1.213860273361206, |
|
"eval_runtime": 492.7571, |
|
"eval_samples_per_second": 258.894, |
|
"eval_steps_per_second": 8.091, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 1.5614222110741833, |
|
"grad_norm": 1.1643731594085693, |
|
"learning_rate": 1.3552578863941195e-05, |
|
"loss": 0.5741, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 1.5614222110741833, |
|
"eval_loss": 1.237512469291687, |
|
"eval_runtime": 506.8194, |
|
"eval_samples_per_second": 251.711, |
|
"eval_steps_per_second": 7.867, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 1.567692983006208, |
|
"grad_norm": 106.2492446899414, |
|
"learning_rate": 1.3517741120730201e-05, |
|
"loss": 0.883, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.567692983006208, |
|
"eval_loss": 1.203902244567871, |
|
"eval_runtime": 495.5543, |
|
"eval_samples_per_second": 257.433, |
|
"eval_steps_per_second": 8.046, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.5739637549382328, |
|
"grad_norm": 24.915504455566406, |
|
"learning_rate": 1.3482903377519206e-05, |
|
"loss": 1.1212, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 1.5739637549382328, |
|
"eval_loss": 1.1428111791610718, |
|
"eval_runtime": 489.9866, |
|
"eval_samples_per_second": 260.358, |
|
"eval_steps_per_second": 8.137, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 1.5802345268702578, |
|
"grad_norm": 0.43622246384620667, |
|
"learning_rate": 1.3448065634308212e-05, |
|
"loss": 0.8229, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 1.5802345268702578, |
|
"eval_loss": 1.2338348627090454, |
|
"eval_runtime": 488.67, |
|
"eval_samples_per_second": 261.06, |
|
"eval_steps_per_second": 8.159, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 1.5865052988022825, |
|
"grad_norm": 76.21497344970703, |
|
"learning_rate": 1.3413227891097215e-05, |
|
"loss": 0.8856, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 1.5865052988022825, |
|
"eval_loss": 1.146145224571228, |
|
"eval_runtime": 504.1995, |
|
"eval_samples_per_second": 253.019, |
|
"eval_steps_per_second": 7.908, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 1.5927760707343075, |
|
"grad_norm": 114.51611328125, |
|
"learning_rate": 1.337839014788622e-05, |
|
"loss": 1.2323, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 1.5927760707343075, |
|
"eval_loss": 1.1568622589111328, |
|
"eval_runtime": 492.991, |
|
"eval_samples_per_second": 258.771, |
|
"eval_steps_per_second": 8.087, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 1.5990468426663322, |
|
"grad_norm": 3.8696110248565674, |
|
"learning_rate": 1.3343552404675226e-05, |
|
"loss": 0.9724, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.5990468426663322, |
|
"eval_loss": 1.1549348831176758, |
|
"eval_runtime": 499.5621, |
|
"eval_samples_per_second": 255.368, |
|
"eval_steps_per_second": 7.981, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.605317614598357, |
|
"grad_norm": 1.6167796850204468, |
|
"learning_rate": 1.330871466146423e-05, |
|
"loss": 1.0791, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 1.605317614598357, |
|
"eval_loss": 1.1160709857940674, |
|
"eval_runtime": 484.6001, |
|
"eval_samples_per_second": 263.252, |
|
"eval_steps_per_second": 8.227, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 1.611588386530382, |
|
"grad_norm": 138.8144073486328, |
|
"learning_rate": 1.3273876918253237e-05, |
|
"loss": 0.9845, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 1.611588386530382, |
|
"eval_loss": 1.1060998439788818, |
|
"eval_runtime": 496.5423, |
|
"eval_samples_per_second": 256.921, |
|
"eval_steps_per_second": 8.03, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 1.6178591584624067, |
|
"grad_norm": 4.400548934936523, |
|
"learning_rate": 1.3239039175042242e-05, |
|
"loss": 1.1591, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 1.6178591584624067, |
|
"eval_loss": 1.110283613204956, |
|
"eval_runtime": 486.9154, |
|
"eval_samples_per_second": 262.0, |
|
"eval_steps_per_second": 8.188, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 1.6241299303944317, |
|
"grad_norm": 239.38189697265625, |
|
"learning_rate": 1.3204201431831248e-05, |
|
"loss": 1.116, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 1.6241299303944317, |
|
"eval_loss": 1.1404825448989868, |
|
"eval_runtime": 492.7605, |
|
"eval_samples_per_second": 258.892, |
|
"eval_steps_per_second": 8.091, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 1.6304007023264564, |
|
"grad_norm": 232.2500457763672, |
|
"learning_rate": 1.3169363688620251e-05, |
|
"loss": 1.2221, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.6304007023264564, |
|
"eval_loss": 1.1528397798538208, |
|
"eval_runtime": 487.3414, |
|
"eval_samples_per_second": 261.771, |
|
"eval_steps_per_second": 8.181, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.6366714742584811, |
|
"grad_norm": 5.894351959228516, |
|
"learning_rate": 1.3134525945409257e-05, |
|
"loss": 0.9085, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 1.6366714742584811, |
|
"eval_loss": 1.139626145362854, |
|
"eval_runtime": 480.0477, |
|
"eval_samples_per_second": 265.749, |
|
"eval_steps_per_second": 8.305, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 1.642942246190506, |
|
"grad_norm": 0.19382409751415253, |
|
"learning_rate": 1.3099688202198262e-05, |
|
"loss": 0.9543, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 1.642942246190506, |
|
"eval_loss": 1.195331335067749, |
|
"eval_runtime": 487.008, |
|
"eval_samples_per_second": 261.951, |
|
"eval_steps_per_second": 8.187, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 1.6492130181225308, |
|
"grad_norm": 240.2974090576172, |
|
"learning_rate": 1.3064850458987268e-05, |
|
"loss": 1.1855, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 1.6492130181225308, |
|
"eval_loss": 1.1792023181915283, |
|
"eval_runtime": 487.2539, |
|
"eval_samples_per_second": 261.818, |
|
"eval_steps_per_second": 8.183, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 1.6554837900545558, |
|
"grad_norm": 5.021773338317871, |
|
"learning_rate": 1.3030012715776273e-05, |
|
"loss": 1.0583, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 1.6554837900545558, |
|
"eval_loss": 1.1666100025177002, |
|
"eval_runtime": 490.5958, |
|
"eval_samples_per_second": 260.035, |
|
"eval_steps_per_second": 8.127, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 1.6617545619865806, |
|
"grad_norm": 0.47061604261398315, |
|
"learning_rate": 1.299517497256528e-05, |
|
"loss": 0.6583, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.6617545619865806, |
|
"eval_loss": 1.1151552200317383, |
|
"eval_runtime": 489.33, |
|
"eval_samples_per_second": 260.708, |
|
"eval_steps_per_second": 8.148, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.6680253339186053, |
|
"grad_norm": 0.7339816689491272, |
|
"learning_rate": 1.2960337229354282e-05, |
|
"loss": 1.3067, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 1.6680253339186053, |
|
"eval_loss": 1.0397262573242188, |
|
"eval_runtime": 490.7479, |
|
"eval_samples_per_second": 259.954, |
|
"eval_steps_per_second": 8.124, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 1.6742961058506303, |
|
"grad_norm": 0.43579697608947754, |
|
"learning_rate": 1.2925499486143289e-05, |
|
"loss": 1.5336, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 1.6742961058506303, |
|
"eval_loss": 1.1244205236434937, |
|
"eval_runtime": 504.0991, |
|
"eval_samples_per_second": 253.069, |
|
"eval_steps_per_second": 7.909, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 1.680566877782655, |
|
"grad_norm": 0.877700686454773, |
|
"learning_rate": 1.2890661742932293e-05, |
|
"loss": 0.614, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 1.680566877782655, |
|
"eval_loss": 1.1273393630981445, |
|
"eval_runtime": 490.8071, |
|
"eval_samples_per_second": 259.923, |
|
"eval_steps_per_second": 8.123, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 1.68683764971468, |
|
"grad_norm": 2.61261248588562, |
|
"learning_rate": 1.28558239997213e-05, |
|
"loss": 1.0336, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 1.68683764971468, |
|
"eval_loss": 1.067978024482727, |
|
"eval_runtime": 488.4991, |
|
"eval_samples_per_second": 261.151, |
|
"eval_steps_per_second": 8.162, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 1.6931084216467047, |
|
"grad_norm": 1.7996759414672852, |
|
"learning_rate": 1.2821334633942416e-05, |
|
"loss": 1.462, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.6931084216467047, |
|
"eval_loss": 1.0983270406723022, |
|
"eval_runtime": 497.9625, |
|
"eval_samples_per_second": 256.188, |
|
"eval_steps_per_second": 8.007, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.6993791935787295, |
|
"grad_norm": 0.4661722183227539, |
|
"learning_rate": 1.2786496890731419e-05, |
|
"loss": 0.8858, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 1.6993791935787295, |
|
"eval_loss": 1.0672377347946167, |
|
"eval_runtime": 488.5627, |
|
"eval_samples_per_second": 261.117, |
|
"eval_steps_per_second": 8.161, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 1.7056499655107544, |
|
"grad_norm": 131.8981475830078, |
|
"learning_rate": 1.2751659147520425e-05, |
|
"loss": 0.7494, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 1.7056499655107544, |
|
"eval_loss": 1.1623871326446533, |
|
"eval_runtime": 489.2152, |
|
"eval_samples_per_second": 260.769, |
|
"eval_steps_per_second": 8.15, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 1.7119207374427792, |
|
"grad_norm": 1.5505995750427246, |
|
"learning_rate": 1.271682140430943e-05, |
|
"loss": 0.8152, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 1.7119207374427792, |
|
"eval_loss": 1.0928338766098022, |
|
"eval_runtime": 485.3945, |
|
"eval_samples_per_second": 262.821, |
|
"eval_steps_per_second": 8.214, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 1.7181915093748041, |
|
"grad_norm": 0.11606509238481522, |
|
"learning_rate": 1.2681983661098436e-05, |
|
"loss": 0.7785, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 1.7181915093748041, |
|
"eval_loss": 1.0952435731887817, |
|
"eval_runtime": 490.1873, |
|
"eval_samples_per_second": 260.252, |
|
"eval_steps_per_second": 8.134, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 1.7244622813068289, |
|
"grad_norm": 60.00815963745117, |
|
"learning_rate": 1.264714591788744e-05, |
|
"loss": 1.0471, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.7244622813068289, |
|
"eval_loss": 1.0999162197113037, |
|
"eval_runtime": 472.8514, |
|
"eval_samples_per_second": 269.793, |
|
"eval_steps_per_second": 8.432, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.7307330532388536, |
|
"grad_norm": 0.18325106799602509, |
|
"learning_rate": 1.2612308174676447e-05, |
|
"loss": 1.0994, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 1.7307330532388536, |
|
"eval_loss": 0.9880152344703674, |
|
"eval_runtime": 489.527, |
|
"eval_samples_per_second": 260.603, |
|
"eval_steps_per_second": 8.145, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 1.7370038251708786, |
|
"grad_norm": 33.887603759765625, |
|
"learning_rate": 1.257747043146545e-05, |
|
"loss": 1.0706, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 1.7370038251708786, |
|
"eval_loss": 1.0416243076324463, |
|
"eval_runtime": 486.6381, |
|
"eval_samples_per_second": 262.15, |
|
"eval_steps_per_second": 8.193, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 1.7432745971029033, |
|
"grad_norm": 122.05184936523438, |
|
"learning_rate": 1.2542632688254456e-05, |
|
"loss": 1.1158, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 1.7432745971029033, |
|
"eval_loss": 1.0675890445709229, |
|
"eval_runtime": 488.3694, |
|
"eval_samples_per_second": 261.22, |
|
"eval_steps_per_second": 8.164, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 1.7495453690349283, |
|
"grad_norm": 3.5680992603302, |
|
"learning_rate": 1.2507794945043461e-05, |
|
"loss": 0.9893, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 1.7495453690349283, |
|
"eval_loss": 1.0288848876953125, |
|
"eval_runtime": 487.5059, |
|
"eval_samples_per_second": 261.683, |
|
"eval_steps_per_second": 8.178, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 1.755816140966953, |
|
"grad_norm": 0.61468905210495, |
|
"learning_rate": 1.2472957201832467e-05, |
|
"loss": 1.2939, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.755816140966953, |
|
"eval_loss": 1.0149768590927124, |
|
"eval_runtime": 496.1264, |
|
"eval_samples_per_second": 257.136, |
|
"eval_steps_per_second": 8.036, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.7620869128989778, |
|
"grad_norm": 0.23548483848571777, |
|
"learning_rate": 1.2438119458621472e-05, |
|
"loss": 0.9543, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 1.7620869128989778, |
|
"eval_loss": 1.076741099357605, |
|
"eval_runtime": 494.571, |
|
"eval_samples_per_second": 257.945, |
|
"eval_steps_per_second": 8.062, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 1.7683576848310028, |
|
"grad_norm": 0.04505012556910515, |
|
"learning_rate": 1.2403281715410475e-05, |
|
"loss": 0.7907, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 1.7683576848310028, |
|
"eval_loss": 1.071725845336914, |
|
"eval_runtime": 498.1358, |
|
"eval_samples_per_second": 256.099, |
|
"eval_steps_per_second": 8.004, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 1.7746284567630275, |
|
"grad_norm": 0.3665514886379242, |
|
"learning_rate": 1.2368443972199481e-05, |
|
"loss": 0.92, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 1.7746284567630275, |
|
"eval_loss": 1.1132545471191406, |
|
"eval_runtime": 494.9621, |
|
"eval_samples_per_second": 257.741, |
|
"eval_steps_per_second": 8.055, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 1.7808992286950525, |
|
"grad_norm": 2.6903622150421143, |
|
"learning_rate": 1.2333606228988486e-05, |
|
"loss": 0.8636, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 1.7808992286950525, |
|
"eval_loss": 1.070193886756897, |
|
"eval_runtime": 487.101, |
|
"eval_samples_per_second": 261.901, |
|
"eval_steps_per_second": 8.185, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 1.7871700006270772, |
|
"grad_norm": 246.5596923828125, |
|
"learning_rate": 1.2298768485777492e-05, |
|
"loss": 0.9118, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.7871700006270772, |
|
"eval_loss": 1.0536377429962158, |
|
"eval_runtime": 500.9429, |
|
"eval_samples_per_second": 254.664, |
|
"eval_steps_per_second": 7.959, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.793440772559102, |
|
"grad_norm": 15.87330150604248, |
|
"learning_rate": 1.2263930742566497e-05, |
|
"loss": 1.2643, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 1.793440772559102, |
|
"eval_loss": 1.135445237159729, |
|
"eval_runtime": 491.8209, |
|
"eval_samples_per_second": 259.387, |
|
"eval_steps_per_second": 8.107, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 1.7997115444911267, |
|
"grad_norm": 0.04285774007439613, |
|
"learning_rate": 1.2229092999355503e-05, |
|
"loss": 0.8284, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 1.7997115444911267, |
|
"eval_loss": 1.0714679956436157, |
|
"eval_runtime": 491.3195, |
|
"eval_samples_per_second": 259.652, |
|
"eval_steps_per_second": 8.115, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 1.8059823164231517, |
|
"grad_norm": 50.862327575683594, |
|
"learning_rate": 1.2194255256144508e-05, |
|
"loss": 0.8447, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 1.8059823164231517, |
|
"eval_loss": 1.0457782745361328, |
|
"eval_runtime": 497.4392, |
|
"eval_samples_per_second": 256.457, |
|
"eval_steps_per_second": 8.015, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 1.8122530883551766, |
|
"grad_norm": 1.507433295249939, |
|
"learning_rate": 1.2159417512933514e-05, |
|
"loss": 1.2102, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 1.8122530883551766, |
|
"eval_loss": 1.1000713109970093, |
|
"eval_runtime": 492.3678, |
|
"eval_samples_per_second": 259.099, |
|
"eval_steps_per_second": 8.098, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 1.8185238602872014, |
|
"grad_norm": 182.16946411132812, |
|
"learning_rate": 1.2124579769722517e-05, |
|
"loss": 1.1042, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.8185238602872014, |
|
"eval_loss": 1.0364127159118652, |
|
"eval_runtime": 493.1395, |
|
"eval_samples_per_second": 258.694, |
|
"eval_steps_per_second": 8.085, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.824794632219226, |
|
"grad_norm": 84.75048065185547, |
|
"learning_rate": 1.2089742026511523e-05, |
|
"loss": 0.9638, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 1.824794632219226, |
|
"eval_loss": 1.0946918725967407, |
|
"eval_runtime": 494.4031, |
|
"eval_samples_per_second": 258.032, |
|
"eval_steps_per_second": 8.064, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 1.8310654041512509, |
|
"grad_norm": 0.5844135284423828, |
|
"learning_rate": 1.2054904283300528e-05, |
|
"loss": 0.6847, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 1.8310654041512509, |
|
"eval_loss": 1.0311741828918457, |
|
"eval_runtime": 481.6292, |
|
"eval_samples_per_second": 264.876, |
|
"eval_steps_per_second": 8.278, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 1.8373361760832758, |
|
"grad_norm": 21.12558364868164, |
|
"learning_rate": 1.2020066540089534e-05, |
|
"loss": 1.7671, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 1.8373361760832758, |
|
"eval_loss": 1.0470467805862427, |
|
"eval_runtime": 494.9594, |
|
"eval_samples_per_second": 257.742, |
|
"eval_steps_per_second": 8.055, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 1.8436069480153008, |
|
"grad_norm": 7.0535407066345215, |
|
"learning_rate": 1.1985228796878539e-05, |
|
"loss": 0.7525, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 1.8436069480153008, |
|
"eval_loss": 1.1158130168914795, |
|
"eval_runtime": 492.3408, |
|
"eval_samples_per_second": 259.113, |
|
"eval_steps_per_second": 8.098, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 1.8498777199473255, |
|
"grad_norm": 0.11249526590108871, |
|
"learning_rate": 1.1950391053667545e-05, |
|
"loss": 1.2843, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.8498777199473255, |
|
"eval_loss": 1.0139508247375488, |
|
"eval_runtime": 483.0492, |
|
"eval_samples_per_second": 264.097, |
|
"eval_steps_per_second": 8.254, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.8561484918793503, |
|
"grad_norm": 72.12831115722656, |
|
"learning_rate": 1.191590168788866e-05, |
|
"loss": 0.6844, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 1.8561484918793503, |
|
"eval_loss": 1.1603798866271973, |
|
"eval_runtime": 491.4897, |
|
"eval_samples_per_second": 259.562, |
|
"eval_steps_per_second": 8.112, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 1.862419263811375, |
|
"grad_norm": 21.705537796020508, |
|
"learning_rate": 1.1881063944677665e-05, |
|
"loss": 1.2824, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 1.862419263811375, |
|
"eval_loss": 1.0052319765090942, |
|
"eval_runtime": 487.7473, |
|
"eval_samples_per_second": 261.553, |
|
"eval_steps_per_second": 8.174, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 1.8686900357434, |
|
"grad_norm": 1.3453004360198975, |
|
"learning_rate": 1.1846226201466671e-05, |
|
"loss": 1.314, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 1.8686900357434, |
|
"eval_loss": 1.0322686433792114, |
|
"eval_runtime": 480.0979, |
|
"eval_samples_per_second": 265.721, |
|
"eval_steps_per_second": 8.305, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 1.874960807675425, |
|
"grad_norm": 5.6963677406311035, |
|
"learning_rate": 1.1811388458255676e-05, |
|
"loss": 1.0796, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 1.874960807675425, |
|
"eval_loss": 1.0885429382324219, |
|
"eval_runtime": 483.9557, |
|
"eval_samples_per_second": 263.603, |
|
"eval_steps_per_second": 8.238, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 1.8812315796074497, |
|
"grad_norm": 0.3642306923866272, |
|
"learning_rate": 1.1776550715044682e-05, |
|
"loss": 1.0012, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.8812315796074497, |
|
"eval_loss": 1.0266896486282349, |
|
"eval_runtime": 498.6153, |
|
"eval_samples_per_second": 255.853, |
|
"eval_steps_per_second": 7.996, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.8875023515394744, |
|
"grad_norm": 45.68118667602539, |
|
"learning_rate": 1.1741712971833685e-05, |
|
"loss": 1.4932, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 1.8875023515394744, |
|
"eval_loss": 1.0438352823257446, |
|
"eval_runtime": 479.3134, |
|
"eval_samples_per_second": 266.156, |
|
"eval_steps_per_second": 8.318, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 1.8937731234714992, |
|
"grad_norm": 159.10227966308594, |
|
"learning_rate": 1.1706875228622691e-05, |
|
"loss": 1.0404, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 1.8937731234714992, |
|
"eval_loss": 1.0162733793258667, |
|
"eval_runtime": 484.5198, |
|
"eval_samples_per_second": 263.296, |
|
"eval_steps_per_second": 8.229, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 1.9000438954035241, |
|
"grad_norm": 9.165184020996094, |
|
"learning_rate": 1.1672037485411696e-05, |
|
"loss": 0.614, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 1.9000438954035241, |
|
"eval_loss": 1.0366989374160767, |
|
"eval_runtime": 494.4949, |
|
"eval_samples_per_second": 257.984, |
|
"eval_steps_per_second": 8.063, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 1.906314667335549, |
|
"grad_norm": 93.2901840209961, |
|
"learning_rate": 1.1637199742200702e-05, |
|
"loss": 1.2676, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 1.906314667335549, |
|
"eval_loss": 1.080250859260559, |
|
"eval_runtime": 506.0169, |
|
"eval_samples_per_second": 252.11, |
|
"eval_steps_per_second": 7.879, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 1.9125854392675739, |
|
"grad_norm": 22.93528938293457, |
|
"learning_rate": 1.1602361998989707e-05, |
|
"loss": 1.2431, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.9125854392675739, |
|
"eval_loss": 1.042752742767334, |
|
"eval_runtime": 482.2307, |
|
"eval_samples_per_second": 264.546, |
|
"eval_steps_per_second": 8.268, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.9188562111995986, |
|
"grad_norm": 44.19611358642578, |
|
"learning_rate": 1.1567524255778713e-05, |
|
"loss": 1.4063, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 1.9188562111995986, |
|
"eval_loss": 1.0318702459335327, |
|
"eval_runtime": 482.0351, |
|
"eval_samples_per_second": 264.653, |
|
"eval_steps_per_second": 8.271, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 1.9251269831316233, |
|
"grad_norm": 0.21961411833763123, |
|
"learning_rate": 1.1532686512567716e-05, |
|
"loss": 0.7787, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 1.9251269831316233, |
|
"eval_loss": 0.9666246175765991, |
|
"eval_runtime": 497.4003, |
|
"eval_samples_per_second": 256.478, |
|
"eval_steps_per_second": 8.016, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 1.9313977550636483, |
|
"grad_norm": 5.579217910766602, |
|
"learning_rate": 1.1497848769356722e-05, |
|
"loss": 1.0311, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 1.9313977550636483, |
|
"eval_loss": 1.0375796556472778, |
|
"eval_runtime": 496.1027, |
|
"eval_samples_per_second": 257.148, |
|
"eval_steps_per_second": 8.037, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 1.9376685269956733, |
|
"grad_norm": 0.01572820357978344, |
|
"learning_rate": 1.1463011026145727e-05, |
|
"loss": 1.0353, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 1.9376685269956733, |
|
"eval_loss": 0.9868729114532471, |
|
"eval_runtime": 491.6277, |
|
"eval_samples_per_second": 259.489, |
|
"eval_steps_per_second": 8.11, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 1.943939298927698, |
|
"grad_norm": 1.0484445095062256, |
|
"learning_rate": 1.1428173282934732e-05, |
|
"loss": 1.2221, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.943939298927698, |
|
"eval_loss": 0.968561589717865, |
|
"eval_runtime": 499.708, |
|
"eval_samples_per_second": 255.293, |
|
"eval_steps_per_second": 7.979, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.9502100708597228, |
|
"grad_norm": 123.73536682128906, |
|
"learning_rate": 1.1393335539723738e-05, |
|
"loss": 0.5806, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 1.9502100708597228, |
|
"eval_loss": 0.9662685394287109, |
|
"eval_runtime": 496.0179, |
|
"eval_samples_per_second": 257.192, |
|
"eval_steps_per_second": 8.038, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 1.9564808427917475, |
|
"grad_norm": 265.9390869140625, |
|
"learning_rate": 1.1358497796512741e-05, |
|
"loss": 0.6919, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 1.9564808427917475, |
|
"eval_loss": 0.9837759733200073, |
|
"eval_runtime": 481.0273, |
|
"eval_samples_per_second": 265.207, |
|
"eval_steps_per_second": 8.289, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 1.9627516147237725, |
|
"grad_norm": 1.0015980005264282, |
|
"learning_rate": 1.1323660053301749e-05, |
|
"loss": 0.8028, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 1.9627516147237725, |
|
"eval_loss": 0.9759084582328796, |
|
"eval_runtime": 487.9887, |
|
"eval_samples_per_second": 261.424, |
|
"eval_steps_per_second": 8.17, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 1.9690223866557974, |
|
"grad_norm": 31.675607681274414, |
|
"learning_rate": 1.1288822310090752e-05, |
|
"loss": 0.8365, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 1.9690223866557974, |
|
"eval_loss": 0.9640862345695496, |
|
"eval_runtime": 496.5309, |
|
"eval_samples_per_second": 256.927, |
|
"eval_steps_per_second": 8.03, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 1.9752931585878222, |
|
"grad_norm": 1.1243913173675537, |
|
"learning_rate": 1.1253984566879758e-05, |
|
"loss": 0.7518, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.9752931585878222, |
|
"eval_loss": 1.008094310760498, |
|
"eval_runtime": 499.5695, |
|
"eval_samples_per_second": 255.364, |
|
"eval_steps_per_second": 7.981, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.981563930519847, |
|
"grad_norm": 216.04434204101562, |
|
"learning_rate": 1.1219495201100875e-05, |
|
"loss": 1.0654, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 1.981563930519847, |
|
"eval_loss": 0.9843435287475586, |
|
"eval_runtime": 480.8256, |
|
"eval_samples_per_second": 265.319, |
|
"eval_steps_per_second": 8.292, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 1.9878347024518717, |
|
"grad_norm": 0.3936084806919098, |
|
"learning_rate": 1.1184657457889878e-05, |
|
"loss": 0.8637, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 1.9878347024518717, |
|
"eval_loss": 0.963536262512207, |
|
"eval_runtime": 481.4757, |
|
"eval_samples_per_second": 264.96, |
|
"eval_steps_per_second": 8.281, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 1.9941054743838966, |
|
"grad_norm": 8.97900104522705, |
|
"learning_rate": 1.1149819714678884e-05, |
|
"loss": 0.8663, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 1.9941054743838966, |
|
"eval_loss": 0.9537881016731262, |
|
"eval_runtime": 488.2812, |
|
"eval_samples_per_second": 261.268, |
|
"eval_steps_per_second": 8.165, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 2.0003762463159216, |
|
"grad_norm": 0.23352281749248505, |
|
"learning_rate": 1.1114981971467889e-05, |
|
"loss": 0.8524, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 2.0003762463159216, |
|
"eval_loss": 0.9627546072006226, |
|
"eval_runtime": 476.3461, |
|
"eval_samples_per_second": 267.814, |
|
"eval_steps_per_second": 8.37, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 2.0066470182479463, |
|
"grad_norm": 10.038532257080078, |
|
"learning_rate": 1.1080144228256895e-05, |
|
"loss": 1.2748, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.0066470182479463, |
|
"eval_loss": 0.9381898641586304, |
|
"eval_runtime": 483.6522, |
|
"eval_samples_per_second": 263.768, |
|
"eval_steps_per_second": 8.244, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.012917790179971, |
|
"grad_norm": 3.102550745010376, |
|
"learning_rate": 1.10453064850459e-05, |
|
"loss": 0.8138, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 2.012917790179971, |
|
"eval_loss": 0.9460862874984741, |
|
"eval_runtime": 486.1122, |
|
"eval_samples_per_second": 262.433, |
|
"eval_steps_per_second": 8.202, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 2.019188562111996, |
|
"grad_norm": 5.872899532318115, |
|
"learning_rate": 1.1010468741834906e-05, |
|
"loss": 0.4484, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 2.019188562111996, |
|
"eval_loss": 0.9221316576004028, |
|
"eval_runtime": 489.5035, |
|
"eval_samples_per_second": 260.615, |
|
"eval_steps_per_second": 8.145, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 2.025459334044021, |
|
"grad_norm": 70.84674072265625, |
|
"learning_rate": 1.0975630998623909e-05, |
|
"loss": 0.8839, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 2.025459334044021, |
|
"eval_loss": 0.9566515684127808, |
|
"eval_runtime": 497.3551, |
|
"eval_samples_per_second": 256.501, |
|
"eval_steps_per_second": 8.016, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 2.0317301059760458, |
|
"grad_norm": 20.528474807739258, |
|
"learning_rate": 1.0940793255412915e-05, |
|
"loss": 0.7599, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 2.0317301059760458, |
|
"eval_loss": 0.9439575672149658, |
|
"eval_runtime": 475.9709, |
|
"eval_samples_per_second": 268.025, |
|
"eval_steps_per_second": 8.377, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 2.0380008779080705, |
|
"grad_norm": 0.2569330930709839, |
|
"learning_rate": 1.090595551220192e-05, |
|
"loss": 0.8665, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.0380008779080705, |
|
"eval_loss": 0.9651756882667542, |
|
"eval_runtime": 476.2761, |
|
"eval_samples_per_second": 267.853, |
|
"eval_steps_per_second": 8.371, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.0442716498400952, |
|
"grad_norm": 160.0611572265625, |
|
"learning_rate": 1.0871117768990926e-05, |
|
"loss": 0.5802, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 2.0442716498400952, |
|
"eval_loss": 0.9474946856498718, |
|
"eval_runtime": 498.1214, |
|
"eval_samples_per_second": 256.106, |
|
"eval_steps_per_second": 8.004, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 2.05054242177212, |
|
"grad_norm": 13.137542724609375, |
|
"learning_rate": 1.083628002577993e-05, |
|
"loss": 0.7731, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 2.05054242177212, |
|
"eval_loss": 0.9197245240211487, |
|
"eval_runtime": 471.6865, |
|
"eval_samples_per_second": 270.459, |
|
"eval_steps_per_second": 8.453, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 2.056813193704145, |
|
"grad_norm": 4.745016574859619, |
|
"learning_rate": 1.0801442282568937e-05, |
|
"loss": 0.7913, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 2.056813193704145, |
|
"eval_loss": 1.002418875694275, |
|
"eval_runtime": 444.3682, |
|
"eval_samples_per_second": 287.086, |
|
"eval_steps_per_second": 8.972, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 2.06308396563617, |
|
"grad_norm": 273.15252685546875, |
|
"learning_rate": 1.0766604539357942e-05, |
|
"loss": 0.7758, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 2.06308396563617, |
|
"eval_loss": 0.9257067441940308, |
|
"eval_runtime": 479.6839, |
|
"eval_samples_per_second": 265.95, |
|
"eval_steps_per_second": 8.312, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 2.0693547375681947, |
|
"grad_norm": 0.2749234437942505, |
|
"learning_rate": 1.0731766796146948e-05, |
|
"loss": 0.7468, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.0693547375681947, |
|
"eval_loss": 0.9662745594978333, |
|
"eval_runtime": 482.8123, |
|
"eval_samples_per_second": 264.227, |
|
"eval_steps_per_second": 8.258, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.0756255095002194, |
|
"grad_norm": 2.7121362686157227, |
|
"learning_rate": 1.0696929052935951e-05, |
|
"loss": 0.9947, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 2.0756255095002194, |
|
"eval_loss": 0.9788134098052979, |
|
"eval_runtime": 488.227, |
|
"eval_samples_per_second": 261.296, |
|
"eval_steps_per_second": 8.166, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 2.081896281432244, |
|
"grad_norm": 0.2543056905269623, |
|
"learning_rate": 1.0662091309724957e-05, |
|
"loss": 0.5618, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 2.081896281432244, |
|
"eval_loss": 0.948021650314331, |
|
"eval_runtime": 491.5864, |
|
"eval_samples_per_second": 259.511, |
|
"eval_steps_per_second": 8.11, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 2.0881670533642693, |
|
"grad_norm": 0.034537989646196365, |
|
"learning_rate": 1.0627253566513962e-05, |
|
"loss": 0.8805, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 2.0881670533642693, |
|
"eval_loss": 0.9520492553710938, |
|
"eval_runtime": 482.9571, |
|
"eval_samples_per_second": 264.148, |
|
"eval_steps_per_second": 8.255, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 2.094437825296294, |
|
"grad_norm": 4.662662982940674, |
|
"learning_rate": 1.0592415823302968e-05, |
|
"loss": 0.9755, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 2.094437825296294, |
|
"eval_loss": 0.9288346767425537, |
|
"eval_runtime": 495.4516, |
|
"eval_samples_per_second": 257.486, |
|
"eval_steps_per_second": 8.047, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 2.100708597228319, |
|
"grad_norm": 64.40668487548828, |
|
"learning_rate": 1.0557578080091973e-05, |
|
"loss": 0.8942, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 2.100708597228319, |
|
"eval_loss": 0.9233998656272888, |
|
"eval_runtime": 486.5273, |
|
"eval_samples_per_second": 262.209, |
|
"eval_steps_per_second": 8.195, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 2.1069793691603436, |
|
"grad_norm": 2.1412320137023926, |
|
"learning_rate": 1.0522740336880976e-05, |
|
"loss": 0.7242, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 2.1069793691603436, |
|
"eval_loss": 0.9412585496902466, |
|
"eval_runtime": 481.4435, |
|
"eval_samples_per_second": 264.978, |
|
"eval_steps_per_second": 8.281, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 2.1132501410923683, |
|
"grad_norm": 5.01767635345459, |
|
"learning_rate": 1.0487902593669982e-05, |
|
"loss": 0.6231, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 2.1132501410923683, |
|
"eval_loss": 0.9660213589668274, |
|
"eval_runtime": 480.6062, |
|
"eval_samples_per_second": 265.44, |
|
"eval_steps_per_second": 8.296, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 2.1195209130243935, |
|
"grad_norm": 0.02841496281325817, |
|
"learning_rate": 1.0453064850458987e-05, |
|
"loss": 0.7144, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 2.1195209130243935, |
|
"eval_loss": 0.8900822997093201, |
|
"eval_runtime": 506.9048, |
|
"eval_samples_per_second": 251.669, |
|
"eval_steps_per_second": 7.865, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 2.1257916849564182, |
|
"grad_norm": 14.184029579162598, |
|
"learning_rate": 1.0418227107247993e-05, |
|
"loss": 0.7139, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 2.1257916849564182, |
|
"eval_loss": 0.9535605907440186, |
|
"eval_runtime": 467.6722, |
|
"eval_samples_per_second": 272.781, |
|
"eval_steps_per_second": 8.525, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 2.132062456888443, |
|
"grad_norm": 247.51730346679688, |
|
"learning_rate": 1.0383389364036998e-05, |
|
"loss": 0.6378, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.132062456888443, |
|
"eval_loss": 0.9369811415672302, |
|
"eval_runtime": 467.3096, |
|
"eval_samples_per_second": 272.992, |
|
"eval_steps_per_second": 8.532, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.1383332288204677, |
|
"grad_norm": 0.265493243932724, |
|
"learning_rate": 1.0348551620826004e-05, |
|
"loss": 0.7607, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 2.1383332288204677, |
|
"eval_loss": 0.9209387898445129, |
|
"eval_runtime": 460.3387, |
|
"eval_samples_per_second": 277.126, |
|
"eval_steps_per_second": 8.661, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 2.1446040007524925, |
|
"grad_norm": 4.44495153427124, |
|
"learning_rate": 1.0313713877615009e-05, |
|
"loss": 0.8667, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 2.1446040007524925, |
|
"eval_loss": 0.9734475016593933, |
|
"eval_runtime": 472.3123, |
|
"eval_samples_per_second": 270.101, |
|
"eval_steps_per_second": 8.441, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 2.1508747726845177, |
|
"grad_norm": 1.1490778923034668, |
|
"learning_rate": 1.0278876134404015e-05, |
|
"loss": 0.8533, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 2.1508747726845177, |
|
"eval_loss": 0.9177405834197998, |
|
"eval_runtime": 481.8576, |
|
"eval_samples_per_second": 264.75, |
|
"eval_steps_per_second": 8.274, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 2.1571455446165424, |
|
"grad_norm": 6.377614498138428, |
|
"learning_rate": 1.0244038391193018e-05, |
|
"loss": 0.6395, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 2.1571455446165424, |
|
"eval_loss": 0.9285467863082886, |
|
"eval_runtime": 491.7764, |
|
"eval_samples_per_second": 259.411, |
|
"eval_steps_per_second": 8.107, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 2.163416316548567, |
|
"grad_norm": 63.10408401489258, |
|
"learning_rate": 1.0209200647982025e-05, |
|
"loss": 0.7377, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 2.163416316548567, |
|
"eval_loss": 0.9046958088874817, |
|
"eval_runtime": 472.1262, |
|
"eval_samples_per_second": 270.207, |
|
"eval_steps_per_second": 8.445, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 2.169687088480592, |
|
"grad_norm": 0.07853188365697861, |
|
"learning_rate": 1.017471128220314e-05, |
|
"loss": 0.7787, |
|
"step": 34600 |
|
}, |
|
{ |
|
"epoch": 2.169687088480592, |
|
"eval_loss": 0.9967793822288513, |
|
"eval_runtime": 488.6457, |
|
"eval_samples_per_second": 261.073, |
|
"eval_steps_per_second": 8.159, |
|
"step": 34600 |
|
}, |
|
{ |
|
"epoch": 2.1759578604126166, |
|
"grad_norm": 44.51852035522461, |
|
"learning_rate": 1.0139873538992144e-05, |
|
"loss": 0.6561, |
|
"step": 34700 |
|
}, |
|
{ |
|
"epoch": 2.1759578604126166, |
|
"eval_loss": 0.9653065800666809, |
|
"eval_runtime": 480.7646, |
|
"eval_samples_per_second": 265.352, |
|
"eval_steps_per_second": 8.293, |
|
"step": 34700 |
|
}, |
|
{ |
|
"epoch": 2.182228632344642, |
|
"grad_norm": 37.319366455078125, |
|
"learning_rate": 1.010503579578115e-05, |
|
"loss": 0.6169, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 2.182228632344642, |
|
"eval_loss": 0.9403988122940063, |
|
"eval_runtime": 467.6154, |
|
"eval_samples_per_second": 272.814, |
|
"eval_steps_per_second": 8.526, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 2.1884994042766666, |
|
"grad_norm": 0.24766607582569122, |
|
"learning_rate": 1.0070198052570155e-05, |
|
"loss": 0.7643, |
|
"step": 34900 |
|
}, |
|
{ |
|
"epoch": 2.1884994042766666, |
|
"eval_loss": 0.9397174715995789, |
|
"eval_runtime": 492.9746, |
|
"eval_samples_per_second": 258.78, |
|
"eval_steps_per_second": 8.088, |
|
"step": 34900 |
|
}, |
|
{ |
|
"epoch": 2.1947701762086913, |
|
"grad_norm": 1.6579983234405518, |
|
"learning_rate": 1.0035360309359161e-05, |
|
"loss": 0.998, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.1947701762086913, |
|
"eval_loss": 0.9152400493621826, |
|
"eval_runtime": 488.9266, |
|
"eval_samples_per_second": 260.923, |
|
"eval_steps_per_second": 8.155, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.201040948140716, |
|
"grad_norm": 14.633705139160156, |
|
"learning_rate": 1.0000522566148166e-05, |
|
"loss": 0.8246, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 2.201040948140716, |
|
"eval_loss": 0.9512937068939209, |
|
"eval_runtime": 497.3249, |
|
"eval_samples_per_second": 256.516, |
|
"eval_steps_per_second": 8.017, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 2.207311720072741, |
|
"grad_norm": 162.75132751464844, |
|
"learning_rate": 9.96568482293717e-06, |
|
"loss": 0.6655, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 2.207311720072741, |
|
"eval_loss": 0.9354454278945923, |
|
"eval_runtime": 479.342, |
|
"eval_samples_per_second": 266.14, |
|
"eval_steps_per_second": 8.318, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 2.213582492004766, |
|
"grad_norm": 0.0890607163310051, |
|
"learning_rate": 9.930847079726175e-06, |
|
"loss": 0.9279, |
|
"step": 35300 |
|
}, |
|
{ |
|
"epoch": 2.213582492004766, |
|
"eval_loss": 0.9034134745597839, |
|
"eval_runtime": 495.8444, |
|
"eval_samples_per_second": 257.282, |
|
"eval_steps_per_second": 8.041, |
|
"step": 35300 |
|
}, |
|
{ |
|
"epoch": 2.2198532639367907, |
|
"grad_norm": 12.482114791870117, |
|
"learning_rate": 9.896009336515181e-06, |
|
"loss": 0.4239, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 2.2198532639367907, |
|
"eval_loss": 0.9606735706329346, |
|
"eval_runtime": 472.1675, |
|
"eval_samples_per_second": 270.184, |
|
"eval_steps_per_second": 8.444, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 2.2261240358688155, |
|
"grad_norm": 193.45916748046875, |
|
"learning_rate": 9.861171593304186e-06, |
|
"loss": 1.0023, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.2261240358688155, |
|
"eval_loss": 0.8731982707977295, |
|
"eval_runtime": 502.841, |
|
"eval_samples_per_second": 253.702, |
|
"eval_steps_per_second": 7.929, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.23239480780084, |
|
"grad_norm": 25.368621826171875, |
|
"learning_rate": 9.826333850093192e-06, |
|
"loss": 0.7426, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 2.23239480780084, |
|
"eval_loss": 0.8882994651794434, |
|
"eval_runtime": 489.5037, |
|
"eval_samples_per_second": 260.615, |
|
"eval_steps_per_second": 8.145, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 2.238665579732865, |
|
"grad_norm": 6.321267127990723, |
|
"learning_rate": 9.791496106882197e-06, |
|
"loss": 0.8675, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 2.238665579732865, |
|
"eval_loss": 0.9296298027038574, |
|
"eval_runtime": 481.2287, |
|
"eval_samples_per_second": 265.096, |
|
"eval_steps_per_second": 8.285, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 2.24493635166489, |
|
"grad_norm": 0.16120706498622894, |
|
"learning_rate": 9.756658363671202e-06, |
|
"loss": 0.9226, |
|
"step": 35800 |
|
}, |
|
{ |
|
"epoch": 2.24493635166489, |
|
"eval_loss": 0.9145704507827759, |
|
"eval_runtime": 507.1996, |
|
"eval_samples_per_second": 251.522, |
|
"eval_steps_per_second": 7.861, |
|
"step": 35800 |
|
}, |
|
{ |
|
"epoch": 2.251207123596915, |
|
"grad_norm": 14.761024475097656, |
|
"learning_rate": 9.721820620460208e-06, |
|
"loss": 0.4944, |
|
"step": 35900 |
|
}, |
|
{ |
|
"epoch": 2.251207123596915, |
|
"eval_loss": 0.9145201444625854, |
|
"eval_runtime": 480.7973, |
|
"eval_samples_per_second": 265.334, |
|
"eval_steps_per_second": 8.292, |
|
"step": 35900 |
|
}, |
|
{ |
|
"epoch": 2.2574778955289396, |
|
"grad_norm": 15.988486289978027, |
|
"learning_rate": 9.686982877249213e-06, |
|
"loss": 0.9663, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.2574778955289396, |
|
"eval_loss": 0.8893073201179504, |
|
"eval_runtime": 466.9887, |
|
"eval_samples_per_second": 273.18, |
|
"eval_steps_per_second": 8.538, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.2637486674609644, |
|
"grad_norm": 8.232684135437012, |
|
"learning_rate": 9.652493511470327e-06, |
|
"loss": 0.6455, |
|
"step": 36100 |
|
}, |
|
{ |
|
"epoch": 2.2637486674609644, |
|
"eval_loss": 0.9238069653511047, |
|
"eval_runtime": 471.8533, |
|
"eval_samples_per_second": 270.364, |
|
"eval_steps_per_second": 8.45, |
|
"step": 36100 |
|
}, |
|
{ |
|
"epoch": 2.270019439392989, |
|
"grad_norm": 0.20196978747844696, |
|
"learning_rate": 9.617655768259333e-06, |
|
"loss": 0.9673, |
|
"step": 36200 |
|
}, |
|
{ |
|
"epoch": 2.270019439392989, |
|
"eval_loss": 0.8942546248435974, |
|
"eval_runtime": 497.8096, |
|
"eval_samples_per_second": 256.267, |
|
"eval_steps_per_second": 8.009, |
|
"step": 36200 |
|
}, |
|
{ |
|
"epoch": 2.2762902113250143, |
|
"grad_norm": 180.26956176757812, |
|
"learning_rate": 9.582818025048338e-06, |
|
"loss": 0.7974, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 2.2762902113250143, |
|
"eval_loss": 0.8620045185089111, |
|
"eval_runtime": 505.8787, |
|
"eval_samples_per_second": 252.179, |
|
"eval_steps_per_second": 7.881, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 2.282560983257039, |
|
"grad_norm": 118.34184265136719, |
|
"learning_rate": 9.547980281837343e-06, |
|
"loss": 0.9777, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 2.282560983257039, |
|
"eval_loss": 0.8812283873558044, |
|
"eval_runtime": 484.5263, |
|
"eval_samples_per_second": 263.292, |
|
"eval_steps_per_second": 8.229, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 2.288831755189064, |
|
"grad_norm": 1.6557927131652832, |
|
"learning_rate": 9.513142538626349e-06, |
|
"loss": 0.8741, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.288831755189064, |
|
"eval_loss": 0.8862267732620239, |
|
"eval_runtime": 499.5387, |
|
"eval_samples_per_second": 255.38, |
|
"eval_steps_per_second": 7.981, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.2951025271210885, |
|
"grad_norm": 0.09699351340532303, |
|
"learning_rate": 9.478304795415354e-06, |
|
"loss": 0.9642, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 2.2951025271210885, |
|
"eval_loss": 0.9157158732414246, |
|
"eval_runtime": 473.3855, |
|
"eval_samples_per_second": 269.489, |
|
"eval_steps_per_second": 8.422, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 2.3013732990531133, |
|
"grad_norm": 166.3496551513672, |
|
"learning_rate": 9.44346705220436e-06, |
|
"loss": 0.9225, |
|
"step": 36700 |
|
}, |
|
{ |
|
"epoch": 2.3013732990531133, |
|
"eval_loss": 0.8784195780754089, |
|
"eval_runtime": 480.9986, |
|
"eval_samples_per_second": 265.223, |
|
"eval_steps_per_second": 8.289, |
|
"step": 36700 |
|
}, |
|
{ |
|
"epoch": 2.3076440709851385, |
|
"grad_norm": 3.2308545112609863, |
|
"learning_rate": 9.408629308993365e-06, |
|
"loss": 0.6789, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 2.3076440709851385, |
|
"eval_loss": 0.9065931439399719, |
|
"eval_runtime": 486.2861, |
|
"eval_samples_per_second": 262.339, |
|
"eval_steps_per_second": 8.199, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 2.313914842917163, |
|
"grad_norm": 117.87212371826172, |
|
"learning_rate": 9.37379156578237e-06, |
|
"loss": 0.6726, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 2.313914842917163, |
|
"eval_loss": 0.9090869426727295, |
|
"eval_runtime": 479.1369, |
|
"eval_samples_per_second": 266.254, |
|
"eval_steps_per_second": 8.321, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 2.320185614849188, |
|
"grad_norm": 64.78949737548828, |
|
"learning_rate": 9.338953822571376e-06, |
|
"loss": 0.7326, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.320185614849188, |
|
"eval_loss": 0.9202622175216675, |
|
"eval_runtime": 484.0573, |
|
"eval_samples_per_second": 263.547, |
|
"eval_steps_per_second": 8.237, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.3264563867812127, |
|
"grad_norm": 259.2130126953125, |
|
"learning_rate": 9.30411607936038e-06, |
|
"loss": 1.007, |
|
"step": 37100 |
|
}, |
|
{ |
|
"epoch": 2.3264563867812127, |
|
"eval_loss": 0.9124699234962463, |
|
"eval_runtime": 494.1141, |
|
"eval_samples_per_second": 258.183, |
|
"eval_steps_per_second": 8.069, |
|
"step": 37100 |
|
}, |
|
{ |
|
"epoch": 2.3327271587132374, |
|
"grad_norm": 3.8969433307647705, |
|
"learning_rate": 9.269278336149385e-06, |
|
"loss": 0.6134, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 2.3327271587132374, |
|
"eval_loss": 0.8837085366249084, |
|
"eval_runtime": 473.1604, |
|
"eval_samples_per_second": 269.617, |
|
"eval_steps_per_second": 8.426, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 2.3389979306452626, |
|
"grad_norm": 0.8037031292915344, |
|
"learning_rate": 9.234440592938391e-06, |
|
"loss": 0.9051, |
|
"step": 37300 |
|
}, |
|
{ |
|
"epoch": 2.3389979306452626, |
|
"eval_loss": 0.8945268392562866, |
|
"eval_runtime": 488.4927, |
|
"eval_samples_per_second": 261.154, |
|
"eval_steps_per_second": 8.162, |
|
"step": 37300 |
|
}, |
|
{ |
|
"epoch": 2.3452687025772874, |
|
"grad_norm": 70.98564910888672, |
|
"learning_rate": 9.199602849727396e-06, |
|
"loss": 0.837, |
|
"step": 37400 |
|
}, |
|
{ |
|
"epoch": 2.3452687025772874, |
|
"eval_loss": 0.8740183711051941, |
|
"eval_runtime": 492.9522, |
|
"eval_samples_per_second": 258.792, |
|
"eval_steps_per_second": 8.088, |
|
"step": 37400 |
|
}, |
|
{ |
|
"epoch": 2.351539474509312, |
|
"grad_norm": 1.006698489189148, |
|
"learning_rate": 9.1647651065164e-06, |
|
"loss": 0.7615, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 2.351539474509312, |
|
"eval_loss": 0.916473388671875, |
|
"eval_runtime": 499.2162, |
|
"eval_samples_per_second": 255.545, |
|
"eval_steps_per_second": 7.987, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 2.357810246441337, |
|
"grad_norm": 0.15957336127758026, |
|
"learning_rate": 9.129927363305405e-06, |
|
"loss": 0.8304, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 2.357810246441337, |
|
"eval_loss": 0.9107189774513245, |
|
"eval_runtime": 494.2784, |
|
"eval_samples_per_second": 258.097, |
|
"eval_steps_per_second": 8.066, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 2.3640810183733616, |
|
"grad_norm": 0.21330799162387848, |
|
"learning_rate": 9.09508962009441e-06, |
|
"loss": 0.6255, |
|
"step": 37700 |
|
}, |
|
{ |
|
"epoch": 2.3640810183733616, |
|
"eval_loss": 0.8891344666481018, |
|
"eval_runtime": 489.2061, |
|
"eval_samples_per_second": 260.774, |
|
"eval_steps_per_second": 8.15, |
|
"step": 37700 |
|
}, |
|
{ |
|
"epoch": 2.370351790305387, |
|
"grad_norm": 1.2431716918945312, |
|
"learning_rate": 9.060251876883416e-06, |
|
"loss": 0.6775, |
|
"step": 37800 |
|
}, |
|
{ |
|
"epoch": 2.370351790305387, |
|
"eval_loss": 0.8907997608184814, |
|
"eval_runtime": 497.9968, |
|
"eval_samples_per_second": 256.17, |
|
"eval_steps_per_second": 8.006, |
|
"step": 37800 |
|
}, |
|
{ |
|
"epoch": 2.3766225622374115, |
|
"grad_norm": 0.6021884679794312, |
|
"learning_rate": 9.025414133672421e-06, |
|
"loss": 0.7159, |
|
"step": 37900 |
|
}, |
|
{ |
|
"epoch": 2.3766225622374115, |
|
"eval_loss": 0.8589950203895569, |
|
"eval_runtime": 499.2581, |
|
"eval_samples_per_second": 255.523, |
|
"eval_steps_per_second": 7.986, |
|
"step": 37900 |
|
}, |
|
{ |
|
"epoch": 2.3828933341694363, |
|
"grad_norm": 0.35575389862060547, |
|
"learning_rate": 8.990576390461425e-06, |
|
"loss": 0.6422, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.3828933341694363, |
|
"eval_loss": 0.8558962941169739, |
|
"eval_runtime": 493.9361, |
|
"eval_samples_per_second": 258.276, |
|
"eval_steps_per_second": 8.072, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.389164106101461, |
|
"grad_norm": 0.1820683479309082, |
|
"learning_rate": 8.955738647250432e-06, |
|
"loss": 0.7773, |
|
"step": 38100 |
|
}, |
|
{ |
|
"epoch": 2.389164106101461, |
|
"eval_loss": 0.8600557446479797, |
|
"eval_runtime": 493.6945, |
|
"eval_samples_per_second": 258.403, |
|
"eval_steps_per_second": 8.076, |
|
"step": 38100 |
|
}, |
|
{ |
|
"epoch": 2.3954348780334858, |
|
"grad_norm": 0.02349485270678997, |
|
"learning_rate": 8.920900904039436e-06, |
|
"loss": 0.5457, |
|
"step": 38200 |
|
}, |
|
{ |
|
"epoch": 2.3954348780334858, |
|
"eval_loss": 0.8856033086776733, |
|
"eval_runtime": 499.9609, |
|
"eval_samples_per_second": 255.164, |
|
"eval_steps_per_second": 7.975, |
|
"step": 38200 |
|
}, |
|
{ |
|
"epoch": 2.401705649965511, |
|
"grad_norm": 38.077266693115234, |
|
"learning_rate": 8.886063160828443e-06, |
|
"loss": 0.4997, |
|
"step": 38300 |
|
}, |
|
{ |
|
"epoch": 2.401705649965511, |
|
"eval_loss": 0.8785237669944763, |
|
"eval_runtime": 503.5878, |
|
"eval_samples_per_second": 253.326, |
|
"eval_steps_per_second": 7.917, |
|
"step": 38300 |
|
}, |
|
{ |
|
"epoch": 2.4079764218975357, |
|
"grad_norm": 100.00057983398438, |
|
"learning_rate": 8.851225417617447e-06, |
|
"loss": 0.6319, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 2.4079764218975357, |
|
"eval_loss": 0.885017454624176, |
|
"eval_runtime": 496.4876, |
|
"eval_samples_per_second": 256.949, |
|
"eval_steps_per_second": 8.03, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 2.4142471938295604, |
|
"grad_norm": 71.8719253540039, |
|
"learning_rate": 8.816387674406452e-06, |
|
"loss": 0.7096, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 2.4142471938295604, |
|
"eval_loss": 0.823376476764679, |
|
"eval_runtime": 500.2741, |
|
"eval_samples_per_second": 255.004, |
|
"eval_steps_per_second": 7.97, |
|
"step": 38500 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 63788, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|