|
{ |
|
"best_metric": 1.7627220153808594, |
|
"best_model_checkpoint": "/home/sr2464/scratch/C2S_Files/C2S_training_runs/multicell_v2_pretraining_runs/finetuning-EleutherAI/pythia-410m-multicell_v2_pretraining-2024-07-28_13-55-51/checkpoint-7600", |
|
"epoch": 0.02637866105155987, |
|
"eval_steps": 100, |
|
"global_step": 7600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00034708764541526143, |
|
"grad_norm": 11.82771110534668, |
|
"learning_rate": 3.470776065528252e-07, |
|
"loss": 4.0944, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.00034708764541526143, |
|
"eval_loss": 3.7496891021728516, |
|
"eval_runtime": 24.9132, |
|
"eval_samples_per_second": 17.621, |
|
"eval_steps_per_second": 0.401, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0006941752908305229, |
|
"grad_norm": 6.635104656219482, |
|
"learning_rate": 6.941552131056504e-07, |
|
"loss": 3.4557, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0006941752908305229, |
|
"eval_loss": 3.1241986751556396, |
|
"eval_runtime": 24.3825, |
|
"eval_samples_per_second": 18.005, |
|
"eval_steps_per_second": 0.41, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0010412629362457843, |
|
"grad_norm": 5.158722400665283, |
|
"learning_rate": 1.0412328196584756e-06, |
|
"loss": 2.9713, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0010412629362457843, |
|
"eval_loss": 2.807060956954956, |
|
"eval_runtime": 24.5493, |
|
"eval_samples_per_second": 17.882, |
|
"eval_steps_per_second": 0.407, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0013883505816610457, |
|
"grad_norm": 4.5594987869262695, |
|
"learning_rate": 1.3883104262113009e-06, |
|
"loss": 2.7438, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0013883505816610457, |
|
"eval_loss": 2.6474108695983887, |
|
"eval_runtime": 24.8652, |
|
"eval_samples_per_second": 17.655, |
|
"eval_steps_per_second": 0.402, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0017354382270763071, |
|
"grad_norm": 4.926589488983154, |
|
"learning_rate": 1.7353880327641261e-06, |
|
"loss": 2.6108, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0017354382270763071, |
|
"eval_loss": 2.5452585220336914, |
|
"eval_runtime": 24.5027, |
|
"eval_samples_per_second": 17.916, |
|
"eval_steps_per_second": 0.408, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0020825258724915686, |
|
"grad_norm": 4.601032733917236, |
|
"learning_rate": 2.082465639316951e-06, |
|
"loss": 2.5207, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.0020825258724915686, |
|
"eval_loss": 2.4688005447387695, |
|
"eval_runtime": 24.9309, |
|
"eval_samples_per_second": 17.609, |
|
"eval_steps_per_second": 0.401, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.00242961351790683, |
|
"grad_norm": 7.157230377197266, |
|
"learning_rate": 2.429543245869777e-06, |
|
"loss": 2.4464, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.00242961351790683, |
|
"eval_loss": 2.404836416244507, |
|
"eval_runtime": 24.5241, |
|
"eval_samples_per_second": 17.901, |
|
"eval_steps_per_second": 0.408, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0027767011633220914, |
|
"grad_norm": 5.524389266967773, |
|
"learning_rate": 2.7766208524226017e-06, |
|
"loss": 2.3814, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.0027767011633220914, |
|
"eval_loss": 2.342756748199463, |
|
"eval_runtime": 24.889, |
|
"eval_samples_per_second": 17.638, |
|
"eval_steps_per_second": 0.402, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.003123788808737353, |
|
"grad_norm": 9.947836875915527, |
|
"learning_rate": 3.123698458975427e-06, |
|
"loss": 2.3277, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.003123788808737353, |
|
"eval_loss": 2.2919700145721436, |
|
"eval_runtime": 24.3758, |
|
"eval_samples_per_second": 18.01, |
|
"eval_steps_per_second": 0.41, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.0034708764541526143, |
|
"grad_norm": 3.7194149494171143, |
|
"learning_rate": 3.4707760655282523e-06, |
|
"loss": 2.2745, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0034708764541526143, |
|
"eval_loss": 2.2262797355651855, |
|
"eval_runtime": 24.8001, |
|
"eval_samples_per_second": 17.702, |
|
"eval_steps_per_second": 0.403, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0038179640995678757, |
|
"grad_norm": 3.318101167678833, |
|
"learning_rate": 3.817853672081077e-06, |
|
"loss": 2.1989, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.0038179640995678757, |
|
"eval_loss": 2.1580822467803955, |
|
"eval_runtime": 24.9395, |
|
"eval_samples_per_second": 17.603, |
|
"eval_steps_per_second": 0.401, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.004165051744983137, |
|
"grad_norm": 3.5991110801696777, |
|
"learning_rate": 4.164931278633902e-06, |
|
"loss": 2.1452, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.004165051744983137, |
|
"eval_loss": 2.103646755218506, |
|
"eval_runtime": 24.7561, |
|
"eval_samples_per_second": 17.733, |
|
"eval_steps_per_second": 0.404, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.004512139390398399, |
|
"grad_norm": 4.609652519226074, |
|
"learning_rate": 4.5120088851867285e-06, |
|
"loss": 2.0965, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.004512139390398399, |
|
"eval_loss": 2.0649664402008057, |
|
"eval_runtime": 24.3571, |
|
"eval_samples_per_second": 18.023, |
|
"eval_steps_per_second": 0.411, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.00485922703581366, |
|
"grad_norm": 2.850724697113037, |
|
"learning_rate": 4.859086491739554e-06, |
|
"loss": 2.0605, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.00485922703581366, |
|
"eval_loss": 2.035384178161621, |
|
"eval_runtime": 24.7396, |
|
"eval_samples_per_second": 17.745, |
|
"eval_steps_per_second": 0.404, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.0052063146812289214, |
|
"grad_norm": 2.408285140991211, |
|
"learning_rate": 5.206164098292378e-06, |
|
"loss": 2.0304, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.0052063146812289214, |
|
"eval_loss": 2.007747173309326, |
|
"eval_runtime": 24.3422, |
|
"eval_samples_per_second": 18.035, |
|
"eval_steps_per_second": 0.411, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.005553402326644183, |
|
"grad_norm": 2.4324307441711426, |
|
"learning_rate": 5.5532417048452035e-06, |
|
"loss": 2.0107, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.005553402326644183, |
|
"eval_loss": 1.9913328886032104, |
|
"eval_runtime": 24.7711, |
|
"eval_samples_per_second": 17.722, |
|
"eval_steps_per_second": 0.404, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.005900489972059444, |
|
"grad_norm": 2.5259335041046143, |
|
"learning_rate": 5.900319311398029e-06, |
|
"loss": 1.9932, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.005900489972059444, |
|
"eval_loss": 1.9759337902069092, |
|
"eval_runtime": 24.3525, |
|
"eval_samples_per_second": 18.027, |
|
"eval_steps_per_second": 0.411, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.006247577617474706, |
|
"grad_norm": 2.625298500061035, |
|
"learning_rate": 6.247396917950854e-06, |
|
"loss": 1.9784, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.006247577617474706, |
|
"eval_loss": 1.9598480463027954, |
|
"eval_runtime": 24.7491, |
|
"eval_samples_per_second": 17.738, |
|
"eval_steps_per_second": 0.404, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.006594665262889967, |
|
"grad_norm": 3.4950461387634277, |
|
"learning_rate": 6.59447452450368e-06, |
|
"loss": 1.962, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.006594665262889967, |
|
"eval_loss": 1.9490702152252197, |
|
"eval_runtime": 24.3735, |
|
"eval_samples_per_second": 18.011, |
|
"eval_steps_per_second": 0.41, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.006941752908305229, |
|
"grad_norm": 2.0300960540771484, |
|
"learning_rate": 6.9415521310565046e-06, |
|
"loss": 1.9562, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.006941752908305229, |
|
"eval_loss": 1.9372752904891968, |
|
"eval_runtime": 24.82, |
|
"eval_samples_per_second": 17.687, |
|
"eval_steps_per_second": 0.403, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.00728884055372049, |
|
"grad_norm": 2.2911906242370605, |
|
"learning_rate": 7.28862973760933e-06, |
|
"loss": 1.9433, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.00728884055372049, |
|
"eval_loss": 1.9285736083984375, |
|
"eval_runtime": 24.3516, |
|
"eval_samples_per_second": 18.028, |
|
"eval_steps_per_second": 0.411, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.0076359281991357515, |
|
"grad_norm": 2.4398937225341797, |
|
"learning_rate": 7.635707344162154e-06, |
|
"loss": 1.9336, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.0076359281991357515, |
|
"eval_loss": 1.917083978652954, |
|
"eval_runtime": 24.7637, |
|
"eval_samples_per_second": 17.728, |
|
"eval_steps_per_second": 0.404, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.007983015844551014, |
|
"grad_norm": 2.05511474609375, |
|
"learning_rate": 7.982784950714981e-06, |
|
"loss": 1.9206, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.007983015844551014, |
|
"eval_loss": 1.9095990657806396, |
|
"eval_runtime": 24.3373, |
|
"eval_samples_per_second": 18.038, |
|
"eval_steps_per_second": 0.411, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.008330103489966274, |
|
"grad_norm": 2.751636266708374, |
|
"learning_rate": 8.329862557267805e-06, |
|
"loss": 1.9171, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.008330103489966274, |
|
"eval_loss": 1.9016169309616089, |
|
"eval_runtime": 24.8006, |
|
"eval_samples_per_second": 17.701, |
|
"eval_steps_per_second": 0.403, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.008677191135381537, |
|
"grad_norm": 2.362074375152588, |
|
"learning_rate": 8.67694016382063e-06, |
|
"loss": 1.9019, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.008677191135381537, |
|
"eval_loss": 1.8958009481430054, |
|
"eval_runtime": 24.3568, |
|
"eval_samples_per_second": 18.024, |
|
"eval_steps_per_second": 0.411, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.009024278780796797, |
|
"grad_norm": 2.2177438735961914, |
|
"learning_rate": 9.024017770373457e-06, |
|
"loss": 1.9017, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.009024278780796797, |
|
"eval_loss": 1.887508749961853, |
|
"eval_runtime": 24.7322, |
|
"eval_samples_per_second": 17.75, |
|
"eval_steps_per_second": 0.404, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.00937136642621206, |
|
"grad_norm": 1.9819880723953247, |
|
"learning_rate": 9.37109537692628e-06, |
|
"loss": 1.8884, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.00937136642621206, |
|
"eval_loss": 1.883723258972168, |
|
"eval_runtime": 24.3493, |
|
"eval_samples_per_second": 18.029, |
|
"eval_steps_per_second": 0.411, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.00971845407162732, |
|
"grad_norm": 1.854220986366272, |
|
"learning_rate": 9.718172983479108e-06, |
|
"loss": 1.8849, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.00971845407162732, |
|
"eval_loss": 1.8750510215759277, |
|
"eval_runtime": 24.761, |
|
"eval_samples_per_second": 17.729, |
|
"eval_steps_per_second": 0.404, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.010065541717042582, |
|
"grad_norm": 1.6808480024337769, |
|
"learning_rate": 1.0065250590031931e-05, |
|
"loss": 1.8835, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.010065541717042582, |
|
"eval_loss": 1.8682337999343872, |
|
"eval_runtime": 24.3525, |
|
"eval_samples_per_second": 18.027, |
|
"eval_steps_per_second": 0.411, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.010412629362457843, |
|
"grad_norm": 2.1851375102996826, |
|
"learning_rate": 1.0412328196584756e-05, |
|
"loss": 1.8679, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.010412629362457843, |
|
"eval_loss": 1.8666459321975708, |
|
"eval_runtime": 24.7696, |
|
"eval_samples_per_second": 17.723, |
|
"eval_steps_per_second": 0.404, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.010759717007873105, |
|
"grad_norm": 2.1571567058563232, |
|
"learning_rate": 1.0759405803137582e-05, |
|
"loss": 1.8684, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.010759717007873105, |
|
"eval_loss": 1.8589407205581665, |
|
"eval_runtime": 24.3459, |
|
"eval_samples_per_second": 18.032, |
|
"eval_steps_per_second": 0.411, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.011106804653288366, |
|
"grad_norm": 1.878134846687317, |
|
"learning_rate": 1.1106483409690407e-05, |
|
"loss": 1.8671, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.011106804653288366, |
|
"eval_loss": 1.8569821119308472, |
|
"eval_runtime": 24.8145, |
|
"eval_samples_per_second": 17.691, |
|
"eval_steps_per_second": 0.403, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.011453892298703628, |
|
"grad_norm": 1.3475431203842163, |
|
"learning_rate": 1.1453561016243232e-05, |
|
"loss": 1.8574, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.011453892298703628, |
|
"eval_loss": 1.8536343574523926, |
|
"eval_runtime": 24.3402, |
|
"eval_samples_per_second": 18.036, |
|
"eval_steps_per_second": 0.411, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.011800979944118889, |
|
"grad_norm": 1.7545586824417114, |
|
"learning_rate": 1.1800638622796058e-05, |
|
"loss": 1.8544, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.011800979944118889, |
|
"eval_loss": 1.848755955696106, |
|
"eval_runtime": 24.8444, |
|
"eval_samples_per_second": 17.67, |
|
"eval_steps_per_second": 0.403, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.012148067589534151, |
|
"grad_norm": 1.378000020980835, |
|
"learning_rate": 1.2147716229348883e-05, |
|
"loss": 1.8531, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.012148067589534151, |
|
"eval_loss": 1.8428057432174683, |
|
"eval_runtime": 24.3454, |
|
"eval_samples_per_second": 18.032, |
|
"eval_steps_per_second": 0.411, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.012495155234949411, |
|
"grad_norm": 1.6506402492523193, |
|
"learning_rate": 1.2494793835901708e-05, |
|
"loss": 1.8481, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.012495155234949411, |
|
"eval_loss": 1.8448010683059692, |
|
"eval_runtime": 24.8196, |
|
"eval_samples_per_second": 17.688, |
|
"eval_steps_per_second": 0.403, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.012842242880364674, |
|
"grad_norm": 1.8429005146026611, |
|
"learning_rate": 1.2841871442454533e-05, |
|
"loss": 1.8502, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.012842242880364674, |
|
"eval_loss": 1.8411270380020142, |
|
"eval_runtime": 24.3415, |
|
"eval_samples_per_second": 18.035, |
|
"eval_steps_per_second": 0.411, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.013189330525779934, |
|
"grad_norm": 1.3569915294647217, |
|
"learning_rate": 1.318894904900736e-05, |
|
"loss": 1.8439, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.013189330525779934, |
|
"eval_loss": 1.8359909057617188, |
|
"eval_runtime": 24.808, |
|
"eval_samples_per_second": 17.696, |
|
"eval_steps_per_second": 0.403, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.013536418171195197, |
|
"grad_norm": 1.4549897909164429, |
|
"learning_rate": 1.3536026655560182e-05, |
|
"loss": 1.8377, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.013536418171195197, |
|
"eval_loss": 1.8364492654800415, |
|
"eval_runtime": 24.3193, |
|
"eval_samples_per_second": 18.051, |
|
"eval_steps_per_second": 0.411, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.013883505816610457, |
|
"grad_norm": 1.493501901626587, |
|
"learning_rate": 1.3883104262113009e-05, |
|
"loss": 1.8449, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.013883505816610457, |
|
"eval_loss": 1.8301156759262085, |
|
"eval_runtime": 24.8527, |
|
"eval_samples_per_second": 17.664, |
|
"eval_steps_per_second": 0.402, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.01423059346202572, |
|
"grad_norm": 1.4112632274627686, |
|
"learning_rate": 1.4230181868665834e-05, |
|
"loss": 1.8362, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.01423059346202572, |
|
"eval_loss": 1.827904462814331, |
|
"eval_runtime": 24.3918, |
|
"eval_samples_per_second": 17.998, |
|
"eval_steps_per_second": 0.41, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.01457768110744098, |
|
"grad_norm": 1.425305962562561, |
|
"learning_rate": 1.457725947521866e-05, |
|
"loss": 1.834, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.01457768110744098, |
|
"eval_loss": 1.8238531351089478, |
|
"eval_runtime": 25.0327, |
|
"eval_samples_per_second": 17.537, |
|
"eval_steps_per_second": 0.399, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.014924768752856242, |
|
"grad_norm": 1.3402836322784424, |
|
"learning_rate": 1.4924337081771487e-05, |
|
"loss": 1.8272, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.014924768752856242, |
|
"eval_loss": 1.8218436241149902, |
|
"eval_runtime": 24.3405, |
|
"eval_samples_per_second": 18.036, |
|
"eval_steps_per_second": 0.411, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.015271856398271503, |
|
"grad_norm": 1.3900611400604248, |
|
"learning_rate": 1.527141468832431e-05, |
|
"loss": 1.8267, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.015271856398271503, |
|
"eval_loss": 1.8179056644439697, |
|
"eval_runtime": 24.8888, |
|
"eval_samples_per_second": 17.638, |
|
"eval_steps_per_second": 0.402, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.015618944043686765, |
|
"grad_norm": 1.8727518320083618, |
|
"learning_rate": 1.5618492294877134e-05, |
|
"loss": 1.8164, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.015618944043686765, |
|
"eval_loss": 1.8159282207489014, |
|
"eval_runtime": 24.4165, |
|
"eval_samples_per_second": 17.98, |
|
"eval_steps_per_second": 0.41, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.015966031689102027, |
|
"grad_norm": 1.3085731267929077, |
|
"learning_rate": 1.5965569901429962e-05, |
|
"loss": 1.8236, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.015966031689102027, |
|
"eval_loss": 1.8137013912200928, |
|
"eval_runtime": 24.8538, |
|
"eval_samples_per_second": 17.663, |
|
"eval_steps_per_second": 0.402, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.016313119334517286, |
|
"grad_norm": 1.1558576822280884, |
|
"learning_rate": 1.6312647507982788e-05, |
|
"loss": 1.8186, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.016313119334517286, |
|
"eval_loss": 1.8110047578811646, |
|
"eval_runtime": 24.775, |
|
"eval_samples_per_second": 17.719, |
|
"eval_steps_per_second": 0.404, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.01666020697993255, |
|
"grad_norm": 1.2725753784179688, |
|
"learning_rate": 1.665972511453561e-05, |
|
"loss": 1.8142, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.01666020697993255, |
|
"eval_loss": 1.8090318441390991, |
|
"eval_runtime": 24.7529, |
|
"eval_samples_per_second": 17.735, |
|
"eval_steps_per_second": 0.404, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.01700729462534781, |
|
"grad_norm": 1.2328158617019653, |
|
"learning_rate": 1.7006802721088435e-05, |
|
"loss": 1.814, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.01700729462534781, |
|
"eval_loss": 1.8059718608856201, |
|
"eval_runtime": 24.4992, |
|
"eval_samples_per_second": 17.919, |
|
"eval_steps_per_second": 0.408, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.017354382270763073, |
|
"grad_norm": 1.2142012119293213, |
|
"learning_rate": 1.735388032764126e-05, |
|
"loss": 1.8106, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.017354382270763073, |
|
"eval_loss": 1.8046983480453491, |
|
"eval_runtime": 24.8108, |
|
"eval_samples_per_second": 17.694, |
|
"eval_steps_per_second": 0.403, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.017701469916178332, |
|
"grad_norm": 1.129823923110962, |
|
"learning_rate": 1.770095793419409e-05, |
|
"loss": 1.8103, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.017701469916178332, |
|
"eval_loss": 1.8018277883529663, |
|
"eval_runtime": 24.402, |
|
"eval_samples_per_second": 17.99, |
|
"eval_steps_per_second": 0.41, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.018048557561593594, |
|
"grad_norm": 1.4417102336883545, |
|
"learning_rate": 1.8048035540746914e-05, |
|
"loss": 1.809, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.018048557561593594, |
|
"eval_loss": 1.7983735799789429, |
|
"eval_runtime": 24.8186, |
|
"eval_samples_per_second": 17.688, |
|
"eval_steps_per_second": 0.403, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.018395645207008857, |
|
"grad_norm": 1.1981695890426636, |
|
"learning_rate": 1.8395113147299736e-05, |
|
"loss": 1.8025, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.018395645207008857, |
|
"eval_loss": 1.7986949682235718, |
|
"eval_runtime": 24.3774, |
|
"eval_samples_per_second": 18.009, |
|
"eval_steps_per_second": 0.41, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.01874273285242412, |
|
"grad_norm": 1.0044975280761719, |
|
"learning_rate": 1.874219075385256e-05, |
|
"loss": 1.8037, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.01874273285242412, |
|
"eval_loss": 1.795265793800354, |
|
"eval_runtime": 24.8161, |
|
"eval_samples_per_second": 17.69, |
|
"eval_steps_per_second": 0.403, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.019089820497839378, |
|
"grad_norm": 1.0582355260849, |
|
"learning_rate": 1.9089268360405386e-05, |
|
"loss": 1.8043, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.019089820497839378, |
|
"eval_loss": 1.7948534488677979, |
|
"eval_runtime": 24.3738, |
|
"eval_samples_per_second": 18.011, |
|
"eval_steps_per_second": 0.41, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.01943690814325464, |
|
"grad_norm": 0.9827861189842224, |
|
"learning_rate": 1.9436345966958215e-05, |
|
"loss": 1.7944, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.01943690814325464, |
|
"eval_loss": 1.7950631380081177, |
|
"eval_runtime": 24.8182, |
|
"eval_samples_per_second": 17.689, |
|
"eval_steps_per_second": 0.403, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.019783995788669902, |
|
"grad_norm": 0.9715519547462463, |
|
"learning_rate": 1.9783423573511037e-05, |
|
"loss": 1.8002, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.019783995788669902, |
|
"eval_loss": 1.7902957201004028, |
|
"eval_runtime": 24.6536, |
|
"eval_samples_per_second": 17.807, |
|
"eval_steps_per_second": 0.406, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.020131083434085165, |
|
"grad_norm": 1.1996351480484009, |
|
"learning_rate": 2.0130501180063862e-05, |
|
"loss": 1.7982, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.020131083434085165, |
|
"eval_loss": 1.7865217924118042, |
|
"eval_runtime": 24.7792, |
|
"eval_samples_per_second": 17.716, |
|
"eval_steps_per_second": 0.404, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.020478171079500423, |
|
"grad_norm": 0.9313274621963501, |
|
"learning_rate": 2.0477578786616688e-05, |
|
"loss": 1.7917, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.020478171079500423, |
|
"eval_loss": 1.7852766513824463, |
|
"eval_runtime": 24.3762, |
|
"eval_samples_per_second": 18.009, |
|
"eval_steps_per_second": 0.41, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.020825258724915686, |
|
"grad_norm": 1.2815057039260864, |
|
"learning_rate": 2.0824656393169513e-05, |
|
"loss": 1.7926, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.020825258724915686, |
|
"eval_loss": 1.7834093570709229, |
|
"eval_runtime": 24.8731, |
|
"eval_samples_per_second": 17.65, |
|
"eval_steps_per_second": 0.402, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.021172346370330948, |
|
"grad_norm": 1.3323718309402466, |
|
"learning_rate": 2.117173399972234e-05, |
|
"loss": 1.7909, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.021172346370330948, |
|
"eval_loss": 1.7828959226608276, |
|
"eval_runtime": 24.4144, |
|
"eval_samples_per_second": 17.981, |
|
"eval_steps_per_second": 0.41, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.02151943401574621, |
|
"grad_norm": 1.2279607057571411, |
|
"learning_rate": 2.1518811606275163e-05, |
|
"loss": 1.7903, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.02151943401574621, |
|
"eval_loss": 1.7831159830093384, |
|
"eval_runtime": 24.8211, |
|
"eval_samples_per_second": 17.687, |
|
"eval_steps_per_second": 0.403, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.021866521661161473, |
|
"grad_norm": 0.9246248602867126, |
|
"learning_rate": 2.186588921282799e-05, |
|
"loss": 1.7923, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.021866521661161473, |
|
"eval_loss": 1.7781379222869873, |
|
"eval_runtime": 24.4126, |
|
"eval_samples_per_second": 17.983, |
|
"eval_steps_per_second": 0.41, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.02221360930657673, |
|
"grad_norm": 1.3288153409957886, |
|
"learning_rate": 2.2212966819380814e-05, |
|
"loss": 1.7866, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.02221360930657673, |
|
"eval_loss": 1.781212568283081, |
|
"eval_runtime": 24.7848, |
|
"eval_samples_per_second": 17.712, |
|
"eval_steps_per_second": 0.403, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.022560696951991994, |
|
"grad_norm": 0.9484734535217285, |
|
"learning_rate": 2.256004442593364e-05, |
|
"loss": 1.7852, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.022560696951991994, |
|
"eval_loss": 1.7785717248916626, |
|
"eval_runtime": 24.4116, |
|
"eval_samples_per_second": 17.983, |
|
"eval_steps_per_second": 0.41, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.022907784597407256, |
|
"grad_norm": 1.7850075960159302, |
|
"learning_rate": 2.2907122032486464e-05, |
|
"loss": 1.7838, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.022907784597407256, |
|
"eval_loss": 1.7806742191314697, |
|
"eval_runtime": 24.8097, |
|
"eval_samples_per_second": 17.695, |
|
"eval_steps_per_second": 0.403, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.02325487224282252, |
|
"grad_norm": 0.8073210120201111, |
|
"learning_rate": 2.325419963903929e-05, |
|
"loss": 1.7817, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.02325487224282252, |
|
"eval_loss": 1.7764739990234375, |
|
"eval_runtime": 24.3455, |
|
"eval_samples_per_second": 18.032, |
|
"eval_steps_per_second": 0.411, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.023601959888237777, |
|
"grad_norm": 1.089603304862976, |
|
"learning_rate": 2.3601277245592115e-05, |
|
"loss": 1.78, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.023601959888237777, |
|
"eval_loss": 1.7742102146148682, |
|
"eval_runtime": 24.8252, |
|
"eval_samples_per_second": 17.684, |
|
"eval_steps_per_second": 0.403, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.02394904753365304, |
|
"grad_norm": 0.9081795811653137, |
|
"learning_rate": 2.394835485214494e-05, |
|
"loss": 1.7801, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.02394904753365304, |
|
"eval_loss": 1.77180814743042, |
|
"eval_runtime": 24.5243, |
|
"eval_samples_per_second": 17.901, |
|
"eval_steps_per_second": 0.408, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.024296135179068302, |
|
"grad_norm": 0.760989248752594, |
|
"learning_rate": 2.4295432458697766e-05, |
|
"loss": 1.782, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.024296135179068302, |
|
"eval_loss": 1.7700749635696411, |
|
"eval_runtime": 24.8764, |
|
"eval_samples_per_second": 17.647, |
|
"eval_steps_per_second": 0.402, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.024643222824483564, |
|
"grad_norm": 1.12177312374115, |
|
"learning_rate": 2.464251006525059e-05, |
|
"loss": 1.7819, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.024643222824483564, |
|
"eval_loss": 1.7722524404525757, |
|
"eval_runtime": 24.4905, |
|
"eval_samples_per_second": 17.925, |
|
"eval_steps_per_second": 0.408, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.024990310469898823, |
|
"grad_norm": 39.85576629638672, |
|
"learning_rate": 2.4989587671803416e-05, |
|
"loss": 1.7812, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.024990310469898823, |
|
"eval_loss": 1.7866556644439697, |
|
"eval_runtime": 24.8928, |
|
"eval_samples_per_second": 17.636, |
|
"eval_steps_per_second": 0.402, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.025337398115314085, |
|
"grad_norm": 1.0117896795272827, |
|
"learning_rate": 2.533666527835624e-05, |
|
"loss": 1.7824, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.025337398115314085, |
|
"eval_loss": 1.7690812349319458, |
|
"eval_runtime": 17.2093, |
|
"eval_samples_per_second": 25.509, |
|
"eval_steps_per_second": 0.581, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.025684485760729348, |
|
"grad_norm": 1.1257307529449463, |
|
"learning_rate": 2.5683742884909067e-05, |
|
"loss": 1.7754, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.025684485760729348, |
|
"eval_loss": 1.7678226232528687, |
|
"eval_runtime": 17.576, |
|
"eval_samples_per_second": 24.977, |
|
"eval_steps_per_second": 0.569, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.02603157340614461, |
|
"grad_norm": 0.8239357471466064, |
|
"learning_rate": 2.6030820491461895e-05, |
|
"loss": 1.7755, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.02603157340614461, |
|
"eval_loss": 1.7637029886245728, |
|
"eval_runtime": 16.9836, |
|
"eval_samples_per_second": 25.848, |
|
"eval_steps_per_second": 0.589, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.02637866105155987, |
|
"grad_norm": 0.7548935413360596, |
|
"learning_rate": 2.637789809801472e-05, |
|
"loss": 1.7684, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.02637866105155987, |
|
"eval_loss": 1.7627220153808594, |
|
"eval_runtime": 17.2269, |
|
"eval_samples_per_second": 25.483, |
|
"eval_steps_per_second": 0.58, |
|
"step": 7600 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 288111, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.586067803209728e+18, |
|
"train_batch_size": 48, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|