|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.8930602957906713, |
|
"eval_steps": 100000, |
|
"global_step": 5200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 8.332918167114258, |
|
"learning_rate": 9.999e-07, |
|
"loss": 0.256, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.65649938583374, |
|
"learning_rate": 9.998e-07, |
|
"loss": 0.174, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.654027462005615, |
|
"learning_rate": 9.997e-07, |
|
"loss": 0.1674, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.793820858001709, |
|
"learning_rate": 9.996e-07, |
|
"loss": 0.1538, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.857839107513428, |
|
"learning_rate": 9.995e-07, |
|
"loss": 0.1507, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 6.979122638702393, |
|
"learning_rate": 9.994e-07, |
|
"loss": 0.1413, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.4787421226501465, |
|
"learning_rate": 9.993e-07, |
|
"loss": 0.1426, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.989820957183838, |
|
"learning_rate": 9.992e-07, |
|
"loss": 0.139, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 4.455531597137451, |
|
"learning_rate": 9.990999999999999e-07, |
|
"loss": 0.145, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.2097902297973633, |
|
"learning_rate": 9.989999999999999e-07, |
|
"loss": 0.1349, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 4.64670467376709, |
|
"learning_rate": 9.988999999999999e-07, |
|
"loss": 0.1394, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 6.975039958953857, |
|
"learning_rate": 9.988e-07, |
|
"loss": 0.1346, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 7.464960098266602, |
|
"learning_rate": 9.987e-07, |
|
"loss": 0.1314, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.806921482086182, |
|
"learning_rate": 9.986e-07, |
|
"loss": 0.131, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 6.7189531326293945, |
|
"learning_rate": 9.985e-07, |
|
"loss": 0.1262, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 5.656557559967041, |
|
"learning_rate": 9.983999999999998e-07, |
|
"loss": 0.1223, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 4.686679363250732, |
|
"learning_rate": 9.982999999999998e-07, |
|
"loss": 0.1247, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 5.541558265686035, |
|
"learning_rate": 9.982e-07, |
|
"loss": 0.1269, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 6.699037551879883, |
|
"learning_rate": 9.981e-07, |
|
"loss": 0.1212, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 7.294734001159668, |
|
"learning_rate": 9.98e-07, |
|
"loss": 0.1288, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 6.782406806945801, |
|
"learning_rate": 9.979e-07, |
|
"loss": 0.1282, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 7.686770439147949, |
|
"learning_rate": 9.978e-07, |
|
"loss": 0.1252, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 5.651573181152344, |
|
"learning_rate": 9.977e-07, |
|
"loss": 0.128, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.8114349842071533, |
|
"learning_rate": 9.976e-07, |
|
"loss": 0.1189, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 4.4628777503967285, |
|
"learning_rate": 9.975e-07, |
|
"loss": 0.1227, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 3.973808526992798, |
|
"learning_rate": 9.974e-07, |
|
"loss": 0.1225, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 7.216485977172852, |
|
"learning_rate": 9.973e-07, |
|
"loss": 0.1171, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 4.588983058929443, |
|
"learning_rate": 9.972e-07, |
|
"loss": 0.1152, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.4740259647369385, |
|
"learning_rate": 9.971e-07, |
|
"loss": 0.096, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 4.7505903244018555, |
|
"learning_rate": 9.97e-07, |
|
"loss": 0.0986, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 5.158182621002197, |
|
"learning_rate": 9.969e-07, |
|
"loss": 0.1006, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 6.2976202964782715, |
|
"learning_rate": 9.968e-07, |
|
"loss": 0.0979, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 5.02843713760376, |
|
"learning_rate": 9.967e-07, |
|
"loss": 0.0923, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 5.816647052764893, |
|
"learning_rate": 9.966e-07, |
|
"loss": 0.1005, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 5.387178421020508, |
|
"learning_rate": 9.965e-07, |
|
"loss": 0.0992, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 5.27618932723999, |
|
"learning_rate": 9.964e-07, |
|
"loss": 0.0945, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.8429291248321533, |
|
"learning_rate": 9.962999999999999e-07, |
|
"loss": 0.0999, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 3.8993031978607178, |
|
"learning_rate": 9.961999999999999e-07, |
|
"loss": 0.0946, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 4.275607585906982, |
|
"learning_rate": 9.960999999999999e-07, |
|
"loss": 0.0943, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 4.941762447357178, |
|
"learning_rate": 9.959999999999999e-07, |
|
"loss": 0.1011, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 3.800781726837158, |
|
"learning_rate": 9.958999999999999e-07, |
|
"loss": 0.096, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 3.8520452976226807, |
|
"learning_rate": 9.958e-07, |
|
"loss": 0.0986, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 8.225783348083496, |
|
"learning_rate": 9.957e-07, |
|
"loss": 0.0935, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 3.4622890949249268, |
|
"learning_rate": 9.956e-07, |
|
"loss": 0.0964, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 4.632036209106445, |
|
"learning_rate": 9.955e-07, |
|
"loss": 0.0899, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 4.176944732666016, |
|
"learning_rate": 9.953999999999998e-07, |
|
"loss": 0.0947, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 4.445220947265625, |
|
"learning_rate": 9.952999999999998e-07, |
|
"loss": 0.0882, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 4.21484375, |
|
"learning_rate": 9.952e-07, |
|
"loss": 0.0987, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 7.656230449676514, |
|
"learning_rate": 9.951e-07, |
|
"loss": 0.0995, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 4.51482629776001, |
|
"learning_rate": 9.95e-07, |
|
"loss": 0.092, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 4.313094139099121, |
|
"learning_rate": 9.949e-07, |
|
"loss": 0.0935, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 5.037979602813721, |
|
"learning_rate": 9.948e-07, |
|
"loss": 0.0998, |
|
"step": 5200 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 1000000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 365, |
|
"save_steps": 200, |
|
"total_flos": 3.542617035836621e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|