|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.6149993409779886, |
|
"eval_steps": 100, |
|
"global_step": 3100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 3e-06, |
|
"loss": 3.4225, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 3.0902585983276367, |
|
"eval_runtime": 125.4074, |
|
"eval_samples_per_second": 67.221, |
|
"eval_steps_per_second": 33.61, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 6e-06, |
|
"loss": 2.6313, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_loss": 1.4370466470718384, |
|
"eval_runtime": 124.5179, |
|
"eval_samples_per_second": 67.701, |
|
"eval_steps_per_second": 33.851, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9e-06, |
|
"loss": 1.2926, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.0395174026489258, |
|
"eval_runtime": 125.1005, |
|
"eval_samples_per_second": 67.386, |
|
"eval_steps_per_second": 33.693, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.1492, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_loss": 1.0025979280471802, |
|
"eval_runtime": 124.2401, |
|
"eval_samples_per_second": 67.852, |
|
"eval_steps_per_second": 33.926, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.1062, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 0.9820207357406616, |
|
"eval_runtime": 124.4083, |
|
"eval_samples_per_second": 67.761, |
|
"eval_steps_per_second": 33.88, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.4960378963658215e-05, |
|
"loss": 1.0761, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 0.9677473902702332, |
|
"eval_runtime": 124.3968, |
|
"eval_samples_per_second": 67.767, |
|
"eval_steps_per_second": 33.884, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.484193447503841e-05, |
|
"loss": 1.056, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_loss": 0.9569339156150818, |
|
"eval_runtime": 124.228, |
|
"eval_samples_per_second": 67.859, |
|
"eval_steps_per_second": 33.93, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.4645917972377404e-05, |
|
"loss": 1.0446, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_loss": 0.9495565891265869, |
|
"eval_runtime": 124.2985, |
|
"eval_samples_per_second": 67.821, |
|
"eval_steps_per_second": 33.91, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 1.4374400489535342e-05, |
|
"loss": 1.0399, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 0.9445509910583496, |
|
"eval_runtime": 124.152, |
|
"eval_samples_per_second": 67.901, |
|
"eval_steps_per_second": 33.95, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.403025077426025e-05, |
|
"loss": 1.0252, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 0.9403882026672363, |
|
"eval_runtime": 123.8904, |
|
"eval_samples_per_second": 68.044, |
|
"eval_steps_per_second": 34.022, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.3617104978119044e-05, |
|
"loss": 1.0284, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_loss": 0.9368470907211304, |
|
"eval_runtime": 123.9167, |
|
"eval_samples_per_second": 68.03, |
|
"eval_steps_per_second": 34.015, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.3139328238339287e-05, |
|
"loss": 1.0171, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"eval_loss": 0.9342640042304993, |
|
"eval_runtime": 124.6542, |
|
"eval_samples_per_second": 67.627, |
|
"eval_steps_per_second": 33.814, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.2601968557473e-05, |
|
"loss": 1.0086, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"eval_loss": 0.9315484166145325, |
|
"eval_runtime": 124.4874, |
|
"eval_samples_per_second": 67.718, |
|
"eval_steps_per_second": 33.859, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 1.2010703468171973e-05, |
|
"loss": 1.0056, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"eval_loss": 0.9296947717666626, |
|
"eval_runtime": 124.175, |
|
"eval_samples_per_second": 67.888, |
|
"eval_steps_per_second": 33.944, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.1371780046593758e-05, |
|
"loss": 1.0083, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"eval_loss": 0.9280151128768921, |
|
"eval_runtime": 123.9014, |
|
"eval_samples_per_second": 68.038, |
|
"eval_steps_per_second": 34.019, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.069194890823328e-05, |
|
"loss": 1.0037, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"eval_loss": 0.926633358001709, |
|
"eval_runtime": 123.7914, |
|
"eval_samples_per_second": 68.098, |
|
"eval_steps_per_second": 34.049, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 9.978392883554342e-06, |
|
"loss": 0.9951, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"eval_loss": 0.9259628653526306, |
|
"eval_runtime": 124.5775, |
|
"eval_samples_per_second": 67.669, |
|
"eval_steps_per_second": 33.834, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 9.238651127006462e-06, |
|
"loss": 0.9962, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"eval_loss": 0.9251705408096313, |
|
"eval_runtime": 124.4301, |
|
"eval_samples_per_second": 67.749, |
|
"eval_steps_per_second": 33.874, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 8.48053946126157e-06, |
|
"loss": 0.9907, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.924351155757904, |
|
"eval_runtime": 124.2821, |
|
"eval_samples_per_second": 67.83, |
|
"eval_steps_per_second": 33.915, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 7.712067798282222e-06, |
|
"loss": 1.0003, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"eval_loss": 0.9239010810852051, |
|
"eval_runtime": 123.3108, |
|
"eval_samples_per_second": 68.364, |
|
"eval_steps_per_second": 34.182, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 6.941355509718164e-06, |
|
"loss": 0.9976, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"eval_loss": 0.9232047200202942, |
|
"eval_runtime": 124.3131, |
|
"eval_samples_per_second": 67.813, |
|
"eval_steps_per_second": 33.906, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 6.176545640794535e-06, |
|
"loss": 0.9896, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"eval_loss": 0.923151969909668, |
|
"eval_runtime": 123.8739, |
|
"eval_samples_per_second": 68.053, |
|
"eval_steps_per_second": 34.027, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 5.4257188740743086e-06, |
|
"loss": 0.9954, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"eval_loss": 0.9228904247283936, |
|
"eval_runtime": 124.4202, |
|
"eval_samples_per_second": 67.754, |
|
"eval_steps_per_second": 33.877, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 4.696808152120318e-06, |
|
"loss": 0.9982, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"eval_loss": 0.9227039813995361, |
|
"eval_runtime": 124.3531, |
|
"eval_samples_per_second": 67.791, |
|
"eval_steps_per_second": 33.895, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 3.997514861120414e-06, |
|
"loss": 0.9957, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"eval_loss": 0.9224779605865479, |
|
"eval_runtime": 123.9488, |
|
"eval_samples_per_second": 68.012, |
|
"eval_steps_per_second": 34.006, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 3.335227461046941e-06, |
|
"loss": 0.9883, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"eval_loss": 0.922317624092102, |
|
"eval_runtime": 124.5434, |
|
"eval_samples_per_second": 67.687, |
|
"eval_steps_per_second": 33.844, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 2.7169434220724335e-06, |
|
"loss": 0.9849, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"eval_loss": 0.9223732352256775, |
|
"eval_runtime": 124.8267, |
|
"eval_samples_per_second": 67.534, |
|
"eval_steps_per_second": 33.767, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 2.14919529203096e-06, |
|
"loss": 0.9974, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"eval_loss": 0.9222919940948486, |
|
"eval_runtime": 124.9102, |
|
"eval_samples_per_second": 67.489, |
|
"eval_steps_per_second": 33.744, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.6379816760674141e-06, |
|
"loss": 0.9854, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"eval_loss": 0.9222748279571533, |
|
"eval_runtime": 124.029, |
|
"eval_samples_per_second": 67.968, |
|
"eval_steps_per_second": 33.984, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.1887038577168646e-06, |
|
"loss": 0.9831, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"eval_loss": 0.922382652759552, |
|
"eval_runtime": 123.9646, |
|
"eval_samples_per_second": 68.003, |
|
"eval_steps_per_second": 34.002, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 8.061087310508917e-07, |
|
"loss": 0.9958, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"eval_loss": 0.9223530292510986, |
|
"eval_runtime": 124.2043, |
|
"eval_samples_per_second": 67.872, |
|
"eval_steps_per_second": 33.936, |
|
"step": 3100 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 3555, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 2.106483852705915e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|