ununtrium's picture
Model save
51f5b14 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9887640449438202,
"eval_steps": 100,
"global_step": 66,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 133.09732789993285,
"epoch": 0.0749063670411985,
"grad_norm": 1.5293161678134963,
"kl": 0.023732519149780272,
"learning_rate": 1.4285714285714287e-05,
"loss": 0.001,
"reward": 1.653250901401043,
"reward_std": 0.6752304386347532,
"rewards/accuracy_reward": 0.5250000260770321,
"rewards/cosine_scaled_reward": 0.34640559698455037,
"rewards/format_reward": 0.7250000283122062,
"rewards/reasoning_steps_reward": 0.056845242623239756,
"step": 5
},
{
"completion_length": 102.5696473121643,
"epoch": 0.149812734082397,
"grad_norm": 1.3144666712389457,
"kl": 0.15343170166015624,
"learning_rate": 1.9872683547213446e-05,
"loss": 0.0061,
"reward": 1.7395714558660984,
"reward_std": 0.6690045401453972,
"rewards/accuracy_reward": 0.5000000222586095,
"rewards/cosine_scaled_reward": 0.40504757558394433,
"rewards/format_reward": 0.8053571842610836,
"rewards/reasoning_steps_reward": 0.02916666897945106,
"step": 10
},
{
"completion_length": 68.31875326633454,
"epoch": 0.2247191011235955,
"grad_norm": 1.690644929593771,
"kl": 0.4529541015625,
"learning_rate": 1.9106347728549134e-05,
"loss": 0.0181,
"reward": 1.5209994725883007,
"reward_std": 0.4919801800744608,
"rewards/accuracy_reward": 0.3375000160187483,
"rewards/cosine_scaled_reward": 0.30939229315263217,
"rewards/format_reward": 0.8714286126196384,
"rewards/reasoning_steps_reward": 0.0026785717345774174,
"step": 15
},
{
"completion_length": 27.072322607040405,
"epoch": 0.299625468164794,
"grad_norm": 5.038040476887466,
"kl": 0.86884765625,
"learning_rate": 1.7698339834299064e-05,
"loss": 0.0348,
"reward": 1.3508709453046321,
"reward_std": 0.2549172870512848,
"rewards/accuracy_reward": 0.19553572423756121,
"rewards/cosine_scaled_reward": 0.1919423281069612,
"rewards/format_reward": 0.9633928693830967,
"rewards/reasoning_steps_reward": 0.0,
"step": 20
},
{
"completion_length": 19.96160795688629,
"epoch": 0.37453183520599254,
"grad_norm": 0.7209085060337556,
"kl": 1.62548828125,
"learning_rate": 1.5747874102144073e-05,
"loss": 0.065,
"reward": 1.1604069240391255,
"reward_std": 0.1543787806871933,
"rewards/accuracy_reward": 0.09196429010480642,
"rewards/cosine_scaled_reward": 0.08987116043572314,
"rewards/format_reward": 0.9785714387893677,
"rewards/reasoning_steps_reward": 0.0,
"step": 25
},
{
"completion_length": 14.525000703334808,
"epoch": 0.449438202247191,
"grad_norm": 1.0679577003741414,
"kl": 2.248828125,
"learning_rate": 1.3392388661180303e-05,
"loss": 0.0899,
"reward": 1.0422519214451313,
"reward_std": 0.07961238992461403,
"rewards/accuracy_reward": 0.028571429941803218,
"rewards/cosine_scaled_reward": 0.02707329906115774,
"rewards/format_reward": 0.9866071492433548,
"rewards/reasoning_steps_reward": 0.0,
"step": 30
},
{
"completion_length": 16.882143712043764,
"epoch": 0.5243445692883895,
"grad_norm": 1.9915464242130758,
"kl": 2.50048828125,
"learning_rate": 1.0797861055530832e-05,
"loss": 0.1,
"reward": 1.0598101012408734,
"reward_std": 0.08466785513780906,
"rewards/accuracy_reward": 0.03660714449360967,
"rewards/cosine_scaled_reward": 0.03391720272193197,
"rewards/format_reward": 0.9892857186496258,
"rewards/reasoning_steps_reward": 0.0,
"step": 35
},
{
"completion_length": 17.50357232093811,
"epoch": 0.599250936329588,
"grad_norm": 0.6015571750593173,
"kl": 2.02138671875,
"learning_rate": 8.147112759128859e-06,
"loss": 0.0808,
"reward": 1.0429342821240426,
"reward_std": 0.08854535985910843,
"rewards/accuracy_reward": 0.027678572665899993,
"rewards/cosine_scaled_reward": 0.025969939603237437,
"rewards/format_reward": 0.9892857193946838,
"rewards/reasoning_steps_reward": 0.0,
"step": 40
},
{
"completion_length": 17.032143676280974,
"epoch": 0.6741573033707865,
"grad_norm": 1.5617783259086722,
"kl": 9.1677734375,
"learning_rate": 5.626926795411447e-06,
"loss": 0.3657,
"reward": 1.0205947622656821,
"reward_std": 0.04966565851066207,
"rewards/accuracy_reward": 0.016071429289877415,
"rewards/cosine_scaled_reward": 0.013451844768133015,
"rewards/format_reward": 0.9910714328289032,
"rewards/reasoning_steps_reward": 0.0,
"step": 45
},
{
"completion_length": 24.61875115633011,
"epoch": 0.7490636704119851,
"grad_norm": 12.921102363021035,
"kl": 2.09052734375,
"learning_rate": 3.414886209349615e-06,
"loss": 0.0836,
"reward": 1.0334184400737285,
"reward_std": 0.07184715992339079,
"rewards/accuracy_reward": 0.026785715483129025,
"rewards/cosine_scaled_reward": 0.02270410436904058,
"rewards/format_reward": 0.9839285790920258,
"rewards/reasoning_steps_reward": 0.0,
"step": 50
},
{
"completion_length": 37.971430158615114,
"epoch": 0.8239700374531835,
"grad_norm": 0.7983120982012815,
"kl": 2.83486328125,
"learning_rate": 1.6668608091748495e-06,
"loss": 0.1137,
"reward": 1.026642444729805,
"reward_std": 0.09744821136546307,
"rewards/accuracy_reward": 0.028571429941803218,
"rewards/cosine_scaled_reward": 0.024261438589019236,
"rewards/format_reward": 0.9732142984867096,
"rewards/reasoning_steps_reward": 0.0005952381528913975,
"step": 55
},
{
"completion_length": 47.03750244379044,
"epoch": 0.898876404494382,
"grad_norm": 0.413637391163457,
"kl": 1.8755859375,
"learning_rate": 5.060239153161872e-07,
"loss": 0.075,
"reward": 1.005486535280943,
"reward_std": 0.12288255607795691,
"rewards/accuracy_reward": 0.026785715576261283,
"rewards/cosine_scaled_reward": 0.02066505177790532,
"rewards/format_reward": 0.9571428760886193,
"rewards/reasoning_steps_reward": 0.0008928572293370962,
"step": 60
},
{
"completion_length": 41.104466354846956,
"epoch": 0.9737827715355806,
"grad_norm": 0.8917372735425451,
"kl": 1.94501953125,
"learning_rate": 1.4173043232380557e-08,
"loss": 0.0778,
"reward": 1.0319705478847028,
"reward_std": 0.12593294799758042,
"rewards/accuracy_reward": 0.03571428749710322,
"rewards/cosine_scaled_reward": 0.030184772261418403,
"rewards/format_reward": 0.9660714447498322,
"rewards/reasoning_steps_reward": 0.0,
"step": 65
},
{
"completion_length": 23.01785808801651,
"epoch": 0.9887640449438202,
"kl": 1.99560546875,
"reward": 1.0267603546380997,
"reward_std": 0.08640075298319516,
"rewards/accuracy_reward": 0.022321429569274187,
"rewards/cosine_scaled_reward": 0.01783171975694131,
"rewards/format_reward": 0.9866071492433548,
"rewards/reasoning_steps_reward": 0.0,
"step": 66,
"total_flos": 0.0,
"train_loss": 0.0854370246613116,
"train_runtime": 2996.315,
"train_samples_per_second": 2.494,
"train_steps_per_second": 0.022
}
],
"logging_steps": 5,
"max_steps": 66,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}