|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9887640449438202, |
|
"eval_steps": 100, |
|
"global_step": 66, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 133.09732789993285, |
|
"epoch": 0.0749063670411985, |
|
"grad_norm": 1.5293161678134963, |
|
"kl": 0.023732519149780272, |
|
"learning_rate": 1.4285714285714287e-05, |
|
"loss": 0.001, |
|
"reward": 1.653250901401043, |
|
"reward_std": 0.6752304386347532, |
|
"rewards/accuracy_reward": 0.5250000260770321, |
|
"rewards/cosine_scaled_reward": 0.34640559698455037, |
|
"rewards/format_reward": 0.7250000283122062, |
|
"rewards/reasoning_steps_reward": 0.056845242623239756, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 102.5696473121643, |
|
"epoch": 0.149812734082397, |
|
"grad_norm": 1.3144666712389457, |
|
"kl": 0.15343170166015624, |
|
"learning_rate": 1.9872683547213446e-05, |
|
"loss": 0.0061, |
|
"reward": 1.7395714558660984, |
|
"reward_std": 0.6690045401453972, |
|
"rewards/accuracy_reward": 0.5000000222586095, |
|
"rewards/cosine_scaled_reward": 0.40504757558394433, |
|
"rewards/format_reward": 0.8053571842610836, |
|
"rewards/reasoning_steps_reward": 0.02916666897945106, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 68.31875326633454, |
|
"epoch": 0.2247191011235955, |
|
"grad_norm": 1.690644929593771, |
|
"kl": 0.4529541015625, |
|
"learning_rate": 1.9106347728549134e-05, |
|
"loss": 0.0181, |
|
"reward": 1.5209994725883007, |
|
"reward_std": 0.4919801800744608, |
|
"rewards/accuracy_reward": 0.3375000160187483, |
|
"rewards/cosine_scaled_reward": 0.30939229315263217, |
|
"rewards/format_reward": 0.8714286126196384, |
|
"rewards/reasoning_steps_reward": 0.0026785717345774174, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 27.072322607040405, |
|
"epoch": 0.299625468164794, |
|
"grad_norm": 5.038040476887466, |
|
"kl": 0.86884765625, |
|
"learning_rate": 1.7698339834299064e-05, |
|
"loss": 0.0348, |
|
"reward": 1.3508709453046321, |
|
"reward_std": 0.2549172870512848, |
|
"rewards/accuracy_reward": 0.19553572423756121, |
|
"rewards/cosine_scaled_reward": 0.1919423281069612, |
|
"rewards/format_reward": 0.9633928693830967, |
|
"rewards/reasoning_steps_reward": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 19.96160795688629, |
|
"epoch": 0.37453183520599254, |
|
"grad_norm": 0.7209085060337556, |
|
"kl": 1.62548828125, |
|
"learning_rate": 1.5747874102144073e-05, |
|
"loss": 0.065, |
|
"reward": 1.1604069240391255, |
|
"reward_std": 0.1543787806871933, |
|
"rewards/accuracy_reward": 0.09196429010480642, |
|
"rewards/cosine_scaled_reward": 0.08987116043572314, |
|
"rewards/format_reward": 0.9785714387893677, |
|
"rewards/reasoning_steps_reward": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 14.525000703334808, |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 1.0679577003741414, |
|
"kl": 2.248828125, |
|
"learning_rate": 1.3392388661180303e-05, |
|
"loss": 0.0899, |
|
"reward": 1.0422519214451313, |
|
"reward_std": 0.07961238992461403, |
|
"rewards/accuracy_reward": 0.028571429941803218, |
|
"rewards/cosine_scaled_reward": 0.02707329906115774, |
|
"rewards/format_reward": 0.9866071492433548, |
|
"rewards/reasoning_steps_reward": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 16.882143712043764, |
|
"epoch": 0.5243445692883895, |
|
"grad_norm": 1.9915464242130758, |
|
"kl": 2.50048828125, |
|
"learning_rate": 1.0797861055530832e-05, |
|
"loss": 0.1, |
|
"reward": 1.0598101012408734, |
|
"reward_std": 0.08466785513780906, |
|
"rewards/accuracy_reward": 0.03660714449360967, |
|
"rewards/cosine_scaled_reward": 0.03391720272193197, |
|
"rewards/format_reward": 0.9892857186496258, |
|
"rewards/reasoning_steps_reward": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 17.50357232093811, |
|
"epoch": 0.599250936329588, |
|
"grad_norm": 0.6015571750593173, |
|
"kl": 2.02138671875, |
|
"learning_rate": 8.147112759128859e-06, |
|
"loss": 0.0808, |
|
"reward": 1.0429342821240426, |
|
"reward_std": 0.08854535985910843, |
|
"rewards/accuracy_reward": 0.027678572665899993, |
|
"rewards/cosine_scaled_reward": 0.025969939603237437, |
|
"rewards/format_reward": 0.9892857193946838, |
|
"rewards/reasoning_steps_reward": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 17.032143676280974, |
|
"epoch": 0.6741573033707865, |
|
"grad_norm": 1.5617783259086722, |
|
"kl": 9.1677734375, |
|
"learning_rate": 5.626926795411447e-06, |
|
"loss": 0.3657, |
|
"reward": 1.0205947622656821, |
|
"reward_std": 0.04966565851066207, |
|
"rewards/accuracy_reward": 0.016071429289877415, |
|
"rewards/cosine_scaled_reward": 0.013451844768133015, |
|
"rewards/format_reward": 0.9910714328289032, |
|
"rewards/reasoning_steps_reward": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 24.61875115633011, |
|
"epoch": 0.7490636704119851, |
|
"grad_norm": 12.921102363021035, |
|
"kl": 2.09052734375, |
|
"learning_rate": 3.414886209349615e-06, |
|
"loss": 0.0836, |
|
"reward": 1.0334184400737285, |
|
"reward_std": 0.07184715992339079, |
|
"rewards/accuracy_reward": 0.026785715483129025, |
|
"rewards/cosine_scaled_reward": 0.02270410436904058, |
|
"rewards/format_reward": 0.9839285790920258, |
|
"rewards/reasoning_steps_reward": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 37.971430158615114, |
|
"epoch": 0.8239700374531835, |
|
"grad_norm": 0.7983120982012815, |
|
"kl": 2.83486328125, |
|
"learning_rate": 1.6668608091748495e-06, |
|
"loss": 0.1137, |
|
"reward": 1.026642444729805, |
|
"reward_std": 0.09744821136546307, |
|
"rewards/accuracy_reward": 0.028571429941803218, |
|
"rewards/cosine_scaled_reward": 0.024261438589019236, |
|
"rewards/format_reward": 0.9732142984867096, |
|
"rewards/reasoning_steps_reward": 0.0005952381528913975, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 47.03750244379044, |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 0.413637391163457, |
|
"kl": 1.8755859375, |
|
"learning_rate": 5.060239153161872e-07, |
|
"loss": 0.075, |
|
"reward": 1.005486535280943, |
|
"reward_std": 0.12288255607795691, |
|
"rewards/accuracy_reward": 0.026785715576261283, |
|
"rewards/cosine_scaled_reward": 0.02066505177790532, |
|
"rewards/format_reward": 0.9571428760886193, |
|
"rewards/reasoning_steps_reward": 0.0008928572293370962, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 41.104466354846956, |
|
"epoch": 0.9737827715355806, |
|
"grad_norm": 0.8917372735425451, |
|
"kl": 1.94501953125, |
|
"learning_rate": 1.4173043232380557e-08, |
|
"loss": 0.0778, |
|
"reward": 1.0319705478847028, |
|
"reward_std": 0.12593294799758042, |
|
"rewards/accuracy_reward": 0.03571428749710322, |
|
"rewards/cosine_scaled_reward": 0.030184772261418403, |
|
"rewards/format_reward": 0.9660714447498322, |
|
"rewards/reasoning_steps_reward": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 23.01785808801651, |
|
"epoch": 0.9887640449438202, |
|
"kl": 1.99560546875, |
|
"reward": 1.0267603546380997, |
|
"reward_std": 0.08640075298319516, |
|
"rewards/accuracy_reward": 0.022321429569274187, |
|
"rewards/cosine_scaled_reward": 0.01783171975694131, |
|
"rewards/format_reward": 0.9866071492433548, |
|
"rewards/reasoning_steps_reward": 0.0, |
|
"step": 66, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0854370246613116, |
|
"train_runtime": 2996.315, |
|
"train_samples_per_second": 2.494, |
|
"train_steps_per_second": 0.022 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 66, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|