{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9887640449438202, "eval_steps": 100, "global_step": 66, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 133.09732789993285, "epoch": 0.0749063670411985, "grad_norm": 1.5293161678134963, "kl": 0.023732519149780272, "learning_rate": 1.4285714285714287e-05, "loss": 0.001, "reward": 1.653250901401043, "reward_std": 0.6752304386347532, "rewards/accuracy_reward": 0.5250000260770321, "rewards/cosine_scaled_reward": 0.34640559698455037, "rewards/format_reward": 0.7250000283122062, "rewards/reasoning_steps_reward": 0.056845242623239756, "step": 5 }, { "completion_length": 102.5696473121643, "epoch": 0.149812734082397, "grad_norm": 1.3144666712389457, "kl": 0.15343170166015624, "learning_rate": 1.9872683547213446e-05, "loss": 0.0061, "reward": 1.7395714558660984, "reward_std": 0.6690045401453972, "rewards/accuracy_reward": 0.5000000222586095, "rewards/cosine_scaled_reward": 0.40504757558394433, "rewards/format_reward": 0.8053571842610836, "rewards/reasoning_steps_reward": 0.02916666897945106, "step": 10 }, { "completion_length": 68.31875326633454, "epoch": 0.2247191011235955, "grad_norm": 1.690644929593771, "kl": 0.4529541015625, "learning_rate": 1.9106347728549134e-05, "loss": 0.0181, "reward": 1.5209994725883007, "reward_std": 0.4919801800744608, "rewards/accuracy_reward": 0.3375000160187483, "rewards/cosine_scaled_reward": 0.30939229315263217, "rewards/format_reward": 0.8714286126196384, "rewards/reasoning_steps_reward": 0.0026785717345774174, "step": 15 }, { "completion_length": 27.072322607040405, "epoch": 0.299625468164794, "grad_norm": 5.038040476887466, "kl": 0.86884765625, "learning_rate": 1.7698339834299064e-05, "loss": 0.0348, "reward": 1.3508709453046321, "reward_std": 0.2549172870512848, "rewards/accuracy_reward": 0.19553572423756121, "rewards/cosine_scaled_reward": 0.1919423281069612, "rewards/format_reward": 0.9633928693830967, "rewards/reasoning_steps_reward": 0.0, "step": 20 }, { "completion_length": 19.96160795688629, "epoch": 0.37453183520599254, "grad_norm": 0.7209085060337556, "kl": 1.62548828125, "learning_rate": 1.5747874102144073e-05, "loss": 0.065, "reward": 1.1604069240391255, "reward_std": 0.1543787806871933, "rewards/accuracy_reward": 0.09196429010480642, "rewards/cosine_scaled_reward": 0.08987116043572314, "rewards/format_reward": 0.9785714387893677, "rewards/reasoning_steps_reward": 0.0, "step": 25 }, { "completion_length": 14.525000703334808, "epoch": 0.449438202247191, "grad_norm": 1.0679577003741414, "kl": 2.248828125, "learning_rate": 1.3392388661180303e-05, "loss": 0.0899, "reward": 1.0422519214451313, "reward_std": 0.07961238992461403, "rewards/accuracy_reward": 0.028571429941803218, "rewards/cosine_scaled_reward": 0.02707329906115774, "rewards/format_reward": 0.9866071492433548, "rewards/reasoning_steps_reward": 0.0, "step": 30 }, { "completion_length": 16.882143712043764, "epoch": 0.5243445692883895, "grad_norm": 1.9915464242130758, "kl": 2.50048828125, "learning_rate": 1.0797861055530832e-05, "loss": 0.1, "reward": 1.0598101012408734, "reward_std": 0.08466785513780906, "rewards/accuracy_reward": 0.03660714449360967, "rewards/cosine_scaled_reward": 0.03391720272193197, "rewards/format_reward": 0.9892857186496258, "rewards/reasoning_steps_reward": 0.0, "step": 35 }, { "completion_length": 17.50357232093811, "epoch": 0.599250936329588, "grad_norm": 0.6015571750593173, "kl": 2.02138671875, "learning_rate": 8.147112759128859e-06, "loss": 0.0808, "reward": 1.0429342821240426, "reward_std": 0.08854535985910843, "rewards/accuracy_reward": 0.027678572665899993, "rewards/cosine_scaled_reward": 0.025969939603237437, "rewards/format_reward": 0.9892857193946838, "rewards/reasoning_steps_reward": 0.0, "step": 40 }, { "completion_length": 17.032143676280974, "epoch": 0.6741573033707865, "grad_norm": 1.5617783259086722, "kl": 9.1677734375, "learning_rate": 5.626926795411447e-06, "loss": 0.3657, "reward": 1.0205947622656821, "reward_std": 0.04966565851066207, "rewards/accuracy_reward": 0.016071429289877415, "rewards/cosine_scaled_reward": 0.013451844768133015, "rewards/format_reward": 0.9910714328289032, "rewards/reasoning_steps_reward": 0.0, "step": 45 }, { "completion_length": 24.61875115633011, "epoch": 0.7490636704119851, "grad_norm": 12.921102363021035, "kl": 2.09052734375, "learning_rate": 3.414886209349615e-06, "loss": 0.0836, "reward": 1.0334184400737285, "reward_std": 0.07184715992339079, "rewards/accuracy_reward": 0.026785715483129025, "rewards/cosine_scaled_reward": 0.02270410436904058, "rewards/format_reward": 0.9839285790920258, "rewards/reasoning_steps_reward": 0.0, "step": 50 }, { "completion_length": 37.971430158615114, "epoch": 0.8239700374531835, "grad_norm": 0.7983120982012815, "kl": 2.83486328125, "learning_rate": 1.6668608091748495e-06, "loss": 0.1137, "reward": 1.026642444729805, "reward_std": 0.09744821136546307, "rewards/accuracy_reward": 0.028571429941803218, "rewards/cosine_scaled_reward": 0.024261438589019236, "rewards/format_reward": 0.9732142984867096, "rewards/reasoning_steps_reward": 0.0005952381528913975, "step": 55 }, { "completion_length": 47.03750244379044, "epoch": 0.898876404494382, "grad_norm": 0.413637391163457, "kl": 1.8755859375, "learning_rate": 5.060239153161872e-07, "loss": 0.075, "reward": 1.005486535280943, "reward_std": 0.12288255607795691, "rewards/accuracy_reward": 0.026785715576261283, "rewards/cosine_scaled_reward": 0.02066505177790532, "rewards/format_reward": 0.9571428760886193, "rewards/reasoning_steps_reward": 0.0008928572293370962, "step": 60 }, { "completion_length": 41.104466354846956, "epoch": 0.9737827715355806, "grad_norm": 0.8917372735425451, "kl": 1.94501953125, "learning_rate": 1.4173043232380557e-08, "loss": 0.0778, "reward": 1.0319705478847028, "reward_std": 0.12593294799758042, "rewards/accuracy_reward": 0.03571428749710322, "rewards/cosine_scaled_reward": 0.030184772261418403, "rewards/format_reward": 0.9660714447498322, "rewards/reasoning_steps_reward": 0.0, "step": 65 }, { "completion_length": 23.01785808801651, "epoch": 0.9887640449438202, "kl": 1.99560546875, "reward": 1.0267603546380997, "reward_std": 0.08640075298319516, "rewards/accuracy_reward": 0.022321429569274187, "rewards/cosine_scaled_reward": 0.01783171975694131, "rewards/format_reward": 0.9866071492433548, "rewards/reasoning_steps_reward": 0.0, "step": 66, "total_flos": 0.0, "train_loss": 0.0854370246613116, "train_runtime": 2996.315, "train_samples_per_second": 2.494, "train_steps_per_second": 0.022 } ], "logging_steps": 5, "max_steps": 66, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }