iaminju's picture
Model save
872344b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9993002099370188,
"eval_steps": 500,
"global_step": 357,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 750.0178985595703,
"epoch": 0.0027991602519244225,
"grad_norm": 0.30192625522613525,
"kl": 0.0,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0293,
"reward": 0.3750000186264515,
"reward_std": 0.3078143782913685,
"rewards/accuracy_reward": 0.3750000186264515,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 731.9107496473524,
"epoch": 0.02799160251924423,
"grad_norm": 0.2259158194065094,
"kl": 0.00010485781563652886,
"learning_rate": 5.555555555555557e-06,
"loss": 0.0128,
"reward": 0.3670635099212329,
"reward_std": 0.26350560991300476,
"rewards/accuracy_reward": 0.3670635099212329,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 716.9598579406738,
"epoch": 0.05598320503848846,
"grad_norm": 0.3309926688671112,
"kl": 0.0035821676254272463,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.0246,
"reward": 0.40535716228187085,
"reward_std": 0.2991223815828562,
"rewards/accuracy_reward": 0.40535716228187085,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 743.5920013427734,
"epoch": 0.08397480755773268,
"grad_norm": 0.30614855885505676,
"kl": 0.01827545166015625,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.015,
"reward": 0.48482145443558694,
"reward_std": 0.2562454042956233,
"rewards/accuracy_reward": 0.48482145443558694,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 675.5830673217773,
"epoch": 0.11196641007697691,
"grad_norm": 0.2699500024318695,
"kl": 0.03763885498046875,
"learning_rate": 1.999310448492752e-05,
"loss": 0.0166,
"reward": 0.4187500230967999,
"reward_std": 0.28400791343301535,
"rewards/accuracy_reward": 0.4187500230967999,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 724.2911056518554,
"epoch": 0.13995801259622112,
"grad_norm": 0.28290238976478577,
"kl": 0.054229736328125,
"learning_rate": 1.9915651236017307e-05,
"loss": 0.0087,
"reward": 0.41517859101295473,
"reward_std": 0.2900457665324211,
"rewards/accuracy_reward": 0.41517859101295473,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 648.35092086792,
"epoch": 0.16794961511546536,
"grad_norm": 0.26611465215682983,
"kl": 0.063665771484375,
"learning_rate": 1.975286910165463e-05,
"loss": 0.016,
"reward": 0.46607145220041274,
"reward_std": 0.23811200838536023,
"rewards/accuracy_reward": 0.46607145220041274,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 653.6018226623535,
"epoch": 0.1959412176347096,
"grad_norm": 0.2749662697315216,
"kl": 0.077093505859375,
"learning_rate": 1.95063160182963e-05,
"loss": 0.0167,
"reward": 0.46250002086162567,
"reward_std": 0.26145450249314306,
"rewards/accuracy_reward": 0.46250002086162567,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 849.4866439819336,
"epoch": 0.22393282015395383,
"grad_norm": 0.19597865641117096,
"kl": 0.08182373046875,
"learning_rate": 1.917835166772562e-05,
"loss": 0.0064,
"reward": 0.5196428813040257,
"reward_std": 0.26366451028734444,
"rewards/accuracy_reward": 0.5196428813040257,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 893.0500381469726,
"epoch": 0.25192442267319803,
"grad_norm": 0.2700682282447815,
"kl": 0.07144775390625,
"learning_rate": 1.877211489328239e-05,
"loss": 0.0087,
"reward": 0.45446430817246436,
"reward_std": 0.2592643300071359,
"rewards/accuracy_reward": 0.45446430817246436,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 749.3607467651367,
"epoch": 0.27991602519244224,
"grad_norm": 0.2792201042175293,
"kl": 0.079168701171875,
"learning_rate": 1.829149365898355e-05,
"loss": 0.0121,
"reward": 0.4687500227242708,
"reward_std": 0.2029303913936019,
"rewards/accuracy_reward": 0.4687500227242708,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 666.171460723877,
"epoch": 0.3079076277116865,
"grad_norm": 0.24913422763347626,
"kl": 0.0840087890625,
"learning_rate": 1.7741087839045992e-05,
"loss": -0.0013,
"reward": 0.47857145592570305,
"reward_std": 0.2354975413531065,
"rewards/accuracy_reward": 0.47857145592570305,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 733.9053924560546,
"epoch": 0.3358992302309307,
"grad_norm": 0.23493996262550354,
"kl": 0.078680419921875,
"learning_rate": 1.712616519394157e-05,
"loss": 0.0031,
"reward": 0.4696428798139095,
"reward_std": 0.246359870582819,
"rewards/accuracy_reward": 0.4696428798139095,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 966.488444519043,
"epoch": 0.363890832750175,
"grad_norm": 0.24625414609909058,
"kl": 0.087646484375,
"learning_rate": 1.6452610954323337e-05,
"loss": 0.0079,
"reward": 0.42410715818405154,
"reward_std": 0.2372832555323839,
"rewards/accuracy_reward": 0.42410715818405154,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 932.5196838378906,
"epoch": 0.3918824352694192,
"grad_norm": 0.3386368751525879,
"kl": 0.0993896484375,
"learning_rate": 1.5726871495339563e-05,
"loss": 0.0242,
"reward": 0.43839287767186763,
"reward_std": 0.23617825247347354,
"rewards/accuracy_reward": 0.43839287767186763,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 888.8741516113281,
"epoch": 0.4198740377886634,
"grad_norm": 0.4019307792186737,
"kl": 0.1232421875,
"learning_rate": 1.4955892640410717e-05,
"loss": 0.0233,
"reward": 0.4660714529454708,
"reward_std": 0.2454029094427824,
"rewards/accuracy_reward": 0.4660714529454708,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 801.5536071777344,
"epoch": 0.44786564030790765,
"grad_norm": 0.3658553957939148,
"kl": 0.1235595703125,
"learning_rate": 1.4147053184944674e-05,
"loss": 0.016,
"reward": 0.4410714492201805,
"reward_std": 0.25928416270762683,
"rewards/accuracy_reward": 0.4410714492201805,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 689.7009254455567,
"epoch": 0.47585724282715186,
"grad_norm": 0.5175524950027466,
"kl": 0.17607421875,
"learning_rate": 1.3308094276213557e-05,
"loss": 0.0262,
"reward": 0.4401785898953676,
"reward_std": 0.23054485712200404,
"rewards/accuracy_reward": 0.4401785898953676,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 613.9518119812012,
"epoch": 0.5038488453463961,
"grad_norm": 0.38585546612739563,
"kl": 0.1792724609375,
"learning_rate": 1.2447045325275215e-05,
"loss": 0.0113,
"reward": 0.4428571630269289,
"reward_std": 0.2558211121708155,
"rewards/accuracy_reward": 0.4428571630269289,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 767.7786056518555,
"epoch": 0.5318404478656403,
"grad_norm": 0.22975799441337585,
"kl": 0.10023193359375,
"learning_rate": 1.1572147160012956e-05,
"loss": 0.0294,
"reward": 0.45357144996523857,
"reward_std": 0.2570741597563028,
"rewards/accuracy_reward": 0.45357144996523857,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 1036.6339736938476,
"epoch": 0.5598320503848845,
"grad_norm": 0.3325261175632477,
"kl": 0.1287353515625,
"learning_rate": 1.0691773154771508e-05,
"loss": 0.0119,
"reward": 0.4821428835391998,
"reward_std": 0.2324786176905036,
"rewards/accuracy_reward": 0.4821428835391998,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 868.197361755371,
"epoch": 0.5878236529041287,
"grad_norm": 0.6180446743965149,
"kl": 0.1508056640625,
"learning_rate": 9.814349091432634e-06,
"loss": 0.0277,
"reward": 0.4821428798139095,
"reward_std": 0.2132600512355566,
"rewards/accuracy_reward": 0.4821428798139095,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 818.4768188476562,
"epoch": 0.615815255423373,
"grad_norm": 0.2782590687274933,
"kl": 0.127099609375,
"learning_rate": 8.948272518914737e-06,
"loss": 0.0071,
"reward": 0.4758928822353482,
"reward_std": 0.22520754840224982,
"rewards/accuracy_reward": 0.4758928822353482,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 785.9794929504394,
"epoch": 0.6438068579426172,
"grad_norm": 0.2999161183834076,
"kl": 0.10723876953125,
"learning_rate": 8.101832382881249e-06,
"loss": 0.0243,
"reward": 0.4767857313156128,
"reward_std": 0.2523977212607861,
"rewards/accuracy_reward": 0.4767857313156128,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 833.9795013427735,
"epoch": 0.6717984604618614,
"grad_norm": 0.28707244992256165,
"kl": 0.1548828125,
"learning_rate": 7.283129694856508e-06,
"loss": 0.0181,
"reward": 0.420535734295845,
"reward_std": 0.24855004157871008,
"rewards/accuracy_reward": 0.420535734295845,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 825.6973556518554,
"epoch": 0.6997900629811057,
"grad_norm": 0.2718600630760193,
"kl": 0.2059814453125,
"learning_rate": 6.500000000000003e-06,
"loss": 0.0192,
"reward": 0.46696431189775467,
"reward_std": 0.2507600516080856,
"rewards/accuracy_reward": 0.46696431189775467,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 729.3732452392578,
"epoch": 0.72778166550035,
"grad_norm": 0.327871173620224,
"kl": 0.1481689453125,
"learning_rate": 5.759938385575454e-06,
"loss": 0.0284,
"reward": 0.48750002318993213,
"reward_std": 0.25541665144264697,
"rewards/accuracy_reward": 0.48750002318993213,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 769.4178924560547,
"epoch": 0.7557732680195941,
"grad_norm": 0.3249359428882599,
"kl": 0.23892822265625,
"learning_rate": 5.070027747835002e-06,
"loss": 0.024,
"reward": 0.48839287757873534,
"reward_std": 0.2081593234091997,
"rewards/accuracy_reward": 0.48839287757873534,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 738.6937805175781,
"epoch": 0.7837648705388384,
"grad_norm": 0.2741276025772095,
"kl": 0.20296630859375,
"learning_rate": 4.436871003853553e-06,
"loss": 0.0376,
"reward": 0.47500001937150954,
"reward_std": 0.26311200819909575,
"rewards/accuracy_reward": 0.47500001937150954,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 706.5491378784179,
"epoch": 0.8117564730580826,
"grad_norm": 0.29115381836891174,
"kl": 0.1580322265625,
"learning_rate": 3.866527897092401e-06,
"loss": 0.0139,
"reward": 0.4812500238418579,
"reward_std": 0.2547359408810735,
"rewards/accuracy_reward": 0.4812500238418579,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 751.8768203735351,
"epoch": 0.8397480755773268,
"grad_norm": 0.26477473974227905,
"kl": 0.1802001953125,
"learning_rate": 3.364457001506166e-06,
"loss": 0.0253,
"reward": 0.4580357393249869,
"reward_std": 0.2625793442130089,
"rewards/accuracy_reward": 0.4580357393249869,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 732.6241462707519,
"epoch": 0.867739678096571,
"grad_norm": 0.3064480125904083,
"kl": 0.19808349609375,
"learning_rate": 2.935463479253442e-06,
"loss": 0.02,
"reward": 0.40446430407464506,
"reward_std": 0.2393452214077115,
"rewards/accuracy_reward": 0.40446430407464506,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 715.6580657958984,
"epoch": 0.8957312806158153,
"grad_norm": 0.37576210498809814,
"kl": 0.19476318359375,
"learning_rate": 2.5836530920055976e-06,
"loss": 0.0327,
"reward": 0.40625001918524506,
"reward_std": 0.23536933306604624,
"rewards/accuracy_reward": 0.40625001918524506,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 731.3232460021973,
"epoch": 0.9237228831350595,
"grad_norm": 0.3001156151294708,
"kl": 0.3147216796875,
"learning_rate": 2.3123929059970286e-06,
"loss": 0.0557,
"reward": 0.4071428745985031,
"reward_std": 0.22367824967950584,
"rewards/accuracy_reward": 0.4071428745985031,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 669.9786071777344,
"epoch": 0.9517144856543037,
"grad_norm": 0.504320502281189,
"kl": 0.3324951171875,
"learning_rate": 2.1242790668964046e-06,
"loss": 0.0244,
"reward": 0.42232145071029664,
"reward_std": 0.2736980877816677,
"rewards/accuracy_reward": 0.42232145071029664,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 650.3339591979981,
"epoch": 0.979706088173548,
"grad_norm": 0.5142662525177002,
"kl": 0.21497802734375,
"learning_rate": 2.021111952915447e-06,
"loss": 0.0222,
"reward": 0.43214288018643854,
"reward_std": 0.2758882647380233,
"rewards/accuracy_reward": 0.43214288018643854,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 621.5370178222656,
"epoch": 0.9993002099370188,
"kl": 0.20172991071428573,
"reward": 0.45025512549493996,
"reward_std": 0.22983166708477906,
"rewards/accuracy_reward": 0.45025512549493996,
"step": 357,
"total_flos": 0.0,
"train_loss": 0.01889443341694793,
"train_runtime": 24438.0983,
"train_samples_per_second": 0.409,
"train_steps_per_second": 0.015
}
],
"logging_steps": 10,
"max_steps": 357,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}