Qwen2.5-3B-Open-R1-Code-GRPO-r2 / trainer_state.json
alexshengzhili's picture
Model save
5c8106f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3189326556543837,
"eval_steps": 500,
"global_step": 502,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 630.0,
"epoch": 0.0006353240152477764,
"grad_norm": 3.4243216514587402,
"kl": 0.0,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 337.14288330078125,
"epoch": 0.0012706480304955528,
"grad_norm": 5.606422424316406,
"kl": 0.0,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0,
"reward": 0.3428571820259094,
"reward_std": 0.09759000688791275,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 517.2857666015625,
"epoch": 0.0019059720457433292,
"grad_norm": 3.686295986175537,
"kl": 0.0004425048828125,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0,
"reward": 0.26031747460365295,
"reward_std": 0.19519509375095367,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0317460335791111,
"rewards/format_reward": 0.5714285969734192,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 638.4285888671875,
"epoch": 0.0025412960609911056,
"grad_norm": 3.40989089012146,
"kl": 0.0004863739013671875,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0,
"reward": 0.1714285910129547,
"reward_std": 0.21380899846553802,
"rewards/code_format_reward": 0.4285714626312256,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.4285714626312256,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 376.4285888671875,
"epoch": 0.0031766200762388818,
"grad_norm": 4.474236011505127,
"kl": 0.0008697509765625,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0,
"reward": 0.40000003576278687,
"reward_std": 0.40000003576278687,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 0.7142857313156128,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 484.71429443359375,
"epoch": 0.0038119440914866584,
"grad_norm": 4.529554843902588,
"kl": 0.0179443359375,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0002,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 446.0000305175781,
"epoch": 0.0044472681067344345,
"grad_norm": 3.3631019592285156,
"kl": 0.00128936767578125,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.0,
"reward": 0.8025974631309509,
"reward_std": 0.4480050206184387,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.40259745717048645,
"rewards/format_reward": 1.0,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 765.0000610351562,
"epoch": 0.005082592121982211,
"grad_norm": 3.205476999282837,
"kl": 0.000675201416015625,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0,
"reward": 0.1714285910129547,
"reward_std": 0.21380899846553802,
"rewards/code_format_reward": 0.4285714626312256,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.4285714626312256,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 628.5714721679688,
"epoch": 0.005717916137229987,
"grad_norm": 0.008113108575344086,
"kl": 0.001678466796875,
"learning_rate": 3e-06,
"loss": 0.0003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 934.1428833007812,
"epoch": 0.0063532401524777635,
"grad_norm": 2.805612087249756,
"kl": 0.00390625,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 733.4285888671875,
"epoch": 0.00698856416772554,
"grad_norm": 0.014334071427583694,
"kl": 0.0016326904296875,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 545.5714721679688,
"epoch": 0.007623888182973317,
"grad_norm": 3.0349507331848145,
"kl": 0.003936767578125,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 539.7142944335938,
"epoch": 0.008259212198221092,
"grad_norm": 2.7579803466796875,
"kl": 0.00787353515625,
"learning_rate": 4.333333333333334e-06,
"loss": 0.0001,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 502.5714416503906,
"epoch": 0.008894536213468869,
"grad_norm": 0.06331096589565277,
"kl": 0.0101318359375,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0004,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 583.5714721679688,
"epoch": 0.009529860228716646,
"grad_norm": 1.8358005285263062,
"kl": 0.0576171875,
"learning_rate": 5e-06,
"loss": 0.0009,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 528.4285888671875,
"epoch": 0.010165184243964422,
"grad_norm": 5.376899242401123,
"kl": 0.173828125,
"learning_rate": 4.999952797253148e-06,
"loss": 0.0017,
"reward": 0.971428632736206,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.5714285969734192,
"rewards/format_reward": 1.0,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 390.5714416503906,
"epoch": 0.010800508259212199,
"grad_norm": 5.1895365715026855,
"kl": 0.1767578125,
"learning_rate": 4.9998111909931225e-06,
"loss": 0.0018,
"reward": 0.5571428537368774,
"reward_std": 0.12051477283239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.2142857313156128,
"rewards/format_reward": 0.8571429252624512,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 449.5714416503906,
"epoch": 0.011435832274459974,
"grad_norm": 3.5275230407714844,
"kl": 0.05126953125,
"learning_rate": 4.999575187161439e-06,
"loss": 0.0005,
"reward": 0.6000000238418579,
"reward_std": 0.432049423456192,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 0.8571429252624512,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 523.1428833007812,
"epoch": 0.01207115628970775,
"grad_norm": 3.147331953048706,
"kl": 0.04541015625,
"learning_rate": 4.9992447956603455e-06,
"loss": 0.0005,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 330.14288330078125,
"epoch": 0.012706480304955527,
"grad_norm": 108.5281753540039,
"kl": 3.046875,
"learning_rate": 4.998820030352409e-06,
"loss": 0.0305,
"reward": 0.6884711980819702,
"reward_std": 0.4107622504234314,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.31704264879226685,
"rewards/format_reward": 1.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 359.0000305175781,
"epoch": 0.013341804320203304,
"grad_norm": 5.423805236816406,
"kl": 0.1591796875,
"learning_rate": 4.998300909059929e-06,
"loss": 0.0016,
"reward": 0.5269841551780701,
"reward_std": 0.1777612417936325,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1269841343164444,
"rewards/format_reward": 1.0,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 491.2857360839844,
"epoch": 0.01397712833545108,
"grad_norm": 1545.3717041015625,
"kl": 23.0,
"learning_rate": 4.997687453564198e-06,
"loss": 0.2309,
"reward": 0.5428571701049805,
"reward_std": 0.6078847646713257,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 0.7142857313156128,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 503.5714416503906,
"epoch": 0.014612452350698857,
"grad_norm": 8.096309661865234,
"kl": 0.69140625,
"learning_rate": 4.9969796896045775e-06,
"loss": 0.0069,
"reward": 0.3142857253551483,
"reward_std": 0.10690449923276901,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 604.1428833007812,
"epoch": 0.015247776365946633,
"grad_norm": 45758.625,
"kl": 608.0,
"learning_rate": 4.996177646877426e-06,
"loss": 6.0704,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 605.1428833007812,
"epoch": 0.01588310038119441,
"grad_norm": 3.5074126720428467,
"kl": 0.1083984375,
"learning_rate": 4.995281359034851e-06,
"loss": 0.0011,
"reward": 0.05714286118745804,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.1428571492433548,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.1428571492433548,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 476.2857360839844,
"epoch": 0.016518424396442185,
"grad_norm": 9.928363800048828,
"kl": 0.421875,
"learning_rate": 4.994290863683296e-06,
"loss": 0.0042,
"reward": 0.11428572237491608,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.2857142984867096,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.2857142984867096,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 420.8571472167969,
"epoch": 0.017153748411689963,
"grad_norm": 30.34365463256836,
"kl": 1.2890625,
"learning_rate": 4.99320620238196e-06,
"loss": 0.0129,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 618.7142944335938,
"epoch": 0.017789072426937738,
"grad_norm": 3.0826454162597656,
"kl": 0.051513671875,
"learning_rate": 4.99202742064106e-06,
"loss": 0.0005,
"reward": 0.22857144474983215,
"reward_std": 0.21380899846553802,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5714285969734192,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 582.5714721679688,
"epoch": 0.018424396442185513,
"grad_norm": 3.330746650695801,
"kl": 0.07666015625,
"learning_rate": 4.990754567919917e-06,
"loss": 0.0008,
"reward": 0.4571428894996643,
"reward_std": 0.35989415645599365,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.1714285910129547,
"rewards/format_reward": 0.7142857313156128,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 440.2857360839844,
"epoch": 0.01905972045743329,
"grad_norm": 3.658856153488159,
"kl": 0.1318359375,
"learning_rate": 4.989387697624881e-06,
"loss": 0.0013,
"reward": 0.3086913228034973,
"reward_std": 0.20198491215705872,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.02297702245414257,
"rewards/format_reward": 0.7142857313156128,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 764.0000610351562,
"epoch": 0.019695044472681066,
"grad_norm": 2.7840328216552734,
"kl": 0.078125,
"learning_rate": 4.987926867107095e-06,
"loss": 0.0008,
"reward": 0.31462588906288147,
"reward_std": 0.19902175664901733,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.028911566361784935,
"rewards/format_reward": 0.7142857313156128,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 573.4285888671875,
"epoch": 0.020330368487928845,
"grad_norm": 3.031372308731079,
"kl": 0.078125,
"learning_rate": 4.986372137660078e-06,
"loss": 0.0008,
"reward": 0.485714316368103,
"reward_std": 0.22677868604660034,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 0.8571429252624512,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 460.14288330078125,
"epoch": 0.02096569250317662,
"grad_norm": 3.5374386310577393,
"kl": 0.107421875,
"learning_rate": 4.984723574517165e-06,
"loss": 0.0011,
"reward": 0.3362637758255005,
"reward_std": 0.17483238875865936,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.021978024393320084,
"rewards/format_reward": 0.8571429252624512,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 510.71429443359375,
"epoch": 0.021601016518424398,
"grad_norm": 3.355870246887207,
"kl": 0.20703125,
"learning_rate": 4.9829812468487655e-06,
"loss": 0.0021,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 617.7142944335938,
"epoch": 0.022236340533672173,
"grad_norm": 2.325124979019165,
"kl": 0.0849609375,
"learning_rate": 4.981145227759457e-06,
"loss": 0.0008,
"reward": 0.5142857432365417,
"reward_std": 0.39761197566986084,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 329.4285888671875,
"epoch": 0.022871664548919948,
"grad_norm": 4.9638495445251465,
"kl": 0.224609375,
"learning_rate": 4.979215594284924e-06,
"loss": 0.0022,
"reward": 0.37142860889434814,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 734.5714721679688,
"epoch": 0.023506988564167726,
"grad_norm": 0.06015148386359215,
"kl": 0.08154296875,
"learning_rate": 4.977192427388722e-06,
"loss": 0.0011,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 481.5714416503906,
"epoch": 0.0241423125794155,
"grad_norm": 0.20115888118743896,
"kl": 0.12109375,
"learning_rate": 4.9750758119588824e-06,
"loss": 0.0015,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 817.0000610351562,
"epoch": 0.02477763659466328,
"grad_norm": 2.3490183353424072,
"kl": 0.0888671875,
"learning_rate": 4.972865836804349e-06,
"loss": 0.0009,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 622.1428833007812,
"epoch": 0.025412960609911054,
"grad_norm": 40.009159088134766,
"kl": 2.34375,
"learning_rate": 4.970562594651254e-06,
"loss": 0.0234,
"reward": 0.37142860889434814,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 498.71429443359375,
"epoch": 0.026048284625158832,
"grad_norm": 3.2044177055358887,
"kl": 0.12109375,
"learning_rate": 4.968166182139026e-06,
"loss": 0.0012,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 480.4285888671875,
"epoch": 0.026683608640406607,
"grad_norm": 0.08217895030975342,
"kl": 0.1484375,
"learning_rate": 4.9656766998163306e-06,
"loss": 0.0018,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 699.5714721679688,
"epoch": 0.027318932655654382,
"grad_norm": 3.2091763019561768,
"kl": 0.0966796875,
"learning_rate": 4.963094252136865e-06,
"loss": 0.001,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 595.857177734375,
"epoch": 0.02795425667090216,
"grad_norm": 0.07719128578901291,
"kl": 0.115234375,
"learning_rate": 4.960418947454958e-06,
"loss": 0.0014,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 598.0,
"epoch": 0.028589580686149935,
"grad_norm": 0.10212292522192001,
"kl": 0.1298828125,
"learning_rate": 4.957650898021038e-06,
"loss": 0.0016,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 517.857177734375,
"epoch": 0.029224904701397714,
"grad_norm": 2.6977856159210205,
"kl": 0.1337890625,
"learning_rate": 4.954790219976915e-06,
"loss": 0.0013,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 699.5714721679688,
"epoch": 0.02986022871664549,
"grad_norm": 0.09081219881772995,
"kl": 0.10888671875,
"learning_rate": 4.95183703335091e-06,
"loss": 0.0014,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 375.5714416503906,
"epoch": 0.030495552731893267,
"grad_norm": 3.1593081951141357,
"kl": 0.162109375,
"learning_rate": 4.948791462052819e-06,
"loss": 0.0016,
"reward": 0.9428572654724121,
"reward_std": 0.5740416646003723,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.5714285969734192,
"rewards/format_reward": 1.0,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 605.5714721679688,
"epoch": 0.031130876747141042,
"grad_norm": 0.2238318771123886,
"kl": 0.1396484375,
"learning_rate": 4.945653633868716e-06,
"loss": 0.0017,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 519.0,
"epoch": 0.03176620076238882,
"grad_norm": 3.9522361755371094,
"kl": 0.1796875,
"learning_rate": 4.942423680455584e-06,
"loss": 0.0018,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 577.0,
"epoch": 0.032401524777636595,
"grad_norm": 3.095000743865967,
"kl": 0.15234375,
"learning_rate": 4.939101737335802e-06,
"loss": 0.0015,
"reward": 0.11428572237491608,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.2857142984867096,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.2857142984867096,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 651.2857666015625,
"epoch": 0.03303684879288437,
"grad_norm": 2.880178451538086,
"kl": 0.14453125,
"learning_rate": 4.935687943891447e-06,
"loss": 0.0015,
"reward": 0.22857144474983215,
"reward_std": 0.21380899846553802,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5714285969734192,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 570.4285888671875,
"epoch": 0.033672172808132145,
"grad_norm": 6.103281021118164,
"kl": 0.421875,
"learning_rate": 4.932182443358458e-06,
"loss": 0.0042,
"reward": 0.22857144474983215,
"reward_std": 0.21380899846553802,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5714285969734192,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 461.857177734375,
"epoch": 0.03430749682337993,
"grad_norm": 5.561570644378662,
"kl": 0.283203125,
"learning_rate": 4.928585382820616e-06,
"loss": 0.0028,
"reward": 0.3142857253551483,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 710.0000610351562,
"epoch": 0.0349428208386277,
"grad_norm": 0.06716505438089371,
"kl": 0.1875,
"learning_rate": 4.924896913203376e-06,
"loss": 0.0022,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 725.5714721679688,
"epoch": 0.035578144853875476,
"grad_norm": 4.731691837310791,
"kl": 0.130859375,
"learning_rate": 4.921117189267535e-06,
"loss": 0.0013,
"reward": 0.3588235676288605,
"reward_std": 0.43374723196029663,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.13025210797786713,
"rewards/format_reward": 0.5714285969734192,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 665.2857666015625,
"epoch": 0.03621346886912325,
"grad_norm": 2.7444961071014404,
"kl": 0.18359375,
"learning_rate": 4.917246369602742e-06,
"loss": 0.0018,
"reward": 0.4285714626312256,
"reward_std": 0.4680252969264984,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 0.7142857313156128,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 648.0,
"epoch": 0.036848792884371026,
"grad_norm": 3.6189987659454346,
"kl": 0.20703125,
"learning_rate": 4.9132846166208355e-06,
"loss": 0.0021,
"reward": 0.5142857432365417,
"reward_std": 0.6309479475021362,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 0.5714285969734192,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 483.14288330078125,
"epoch": 0.03748411689961881,
"grad_norm": 0.15579625964164734,
"kl": 0.26953125,
"learning_rate": 4.9092320965490365e-06,
"loss": 0.003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 599.857177734375,
"epoch": 0.03811944091486658,
"grad_norm": 3.2243692874908447,
"kl": 0.154296875,
"learning_rate": 4.905088979422971e-06,
"loss": 0.0015,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 446.0000305175781,
"epoch": 0.03875476493011436,
"grad_norm": 3.187040090560913,
"kl": 0.2392578125,
"learning_rate": 4.900855439079536e-06,
"loss": 0.0024,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 619.7142944335938,
"epoch": 0.03939008894536213,
"grad_norm": 3.501086473464966,
"kl": 0.205078125,
"learning_rate": 4.8965316531496055e-06,
"loss": 0.0021,
"reward": 0.3428571820259094,
"reward_std": 0.09759001433849335,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 598.0,
"epoch": 0.040025412960609914,
"grad_norm": 4.5344557762146,
"kl": 0.1484375,
"learning_rate": 4.892117803050578e-06,
"loss": 0.0015,
"reward": 0.22857144474983215,
"reward_std": 0.21380899846553802,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5714285969734192,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 507.0000305175781,
"epoch": 0.04066073697585769,
"grad_norm": 6.242559432983398,
"kl": 0.392578125,
"learning_rate": 4.887614073978761e-06,
"loss": 0.0039,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 757.1428833007812,
"epoch": 0.041296060991105464,
"grad_norm": 10.64748764038086,
"kl": 0.53515625,
"learning_rate": 4.883020654901609e-06,
"loss": 0.0054,
"reward": 0.22857144474983215,
"reward_std": 0.21380899846553802,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5714285969734192,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 585.4285888671875,
"epoch": 0.04193138500635324,
"grad_norm": 3.1377880573272705,
"kl": 0.2119140625,
"learning_rate": 4.878337738549785e-06,
"loss": 0.0021,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 604.2857666015625,
"epoch": 0.042566709021601014,
"grad_norm": 0.17088104784488678,
"kl": 0.2490234375,
"learning_rate": 4.873565521409082e-06,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 842.5714721679688,
"epoch": 0.043202033036848796,
"grad_norm": 3.577969551086426,
"kl": 0.384765625,
"learning_rate": 4.868704203712173e-06,
"loss": 0.0038,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 841.0000610351562,
"epoch": 0.04383735705209657,
"grad_norm": 3.18711256980896,
"kl": 0.2080078125,
"learning_rate": 4.86375398943021e-06,
"loss": 0.0021,
"reward": 0.3142857253551483,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 525.4285888671875,
"epoch": 0.044472681067344345,
"grad_norm": 2.3753271102905273,
"kl": 0.177734375,
"learning_rate": 4.858715086264274e-06,
"loss": 0.0018,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 406.8571472167969,
"epoch": 0.04510800508259212,
"grad_norm": 3.5183515548706055,
"kl": 0.2138671875,
"learning_rate": 4.853587705636646e-06,
"loss": 0.0021,
"reward": 0.8285714983940125,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4285714626312256,
"rewards/format_reward": 1.0,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 630.5714721679688,
"epoch": 0.045743329097839895,
"grad_norm": 2.6844465732574463,
"kl": 0.103515625,
"learning_rate": 4.84837206268195e-06,
"loss": 0.001,
"reward": 0.3142857253551483,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 464.14288330078125,
"epoch": 0.04637865311308768,
"grad_norm": 3.63313627243042,
"kl": 0.1591796875,
"learning_rate": 4.8430683762381195e-06,
"loss": 0.0016,
"reward": 0.2571428716182709,
"reward_std": 0.19023796916007996,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 507.2857360839844,
"epoch": 0.04701397712833545,
"grad_norm": 2.707540988922119,
"kl": 0.111328125,
"learning_rate": 4.837676868837213e-06,
"loss": 0.0011,
"reward": 0.4023166298866272,
"reward_std": 0.004252949263900518,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0023166025057435036,
"rewards/format_reward": 1.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 558.857177734375,
"epoch": 0.04764930114358323,
"grad_norm": 3.0911412239074707,
"kl": 0.1474609375,
"learning_rate": 4.832197766696085e-06,
"loss": 0.0015,
"reward": 0.3142857253551483,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 460.4285888671875,
"epoch": 0.048284625158831,
"grad_norm": 3.5635647773742676,
"kl": 0.1357421875,
"learning_rate": 4.826631299706887e-06,
"loss": 0.0014,
"reward": 0.5032258033752441,
"reward_std": 0.2984292209148407,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.18894009292125702,
"rewards/format_reward": 0.8571429252624512,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 382.2857360839844,
"epoch": 0.04891994917407878,
"grad_norm": 0.05187558755278587,
"kl": 0.1611328125,
"learning_rate": 4.820977701427424e-06,
"loss": 0.0019,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 466.14288330078125,
"epoch": 0.04955527318932656,
"grad_norm": 0.04543861746788025,
"kl": 0.130859375,
"learning_rate": 4.81523720907136e-06,
"loss": 0.0016,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 551.1428833007812,
"epoch": 0.05019059720457433,
"grad_norm": 0.45131492614746094,
"kl": 0.28515625,
"learning_rate": 4.809410063498254e-06,
"loss": 0.0032,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 379.5714416503906,
"epoch": 0.05082592121982211,
"grad_norm": 3.854992151260376,
"kl": 0.1591796875,
"learning_rate": 4.8034965092034656e-06,
"loss": 0.0016,
"reward": 1.2000001668930054,
"reward_std": 0.3829708993434906,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.8571429252624512,
"rewards/format_reward": 0.8571429252624512,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 566.857177734375,
"epoch": 0.05146124523506988,
"grad_norm": 41.30952453613281,
"kl": 1.9921875,
"learning_rate": 4.797496794307889e-06,
"loss": 0.0199,
"reward": 0.2489795982837677,
"reward_std": 0.23831285536289215,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.020408164709806442,
"rewards/format_reward": 0.5714285969734192,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 448.71429443359375,
"epoch": 0.052096569250317665,
"grad_norm": 4.601639270782471,
"kl": 0.2412109375,
"learning_rate": 4.791411170547545e-06,
"loss": 0.0024,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 614.5714721679688,
"epoch": 0.05273189326556544,
"grad_norm": 4.209871768951416,
"kl": 0.396484375,
"learning_rate": 4.785239893263017e-06,
"loss": 0.004,
"reward": 0.3142857253551483,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 410.4285888671875,
"epoch": 0.053367217280813214,
"grad_norm": 4.049778938293457,
"kl": 0.171875,
"learning_rate": 4.778983221388742e-06,
"loss": 0.0017,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 421.71429443359375,
"epoch": 0.05400254129606099,
"grad_norm": 5.517159461975098,
"kl": 0.318359375,
"learning_rate": 4.77264141744214e-06,
"loss": 0.0032,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 405.8571472167969,
"epoch": 0.054637865311308764,
"grad_norm": 1.0127395391464233,
"kl": 0.37109375,
"learning_rate": 4.766214747512603e-06,
"loss": 0.004,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 502.2857360839844,
"epoch": 0.055273189326556546,
"grad_norm": 5.1016316413879395,
"kl": 0.625,
"learning_rate": 4.759703481250331e-06,
"loss": 0.0062,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 415.5714416503906,
"epoch": 0.05590851334180432,
"grad_norm": 3205.96337890625,
"kl": 94.0,
"learning_rate": 4.753107891855015e-06,
"loss": 0.9386,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 162.0,
"epoch": 0.056543837357052096,
"grad_norm": 5.175131797790527,
"kl": 0.349609375,
"learning_rate": 4.746428256064375e-06,
"loss": 0.0035,
"reward": 0.971428632736206,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.5714285969734192,
"rewards/format_reward": 1.0,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 255.2857208251953,
"epoch": 0.05717916137229987,
"grad_norm": 4.350471496582031,
"kl": 0.251953125,
"learning_rate": 4.7396648541425534e-06,
"loss": 0.0025,
"reward": 0.9142858386039734,
"reward_std": 0.6202918887138367,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.5714285969734192,
"rewards/format_reward": 0.8571429252624512,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 515.7142944335938,
"epoch": 0.05781448538754765,
"grad_norm": 3.159823417663574,
"kl": 0.1904296875,
"learning_rate": 4.732817969868348e-06,
"loss": 0.0019,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 504.5714416503906,
"epoch": 0.05844980940279543,
"grad_norm": 3.7896342277526855,
"kl": 0.2353515625,
"learning_rate": 4.7258878905233095e-06,
"loss": 0.0023,
"reward": 0.5428571701049805,
"reward_std": 0.37796446681022644,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 355.5714416503906,
"epoch": 0.0590851334180432,
"grad_norm": 2.87099289894104,
"kl": 0.1962890625,
"learning_rate": 4.718874906879688e-06,
"loss": 0.002,
"reward": 0.5428571701049805,
"reward_std": 0.3779645264148712,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 561.0,
"epoch": 0.05972045743329098,
"grad_norm": 3.082097053527832,
"kl": 0.220703125,
"learning_rate": 4.711779313188231e-06,
"loss": 0.0022,
"reward": 0.37142860889434814,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 430.0000305175781,
"epoch": 0.06035578144853875,
"grad_norm": 0.06256424635648727,
"kl": 0.2138671875,
"learning_rate": 4.70460140716584e-06,
"loss": 0.0024,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 695.1428833007812,
"epoch": 0.060991105463786534,
"grad_norm": 3.0291473865509033,
"kl": 0.412109375,
"learning_rate": 4.697341489983076e-06,
"loss": 0.0041,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 459.4285888671875,
"epoch": 0.06162642947903431,
"grad_norm": 3.461120843887329,
"kl": 0.291015625,
"learning_rate": 4.6899998662515215e-06,
"loss": 0.0029,
"reward": 0.4714285731315613,
"reward_std": 0.06681530177593231,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0714285746216774,
"rewards/format_reward": 1.0,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 626.4285888671875,
"epoch": 0.062261753494282084,
"grad_norm": 0.07474807649850845,
"kl": 0.3203125,
"learning_rate": 4.682576844011007e-06,
"loss": 0.0035,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 594.2857666015625,
"epoch": 0.06289707750952986,
"grad_norm": 0.3679976463317871,
"kl": 0.28515625,
"learning_rate": 4.675072734716678e-06,
"loss": 0.0031,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 627.1428833007812,
"epoch": 0.06353240152477764,
"grad_norm": 39.379425048828125,
"kl": 1.5390625,
"learning_rate": 4.667487853225931e-06,
"loss": 0.0153,
"reward": 0.485714316368103,
"reward_std": 0.42983949184417725,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 0.8571429252624512,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 636.0,
"epoch": 0.06416772554002541,
"grad_norm": 0.5123635530471802,
"kl": 0.1962890625,
"learning_rate": 4.659822517785203e-06,
"loss": 0.0023,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 623.7142944335938,
"epoch": 0.06480304955527319,
"grad_norm": 2.7022430896759033,
"kl": 0.2314453125,
"learning_rate": 4.6520770500166165e-06,
"loss": 0.0023,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 338.4285888671875,
"epoch": 0.06543837357052097,
"grad_norm": 3.6665234565734863,
"kl": 0.294921875,
"learning_rate": 4.644251774904487e-06,
"loss": 0.0029,
"reward": 1.057142972946167,
"reward_std": 0.5968170166015625,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.7142857313156128,
"rewards/format_reward": 0.8571429252624512,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 563.5714721679688,
"epoch": 0.06607369758576874,
"grad_norm": 0.09322147816419601,
"kl": 0.23828125,
"learning_rate": 4.636347020781684e-06,
"loss": 0.0027,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 734.1428833007812,
"epoch": 0.06670902160101652,
"grad_norm": 2.2183244228363037,
"kl": 0.177734375,
"learning_rate": 4.6283631193158605e-06,
"loss": 0.0018,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 495.4285888671875,
"epoch": 0.06734434561626429,
"grad_norm": 3.992103338241577,
"kl": 0.345703125,
"learning_rate": 4.620300405495532e-06,
"loss": 0.0035,
"reward": 0.2571428716182709,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.4285714626312256,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 449.4285888671875,
"epoch": 0.06797966963151207,
"grad_norm": 4.281430244445801,
"kl": 0.26953125,
"learning_rate": 4.612159217616022e-06,
"loss": 0.0027,
"reward": 0.2571428716182709,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.4285714626312256,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 463.2857360839844,
"epoch": 0.06861499364675985,
"grad_norm": 3.0798134803771973,
"kl": 0.2158203125,
"learning_rate": 4.603939897265268e-06,
"loss": 0.0022,
"reward": 0.5067504644393921,
"reward_std": 0.28243502974510193,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.10675039887428284,
"rewards/format_reward": 1.0,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 534.7142944335938,
"epoch": 0.06925031766200762,
"grad_norm": 3.179516077041626,
"kl": 0.236328125,
"learning_rate": 4.595642789309492e-06,
"loss": 0.0024,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 673.1428833007812,
"epoch": 0.0698856416772554,
"grad_norm": 2.751520872116089,
"kl": 0.228515625,
"learning_rate": 4.587268241878724e-06,
"loss": 0.0023,
"reward": 0.3142857253551483,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 506.71429443359375,
"epoch": 0.07052096569250317,
"grad_norm": 0.06256023049354553,
"kl": 0.2490234375,
"learning_rate": 4.578816606352205e-06,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 705.5714721679688,
"epoch": 0.07115628970775095,
"grad_norm": 0.08651240170001984,
"kl": 0.1826171875,
"learning_rate": 4.570288237343632e-06,
"loss": 0.0021,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 667.2857666015625,
"epoch": 0.07179161372299873,
"grad_norm": 1.87228262424469,
"kl": 0.1552734375,
"learning_rate": 4.561683492686289e-06,
"loss": 0.0016,
"reward": 0.37142857909202576,
"reward_std": 0.335232675075531,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 0.5714285969734192,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 511.857177734375,
"epoch": 0.0724269377382465,
"grad_norm": 0.06886231899261475,
"kl": 0.21875,
"learning_rate": 4.5530027334180285e-06,
"loss": 0.0025,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 746.857177734375,
"epoch": 0.07306226175349428,
"grad_norm": 2.801109552383423,
"kl": 0.2060546875,
"learning_rate": 4.544246323766122e-06,
"loss": 0.0021,
"reward": 0.22857144474983215,
"reward_std": 0.21380899846553802,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5714285969734192,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 639.5714721679688,
"epoch": 0.07369758576874205,
"grad_norm": 2.9071028232574463,
"kl": 0.2373046875,
"learning_rate": 4.535414631131983e-06,
"loss": 0.0024,
"reward": 0.37142860889434814,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 760.2857666015625,
"epoch": 0.07433290978398983,
"grad_norm": 2.4393320083618164,
"kl": 0.201171875,
"learning_rate": 4.526508026075746e-06,
"loss": 0.002,
"reward": 0.2571428716182709,
"reward_std": 0.19023796916007996,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 729.7142944335938,
"epoch": 0.07496823379923762,
"grad_norm": 7.918629169464111,
"kl": 0.671875,
"learning_rate": 4.517526882300721e-06,
"loss": 0.0067,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 745.1428833007812,
"epoch": 0.07560355781448538,
"grad_norm": 2.229508876800537,
"kl": 0.185546875,
"learning_rate": 4.508471576637713e-06,
"loss": 0.0019,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 508.4285888671875,
"epoch": 0.07623888182973317,
"grad_norm": 2.4816155433654785,
"kl": 0.1845703125,
"learning_rate": 4.499342489029211e-06,
"loss": 0.0018,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 535.857177734375,
"epoch": 0.07687420584498093,
"grad_norm": 3.3153207302093506,
"kl": 0.2490234375,
"learning_rate": 4.490140002513449e-06,
"loss": 0.0025,
"reward": 0.43708792328834534,
"reward_std": 0.0981253907084465,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.03708791360259056,
"rewards/format_reward": 1.0,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 776.1428833007812,
"epoch": 0.07750952986022872,
"grad_norm": 2.2401111125946045,
"kl": 0.1484375,
"learning_rate": 4.48086450320833e-06,
"loss": 0.0015,
"reward": 0.22857144474983215,
"reward_std": 0.21380899846553802,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5714285969734192,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 764.7142944335938,
"epoch": 0.0781448538754765,
"grad_norm": 2.0424013137817383,
"kl": 0.1806640625,
"learning_rate": 4.4715163802952266e-06,
"loss": 0.0018,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 543.7142944335938,
"epoch": 0.07878017789072427,
"grad_norm": 0.1234215497970581,
"kl": 0.25390625,
"learning_rate": 4.462096026002655e-06,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 741.5714721679688,
"epoch": 0.07941550190597205,
"grad_norm": 2.020581007003784,
"kl": 0.1884765625,
"learning_rate": 4.4526038355898144e-06,
"loss": 0.0019,
"reward": 0.36326533555984497,
"reward_std": 0.16554531455039978,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.020408164709806442,
"rewards/format_reward": 0.8571429252624512,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 593.857177734375,
"epoch": 0.08005082592121983,
"grad_norm": 2.3378539085388184,
"kl": 0.2373046875,
"learning_rate": 4.4430402073300035e-06,
"loss": 0.0024,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 599.5714721679688,
"epoch": 0.0806861499364676,
"grad_norm": 0.06773251295089722,
"kl": 0.234375,
"learning_rate": 4.433405542493909e-06,
"loss": 0.0026,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 703.1428833007812,
"epoch": 0.08132147395171538,
"grad_norm": 3.0085108280181885,
"kl": 0.1630859375,
"learning_rate": 4.4237002453327734e-06,
"loss": 0.0016,
"reward": 0.41020408272743225,
"reward_std": 0.026997461915016174,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.010204082354903221,
"rewards/format_reward": 1.0,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 363.2857360839844,
"epoch": 0.08195679796696315,
"grad_norm": 3.266486644744873,
"kl": 0.171875,
"learning_rate": 4.4139247230614245e-06,
"loss": 0.0017,
"reward": 0.4520833492279053,
"reward_std": 0.12870661914348602,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0520833320915699,
"rewards/format_reward": 1.0,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 395.2857360839844,
"epoch": 0.08259212198221093,
"grad_norm": 3.562208414077759,
"kl": 0.322265625,
"learning_rate": 4.404079385841201e-06,
"loss": 0.0032,
"reward": 0.4571428894996643,
"reward_std": 0.2507132887840271,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 0.8571429252624512,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 367.0000305175781,
"epoch": 0.08322744599745871,
"grad_norm": 0.06771758943796158,
"kl": 0.244140625,
"learning_rate": 4.394164646762734e-06,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 523.857177734375,
"epoch": 0.08386277001270648,
"grad_norm": 2.9015965461730957,
"kl": 0.220703125,
"learning_rate": 4.384180921828618e-06,
"loss": 0.0022,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 519.2857666015625,
"epoch": 0.08449809402795426,
"grad_norm": 3.0210537910461426,
"kl": 0.1689453125,
"learning_rate": 4.374128629935955e-06,
"loss": 0.0017,
"reward": 0.9833123087882996,
"reward_std": 0.4177902638912201,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.5833122730255127,
"rewards/format_reward": 1.0,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 358.5714416503906,
"epoch": 0.08513341804320203,
"grad_norm": 2.964980363845825,
"kl": 0.1962890625,
"learning_rate": 4.364008192858781e-06,
"loss": 0.002,
"reward": 0.4098522365093231,
"reward_std": 0.016825860366225243,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.009852217510342598,
"rewards/format_reward": 1.0,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 473.5714416503906,
"epoch": 0.08576874205844981,
"grad_norm": 2.856916666030884,
"kl": 0.2119140625,
"learning_rate": 4.353820035230366e-06,
"loss": 0.0021,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 507.2857360839844,
"epoch": 0.08640406607369759,
"grad_norm": 3.613862991333008,
"kl": 0.169921875,
"learning_rate": 4.3435645845254e-06,
"loss": 0.0017,
"reward": 0.6571429371833801,
"reward_std": 0.5126960277557373,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 451.4285888671875,
"epoch": 0.08703939008894536,
"grad_norm": 0.05980609729886055,
"kl": 0.177734375,
"learning_rate": 4.333242271042054e-06,
"loss": 0.0021,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 515.7142944335938,
"epoch": 0.08767471410419314,
"grad_norm": 3.2973406314849854,
"kl": 0.248046875,
"learning_rate": 4.32285352788393e-06,
"loss": 0.0025,
"reward": 0.5261905193328857,
"reward_std": 0.28607451915740967,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.1547619104385376,
"rewards/format_reward": 1.0,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 384.14288330078125,
"epoch": 0.08831003811944091,
"grad_norm": 2.6424343585968018,
"kl": 0.31640625,
"learning_rate": 4.312398790941882e-06,
"loss": 0.0032,
"reward": 0.3142857253551483,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 326.8571472167969,
"epoch": 0.08894536213468869,
"grad_norm": 3.2031986713409424,
"kl": 0.1884765625,
"learning_rate": 4.301878498875735e-06,
"loss": 0.0019,
"reward": 0.971428632736206,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.5714285969734192,
"rewards/format_reward": 1.0,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 366.71429443359375,
"epoch": 0.08958068614993647,
"grad_norm": 0.29011043906211853,
"kl": 0.232421875,
"learning_rate": 4.291293093095873e-06,
"loss": 0.0026,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 353.5714416503906,
"epoch": 0.09021601016518424,
"grad_norm": 0.05228522792458534,
"kl": 0.1728515625,
"learning_rate": 4.280643017744723e-06,
"loss": 0.002,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 571.857177734375,
"epoch": 0.09085133418043202,
"grad_norm": 4.668937683105469,
"kl": 0.255859375,
"learning_rate": 4.269928719678117e-06,
"loss": 0.0026,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 441.0000305175781,
"epoch": 0.09148665819567979,
"grad_norm": 3.7214486598968506,
"kl": 0.42578125,
"learning_rate": 4.2591506484465426e-06,
"loss": 0.0042,
"reward": 0.3428571820259094,
"reward_std": 0.09759001433849335,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 493.14288330078125,
"epoch": 0.09212198221092757,
"grad_norm": 0.11755923926830292,
"kl": 0.28515625,
"learning_rate": 4.248309256276283e-06,
"loss": 0.0031,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 365.4285888671875,
"epoch": 0.09275730622617535,
"grad_norm": 2.491922378540039,
"kl": 0.2099609375,
"learning_rate": 4.23740499805044e-06,
"loss": 0.0021,
"reward": 0.38035714626312256,
"reward_std": 0.0817723423242569,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.008928571827709675,
"rewards/format_reward": 1.0,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 363.4285888671875,
"epoch": 0.09339263024142312,
"grad_norm": 3.340977191925049,
"kl": 0.185546875,
"learning_rate": 4.22643833128985e-06,
"loss": 0.0018,
"reward": 0.5428571701049805,
"reward_std": 0.37796446681022644,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 244.57144165039062,
"epoch": 0.0940279542566709,
"grad_norm": 0.05019477382302284,
"kl": 0.2255859375,
"learning_rate": 4.215409716133885e-06,
"loss": 0.0035,
"reward": 1.4000002145767212,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 372.2857360839844,
"epoch": 0.09466327827191867,
"grad_norm": 3.1343963146209717,
"kl": 0.34375,
"learning_rate": 4.204319615321151e-06,
"loss": 0.0034,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 355.5714416503906,
"epoch": 0.09529860228716645,
"grad_norm": 3.144193172454834,
"kl": 0.35546875,
"learning_rate": 4.193168494170065e-06,
"loss": 0.0035,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 438.857177734375,
"epoch": 0.09593392630241424,
"grad_norm": 3.4582438468933105,
"kl": 0.27734375,
"learning_rate": 4.181956820559339e-06,
"loss": 0.0028,
"reward": 0.5428571701049805,
"reward_std": 0.37796446681022644,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 556.857177734375,
"epoch": 0.096569250317662,
"grad_norm": 2.9669277667999268,
"kl": 0.259765625,
"learning_rate": 4.170685064908342e-06,
"loss": 0.0026,
"reward": 0.4870130121707916,
"reward_std": 0.23021474480628967,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.08701299130916595,
"rewards/format_reward": 1.0,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 561.1428833007812,
"epoch": 0.09720457433290978,
"grad_norm": 0.06397932022809982,
"kl": 0.265625,
"learning_rate": 4.159353700157365e-06,
"loss": 0.0029,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 382.8571472167969,
"epoch": 0.09783989834815757,
"grad_norm": 0.12080562859773636,
"kl": 0.369140625,
"learning_rate": 4.14796320174778e-06,
"loss": 0.004,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 583.7142944335938,
"epoch": 0.09847522236340533,
"grad_norm": 0.06819948554039001,
"kl": 0.298828125,
"learning_rate": 4.136514047602087e-06,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 347.14288330078125,
"epoch": 0.09911054637865312,
"grad_norm": 0.07698381692171097,
"kl": 0.3984375,
"learning_rate": 4.1250067181038635e-06,
"loss": 0.0043,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 692.1428833007812,
"epoch": 0.09974587039390088,
"grad_norm": 0.17737269401550293,
"kl": 0.28125,
"learning_rate": 4.113441696077608e-06,
"loss": 0.0031,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 341.5714416503906,
"epoch": 0.10038119440914867,
"grad_norm": 4.598702907562256,
"kl": 0.4296875,
"learning_rate": 4.101819466768484e-06,
"loss": 0.0043,
"reward": 0.8134453892707825,
"reward_std": 0.18902148306369781,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4134454131126404,
"rewards/format_reward": 1.0,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 455.2857360839844,
"epoch": 0.10101651842439645,
"grad_norm": 0.07675225287675858,
"kl": 0.302734375,
"learning_rate": 4.0901405178219535e-06,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 752.2857666015625,
"epoch": 0.10165184243964422,
"grad_norm": 1.9616703987121582,
"kl": 0.2080078125,
"learning_rate": 4.078405339263326e-06,
"loss": 0.0021,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 349.8571472167969,
"epoch": 0.102287166454892,
"grad_norm": 3.1100926399230957,
"kl": 0.2451171875,
"learning_rate": 4.06661442347719e-06,
"loss": 0.0025,
"reward": 0.8285714983940125,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4285714626312256,
"rewards/format_reward": 1.0,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 616.1428833007812,
"epoch": 0.10292249047013977,
"grad_norm": 2.4743812084198,
"kl": 0.296875,
"learning_rate": 4.054768265186758e-06,
"loss": 0.003,
"reward": 0.4129870533943176,
"reward_std": 0.034360405057668686,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.012987013906240463,
"rewards/format_reward": 1.0,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 359.4285888671875,
"epoch": 0.10355781448538755,
"grad_norm": 3.092137336730957,
"kl": 0.3046875,
"learning_rate": 4.0428673614331036e-06,
"loss": 0.0031,
"reward": 0.971428632736206,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.5714285969734192,
"rewards/format_reward": 1.0,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 439.5714416503906,
"epoch": 0.10419313850063533,
"grad_norm": 1.3710603713989258,
"kl": 0.302734375,
"learning_rate": 4.030912211554316e-06,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 477.5714416503906,
"epoch": 0.1048284625158831,
"grad_norm": 2.647684097290039,
"kl": 0.1826171875,
"learning_rate": 4.018903317164539e-06,
"loss": 0.0018,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 545.857177734375,
"epoch": 0.10546378653113088,
"grad_norm": 0.07831098884344101,
"kl": 0.1650390625,
"learning_rate": 4.006841182132932e-06,
"loss": 0.0019,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 517.857177734375,
"epoch": 0.10609911054637865,
"grad_norm": 0.6324580311775208,
"kl": 0.296875,
"learning_rate": 3.9947263125625195e-06,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 451.5714416503906,
"epoch": 0.10673443456162643,
"grad_norm": 2.877990961074829,
"kl": 0.166015625,
"learning_rate": 3.982559216768967e-06,
"loss": 0.0017,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 527.2857666015625,
"epoch": 0.10736975857687421,
"grad_norm": 0.31927597522735596,
"kl": 0.310546875,
"learning_rate": 3.970340405259245e-06,
"loss": 0.0034,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 545.7142944335938,
"epoch": 0.10800508259212198,
"grad_norm": 2.99507474899292,
"kl": 0.28125,
"learning_rate": 3.958070390710214e-06,
"loss": 0.0028,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 430.71429443359375,
"epoch": 0.10864040660736976,
"grad_norm": 3.1845176219940186,
"kl": 0.302734375,
"learning_rate": 3.945749687947109e-06,
"loss": 0.003,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 496.4285888671875,
"epoch": 0.10927573062261753,
"grad_norm": 2.674358606338501,
"kl": 0.375,
"learning_rate": 3.933378813921942e-06,
"loss": 0.0037,
"reward": 0.46666669845581055,
"reward_std": 0.11547007411718369,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.06666667759418488,
"rewards/format_reward": 1.0,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 722.1428833007812,
"epoch": 0.10991105463786531,
"grad_norm": 1.7992055416107178,
"kl": 0.205078125,
"learning_rate": 3.920958287691811e-06,
"loss": 0.002,
"reward": 0.3142857253551483,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 436.71429443359375,
"epoch": 0.11054637865311309,
"grad_norm": 0.03782209753990173,
"kl": 0.1591796875,
"learning_rate": 3.908488630397121e-06,
"loss": 0.0019,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 446.14288330078125,
"epoch": 0.11118170266836086,
"grad_norm": 2.619063138961792,
"kl": 0.1796875,
"learning_rate": 3.8959703652397175e-06,
"loss": 0.0018,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 705.5714721679688,
"epoch": 0.11181702668360864,
"grad_norm": 2.71985125541687,
"kl": 0.267578125,
"learning_rate": 3.883404017460935e-06,
"loss": 0.0027,
"reward": 0.2857142984867096,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 460.71429443359375,
"epoch": 0.11245235069885642,
"grad_norm": 3.090186595916748,
"kl": 0.2109375,
"learning_rate": 3.870790114319559e-06,
"loss": 0.0021,
"reward": 0.6571429371833801,
"reward_std": 0.5126960277557373,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 482.71429443359375,
"epoch": 0.11308767471410419,
"grad_norm": 0.13239029049873352,
"kl": 0.287109375,
"learning_rate": 3.858129185069701e-06,
"loss": 0.0032,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 671.5714721679688,
"epoch": 0.11372299872935197,
"grad_norm": 2.1125059127807617,
"kl": 0.27734375,
"learning_rate": 3.845421760938597e-06,
"loss": 0.0028,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 500.4285888671875,
"epoch": 0.11435832274459974,
"grad_norm": 8.883991241455078,
"kl": 0.9453125,
"learning_rate": 3.832668375104312e-06,
"loss": 0.0095,
"reward": 0.37142860889434814,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 519.0,
"epoch": 0.11499364675984752,
"grad_norm": 12.942805290222168,
"kl": 1.375,
"learning_rate": 3.8198695626733725e-06,
"loss": 0.0141,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 667.2857666015625,
"epoch": 0.1156289707750953,
"grad_norm": 2.5961642265319824,
"kl": 0.23828125,
"learning_rate": 3.8070258606583156e-06,
"loss": 0.0024,
"reward": 0.3142857253551483,
"reward_std": 0.10690449923276901,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 587.2857666015625,
"epoch": 0.11626429479034307,
"grad_norm": 3.12656831741333,
"kl": 0.251953125,
"learning_rate": 3.7941378079551544e-06,
"loss": 0.0025,
"reward": 0.3142857253551483,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 328.71429443359375,
"epoch": 0.11689961880559085,
"grad_norm": 2.924964427947998,
"kl": 0.25,
"learning_rate": 3.7812059453207677e-06,
"loss": 0.0025,
"reward": 0.6285714507102966,
"reward_std": 0.5468525290489197,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 0.8571429252624512,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 486.2857360839844,
"epoch": 0.11753494282083862,
"grad_norm": 3.5567517280578613,
"kl": 0.296875,
"learning_rate": 3.768230815350213e-06,
"loss": 0.003,
"reward": 0.37142857909202576,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 554.1428833007812,
"epoch": 0.1181702668360864,
"grad_norm": 2.51031494140625,
"kl": 0.30859375,
"learning_rate": 3.7552129624539557e-06,
"loss": 0.0031,
"reward": 0.37142857909202576,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 594.5714721679688,
"epoch": 0.11880559085133419,
"grad_norm": 2.589846611022949,
"kl": 0.26171875,
"learning_rate": 3.7421529328350316e-06,
"loss": 0.0026,
"reward": 0.5284404158592224,
"reward_std": 0.19612440466880798,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.12844036519527435,
"rewards/format_reward": 1.0,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 595.857177734375,
"epoch": 0.11944091486658195,
"grad_norm": 2.4991862773895264,
"kl": 0.27734375,
"learning_rate": 3.7290512744661274e-06,
"loss": 0.0028,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 552.0,
"epoch": 0.12007623888182974,
"grad_norm": 2.0362820625305176,
"kl": 0.1845703125,
"learning_rate": 3.715908537066589e-06,
"loss": 0.0019,
"reward": 0.485714316368103,
"reward_std": 0.42983949184417725,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 0.8571429252624512,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 698.5714721679688,
"epoch": 0.1207115628970775,
"grad_norm": 2.4349231719970703,
"kl": 0.25,
"learning_rate": 3.7027252720793538e-06,
"loss": 0.0025,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 453.2857360839844,
"epoch": 0.12134688691232529,
"grad_norm": 3.212773323059082,
"kl": 0.1796875,
"learning_rate": 3.689502032647817e-06,
"loss": 0.0018,
"reward": 0.36014559864997864,
"reward_std": 0.1179879903793335,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.017288444563746452,
"rewards/format_reward": 1.0,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 671.1428833007812,
"epoch": 0.12198221092757307,
"grad_norm": 0.08436475694179535,
"kl": 0.1767578125,
"learning_rate": 3.6762393735926245e-06,
"loss": 0.0021,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 651.2857666015625,
"epoch": 0.12261753494282084,
"grad_norm": 2.7841696739196777,
"kl": 0.146484375,
"learning_rate": 3.6629378513883852e-06,
"loss": 0.0015,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 545.2857666015625,
"epoch": 0.12325285895806862,
"grad_norm": 3.3902151584625244,
"kl": 0.205078125,
"learning_rate": 3.6495980241403307e-06,
"loss": 0.002,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 593.1428833007812,
"epoch": 0.12388818297331639,
"grad_norm": 2.5586631298065186,
"kl": 0.197265625,
"learning_rate": 3.636220451560896e-06,
"loss": 0.002,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 589.7142944335938,
"epoch": 0.12452350698856417,
"grad_norm": 0.08236116170883179,
"kl": 0.3203125,
"learning_rate": 3.622805694946235e-06,
"loss": 0.0035,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 780.7142944335938,
"epoch": 0.12515883100381195,
"grad_norm": 0.9724776148796082,
"kl": 0.390625,
"learning_rate": 3.609354317152667e-06,
"loss": 0.0042,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 921.5714721679688,
"epoch": 0.12579415501905972,
"grad_norm": 2.485982894897461,
"kl": 0.263671875,
"learning_rate": 3.595866882573063e-06,
"loss": 0.0026,
"reward": 0.20000001788139343,
"reward_std": 0.20000001788139343,
"rewards/code_format_reward": 0.4285714626312256,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5714285969734192,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 850.2857666015625,
"epoch": 0.12642947903430748,
"grad_norm": 2.52078914642334,
"kl": 0.451171875,
"learning_rate": 3.5823439571131675e-06,
"loss": 0.0045,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 924.2857666015625,
"epoch": 0.12706480304955528,
"grad_norm": 2.445126533508301,
"kl": 0.20703125,
"learning_rate": 3.5687861081678477e-06,
"loss": 0.0021,
"reward": 0.22857144474983215,
"reward_std": 0.21380899846553802,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5714285969734192,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 764.1428833007812,
"epoch": 0.12770012706480305,
"grad_norm": 2.1626222133636475,
"kl": 0.3359375,
"learning_rate": 3.555193904597291e-06,
"loss": 0.0034,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 798.2857666015625,
"epoch": 0.12833545108005082,
"grad_norm": 2.010795831680298,
"kl": 0.275390625,
"learning_rate": 3.541567916703138e-06,
"loss": 0.0027,
"reward": 0.485714316368103,
"reward_std": 0.42983949184417725,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 0.8571429252624512,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 675.5714721679688,
"epoch": 0.1289707750952986,
"grad_norm": 2.5785133838653564,
"kl": 0.2431640625,
"learning_rate": 3.5279087162045517e-06,
"loss": 0.0024,
"reward": 0.6285714507102966,
"reward_std": 0.5468525290489197,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 0.8571429252624512,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 696.4285888671875,
"epoch": 0.12960609911054638,
"grad_norm": 0.14385801553726196,
"kl": 0.166015625,
"learning_rate": 3.5142168762142265e-06,
"loss": 0.002,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 673.5714721679688,
"epoch": 0.13024142312579415,
"grad_norm": 29.659584045410156,
"kl": 2.109375,
"learning_rate": 3.500492971214347e-06,
"loss": 0.0212,
"reward": 0.40446433424949646,
"reward_std": 0.00819578766822815,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.004464285913854837,
"rewards/format_reward": 1.0,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 724.857177734375,
"epoch": 0.13087674714104194,
"grad_norm": 3.008819580078125,
"kl": 0.29296875,
"learning_rate": 3.48673757703248e-06,
"loss": 0.0029,
"reward": 0.4012531638145447,
"reward_std": 0.003315483685582876,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0012531329412013292,
"rewards/format_reward": 1.0,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 670.0,
"epoch": 0.1315120711562897,
"grad_norm": 2.4119279384613037,
"kl": 0.1904296875,
"learning_rate": 3.472951270817418e-06,
"loss": 0.0019,
"reward": 0.8285714983940125,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4285714626312256,
"rewards/format_reward": 1.0,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 723.5714721679688,
"epoch": 0.13214739517153748,
"grad_norm": 0.09062952548265457,
"kl": 0.26171875,
"learning_rate": 3.4591346310149578e-06,
"loss": 0.0029,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 885.5714721679688,
"epoch": 0.13278271918678525,
"grad_norm": 2.048800468444824,
"kl": 0.2109375,
"learning_rate": 3.445288237343632e-06,
"loss": 0.0021,
"reward": 0.2857142984867096,
"reward_std": 0.1864454597234726,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.02857143059372902,
"rewards/format_reward": 0.7142857313156128,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 768.0000610351562,
"epoch": 0.13341804320203304,
"grad_norm": 0.07965610176324844,
"kl": 0.279296875,
"learning_rate": 3.4314126707703895e-06,
"loss": 0.0031,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 668.7142944335938,
"epoch": 0.1340533672172808,
"grad_norm": 2.6163687705993652,
"kl": 0.259765625,
"learning_rate": 3.4175085134862128e-06,
"loss": 0.0026,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 846.5714721679688,
"epoch": 0.13468869123252858,
"grad_norm": 2.003486394882202,
"kl": 0.1591796875,
"learning_rate": 3.4035763488816953e-06,
"loss": 0.0016,
"reward": 0.3142857253551483,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 518.5714721679688,
"epoch": 0.13532401524777637,
"grad_norm": 2.321255683898926,
"kl": 0.1865234375,
"learning_rate": 3.3896167615225594e-06,
"loss": 0.0019,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 736.4285888671875,
"epoch": 0.13595933926302414,
"grad_norm": 0.06602618098258972,
"kl": 0.25,
"learning_rate": 3.375630337125133e-06,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 668.2857666015625,
"epoch": 0.1365946632782719,
"grad_norm": 2.6393556594848633,
"kl": 0.18359375,
"learning_rate": 3.361617662531772e-06,
"loss": 0.0018,
"reward": 0.5445378422737122,
"reward_std": 0.2468453347682953,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.14453783631324768,
"rewards/format_reward": 1.0,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 696.5714721679688,
"epoch": 0.1372299872935197,
"grad_norm": 0.06534316390752792,
"kl": 0.1962890625,
"learning_rate": 3.347579325686237e-06,
"loss": 0.0023,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 728.4285888671875,
"epoch": 0.13786531130876747,
"grad_norm": 2.1525208950042725,
"kl": 0.162109375,
"learning_rate": 3.333515915609027e-06,
"loss": 0.0016,
"reward": 0.3142857253551483,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 695.5714721679688,
"epoch": 0.13850063532401524,
"grad_norm": 2.821530342102051,
"kl": 0.271484375,
"learning_rate": 3.3194280223726616e-06,
"loss": 0.0027,
"reward": 0.41904765367507935,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0476190522313118,
"rewards/format_reward": 1.0,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 558.4285888671875,
"epoch": 0.13913595933926304,
"grad_norm": 0.07845824211835861,
"kl": 0.322265625,
"learning_rate": 3.305316237076927e-06,
"loss": 0.0035,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 762.2857666015625,
"epoch": 0.1397712833545108,
"grad_norm": 2.1945505142211914,
"kl": 0.15625,
"learning_rate": 3.291181151824071e-06,
"loss": 0.0016,
"reward": 0.3142857253551483,
"reward_std": 0.15735916793346405,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 534.0,
"epoch": 0.14040660736975857,
"grad_norm": 2.6735599040985107,
"kl": 0.19921875,
"learning_rate": 3.27702335969396e-06,
"loss": 0.002,
"reward": 0.8285714983940125,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4285714626312256,
"rewards/format_reward": 1.0,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 673.2857666015625,
"epoch": 0.14104193138500634,
"grad_norm": 0.16328755021095276,
"kl": 0.3203125,
"learning_rate": 3.2628434547191985e-06,
"loss": 0.0035,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 690.7142944335938,
"epoch": 0.14167725540025414,
"grad_norm": 0.05350850895047188,
"kl": 0.28515625,
"learning_rate": 3.2486420318601973e-06,
"loss": 0.0032,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 705.7142944335938,
"epoch": 0.1423125794155019,
"grad_norm": 0.05627477914094925,
"kl": 0.255859375,
"learning_rate": 3.2344196869802187e-06,
"loss": 0.0029,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 522.857177734375,
"epoch": 0.14294790343074967,
"grad_norm": 3.546515464782715,
"kl": 0.1943359375,
"learning_rate": 3.2201770168203694e-06,
"loss": 0.0019,
"reward": 0.37142857909202576,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 656.857177734375,
"epoch": 0.14358322744599747,
"grad_norm": 0.30215829610824585,
"kl": 0.283203125,
"learning_rate": 3.205914618974563e-06,
"loss": 0.0031,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 680.1428833007812,
"epoch": 0.14421855146124524,
"grad_norm": 2.333859443664551,
"kl": 0.1982421875,
"learning_rate": 3.1916330918644496e-06,
"loss": 0.002,
"reward": 0.290529727935791,
"reward_std": 0.16188988089561462,
"rewards/code_format_reward": 0.5714285969734192,
"rewards/code_reward": 0.004815409425646067,
"rewards/format_reward": 0.8571429252624512,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 702.7142944335938,
"epoch": 0.144853875476493,
"grad_norm": 0.06898491084575653,
"kl": 0.26171875,
"learning_rate": 3.177333034714303e-06,
"loss": 0.0029,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 699.7142944335938,
"epoch": 0.1454891994917408,
"grad_norm": 3.096179246902466,
"kl": 0.26171875,
"learning_rate": 3.1630150475258813e-06,
"loss": 0.0026,
"reward": 0.3428571820259094,
"reward_std": 0.09759000688791275,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 704.4285888671875,
"epoch": 0.14612452350698857,
"grad_norm": 0.06554409861564636,
"kl": 0.302734375,
"learning_rate": 3.148679731053252e-06,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 575.5714721679688,
"epoch": 0.14675984752223634,
"grad_norm": 2.8049392700195312,
"kl": 0.2158203125,
"learning_rate": 3.1343276867775805e-06,
"loss": 0.0022,
"reward": 0.4242587983608246,
"reward_std": 0.04178621619939804,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.02425876073539257,
"rewards/format_reward": 1.0,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 611.857177734375,
"epoch": 0.1473951715374841,
"grad_norm": 3.6352577209472656,
"kl": 0.296875,
"learning_rate": 3.1199595168819043e-06,
"loss": 0.003,
"reward": 0.37142860889434814,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 638.0,
"epoch": 0.1480304955527319,
"grad_norm": 2.4661004543304443,
"kl": 0.298828125,
"learning_rate": 3.105575824225852e-06,
"loss": 0.003,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 599.857177734375,
"epoch": 0.14866581956797967,
"grad_norm": 2.9908604621887207,
"kl": 0.20703125,
"learning_rate": 3.091177212320363e-06,
"loss": 0.0021,
"reward": 0.37142857909202576,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 652.1428833007812,
"epoch": 0.14930114358322744,
"grad_norm": 0.04900944232940674,
"kl": 0.1875,
"learning_rate": 3.0767642853023538e-06,
"loss": 0.0022,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 485.5714416503906,
"epoch": 0.14993646759847523,
"grad_norm": 2.6988017559051514,
"kl": 0.216796875,
"learning_rate": 3.062337647909376e-06,
"loss": 0.0022,
"reward": 0.37142857909202576,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 672.857177734375,
"epoch": 0.150571791613723,
"grad_norm": 2.6142776012420654,
"kl": 0.22265625,
"learning_rate": 3.04789790545424e-06,
"loss": 0.0022,
"reward": 0.4169172942638397,
"reward_std": 0.04475894197821617,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.016917293891310692,
"rewards/format_reward": 1.0,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 709.7142944335938,
"epoch": 0.15120711562897077,
"grad_norm": 0.06715590506792068,
"kl": 0.1865234375,
"learning_rate": 3.033445663799621e-06,
"loss": 0.0022,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 642.2857666015625,
"epoch": 0.15184243964421856,
"grad_norm": 2.8286352157592773,
"kl": 0.2197265625,
"learning_rate": 3.018981529332633e-06,
"loss": 0.0022,
"reward": 0.5428571701049805,
"reward_std": 0.3779645264148712,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 567.2857666015625,
"epoch": 0.15247776365946633,
"grad_norm": 0.07005994766950607,
"kl": 0.21875,
"learning_rate": 3.00450610893939e-06,
"loss": 0.0025,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 678.5714721679688,
"epoch": 0.1531130876747141,
"grad_norm": 2.2401628494262695,
"kl": 0.162109375,
"learning_rate": 2.9900200099795396e-06,
"loss": 0.0016,
"reward": 0.40160515904426575,
"reward_std": 0.004246791359037161,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.00160513655282557,
"rewards/format_reward": 1.0,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 640.4285888671875,
"epoch": 0.15374841168996187,
"grad_norm": 2.341778039932251,
"kl": 0.30078125,
"learning_rate": 2.9755238402607826e-06,
"loss": 0.003,
"reward": 0.485714316368103,
"reward_std": 0.22677868604660034,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 0.8571429252624512,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 929.4285888671875,
"epoch": 0.15438373570520966,
"grad_norm": 2.2782864570617676,
"kl": 0.193359375,
"learning_rate": 2.961018208013367e-06,
"loss": 0.0019,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 655.5714721679688,
"epoch": 0.15501905972045743,
"grad_norm": 0.0515868104994297,
"kl": 0.265625,
"learning_rate": 2.9465037218645694e-06,
"loss": 0.003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 596.7142944335938,
"epoch": 0.1556543837357052,
"grad_norm": 0.06041451543569565,
"kl": 0.271484375,
"learning_rate": 2.9319809908131604e-06,
"loss": 0.003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 623.5714721679688,
"epoch": 0.156289707750953,
"grad_norm": 0.11632593721151352,
"kl": 0.33203125,
"learning_rate": 2.917450624203847e-06,
"loss": 0.0036,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 708.7142944335938,
"epoch": 0.15692503176620076,
"grad_norm": 2.282824993133545,
"kl": 0.17578125,
"learning_rate": 2.9029132317017118e-06,
"loss": 0.0018,
"reward": 0.6714285612106323,
"reward_std": 0.27516230940818787,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.30000001192092896,
"rewards/format_reward": 1.0,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 402.2857360839844,
"epoch": 0.15756035578144853,
"grad_norm": 0.05820649862289429,
"kl": 0.263671875,
"learning_rate": 2.888369423266629e-06,
"loss": 0.0029,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 559.1428833007812,
"epoch": 0.15819567979669633,
"grad_norm": 2.360961437225342,
"kl": 0.2431640625,
"learning_rate": 2.8738198091276712e-06,
"loss": 0.0024,
"reward": 0.771428644657135,
"reward_std": 0.48205915093421936,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.4285714626312256,
"rewards/format_reward": 0.8571429252624512,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 673.857177734375,
"epoch": 0.1588310038119441,
"grad_norm": 2.426175832748413,
"kl": 0.2431640625,
"learning_rate": 2.859264999757509e-06,
"loss": 0.0024,
"reward": 0.485714316368103,
"reward_std": 0.42983946204185486,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 0.8571429252624512,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 555.7142944335938,
"epoch": 0.15946632782719186,
"grad_norm": 1.7476675510406494,
"kl": 0.2060546875,
"learning_rate": 2.8447056058467928e-06,
"loss": 0.0021,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 635.1428833007812,
"epoch": 0.16010165184243966,
"grad_norm": 0.10966142266988754,
"kl": 0.296875,
"learning_rate": 2.830142238278531e-06,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 569.4285888671875,
"epoch": 0.16073697585768743,
"grad_norm": 0.07483859360218048,
"kl": 0.306640625,
"learning_rate": 2.81557550810246e-06,
"loss": 0.0034,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 696.1428833007812,
"epoch": 0.1613722998729352,
"grad_norm": 557283.4375,
"kl": 19456.0,
"learning_rate": 2.8010060265094026e-06,
"loss": 194.2972,
"reward": 0.485714316368103,
"reward_std": 0.42983946204185486,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 0.8571429252624512,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 572.1428833007812,
"epoch": 0.16200762388818296,
"grad_norm": 2.958310842514038,
"kl": 0.203125,
"learning_rate": 2.786434404805629e-06,
"loss": 0.002,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 642.5714721679688,
"epoch": 0.16264294790343076,
"grad_norm": 2.9970836639404297,
"kl": 0.359375,
"learning_rate": 2.771861254387199e-06,
"loss": 0.0036,
"reward": 0.485714316368103,
"reward_std": 0.42983949184417725,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 0.8571429252624512,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 626.5714721679688,
"epoch": 0.16327827191867852,
"grad_norm": 0.052250247448682785,
"kl": 0.203125,
"learning_rate": 2.7572871867143204e-06,
"loss": 0.0023,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 587.0,
"epoch": 0.1639135959339263,
"grad_norm": 0.049148622900247574,
"kl": 0.1845703125,
"learning_rate": 2.742712813285681e-06,
"loss": 0.0021,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 413.71429443359375,
"epoch": 0.1645489199491741,
"grad_norm": 3.1558637619018555,
"kl": 0.296875,
"learning_rate": 2.7281387456128017e-06,
"loss": 0.003,
"reward": 0.6661654710769653,
"reward_std": 0.2892994284629822,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.3233082890510559,
"rewards/format_reward": 0.8571429252624512,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 599.1428833007812,
"epoch": 0.16518424396442186,
"grad_norm": 2.672922372817993,
"kl": 0.333984375,
"learning_rate": 2.7135655951943716e-06,
"loss": 0.0033,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 638.2857666015625,
"epoch": 0.16581956797966962,
"grad_norm": 0.062102027237415314,
"kl": 0.2890625,
"learning_rate": 2.698993973490598e-06,
"loss": 0.0032,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 615.4285888671875,
"epoch": 0.16645489199491742,
"grad_norm": 2.032621145248413,
"kl": 0.349609375,
"learning_rate": 2.6844244918975416e-06,
"loss": 0.0035,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 680.4285888671875,
"epoch": 0.1670902160101652,
"grad_norm": 0.07202436029911041,
"kl": 0.25390625,
"learning_rate": 2.66985776172147e-06,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 607.0,
"epoch": 0.16772554002541296,
"grad_norm": 3.1766250133514404,
"kl": 0.3046875,
"learning_rate": 2.6552943941532088e-06,
"loss": 0.003,
"reward": 0.41587308049201965,
"reward_std": 0.0419960655272007,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.01587301678955555,
"rewards/format_reward": 1.0,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 661.5714721679688,
"epoch": 0.16836086404066072,
"grad_norm": 3.2823150157928467,
"kl": 0.2392578125,
"learning_rate": 2.6407350002424927e-06,
"loss": 0.0024,
"reward": 0.5868132710456848,
"reward_std": 0.23857378959655762,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.18681320548057556,
"rewards/format_reward": 1.0,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 598.857177734375,
"epoch": 0.16899618805590852,
"grad_norm": 0.08283974230289459,
"kl": 0.2021484375,
"learning_rate": 2.626180190872329e-06,
"loss": 0.0023,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 439.2857360839844,
"epoch": 0.1696315120711563,
"grad_norm": 3.3979873657226562,
"kl": 0.234375,
"learning_rate": 2.611630576733372e-06,
"loss": 0.0023,
"reward": 0.41260507702827454,
"reward_std": 0.01228986494243145,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.01260504312813282,
"rewards/format_reward": 1.0,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 584.1428833007812,
"epoch": 0.17026683608640406,
"grad_norm": 2.4819862842559814,
"kl": 0.28125,
"learning_rate": 2.5970867682982885e-06,
"loss": 0.0028,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 458.14288330078125,
"epoch": 0.17090216010165185,
"grad_norm": 0.08856673538684845,
"kl": 0.349609375,
"learning_rate": 2.582549375796154e-06,
"loss": 0.0038,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 468.71429443359375,
"epoch": 0.17153748411689962,
"grad_norm": 2.835487127304077,
"kl": 0.32421875,
"learning_rate": 2.568019009186841e-06,
"loss": 0.0032,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 637.4285888671875,
"epoch": 0.1721728081321474,
"grad_norm": 0.054750654846429825,
"kl": 0.162109375,
"learning_rate": 2.5534962781354317e-06,
"loss": 0.0019,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 757.2857666015625,
"epoch": 0.17280813214739518,
"grad_norm": 2.1099636554718018,
"kl": 0.21484375,
"learning_rate": 2.538981791986634e-06,
"loss": 0.0021,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 425.2857360839844,
"epoch": 0.17344345616264295,
"grad_norm": 2.960906982421875,
"kl": 0.251953125,
"learning_rate": 2.524476159739218e-06,
"loss": 0.0025,
"reward": 0.5428571701049805,
"reward_std": 0.3779645264148712,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 592.5714721679688,
"epoch": 0.17407878017789072,
"grad_norm": 2.5509631633758545,
"kl": 0.26953125,
"learning_rate": 2.5099799900204607e-06,
"loss": 0.0027,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 436.8571472167969,
"epoch": 0.17471410419313851,
"grad_norm": 2.9435410499572754,
"kl": 0.400390625,
"learning_rate": 2.4954938910606108e-06,
"loss": 0.004,
"reward": 0.563265323638916,
"reward_std": 0.37278667092323303,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.16326531767845154,
"rewards/format_reward": 1.0,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 409.5714416503906,
"epoch": 0.17534942820838628,
"grad_norm": 4.025363922119141,
"kl": 0.275390625,
"learning_rate": 2.481018470667368e-06,
"loss": 0.0028,
"reward": 0.5428571701049805,
"reward_std": 0.37796446681022644,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 467.14288330078125,
"epoch": 0.17598475222363405,
"grad_norm": 0.09415756165981293,
"kl": 0.30078125,
"learning_rate": 2.4665543362003802e-06,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 438.0000305175781,
"epoch": 0.17662007623888182,
"grad_norm": 3.590571165084839,
"kl": 0.2216796875,
"learning_rate": 2.4521020945457615e-06,
"loss": 0.0022,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 439.2857360839844,
"epoch": 0.1772554002541296,
"grad_norm": 2.6514501571655273,
"kl": 0.1884765625,
"learning_rate": 2.4376623520906255e-06,
"loss": 0.0019,
"reward": 0.37142860889434814,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 395.4285888671875,
"epoch": 0.17789072426937738,
"grad_norm": 3.0715246200561523,
"kl": 0.396484375,
"learning_rate": 2.4232357146976478e-06,
"loss": 0.004,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 552.1428833007812,
"epoch": 0.17852604828462515,
"grad_norm": 0.04878819361329079,
"kl": 0.25390625,
"learning_rate": 2.408822787679637e-06,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 470.0000305175781,
"epoch": 0.17916137229987295,
"grad_norm": 2.8764052391052246,
"kl": 0.337890625,
"learning_rate": 2.3944241757741475e-06,
"loss": 0.0034,
"reward": 0.7617021799087524,
"reward_std": 0.39519527554512024,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.3617021143436432,
"rewards/format_reward": 1.0,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 552.0,
"epoch": 0.1797966963151207,
"grad_norm": 3.4082202911376953,
"kl": 0.279296875,
"learning_rate": 2.380040483118097e-06,
"loss": 0.0028,
"reward": 0.37142860889434814,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 419.0000305175781,
"epoch": 0.18043202033036848,
"grad_norm": 3.0144095420837402,
"kl": 0.236328125,
"learning_rate": 2.365672313222419e-06,
"loss": 0.0024,
"reward": 0.800332248210907,
"reward_std": 0.2776820659637451,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4003322720527649,
"rewards/format_reward": 1.0,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 514.5714721679688,
"epoch": 0.18106734434561628,
"grad_norm": 0.1193128377199173,
"kl": 0.35546875,
"learning_rate": 2.351320268946749e-06,
"loss": 0.0038,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 638.7142944335938,
"epoch": 0.18170266836086404,
"grad_norm": 0.050928860902786255,
"kl": 0.25390625,
"learning_rate": 2.336984952474119e-06,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 496.857177734375,
"epoch": 0.1823379923761118,
"grad_norm": 0.12127784639596939,
"kl": 0.310546875,
"learning_rate": 2.322666965285697e-06,
"loss": 0.0034,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 515.857177734375,
"epoch": 0.18297331639135958,
"grad_norm": 0.04745308309793472,
"kl": 0.302734375,
"learning_rate": 2.3083669081355507e-06,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 619.0,
"epoch": 0.18360864040660738,
"grad_norm": 0.04747169092297554,
"kl": 0.2392578125,
"learning_rate": 2.2940853810254377e-06,
"loss": 0.0027,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 427.2857360839844,
"epoch": 0.18424396442185514,
"grad_norm": 0.04503343254327774,
"kl": 0.2109375,
"learning_rate": 2.2798229831796313e-06,
"loss": 0.0024,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 350.8571472167969,
"epoch": 0.1848792884371029,
"grad_norm": 0.15933562815189362,
"kl": 0.29296875,
"learning_rate": 2.2655803130197816e-06,
"loss": 0.0032,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 492.4285888671875,
"epoch": 0.1855146124523507,
"grad_norm": 0.061849016696214676,
"kl": 0.34375,
"learning_rate": 2.2513579681398034e-06,
"loss": 0.0037,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 447.5714416503906,
"epoch": 0.18614993646759848,
"grad_norm": 3.1712300777435303,
"kl": 0.1982421875,
"learning_rate": 2.237156545280803e-06,
"loss": 0.002,
"reward": 1.1142858266830444,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.7142857313156128,
"rewards/format_reward": 1.0,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 530.7142944335938,
"epoch": 0.18678526048284624,
"grad_norm": 2.926269769668579,
"kl": 0.27734375,
"learning_rate": 2.2229766403060403e-06,
"loss": 0.0028,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 370.14288330078125,
"epoch": 0.18742058449809404,
"grad_norm": 3.639002799987793,
"kl": 0.412109375,
"learning_rate": 2.2088188481759305e-06,
"loss": 0.0041,
"reward": 0.44044750928878784,
"reward_std": 0.06550441682338715,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.040447503328323364,
"rewards/format_reward": 1.0,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 411.8571472167969,
"epoch": 0.1880559085133418,
"grad_norm": 3.473376512527466,
"kl": 0.337890625,
"learning_rate": 2.194683762923073e-06,
"loss": 0.0034,
"reward": 0.8047619462013245,
"reward_std": 0.28637492656707764,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4047619104385376,
"rewards/format_reward": 1.0,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 537.1428833007812,
"epoch": 0.18869123252858958,
"grad_norm": 0.05299937725067139,
"kl": 0.2373046875,
"learning_rate": 2.1805719776273387e-06,
"loss": 0.0027,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 499.0000305175781,
"epoch": 0.18932655654383734,
"grad_norm": 0.04631977900862694,
"kl": 0.2265625,
"learning_rate": 2.166484084390974e-06,
"loss": 0.0026,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 490.14288330078125,
"epoch": 0.18996188055908514,
"grad_norm": 0.08931510150432587,
"kl": 0.244140625,
"learning_rate": 2.1524206743137636e-06,
"loss": 0.0027,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 461.2857360839844,
"epoch": 0.1905972045743329,
"grad_norm": 3.1879539489746094,
"kl": 0.236328125,
"learning_rate": 2.1383823374682287e-06,
"loss": 0.0024,
"reward": 0.971428632736206,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.5714285969734192,
"rewards/format_reward": 1.0,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 493.14288330078125,
"epoch": 0.19123252858958067,
"grad_norm": 2.5813148021698,
"kl": 0.2294921875,
"learning_rate": 2.124369662874868e-06,
"loss": 0.0023,
"reward": 0.44916945695877075,
"reward_std": 0.2263808697462082,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.10631229728460312,
"rewards/format_reward": 0.8571429252624512,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 467.857177734375,
"epoch": 0.19186785260482847,
"grad_norm": 2.3633549213409424,
"kl": 0.251953125,
"learning_rate": 2.110383238477441e-06,
"loss": 0.0025,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 467.5714416503906,
"epoch": 0.19250317662007624,
"grad_norm": 3.2412614822387695,
"kl": 0.2021484375,
"learning_rate": 2.096423651118305e-06,
"loss": 0.002,
"reward": 0.41476020216941833,
"reward_std": 0.03590288758277893,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.014760148711502552,
"rewards/format_reward": 1.0,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 629.1428833007812,
"epoch": 0.193138500635324,
"grad_norm": 1.8208192586898804,
"kl": 0.1845703125,
"learning_rate": 2.082491486513788e-06,
"loss": 0.0018,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 573.1428833007812,
"epoch": 0.1937738246505718,
"grad_norm": 0.06740770488977432,
"kl": 0.326171875,
"learning_rate": 2.0685873292296116e-06,
"loss": 0.0036,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 469.0000305175781,
"epoch": 0.19440914866581957,
"grad_norm": 0.050011664628982544,
"kl": 0.322265625,
"learning_rate": 2.054711762656369e-06,
"loss": 0.0035,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 501.2857360839844,
"epoch": 0.19504447268106734,
"grad_norm": 2.7797482013702393,
"kl": 0.1865234375,
"learning_rate": 2.040865368985044e-06,
"loss": 0.0019,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 483.4285888671875,
"epoch": 0.19567979669631513,
"grad_norm": 0.056970253586769104,
"kl": 0.205078125,
"learning_rate": 2.027048729182583e-06,
"loss": 0.0023,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 708.4285888671875,
"epoch": 0.1963151207115629,
"grad_norm": 1.9797154664993286,
"kl": 0.2421875,
"learning_rate": 2.0132624229675205e-06,
"loss": 0.0024,
"reward": 0.45210087299346924,
"reward_std": 0.11403417587280273,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.10924370586872101,
"rewards/format_reward": 0.8571429252624512,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 432.14288330078125,
"epoch": 0.19695044472681067,
"grad_norm": 0.0952640175819397,
"kl": 0.40625,
"learning_rate": 1.9995070287856546e-06,
"loss": 0.0044,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 533.7142944335938,
"epoch": 0.19758576874205844,
"grad_norm": 3.1438472270965576,
"kl": 0.30859375,
"learning_rate": 1.985783123785774e-06,
"loss": 0.0031,
"reward": 0.49523812532424927,
"reward_std": 0.11878278106451035,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0952381044626236,
"rewards/format_reward": 1.0,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 536.5714721679688,
"epoch": 0.19822109275730623,
"grad_norm": 2.1929433345794678,
"kl": 0.314453125,
"learning_rate": 1.9720912837954486e-06,
"loss": 0.0031,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 502.71429443359375,
"epoch": 0.198856416772554,
"grad_norm": 0.05079368129372597,
"kl": 0.306640625,
"learning_rate": 1.958432083296862e-06,
"loss": 0.0034,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 454.14288330078125,
"epoch": 0.19949174078780177,
"grad_norm": 0.06545651704072952,
"kl": 0.2099609375,
"learning_rate": 1.9448060954027093e-06,
"loss": 0.0024,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 523.1428833007812,
"epoch": 0.20012706480304956,
"grad_norm": 2.852233409881592,
"kl": 0.27734375,
"learning_rate": 1.931213891832153e-06,
"loss": 0.0028,
"reward": 0.40317460894584656,
"reward_std": 0.005421662237495184,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0031746034510433674,
"rewards/format_reward": 1.0,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 518.7142944335938,
"epoch": 0.20076238881829733,
"grad_norm": 0.056687433272600174,
"kl": 0.2158203125,
"learning_rate": 1.9176560428868336e-06,
"loss": 0.0025,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 651.857177734375,
"epoch": 0.2013977128335451,
"grad_norm": 2.1868667602539062,
"kl": 0.1923828125,
"learning_rate": 1.9041331174269373e-06,
"loss": 0.0019,
"reward": 0.3556329905986786,
"reward_std": 0.16032202541828156,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.012775842100381851,
"rewards/format_reward": 0.8571429252624512,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 627.7142944335938,
"epoch": 0.2020330368487929,
"grad_norm": 0.055125512182712555,
"kl": 0.2578125,
"learning_rate": 1.8906456828473341e-06,
"loss": 0.0029,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 585.4285888671875,
"epoch": 0.20266836086404066,
"grad_norm": 0.0708109587430954,
"kl": 0.265625,
"learning_rate": 1.8771943050537656e-06,
"loss": 0.0029,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 568.857177734375,
"epoch": 0.20330368487928843,
"grad_norm": 3.287763833999634,
"kl": 0.1796875,
"learning_rate": 1.8637795484391046e-06,
"loss": 0.0018,
"reward": 0.4266955256462097,
"reward_std": 0.05106709897518158,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.026695528998970985,
"rewards/format_reward": 1.0,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 356.71429443359375,
"epoch": 0.2039390088945362,
"grad_norm": 2.707819700241089,
"kl": 0.271484375,
"learning_rate": 1.8504019758596698e-06,
"loss": 0.0027,
"reward": 1.2571430206298828,
"reward_std": 0.3779645264148712,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.8571429252624512,
"rewards/format_reward": 1.0,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 470.4285888671875,
"epoch": 0.204574332909784,
"grad_norm": 2.942082405090332,
"kl": 0.353515625,
"learning_rate": 1.8370621486116163e-06,
"loss": 0.0035,
"reward": 0.41904765367507935,
"reward_std": 0.05039527267217636,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.01904762163758278,
"rewards/format_reward": 1.0,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 485.14288330078125,
"epoch": 0.20520965692503176,
"grad_norm": 0.06295392662286758,
"kl": 0.330078125,
"learning_rate": 1.823760626407377e-06,
"loss": 0.0036,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 499.4285888671875,
"epoch": 0.20584498094027953,
"grad_norm": 2.458465099334717,
"kl": 0.34375,
"learning_rate": 1.8104979673521838e-06,
"loss": 0.0034,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 526.2857666015625,
"epoch": 0.20648030495552733,
"grad_norm": 3.1990699768066406,
"kl": 0.33203125,
"learning_rate": 1.7972747279206482e-06,
"loss": 0.0033,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 601.0,
"epoch": 0.2071156289707751,
"grad_norm": 0.05853183940052986,
"kl": 0.2890625,
"learning_rate": 1.7840914629334122e-06,
"loss": 0.0032,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 498.71429443359375,
"epoch": 0.20775095298602286,
"grad_norm": 0.07322244346141815,
"kl": 0.3125,
"learning_rate": 1.7709487255338731e-06,
"loss": 0.0034,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 553.857177734375,
"epoch": 0.20838627700127066,
"grad_norm": 0.17839957773685455,
"kl": 0.228515625,
"learning_rate": 1.7578470671649684e-06,
"loss": 0.0026,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 530.857177734375,
"epoch": 0.20902160101651843,
"grad_norm": 0.1131465956568718,
"kl": 0.3359375,
"learning_rate": 1.744787037546045e-06,
"loss": 0.0037,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 564.1428833007812,
"epoch": 0.2096569250317662,
"grad_norm": 0.08917635679244995,
"kl": 0.30078125,
"learning_rate": 1.731769184649788e-06,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 401.4285888671875,
"epoch": 0.210292249047014,
"grad_norm": 0.15413963794708252,
"kl": 0.36328125,
"learning_rate": 1.7187940546792325e-06,
"loss": 0.0039,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 535.0,
"epoch": 0.21092757306226176,
"grad_norm": 3.1670172214508057,
"kl": 0.306640625,
"learning_rate": 1.7058621920448465e-06,
"loss": 0.0031,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 477.14288330078125,
"epoch": 0.21156289707750953,
"grad_norm": 0.07635504752397537,
"kl": 0.333984375,
"learning_rate": 1.6929741393416855e-06,
"loss": 0.0036,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 405.71429443359375,
"epoch": 0.2121982210927573,
"grad_norm": 0.0835573673248291,
"kl": 0.361328125,
"learning_rate": 1.6801304373266286e-06,
"loss": 0.0039,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 556.0,
"epoch": 0.2128335451080051,
"grad_norm": 0.17580975592136383,
"kl": 0.310546875,
"learning_rate": 1.667331624895689e-06,
"loss": 0.0034,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 382.14288330078125,
"epoch": 0.21346886912325286,
"grad_norm": 2.9555137157440186,
"kl": 0.330078125,
"learning_rate": 1.6545782390614037e-06,
"loss": 0.0033,
"reward": 0.8285714983940125,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4285714626312256,
"rewards/format_reward": 1.0,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 400.14288330078125,
"epoch": 0.21410419313850063,
"grad_norm": 0.08389069885015488,
"kl": 0.2275390625,
"learning_rate": 1.6418708149302992e-06,
"loss": 0.0026,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 425.2857360839844,
"epoch": 0.21473951715374842,
"grad_norm": 2.6030876636505127,
"kl": 0.23046875,
"learning_rate": 1.6292098856804423e-06,
"loss": 0.0023,
"reward": 1.2571430206298828,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.8571429252624512,
"rewards/format_reward": 1.0,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 369.71429443359375,
"epoch": 0.2153748411689962,
"grad_norm": 0.09957047551870346,
"kl": 0.255859375,
"learning_rate": 1.6165959825390661e-06,
"loss": 0.0029,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 374.5714416503906,
"epoch": 0.21601016518424396,
"grad_norm": 3.5869476795196533,
"kl": 0.2216796875,
"learning_rate": 1.604029634760284e-06,
"loss": 0.0022,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 449.14288330078125,
"epoch": 0.21664548919949175,
"grad_norm": 0.04904744401574135,
"kl": 0.208984375,
"learning_rate": 1.59151136960288e-06,
"loss": 0.0024,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 592.0,
"epoch": 0.21728081321473952,
"grad_norm": 0.07155793160200119,
"kl": 0.29296875,
"learning_rate": 1.5790417123081903e-06,
"loss": 0.0032,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 498.14288330078125,
"epoch": 0.2179161372299873,
"grad_norm": 0.07623915374279022,
"kl": 0.34765625,
"learning_rate": 1.5666211860780583e-06,
"loss": 0.0038,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 439.2857360839844,
"epoch": 0.21855146124523506,
"grad_norm": 3.3097286224365234,
"kl": 0.330078125,
"learning_rate": 1.5542503120528918e-06,
"loss": 0.0033,
"reward": 0.5428571701049805,
"reward_std": 0.37796446681022644,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 504.4285888671875,
"epoch": 0.21918678526048285,
"grad_norm": 3.0239760875701904,
"kl": 0.3125,
"learning_rate": 1.5419296092897866e-06,
"loss": 0.0031,
"reward": 0.5071429014205933,
"reward_std": 0.14202801883220673,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.107142873108387,
"rewards/format_reward": 1.0,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 656.857177734375,
"epoch": 0.21982210927573062,
"grad_norm": 0.04605603963136673,
"kl": 0.279296875,
"learning_rate": 1.529659594740755e-06,
"loss": 0.0031,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 411.2857360839844,
"epoch": 0.2204574332909784,
"grad_norm": 0.05368569865822792,
"kl": 0.373046875,
"learning_rate": 1.5174407832310338e-06,
"loss": 0.004,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 626.0,
"epoch": 0.22109275730622618,
"grad_norm": 2.683948040008545,
"kl": 0.26171875,
"learning_rate": 1.5052736874374815e-06,
"loss": 0.0026,
"reward": 0.4285714626312256,
"reward_std": 0.04879499599337578,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.02857143059372902,
"rewards/format_reward": 1.0,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 500.857177734375,
"epoch": 0.22172808132147395,
"grad_norm": 2.875629186630249,
"kl": 0.203125,
"learning_rate": 1.4931588178670695e-06,
"loss": 0.002,
"reward": 0.8285714983940125,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4285714626312256,
"rewards/format_reward": 1.0,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 259.5714416503906,
"epoch": 0.22236340533672172,
"grad_norm": 0.2272838056087494,
"kl": 0.353515625,
"learning_rate": 1.4810966828354605e-06,
"loss": 0.0047,
"reward": 1.4000002145767212,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 479.71429443359375,
"epoch": 0.22299872935196952,
"grad_norm": 3.2993814945220947,
"kl": 0.21484375,
"learning_rate": 1.469087788445684e-06,
"loss": 0.0021,
"reward": 0.42500001192092896,
"reward_std": 0.04564352706074715,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.02500000223517418,
"rewards/format_reward": 1.0,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 498.5714416503906,
"epoch": 0.22363405336721728,
"grad_norm": 0.2275489717721939,
"kl": 0.333984375,
"learning_rate": 1.4571326385668965e-06,
"loss": 0.0036,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 470.857177734375,
"epoch": 0.22426937738246505,
"grad_norm": 0.05804005637764931,
"kl": 0.341796875,
"learning_rate": 1.4452317348132434e-06,
"loss": 0.0037,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 255.1428680419922,
"epoch": 0.22490470139771285,
"grad_norm": 0.30045151710510254,
"kl": 0.3125,
"learning_rate": 1.4333855765228104e-06,
"loss": 0.0034,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 401.8571472167969,
"epoch": 0.22554002541296062,
"grad_norm": 0.1624024361371994,
"kl": 0.36328125,
"learning_rate": 1.421594660736675e-06,
"loss": 0.0039,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 458.5714416503906,
"epoch": 0.22617534942820838,
"grad_norm": 0.06793898344039917,
"kl": 0.2294921875,
"learning_rate": 1.4098594821780476e-06,
"loss": 0.0026,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 354.2857360839844,
"epoch": 0.22681067344345615,
"grad_norm": 3.499160051345825,
"kl": 0.2412109375,
"learning_rate": 1.3981805332315174e-06,
"loss": 0.0024,
"reward": 1.057142972946167,
"reward_std": 0.47207754850387573,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.7142857313156128,
"rewards/format_reward": 0.8571429252624512,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 432.71429443359375,
"epoch": 0.22744599745870395,
"grad_norm": 3.1533403396606445,
"kl": 0.30859375,
"learning_rate": 1.3865583039223929e-06,
"loss": 0.0031,
"reward": 0.7381389141082764,
"reward_std": 0.2022811770439148,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.3381389379501343,
"rewards/format_reward": 1.0,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 287.8571472167969,
"epoch": 0.22808132147395171,
"grad_norm": 0.08542405813932419,
"kl": 0.3203125,
"learning_rate": 1.374993281896137e-06,
"loss": 0.0044,
"reward": 1.4000002145767212,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 557.5714721679688,
"epoch": 0.22871664548919948,
"grad_norm": 1.56419038772583,
"kl": 0.1650390625,
"learning_rate": 1.3634859523979134e-06,
"loss": 0.0016,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 356.4285888671875,
"epoch": 0.22935196950444728,
"grad_norm": 3.0232937335968018,
"kl": 0.224609375,
"learning_rate": 1.3520367982522208e-06,
"loss": 0.0022,
"reward": 1.1142858266830444,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.7142857313156128,
"rewards/format_reward": 1.0,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 594.1428833007812,
"epoch": 0.22998729351969505,
"grad_norm": 3.2974321842193604,
"kl": 0.1875,
"learning_rate": 1.3406462998426358e-06,
"loss": 0.0019,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 456.2857360839844,
"epoch": 0.23062261753494281,
"grad_norm": 3.04162859916687,
"kl": 0.2119140625,
"learning_rate": 1.3293149350916595e-06,
"loss": 0.0021,
"reward": 0.971428632736206,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.5714285969734192,
"rewards/format_reward": 1.0,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 532.4285888671875,
"epoch": 0.2312579415501906,
"grad_norm": 3.197749376296997,
"kl": 0.1748046875,
"learning_rate": 1.3180431794406623e-06,
"loss": 0.0017,
"reward": 0.5428571701049805,
"reward_std": 0.37796446681022644,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 449.71429443359375,
"epoch": 0.23189326556543838,
"grad_norm": 3.0233240127563477,
"kl": 0.2294921875,
"learning_rate": 1.3068315058299358e-06,
"loss": 0.0023,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 362.5714416503906,
"epoch": 0.23252858958068615,
"grad_norm": 0.06233400106430054,
"kl": 0.28515625,
"learning_rate": 1.2956803846788503e-06,
"loss": 0.0032,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 475.2857360839844,
"epoch": 0.2331639135959339,
"grad_norm": 0.04722089692950249,
"kl": 0.193359375,
"learning_rate": 1.284590283866116e-06,
"loss": 0.0022,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 644.5714721679688,
"epoch": 0.2337992376111817,
"grad_norm": 2.4597692489624023,
"kl": 0.265625,
"learning_rate": 1.2735616687101518e-06,
"loss": 0.0027,
"reward": 0.4714285731315613,
"reward_std": 0.18898221850395203,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0714285746216774,
"rewards/format_reward": 1.0,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 353.8571472167969,
"epoch": 0.23443456162642948,
"grad_norm": 3.595822811126709,
"kl": 0.255859375,
"learning_rate": 1.2625950019495614e-06,
"loss": 0.0026,
"reward": 0.6106783151626587,
"reward_std": 0.34680601954460144,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.21067820489406586,
"rewards/format_reward": 1.0,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 585.5714721679688,
"epoch": 0.23506988564167725,
"grad_norm": 0.05966843292117119,
"kl": 0.287109375,
"learning_rate": 1.251690743723718e-06,
"loss": 0.0032,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 315.4285888671875,
"epoch": 0.23570520965692504,
"grad_norm": 3.2647478580474854,
"kl": 0.26171875,
"learning_rate": 1.2408493515534581e-06,
"loss": 0.0026,
"reward": 1.1142858266830444,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.7142857313156128,
"rewards/format_reward": 1.0,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 503.0000305175781,
"epoch": 0.2363405336721728,
"grad_norm": 3.232701063156128,
"kl": 0.1650390625,
"learning_rate": 1.2300712803218834e-06,
"loss": 0.0017,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 551.5714721679688,
"epoch": 0.23697585768742058,
"grad_norm": 0.05632919818162918,
"kl": 0.2890625,
"learning_rate": 1.2193569822552772e-06,
"loss": 0.0032,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 407.0000305175781,
"epoch": 0.23761118170266837,
"grad_norm": 0.06528866291046143,
"kl": 0.228515625,
"learning_rate": 1.2087069069041268e-06,
"loss": 0.0026,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 378.8571472167969,
"epoch": 0.23824650571791614,
"grad_norm": 3.4633543491363525,
"kl": 0.29296875,
"learning_rate": 1.1981215011242654e-06,
"loss": 0.0029,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 499.0000305175781,
"epoch": 0.2388818297331639,
"grad_norm": 0.06882923096418381,
"kl": 0.2216796875,
"learning_rate": 1.1876012090581184e-06,
"loss": 0.0025,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 424.8571472167969,
"epoch": 0.23951715374841168,
"grad_norm": 0.048563580960035324,
"kl": 0.1806640625,
"learning_rate": 1.177146472116071e-06,
"loss": 0.0021,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 556.1428833007812,
"epoch": 0.24015247776365947,
"grad_norm": 3.4067060947418213,
"kl": 0.263671875,
"learning_rate": 1.1667577289579462e-06,
"loss": 0.0026,
"reward": 0.43296706676483154,
"reward_std": 0.06052277237176895,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.032967038452625275,
"rewards/format_reward": 1.0,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 415.0000305175781,
"epoch": 0.24078780177890724,
"grad_norm": 2.6581125259399414,
"kl": 0.3203125,
"learning_rate": 1.1564354154746007e-06,
"loss": 0.0032,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 473.0000305175781,
"epoch": 0.241423125794155,
"grad_norm": 0.12217168509960175,
"kl": 0.2734375,
"learning_rate": 1.146179964769635e-06,
"loss": 0.003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 358.5714416503906,
"epoch": 0.2420584498094028,
"grad_norm": 0.07087297737598419,
"kl": 0.337890625,
"learning_rate": 1.1359918071412195e-06,
"loss": 0.0037,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 666.7142944335938,
"epoch": 0.24269377382465057,
"grad_norm": 3.078494071960449,
"kl": 0.1669921875,
"learning_rate": 1.1258713700640456e-06,
"loss": 0.0017,
"reward": 0.5428571701049805,
"reward_std": 0.37796446681022644,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 573.5714721679688,
"epoch": 0.24332909783989834,
"grad_norm": 0.0404311902821064,
"kl": 0.25,
"learning_rate": 1.115819078171383e-06,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 508.857177734375,
"epoch": 0.24396442185514614,
"grad_norm": 2.985482931137085,
"kl": 0.1787109375,
"learning_rate": 1.1058353532372667e-06,
"loss": 0.0018,
"reward": 0.37460315227508545,
"reward_std": 0.07708179205656052,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0031746032182127237,
"rewards/format_reward": 1.0,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 499.857177734375,
"epoch": 0.2445997458703939,
"grad_norm": 2.4948058128356934,
"kl": 0.251953125,
"learning_rate": 1.0959206141587998e-06,
"loss": 0.0025,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 400.2857360839844,
"epoch": 0.24523506988564167,
"grad_norm": 3.1324355602264404,
"kl": 0.2119140625,
"learning_rate": 1.0860752769385766e-06,
"loss": 0.0021,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 425.14288330078125,
"epoch": 0.24587039390088947,
"grad_norm": 0.061597954481840134,
"kl": 0.28515625,
"learning_rate": 1.0762997546672279e-06,
"loss": 0.0032,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 500.4285888671875,
"epoch": 0.24650571791613723,
"grad_norm": 0.047814078629016876,
"kl": 0.2021484375,
"learning_rate": 1.0665944575060914e-06,
"loss": 0.0023,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 375.71429443359375,
"epoch": 0.247141041931385,
"grad_norm": 3.590963363647461,
"kl": 0.22265625,
"learning_rate": 1.056959792669997e-06,
"loss": 0.0022,
"reward": 0.5285714268684387,
"reward_std": 0.11126972734928131,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.12857143580913544,
"rewards/format_reward": 1.0,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 470.0000305175781,
"epoch": 0.24777636594663277,
"grad_norm": 0.10802248120307922,
"kl": 0.376953125,
"learning_rate": 1.0473961644101856e-06,
"loss": 0.0041,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 475.14288330078125,
"epoch": 0.24841168996188057,
"grad_norm": 0.18473738431930542,
"kl": 0.318359375,
"learning_rate": 1.037903973997345e-06,
"loss": 0.0035,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 455.5714416503906,
"epoch": 0.24904701397712833,
"grad_norm": 4.052010536193848,
"kl": 0.349609375,
"learning_rate": 1.0284836197047737e-06,
"loss": 0.0035,
"reward": 0.49629050493240356,
"reward_std": 0.17879442870616913,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0962904542684555,
"rewards/format_reward": 1.0,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 437.0000305175781,
"epoch": 0.2496823379923761,
"grad_norm": 2.7656362056732178,
"kl": 0.2158203125,
"learning_rate": 1.0191354967916712e-06,
"loss": 0.0022,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 513.2857666015625,
"epoch": 0.2503176620076239,
"grad_norm": 0.08351174741983414,
"kl": 0.275390625,
"learning_rate": 1.0098599974865515e-06,
"loss": 0.003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 429.4285888671875,
"epoch": 0.25095298602287164,
"grad_norm": 0.061210744082927704,
"kl": 0.248046875,
"learning_rate": 1.0006575109707898e-06,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 506.71429443359375,
"epoch": 0.25158831003811943,
"grad_norm": 0.06361155211925507,
"kl": 0.2734375,
"learning_rate": 9.915284233622877e-07,
"loss": 0.003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 431.4285888671875,
"epoch": 0.25222363405336723,
"grad_norm": 0.06987325847148895,
"kl": 0.203125,
"learning_rate": 9.824731176992796e-07,
"loss": 0.0023,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 355.14288330078125,
"epoch": 0.25285895806861497,
"grad_norm": 0.1327361762523651,
"kl": 0.30859375,
"learning_rate": 9.734919739242543e-07,
"loss": 0.0034,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 417.5714416503906,
"epoch": 0.25349428208386277,
"grad_norm": 3.1446924209594727,
"kl": 0.333984375,
"learning_rate": 9.645853688680177e-07,
"loss": 0.0033,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 389.5714416503906,
"epoch": 0.25412960609911056,
"grad_norm": 0.048515211790800095,
"kl": 0.1787109375,
"learning_rate": 9.557536762338786e-07,
"loss": 0.0021,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 399.14288330078125,
"epoch": 0.2547649301143583,
"grad_norm": 0.053919967263936996,
"kl": 0.2109375,
"learning_rate": 9.46997266581973e-07,
"loss": 0.0024,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 408.2857360839844,
"epoch": 0.2554002541296061,
"grad_norm": 3.3761556148529053,
"kl": 0.310546875,
"learning_rate": 9.383165073137115e-07,
"loss": 0.0031,
"reward": 0.44395607709884644,
"reward_std": 0.04111712798476219,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.04395604878664017,
"rewards/format_reward": 1.0,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 423.8571472167969,
"epoch": 0.2560355781448539,
"grad_norm": 0.17396095395088196,
"kl": 0.3515625,
"learning_rate": 9.297117626563687e-07,
"loss": 0.0038,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 589.7142944335938,
"epoch": 0.25667090216010163,
"grad_norm": 0.44405487179756165,
"kl": 0.318359375,
"learning_rate": 9.211833936477957e-07,
"loss": 0.0035,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 416.0000305175781,
"epoch": 0.25730622617534943,
"grad_norm": 0.06627284735441208,
"kl": 0.29296875,
"learning_rate": 9.127317581212753e-07,
"loss": 0.0032,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 381.0000305175781,
"epoch": 0.2579415501905972,
"grad_norm": 0.10693392902612686,
"kl": 0.30078125,
"learning_rate": 9.043572106905084e-07,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 476.0000305175781,
"epoch": 0.25857687420584496,
"grad_norm": 0.04814046248793602,
"kl": 0.26953125,
"learning_rate": 8.960601027347321e-07,
"loss": 0.003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 390.4285888671875,
"epoch": 0.25921219822109276,
"grad_norm": 0.06620350480079651,
"kl": 0.21484375,
"learning_rate": 8.878407823839788e-07,
"loss": 0.0024,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 444.4285888671875,
"epoch": 0.25984752223634056,
"grad_norm": 3.463074207305908,
"kl": 0.376953125,
"learning_rate": 8.796995945044689e-07,
"loss": 0.0038,
"reward": 0.4129870533943176,
"reward_std": 0.034360405057668686,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.012987013906240463,
"rewards/format_reward": 1.0,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 442.5714416503906,
"epoch": 0.2604828462515883,
"grad_norm": 2.6474926471710205,
"kl": 0.287109375,
"learning_rate": 8.716368806841405e-07,
"loss": 0.0029,
"reward": 0.8285714983940125,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4285714626312256,
"rewards/format_reward": 1.0,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 393.4285888671875,
"epoch": 0.2611181702668361,
"grad_norm": 3.3637120723724365,
"kl": 0.203125,
"learning_rate": 8.636529792183171e-07,
"loss": 0.002,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 441.857177734375,
"epoch": 0.2617534942820839,
"grad_norm": 2.904768466949463,
"kl": 0.2060546875,
"learning_rate": 8.557482250955144e-07,
"loss": 0.0021,
"reward": 0.6987013220787048,
"reward_std": 0.21078045666217804,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.29870128631591797,
"rewards/format_reward": 1.0,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 399.14288330078125,
"epoch": 0.2623888182973316,
"grad_norm": 0.10355106741189957,
"kl": 0.3359375,
"learning_rate": 8.479229499833844e-07,
"loss": 0.0037,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 456.0000305175781,
"epoch": 0.2630241423125794,
"grad_norm": 0.13930782675743103,
"kl": 0.294921875,
"learning_rate": 8.401774822147976e-07,
"loss": 0.0032,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 443.4285888671875,
"epoch": 0.2636594663278272,
"grad_norm": 3.3897175788879395,
"kl": 0.306640625,
"learning_rate": 8.325121467740695e-07,
"loss": 0.0031,
"reward": 0.5224490165710449,
"reward_std": 0.12853363156318665,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.12244899570941925,
"rewards/format_reward": 1.0,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 550.1428833007812,
"epoch": 0.26429479034307496,
"grad_norm": 0.05424835905432701,
"kl": 0.25390625,
"learning_rate": 8.249272652833226e-07,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 482.14288330078125,
"epoch": 0.26493011435832275,
"grad_norm": 0.2863621115684509,
"kl": 0.3203125,
"learning_rate": 8.174231559889931e-07,
"loss": 0.0035,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 486.5714416503906,
"epoch": 0.2655654383735705,
"grad_norm": 0.05610418692231178,
"kl": 0.28515625,
"learning_rate": 8.100001337484787e-07,
"loss": 0.0031,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 466.2857360839844,
"epoch": 0.2662007623888183,
"grad_norm": 0.07580099999904633,
"kl": 0.322265625,
"learning_rate": 8.026585100169251e-07,
"loss": 0.0035,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 478.71429443359375,
"epoch": 0.2668360864040661,
"grad_norm": 3.3928205966949463,
"kl": 0.23046875,
"learning_rate": 7.953985928341601e-07,
"loss": 0.0023,
"reward": 0.4655141234397888,
"reward_std": 0.08343012630939484,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.06551411002874374,
"rewards/format_reward": 1.0,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 494.0000305175781,
"epoch": 0.2674714104193138,
"grad_norm": 0.04618528112769127,
"kl": 0.1669921875,
"learning_rate": 7.882206868117693e-07,
"loss": 0.002,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 441.5714416503906,
"epoch": 0.2681067344345616,
"grad_norm": 2.9898011684417725,
"kl": 0.28515625,
"learning_rate": 7.81125093120313e-07,
"loss": 0.0028,
"reward": 0.8285714983940125,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4285714626312256,
"rewards/format_reward": 1.0,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 462.4285888671875,
"epoch": 0.2687420584498094,
"grad_norm": 0.056119803339242935,
"kl": 0.3046875,
"learning_rate": 7.741121094766916e-07,
"loss": 0.0034,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 312.2857360839844,
"epoch": 0.26937738246505716,
"grad_norm": 3.943939447402954,
"kl": 0.314453125,
"learning_rate": 7.671820301316532e-07,
"loss": 0.0031,
"reward": 0.48671239614486694,
"reward_std": 0.044492121785879135,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.08671241253614426,
"rewards/format_reward": 1.0,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 422.4285888671875,
"epoch": 0.27001270648030495,
"grad_norm": 0.04282496124505997,
"kl": 0.16015625,
"learning_rate": 7.603351458574474e-07,
"loss": 0.0019,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 496.2857360839844,
"epoch": 0.27064803049555275,
"grad_norm": 0.059300344437360764,
"kl": 0.26953125,
"learning_rate": 7.535717439356255e-07,
"loss": 0.003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 476.14288330078125,
"epoch": 0.2712833545108005,
"grad_norm": 2.4999890327453613,
"kl": 0.14453125,
"learning_rate": 7.46892108144986e-07,
"loss": 0.0014,
"reward": 0.7171429395675659,
"reward_std": 0.4510992467403412,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.31714287400245667,
"rewards/format_reward": 1.0,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 488.71429443359375,
"epoch": 0.2719186785260483,
"grad_norm": 0.052872247993946075,
"kl": 0.279296875,
"learning_rate": 7.402965187496697e-07,
"loss": 0.0031,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 475.857177734375,
"epoch": 0.2725540025412961,
"grad_norm": 2.7971818447113037,
"kl": 0.1826171875,
"learning_rate": 7.337852524873974e-07,
"loss": 0.0018,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 647.1428833007812,
"epoch": 0.2731893265565438,
"grad_norm": 2.0566720962524414,
"kl": 0.2255859375,
"learning_rate": 7.273585825578608e-07,
"loss": 0.0023,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 432.8571472167969,
"epoch": 0.2738246505717916,
"grad_norm": 2.9699957370758057,
"kl": 0.27734375,
"learning_rate": 7.21016778611259e-07,
"loss": 0.0028,
"reward": 0.8285714983940125,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4285714626312256,
"rewards/format_reward": 1.0,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 419.14288330078125,
"epoch": 0.2744599745870394,
"grad_norm": 0.10036282241344452,
"kl": 0.31640625,
"learning_rate": 7.147601067369835e-07,
"loss": 0.0035,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 323.71429443359375,
"epoch": 0.27509529860228715,
"grad_norm": 3.8502118587493896,
"kl": 0.275390625,
"learning_rate": 7.085888294524561e-07,
"loss": 0.0028,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 631.2857666015625,
"epoch": 0.27573062261753495,
"grad_norm": 0.05429592728614807,
"kl": 0.23828125,
"learning_rate": 7.025032056921117e-07,
"loss": 0.0027,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 455.71429443359375,
"epoch": 0.27636594663278274,
"grad_norm": 3.2364959716796875,
"kl": 0.29296875,
"learning_rate": 6.965034907965349e-07,
"loss": 0.0029,
"reward": 0.5190476179122925,
"reward_std": 0.09449110925197601,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1190476268529892,
"rewards/format_reward": 1.0,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 419.0000305175781,
"epoch": 0.2770012706480305,
"grad_norm": 3.7509405612945557,
"kl": 0.1748046875,
"learning_rate": 6.905899365017462e-07,
"loss": 0.0017,
"reward": 0.4330357313156128,
"reward_std": 0.0516289584338665,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.03303571790456772,
"rewards/format_reward": 1.0,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 386.5714416503906,
"epoch": 0.2776365946632783,
"grad_norm": 0.05398930609226227,
"kl": 0.314453125,
"learning_rate": 6.847627909286409e-07,
"loss": 0.0034,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 474.4285888671875,
"epoch": 0.2782719186785261,
"grad_norm": 2.960287094116211,
"kl": 0.181640625,
"learning_rate": 6.790222985725761e-07,
"loss": 0.0018,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 492.4285888671875,
"epoch": 0.2789072426937738,
"grad_norm": 3.587923288345337,
"kl": 0.2470703125,
"learning_rate": 6.733687002931141e-07,
"loss": 0.0025,
"reward": 0.4714285731315613,
"reward_std": 0.18898221850395203,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0714285746216774,
"rewards/format_reward": 1.0,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 425.5714416503906,
"epoch": 0.2795425667090216,
"grad_norm": 2.850022077560425,
"kl": 0.1650390625,
"learning_rate": 6.678022333039158e-07,
"loss": 0.0016,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 397.4285888671875,
"epoch": 0.28017789072426935,
"grad_norm": 0.045945875346660614,
"kl": 0.265625,
"learning_rate": 6.623231311627876e-07,
"loss": 0.003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 554.5714721679688,
"epoch": 0.28081321473951715,
"grad_norm": 0.06309759616851807,
"kl": 0.265625,
"learning_rate": 6.569316237618811e-07,
"loss": 0.003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 318.0,
"epoch": 0.28144853875476494,
"grad_norm": 3.4273765087127686,
"kl": 0.234375,
"learning_rate": 6.516279373180499e-07,
"loss": 0.0023,
"reward": 0.971428632736206,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.5714285969734192,
"rewards/format_reward": 1.0,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 551.4285888671875,
"epoch": 0.2820838627700127,
"grad_norm": 0.059127844870090485,
"kl": 0.2490234375,
"learning_rate": 6.464122943633543e-07,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 388.71429443359375,
"epoch": 0.2827191867852605,
"grad_norm": 0.058766093105077744,
"kl": 0.302734375,
"learning_rate": 6.412849137357271e-07,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 484.2857360839844,
"epoch": 0.2833545108005083,
"grad_norm": 0.05756891146302223,
"kl": 0.275390625,
"learning_rate": 6.3624601056979e-07,
"loss": 0.003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 397.4285888671875,
"epoch": 0.283989834815756,
"grad_norm": 3.6736083030700684,
"kl": 0.30859375,
"learning_rate": 6.312957962878278e-07,
"loss": 0.0031,
"reward": 0.4039139151573181,
"reward_std": 0.01035518478602171,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.003913894295692444,
"rewards/format_reward": 1.0,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 528.857177734375,
"epoch": 0.2846251588310038,
"grad_norm": 0.047244369983673096,
"kl": 0.259765625,
"learning_rate": 6.264344785909181e-07,
"loss": 0.0029,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 432.5714416503906,
"epoch": 0.2852604828462516,
"grad_norm": 0.06841956079006195,
"kl": 0.298828125,
"learning_rate": 6.216622614502149e-07,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 538.1428833007812,
"epoch": 0.28589580686149935,
"grad_norm": 0.052464455366134644,
"kl": 0.25,
"learning_rate": 6.169793450983916e-07,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 490.71429443359375,
"epoch": 0.28653113087674714,
"grad_norm": 0.04570423439145088,
"kl": 0.2470703125,
"learning_rate": 6.123859260212393e-07,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 400.14288330078125,
"epoch": 0.28716645489199494,
"grad_norm": 0.05332702025771141,
"kl": 0.306640625,
"learning_rate": 6.07882196949423e-07,
"loss": 0.0034,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 338.8571472167969,
"epoch": 0.2878017789072427,
"grad_norm": 3.625446081161499,
"kl": 0.1982421875,
"learning_rate": 6.034683468503948e-07,
"loss": 0.002,
"reward": 0.8285714983940125,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4285714626312256,
"rewards/format_reward": 1.0,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 361.14288330078125,
"epoch": 0.2884371029224905,
"grad_norm": 3.3959381580352783,
"kl": 0.375,
"learning_rate": 5.991445609204641e-07,
"loss": 0.0037,
"reward": 0.8285714983940125,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4285714626312256,
"rewards/format_reward": 1.0,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 404.2857360839844,
"epoch": 0.28907242693773827,
"grad_norm": 0.06705067306756973,
"kl": 0.27734375,
"learning_rate": 5.949110205770292e-07,
"loss": 0.0031,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 368.14288330078125,
"epoch": 0.289707750952986,
"grad_norm": 0.0792422741651535,
"kl": 0.30078125,
"learning_rate": 5.90767903450964e-07,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 356.0000305175781,
"epoch": 0.2903430749682338,
"grad_norm": 3.4945454597473145,
"kl": 0.1982421875,
"learning_rate": 5.867153833791652e-07,
"loss": 0.002,
"reward": 0.4571428596973419,
"reward_std": 0.053452249616384506,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.05714286118745804,
"rewards/format_reward": 1.0,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 371.5714416503906,
"epoch": 0.2909783989834816,
"grad_norm": 3.341874837875366,
"kl": 0.3203125,
"learning_rate": 5.827536303972587e-07,
"loss": 0.0032,
"reward": 0.44761908054351807,
"reward_std": 0.12598817050457,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0476190522313118,
"rewards/format_reward": 1.0,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 395.8571472167969,
"epoch": 0.29161372299872934,
"grad_norm": 0.0777309313416481,
"kl": 0.30078125,
"learning_rate": 5.78882810732465e-07,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 461.857177734375,
"epoch": 0.29224904701397714,
"grad_norm": 0.0932033360004425,
"kl": 0.3203125,
"learning_rate": 5.75103086796625e-07,
"loss": 0.0035,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 419.4285888671875,
"epoch": 0.2928843710292249,
"grad_norm": 3.0678720474243164,
"kl": 0.1650390625,
"learning_rate": 5.714146171793846e-07,
"loss": 0.0016,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 438.71429443359375,
"epoch": 0.2935196950444727,
"grad_norm": 0.08228779584169388,
"kl": 0.216796875,
"learning_rate": 5.678175566415422e-07,
"loss": 0.0025,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 443.71429443359375,
"epoch": 0.29415501905972047,
"grad_norm": 2.509739398956299,
"kl": 0.349609375,
"learning_rate": 5.643120561085528e-07,
"loss": 0.0035,
"reward": 0.5428571701049805,
"reward_std": 0.3779645264148712,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 476.857177734375,
"epoch": 0.2947903430749682,
"grad_norm": 0.0598638616502285,
"kl": 0.26171875,
"learning_rate": 5.608982626641991e-07,
"loss": 0.0029,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 508.0000305175781,
"epoch": 0.295425667090216,
"grad_norm": 0.04956334829330444,
"kl": 0.2109375,
"learning_rate": 5.575763195444166e-07,
"loss": 0.0024,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 519.1428833007812,
"epoch": 0.2960609911054638,
"grad_norm": 3.209406614303589,
"kl": 0.275390625,
"learning_rate": 5.543463661312847e-07,
"loss": 0.0027,
"reward": 0.5428571701049805,
"reward_std": 0.37796446681022644,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 440.857177734375,
"epoch": 0.29669631512071154,
"grad_norm": 0.05284934490919113,
"kl": 0.26171875,
"learning_rate": 5.512085379471808e-07,
"loss": 0.0029,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 445.2857360839844,
"epoch": 0.29733163913595934,
"grad_norm": 3.3668439388275146,
"kl": 0.29296875,
"learning_rate": 5.481629666490903e-07,
"loss": 0.0029,
"reward": 0.485714316368103,
"reward_std": 0.08997353911399841,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.08571429550647736,
"rewards/format_reward": 1.0,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 541.0,
"epoch": 0.29796696315120713,
"grad_norm": 2.8946890830993652,
"kl": 0.2216796875,
"learning_rate": 5.452097800230853e-07,
"loss": 0.0022,
"reward": 0.3428571820259094,
"reward_std": 0.1511857956647873,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 425.4285888671875,
"epoch": 0.29860228716645487,
"grad_norm": 0.04910692945122719,
"kl": 0.1669921875,
"learning_rate": 5.423491019789623e-07,
"loss": 0.002,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 238.2857208251953,
"epoch": 0.29923761118170267,
"grad_norm": 0.06361490488052368,
"kl": 0.208984375,
"learning_rate": 5.395810525450425e-07,
"loss": 0.0033,
"reward": 1.4000002145767212,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 395.8571472167969,
"epoch": 0.29987293519695046,
"grad_norm": 0.04494641348719597,
"kl": 0.2041015625,
"learning_rate": 5.369057478631359e-07,
"loss": 0.0023,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 438.0000305175781,
"epoch": 0.3005082592121982,
"grad_norm": 3.899231195449829,
"kl": 0.2099609375,
"learning_rate": 5.343233001836694e-07,
"loss": 0.0021,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 566.1428833007812,
"epoch": 0.301143583227446,
"grad_norm": 2.999528646469116,
"kl": 0.1689453125,
"learning_rate": 5.318338178609754e-07,
"loss": 0.0017,
"reward": 0.42418450117111206,
"reward_std": 0.06398611515760422,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.02418447658419609,
"rewards/format_reward": 1.0,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 505.2857360839844,
"epoch": 0.3017789072426938,
"grad_norm": 0.056534409523010254,
"kl": 0.2138671875,
"learning_rate": 5.294374053487459e-07,
"loss": 0.0024,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 376.71429443359375,
"epoch": 0.30241423125794153,
"grad_norm": 3.721620798110962,
"kl": 0.212890625,
"learning_rate": 5.271341631956511e-07,
"loss": 0.0021,
"reward": 0.5836734771728516,
"reward_std": 0.3662114441394806,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.18367348611354828,
"rewards/format_reward": 1.0,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 492.14288330078125,
"epoch": 0.30304955527318933,
"grad_norm": 0.05311813950538635,
"kl": 0.251953125,
"learning_rate": 5.249241880411181e-07,
"loss": 0.0028,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 510.0000305175781,
"epoch": 0.3036848792884371,
"grad_norm": 3.715238332748413,
"kl": 0.240234375,
"learning_rate": 5.228075726112785e-07,
"loss": 0.0024,
"reward": 0.5428571701049805,
"reward_std": 0.3779645264148712,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 455.0000305175781,
"epoch": 0.30432020330368487,
"grad_norm": 3.2647809982299805,
"kl": 0.1640625,
"learning_rate": 5.207844057150768e-07,
"loss": 0.0016,
"reward": 0.5428571701049805,
"reward_std": 0.37796446681022644,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 481.71429443359375,
"epoch": 0.30495552731893266,
"grad_norm": 0.06345637142658234,
"kl": 0.265625,
"learning_rate": 5.188547722405437e-07,
"loss": 0.003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 439.14288330078125,
"epoch": 0.30559085133418046,
"grad_norm": 0.06628384441137314,
"kl": 0.26953125,
"learning_rate": 5.170187531512351e-07,
"loss": 0.003,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 422.14288330078125,
"epoch": 0.3062261753494282,
"grad_norm": 2.7409112453460693,
"kl": 0.1591796875,
"learning_rate": 5.152764254828348e-07,
"loss": 0.0016,
"reward": 0.5428571701049805,
"reward_std": 0.3779645562171936,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 573.1428833007812,
"epoch": 0.306861499364676,
"grad_norm": 0.041587937623262405,
"kl": 0.140625,
"learning_rate": 5.136278623399225e-07,
"loss": 0.0017,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 375.0000305175781,
"epoch": 0.30749682337992373,
"grad_norm": 2.9096410274505615,
"kl": 0.2001953125,
"learning_rate": 5.120731328929058e-07,
"loss": 0.002,
"reward": 0.971428632736206,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.5714285969734192,
"rewards/format_reward": 1.0,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 454.0000305175781,
"epoch": 0.30813214739517153,
"grad_norm": 3.31744122505188,
"kl": 0.21484375,
"learning_rate": 5.106123023751187e-07,
"loss": 0.0021,
"reward": 0.8511278033256531,
"reward_std": 0.23666103184223175,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.451127827167511,
"rewards/format_reward": 1.0,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 416.4285888671875,
"epoch": 0.3087674714104193,
"grad_norm": 2.8466956615448,
"kl": 0.19140625,
"learning_rate": 5.092454320800833e-07,
"loss": 0.0019,
"reward": 0.4338059425354004,
"reward_std": 0.06710861623287201,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.03380589187145233,
"rewards/format_reward": 1.0,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 466.4285888671875,
"epoch": 0.30940279542566707,
"grad_norm": 5.7825751304626465,
"kl": 0.65234375,
"learning_rate": 5.079725793589405e-07,
"loss": 0.0068,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 358.4285888671875,
"epoch": 0.31003811944091486,
"grad_norm": 0.06072893738746643,
"kl": 0.1953125,
"learning_rate": 5.067937976180407e-07,
"loss": 0.0022,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 504.2857360839844,
"epoch": 0.31067344345616266,
"grad_norm": 0.03951825201511383,
"kl": 0.1630859375,
"learning_rate": 5.057091363167046e-07,
"loss": 0.0019,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 235.1428680419922,
"epoch": 0.3113087674714104,
"grad_norm": 3.270069122314453,
"kl": 0.224609375,
"learning_rate": 5.047186409651489e-07,
"loss": 0.0022,
"reward": 1.3428572416305542,
"reward_std": 0.15118582546710968,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 1.0,
"rewards/format_reward": 0.8571429252624512,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 509.0000305175781,
"epoch": 0.3119440914866582,
"grad_norm": 0.1327732652425766,
"kl": 0.298828125,
"learning_rate": 5.038223531225742e-07,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 507.2857360839844,
"epoch": 0.312579415501906,
"grad_norm": 3.286302328109741,
"kl": 0.2353515625,
"learning_rate": 5.030203103954232e-07,
"loss": 0.0024,
"reward": 0.37142860889434814,
"reward_std": 0.07559289783239365,
"rewards/code_format_reward": 0.8571429252624512,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 433.71429443359375,
"epoch": 0.31321473951715373,
"grad_norm": 0.04749950394034386,
"kl": 0.173828125,
"learning_rate": 5.023125464358026e-07,
"loss": 0.002,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 402.2857360839844,
"epoch": 0.3138500635324015,
"grad_norm": 3.375169515609741,
"kl": 0.26171875,
"learning_rate": 5.016990909400709e-07,
"loss": 0.0026,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 491.857177734375,
"epoch": 0.3144853875476493,
"grad_norm": 0.05139714851975441,
"kl": 0.296875,
"learning_rate": 5.011799696475915e-07,
"loss": 0.0033,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 323.5714416503906,
"epoch": 0.31512071156289706,
"grad_norm": 3.7574667930603027,
"kl": 0.181640625,
"learning_rate": 5.007552043396547e-07,
"loss": 0.0018,
"reward": 0.5428571701049805,
"reward_std": 0.3779645264148712,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.1428571492433548,
"rewards/format_reward": 1.0,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 534.857177734375,
"epoch": 0.31575603557814486,
"grad_norm": 2.853102445602417,
"kl": 0.1806640625,
"learning_rate": 5.004248128385618e-07,
"loss": 0.0018,
"reward": 0.6857143640518188,
"reward_std": 0.4879501163959503,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.2857142984867096,
"rewards/format_reward": 1.0,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 422.8571472167969,
"epoch": 0.31639135959339265,
"grad_norm": 3.089301824569702,
"kl": 0.1923828125,
"learning_rate": 5.001888090068784e-07,
"loss": 0.0019,
"reward": 0.8285714983940125,
"reward_std": 0.534522533416748,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.4285714626312256,
"rewards/format_reward": 1.0,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 451.2857360839844,
"epoch": 0.3170266836086404,
"grad_norm": 5.161578178405762,
"kl": 0.57421875,
"learning_rate": 5.000472027468528e-07,
"loss": 0.0057,
"reward": 0.2857142984867096,
"reward_std": 0.1951800137758255,
"rewards/code_format_reward": 0.7142857313156128,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7142857313156128,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 611.4285888671875,
"epoch": 0.3176620076238882,
"grad_norm": 0.048863135278224945,
"kl": 0.2060546875,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0024,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 500
},
{
"clip_ratio": 0.0,
"completion_length": 444.857177734375,
"epoch": 0.318297331639136,
"grad_norm": 0.047799259424209595,
"kl": 0.19921875,
"learning_rate": 5.000472027468528e-07,
"loss": 0.0023,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 501
},
{
"clip_ratio": 0.0,
"completion_length": 446.5714416503906,
"epoch": 0.3189326556543837,
"grad_norm": 0.15391913056373596,
"kl": 0.283203125,
"learning_rate": 5.001888090068784e-07,
"loss": 0.0031,
"reward": 0.40000003576278687,
"reward_std": 0.0,
"rewards/code_format_reward": 1.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 502
},
{
"epoch": 0.3189326556543837,
"step": 502,
"total_flos": 0.0,
"train_loss": 6.229176085843033e-06,
"train_runtime": 154.2548,
"train_samples_per_second": 22.69,
"train_steps_per_second": 3.241
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}