|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 100, |
|
"global_step": 1294, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 552.1678833007812, |
|
"epoch": 0.0038639876352395673, |
|
"grad_norm": 1.1229428052902222, |
|
"kl": 0.0001697540283203125, |
|
"learning_rate": 1.9230769230769234e-07, |
|
"loss": 0.0, |
|
"reward": 0.9830357611179352, |
|
"reward_std": 0.36204318702220917, |
|
"rewards/accuracy_reward": 0.7892857611179351, |
|
"rewards/format_reward": 0.1937500096857548, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 561.3339538574219, |
|
"epoch": 0.0077279752704791345, |
|
"grad_norm": 0.6426151990890503, |
|
"kl": 0.0002849578857421875, |
|
"learning_rate": 3.846153846153847e-07, |
|
"loss": 0.0, |
|
"reward": 0.9151786208152771, |
|
"reward_std": 0.413673534989357, |
|
"rewards/accuracy_reward": 0.7642857551574707, |
|
"rewards/format_reward": 0.15089286640286445, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 571.7214538574219, |
|
"epoch": 0.011591962905718702, |
|
"grad_norm": 1.2495057582855225, |
|
"kl": 0.00039796829223632814, |
|
"learning_rate": 5.76923076923077e-07, |
|
"loss": 0.0, |
|
"reward": 0.9214286088943482, |
|
"reward_std": 0.366368842124939, |
|
"rewards/accuracy_reward": 0.7767857491970063, |
|
"rewards/format_reward": 0.14464286342263222, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 575.4223388671875, |
|
"epoch": 0.015455950540958269, |
|
"grad_norm": 0.5862982869148254, |
|
"kl": 0.0009944915771484375, |
|
"learning_rate": 7.692307692307694e-07, |
|
"loss": 0.0, |
|
"reward": 0.8276786148548126, |
|
"reward_std": 0.350461420416832, |
|
"rewards/accuracy_reward": 0.6660714685916901, |
|
"rewards/format_reward": 0.16160715073347093, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 523.0955596923828, |
|
"epoch": 0.019319938176197836, |
|
"grad_norm": 0.7838996052742004, |
|
"kl": 0.004752349853515625, |
|
"learning_rate": 9.615384615384617e-07, |
|
"loss": 0.0002, |
|
"reward": 0.9107143223285675, |
|
"reward_std": 0.38466152399778364, |
|
"rewards/accuracy_reward": 0.6642857491970062, |
|
"rewards/format_reward": 0.24642858058214187, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 520.2955688476562, |
|
"epoch": 0.023183925811437404, |
|
"grad_norm": 0.7546085119247437, |
|
"kl": 0.002777099609375, |
|
"learning_rate": 1.153846153846154e-06, |
|
"loss": 0.0001, |
|
"reward": 0.979464328289032, |
|
"reward_std": 0.3285092800855637, |
|
"rewards/accuracy_reward": 0.7267857491970062, |
|
"rewards/format_reward": 0.2526785835623741, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 564.7143005371094, |
|
"epoch": 0.02704791344667697, |
|
"grad_norm": 0.8112598657608032, |
|
"kl": 0.00201263427734375, |
|
"learning_rate": 1.3461538461538462e-06, |
|
"loss": 0.0001, |
|
"reward": 1.008035773038864, |
|
"reward_std": 0.3334413096308708, |
|
"rewards/accuracy_reward": 0.8553571760654449, |
|
"rewards/format_reward": 0.15267857983708383, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 553.3607482910156, |
|
"epoch": 0.030911901081916538, |
|
"grad_norm": 0.7396882176399231, |
|
"kl": 0.004928970336914062, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 0.0002, |
|
"reward": 1.0133929133415223, |
|
"reward_std": 0.337762188911438, |
|
"rewards/accuracy_reward": 0.8500000417232514, |
|
"rewards/format_reward": 0.16339286509901285, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 549.3768127441406, |
|
"epoch": 0.0347758887171561, |
|
"grad_norm": 0.8082935214042664, |
|
"kl": 0.00592498779296875, |
|
"learning_rate": 1.7307692307692308e-06, |
|
"loss": 0.0002, |
|
"reward": 1.0535714626312256, |
|
"reward_std": 0.3635849982500076, |
|
"rewards/accuracy_reward": 0.905357176065445, |
|
"rewards/format_reward": 0.14821429401636124, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 545.2357391357422, |
|
"epoch": 0.03863987635239567, |
|
"grad_norm": 0.467140793800354, |
|
"kl": 0.007464599609375, |
|
"learning_rate": 1.9230769230769234e-06, |
|
"loss": 0.0003, |
|
"reward": 0.92857146859169, |
|
"reward_std": 0.3401346325874329, |
|
"rewards/accuracy_reward": 0.6875000327825547, |
|
"rewards/format_reward": 0.24107143878936768, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 559.1250305175781, |
|
"epoch": 0.04250386398763524, |
|
"grad_norm": 0.6232195496559143, |
|
"kl": 0.00318756103515625, |
|
"learning_rate": 2.1153846153846155e-06, |
|
"loss": 0.0001, |
|
"reward": 1.0241072118282317, |
|
"reward_std": 0.43311036825180055, |
|
"rewards/accuracy_reward": 0.8714286267757416, |
|
"rewards/format_reward": 0.15267857685685157, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 532.1384124755859, |
|
"epoch": 0.04636785162287481, |
|
"grad_norm": 0.6029495000839233, |
|
"kl": 0.009820556640625, |
|
"learning_rate": 2.307692307692308e-06, |
|
"loss": 0.0004, |
|
"reward": 1.0446429073810577, |
|
"reward_std": 0.37980674505233764, |
|
"rewards/accuracy_reward": 0.8410714626312256, |
|
"rewards/format_reward": 0.2035714380443096, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 496.79644775390625, |
|
"epoch": 0.05023183925811438, |
|
"grad_norm": 0.5804664492607117, |
|
"kl": 0.011712646484375, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0005, |
|
"reward": 1.1000000298023225, |
|
"reward_std": 0.3491546869277954, |
|
"rewards/accuracy_reward": 0.8071428835391998, |
|
"rewards/format_reward": 0.2928571552038193, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 522.5732391357421, |
|
"epoch": 0.05409582689335394, |
|
"grad_norm": 0.29278990626335144, |
|
"kl": 0.0121734619140625, |
|
"learning_rate": 2.6923076923076923e-06, |
|
"loss": 0.0005, |
|
"reward": 1.1071429133415223, |
|
"reward_std": 0.3307413190603256, |
|
"rewards/accuracy_reward": 0.9053571879863739, |
|
"rewards/format_reward": 0.20178572386503218, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 559.1973510742188, |
|
"epoch": 0.05795981452859351, |
|
"grad_norm": 0.3587290644645691, |
|
"kl": 0.0087127685546875, |
|
"learning_rate": 2.8846153846153845e-06, |
|
"loss": 0.0003, |
|
"reward": 1.0446429073810577, |
|
"reward_std": 0.37309455275535586, |
|
"rewards/accuracy_reward": 0.867857176065445, |
|
"rewards/format_reward": 0.17678572237491608, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 563.2482452392578, |
|
"epoch": 0.061823802163833076, |
|
"grad_norm": 0.38649681210517883, |
|
"kl": 0.01405792236328125, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.0006, |
|
"reward": 1.081250047683716, |
|
"reward_std": 0.34369638115167617, |
|
"rewards/accuracy_reward": 0.8875000417232514, |
|
"rewards/format_reward": 0.1937500111758709, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 513.6526977539063, |
|
"epoch": 0.06568778979907264, |
|
"grad_norm": 0.3370773494243622, |
|
"kl": 0.02208251953125, |
|
"learning_rate": 3.2692307692307696e-06, |
|
"loss": 0.0009, |
|
"reward": 1.0651786148548126, |
|
"reward_std": 0.4572626382112503, |
|
"rewards/accuracy_reward": 0.7392857491970062, |
|
"rewards/format_reward": 0.32589287161827085, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 483.4035949707031, |
|
"epoch": 0.0695517774343122, |
|
"grad_norm": 0.6648739576339722, |
|
"kl": 0.02352294921875, |
|
"learning_rate": 3.4615384615384617e-06, |
|
"loss": 0.0009, |
|
"reward": 1.1357143223285675, |
|
"reward_std": 0.3591114327311516, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/format_reward": 0.3946428716182709, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 460.51698608398436, |
|
"epoch": 0.07341576506955177, |
|
"grad_norm": 0.29094594717025757, |
|
"kl": 0.018267822265625, |
|
"learning_rate": 3.653846153846154e-06, |
|
"loss": 0.0007, |
|
"reward": 1.2437500596046447, |
|
"reward_std": 0.32878804206848145, |
|
"rewards/accuracy_reward": 0.8767857551574707, |
|
"rewards/format_reward": 0.3669643014669418, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 508.53038635253904, |
|
"epoch": 0.07727975270479134, |
|
"grad_norm": 3.374635696411133, |
|
"kl": 0.07086181640625, |
|
"learning_rate": 3.846153846153847e-06, |
|
"loss": 0.0028, |
|
"reward": 1.0633928954601288, |
|
"reward_std": 0.44108102321624754, |
|
"rewards/accuracy_reward": 0.7892857432365418, |
|
"rewards/format_reward": 0.27410715222358706, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07727975270479134, |
|
"eval_completion_length": 472.33049926757815, |
|
"eval_kl": 0.022151692708333334, |
|
"eval_loss": 0.0008794094319455326, |
|
"eval_reward": 1.147619108359019, |
|
"eval_reward_std": 0.36104824443658196, |
|
"eval_rewards/accuracy_reward": 0.7714286088943482, |
|
"eval_rewards/format_reward": 0.3761904915173849, |
|
"eval_runtime": 91.4389, |
|
"eval_samples_per_second": 1.083, |
|
"eval_steps_per_second": 0.044, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 457.9589447021484, |
|
"epoch": 0.08114374034003091, |
|
"grad_norm": 0.35772719979286194, |
|
"kl": 0.07862548828125, |
|
"learning_rate": 4.0384615384615385e-06, |
|
"loss": 0.0031, |
|
"reward": 1.0741071939468383, |
|
"reward_std": 0.36222362220287324, |
|
"rewards/accuracy_reward": 0.6642857491970062, |
|
"rewards/format_reward": 0.4098214417695999, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 409.0098358154297, |
|
"epoch": 0.08500772797527048, |
|
"grad_norm": 0.4931844472885132, |
|
"kl": 0.0488525390625, |
|
"learning_rate": 4.230769230769231e-06, |
|
"loss": 0.002, |
|
"reward": 1.1589286088943482, |
|
"reward_std": 0.4164972364902496, |
|
"rewards/accuracy_reward": 0.6375000357627869, |
|
"rewards/format_reward": 0.5214285969734191, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 437.12233276367186, |
|
"epoch": 0.08887171561051005, |
|
"grad_norm": 0.31475040316581726, |
|
"kl": 0.0398681640625, |
|
"learning_rate": 4.423076923076924e-06, |
|
"loss": 0.0016, |
|
"reward": 1.1946429133415222, |
|
"reward_std": 0.40721654444932937, |
|
"rewards/accuracy_reward": 0.673214316368103, |
|
"rewards/format_reward": 0.5214285910129547, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 468.64287719726565, |
|
"epoch": 0.09273570324574962, |
|
"grad_norm": 0.3709864318370819, |
|
"kl": 0.0320556640625, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 0.0013, |
|
"reward": 1.2848214983940125, |
|
"reward_std": 0.40001600831747053, |
|
"rewards/accuracy_reward": 0.7464286148548126, |
|
"rewards/format_reward": 0.5383928835391998, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 465.69109497070315, |
|
"epoch": 0.09659969088098919, |
|
"grad_norm": 0.35266467928886414, |
|
"kl": 0.0361328125, |
|
"learning_rate": 4.807692307692308e-06, |
|
"loss": 0.0014, |
|
"reward": 1.304464340209961, |
|
"reward_std": 0.3532786279916763, |
|
"rewards/accuracy_reward": 0.7678571701049804, |
|
"rewards/format_reward": 0.5366071611642838, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 417.57412109375, |
|
"epoch": 0.10046367851622875, |
|
"grad_norm": 0.5035552382469177, |
|
"kl": 0.0549560546875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0022, |
|
"reward": 1.3687500596046447, |
|
"reward_std": 0.41467164605855944, |
|
"rewards/accuracy_reward": 0.7267857432365418, |
|
"rewards/format_reward": 0.641964316368103, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 449.3339447021484, |
|
"epoch": 0.10432766615146831, |
|
"grad_norm": 0.34655991196632385, |
|
"kl": 0.0698974609375, |
|
"learning_rate": 4.99977236595506e-06, |
|
"loss": 0.0028, |
|
"reward": 1.5678572416305543, |
|
"reward_std": 0.5014733135700226, |
|
"rewards/accuracy_reward": 0.8892857551574707, |
|
"rewards/format_reward": 0.6785714566707611, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 497.0714508056641, |
|
"epoch": 0.10819165378670788, |
|
"grad_norm": 0.6115472316741943, |
|
"kl": 0.090966796875, |
|
"learning_rate": 4.999089505274044e-06, |
|
"loss": 0.0036, |
|
"reward": 1.796428620815277, |
|
"reward_std": 0.5465091168880463, |
|
"rewards/accuracy_reward": 1.0767857611179352, |
|
"rewards/format_reward": 0.7196428894996643, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 506.6348541259766, |
|
"epoch": 0.11205564142194745, |
|
"grad_norm": 0.2844839096069336, |
|
"kl": 0.08369140625, |
|
"learning_rate": 4.9979515423108255e-06, |
|
"loss": 0.0033, |
|
"reward": 1.8116072416305542, |
|
"reward_std": 0.5752836406230927, |
|
"rewards/accuracy_reward": 1.0535714745521545, |
|
"rewards/format_reward": 0.7580357551574707, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 514.6750274658203, |
|
"epoch": 0.11591962905718702, |
|
"grad_norm": 0.32185375690460205, |
|
"kl": 0.0857666015625, |
|
"learning_rate": 4.9963586842966925e-06, |
|
"loss": 0.0034, |
|
"reward": 1.715178644657135, |
|
"reward_std": 0.6052441537380219, |
|
"rewards/accuracy_reward": 1.0142857611179352, |
|
"rewards/format_reward": 0.7008929014205932, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 498.2187835693359, |
|
"epoch": 0.11978361669242658, |
|
"grad_norm": 0.30082836747169495, |
|
"kl": 0.07783203125, |
|
"learning_rate": 4.994311221302617e-06, |
|
"loss": 0.0031, |
|
"reward": 1.80714293718338, |
|
"reward_std": 0.5749644249677658, |
|
"rewards/accuracy_reward": 0.9964285969734192, |
|
"rewards/format_reward": 0.810714328289032, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 458.05091552734376, |
|
"epoch": 0.12364760432766615, |
|
"grad_norm": 0.35644015669822693, |
|
"kl": 0.078955078125, |
|
"learning_rate": 4.991809526186424e-06, |
|
"loss": 0.0032, |
|
"reward": 1.7375000715255737, |
|
"reward_std": 0.6288636207580567, |
|
"rewards/accuracy_reward": 0.9339286148548126, |
|
"rewards/format_reward": 0.80357146859169, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 428.93572998046875, |
|
"epoch": 0.1275115919629057, |
|
"grad_norm": 1.0398685932159424, |
|
"kl": 0.097021484375, |
|
"learning_rate": 4.988854054524897e-06, |
|
"loss": 0.0039, |
|
"reward": 1.769642949104309, |
|
"reward_std": 0.6159187257289886, |
|
"rewards/accuracy_reward": 0.9625000476837158, |
|
"rewards/format_reward": 0.8071428954601287, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 447.8910888671875, |
|
"epoch": 0.13137557959814528, |
|
"grad_norm": 0.8748766779899597, |
|
"kl": 0.141357421875, |
|
"learning_rate": 4.985445344530811e-06, |
|
"loss": 0.0057, |
|
"reward": 1.7616072177886963, |
|
"reward_std": 0.6195760637521743, |
|
"rewards/accuracy_reward": 0.9589286148548126, |
|
"rewards/format_reward": 0.8026786029338837, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 443.47234497070315, |
|
"epoch": 0.13523956723338484, |
|
"grad_norm": 3.474867582321167, |
|
"kl": 0.24501953125, |
|
"learning_rate": 4.9815840169549216e-06, |
|
"loss": 0.0098, |
|
"reward": 1.7017857909202576, |
|
"reward_std": 0.6981926560401917, |
|
"rewards/accuracy_reward": 0.8839286029338836, |
|
"rewards/format_reward": 0.8178571879863739, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 463.7705505371094, |
|
"epoch": 0.1391035548686244, |
|
"grad_norm": 20.703824996948242, |
|
"kl": 0.319140625, |
|
"learning_rate": 4.9772707749729205e-06, |
|
"loss": 0.0128, |
|
"reward": 1.5928572058677672, |
|
"reward_std": 0.7676066577434539, |
|
"rewards/accuracy_reward": 0.8089286029338837, |
|
"rewards/format_reward": 0.7839286029338837, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 364.6160858154297, |
|
"epoch": 0.14296754250386398, |
|
"grad_norm": 7.634427070617676, |
|
"kl": 2.84404296875, |
|
"learning_rate": 4.9725064040573824e-06, |
|
"loss": 0.114, |
|
"reward": 1.6312500715255738, |
|
"reward_std": 0.7480017244815826, |
|
"rewards/accuracy_reward": 0.8339286148548126, |
|
"rewards/format_reward": 0.7973214685916901, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 301.68394470214844, |
|
"epoch": 0.14683153013910355, |
|
"grad_norm": 119.83277130126953, |
|
"kl": 7.9796875, |
|
"learning_rate": 4.967291771834727e-06, |
|
"loss": 0.3199, |
|
"reward": 1.4017857670783997, |
|
"reward_std": 0.9306474328041077, |
|
"rewards/accuracy_reward": 0.7446428894996643, |
|
"rewards/format_reward": 0.6571428954601288, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 398.2705535888672, |
|
"epoch": 0.15069551777434312, |
|
"grad_norm": 7.846933841705322, |
|
"kl": 2.8125, |
|
"learning_rate": 4.961627827927214e-06, |
|
"loss": 0.1125, |
|
"reward": 1.2562500476837157, |
|
"reward_std": 0.8410393178462983, |
|
"rewards/accuracy_reward": 0.6660714626312256, |
|
"rewards/format_reward": 0.5901785910129547, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 452.414306640625, |
|
"epoch": 0.1545595054095827, |
|
"grad_norm": 13.303293228149414, |
|
"kl": 3.492578125, |
|
"learning_rate": 4.955515603780013e-06, |
|
"loss": 0.1395, |
|
"reward": 1.2098214864730834, |
|
"reward_std": 0.91947420835495, |
|
"rewards/accuracy_reward": 0.7017857491970062, |
|
"rewards/format_reward": 0.508035734295845, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1545595054095827, |
|
"eval_completion_length": 502.390342203776, |
|
"eval_kl": 2.55625, |
|
"eval_loss": 0.1018948182463646, |
|
"eval_reward": 1.0833333810170491, |
|
"eval_reward_std": 0.8227311591307322, |
|
"eval_rewards/accuracy_reward": 0.7095238486925761, |
|
"eval_rewards/format_reward": 0.3738095432519913, |
|
"eval_runtime": 97.2526, |
|
"eval_samples_per_second": 1.018, |
|
"eval_steps_per_second": 0.041, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 485.92323913574216, |
|
"epoch": 0.15842349304482226, |
|
"grad_norm": 1.919914722442627, |
|
"kl": 3892.1974609375, |
|
"learning_rate": 4.948956212473371e-06, |
|
"loss": 156.0877, |
|
"reward": 1.029464328289032, |
|
"reward_std": 0.8151026546955109, |
|
"rewards/accuracy_reward": 0.6910714685916901, |
|
"rewards/format_reward": 0.33839287161827086, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 452.31519470214846, |
|
"epoch": 0.16228748068006182, |
|
"grad_norm": 3.2309248447418213, |
|
"kl": 2.9755859375, |
|
"learning_rate": 4.9419508485199045e-06, |
|
"loss": 0.1187, |
|
"reward": 1.3482143521308898, |
|
"reward_std": 0.8349850118160248, |
|
"rewards/accuracy_reward": 0.7946428954601288, |
|
"rewards/format_reward": 0.5535714477300644, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 420.15180053710935, |
|
"epoch": 0.1661514683153014, |
|
"grad_norm": 0.746425211429596, |
|
"kl": 0.511328125, |
|
"learning_rate": 4.934500787647083e-06, |
|
"loss": 0.0205, |
|
"reward": 1.7473215103149413, |
|
"reward_std": 0.6757471442222596, |
|
"rewards/accuracy_reward": 0.998214328289032, |
|
"rewards/format_reward": 0.7491071820259094, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 430.4910919189453, |
|
"epoch": 0.17001545595054096, |
|
"grad_norm": 1.5358840227127075, |
|
"kl": 0.2576171875, |
|
"learning_rate": 4.926607386564898e-06, |
|
"loss": 0.0103, |
|
"reward": 1.8616072297096253, |
|
"reward_std": 0.5690989345312119, |
|
"rewards/accuracy_reward": 1.0482143342494965, |
|
"rewards/format_reward": 0.8133928894996643, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 480.8393035888672, |
|
"epoch": 0.17387944358578053, |
|
"grad_norm": 0.8902988433837891, |
|
"kl": 736.20126953125, |
|
"learning_rate": 4.918272082718805e-06, |
|
"loss": 29.354, |
|
"reward": 1.9142858028411864, |
|
"reward_std": 0.5961195319890976, |
|
"rewards/accuracy_reward": 1.0714286267757416, |
|
"rewards/format_reward": 0.8428571879863739, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 521.7080596923828, |
|
"epoch": 0.1777434312210201, |
|
"grad_norm": 0.7753103971481323, |
|
"kl": 0.2408203125, |
|
"learning_rate": 4.909496394027945e-06, |
|
"loss": 0.0096, |
|
"reward": 1.6517857789993287, |
|
"reward_std": 0.6544238030910492, |
|
"rewards/accuracy_reward": 0.9446429014205933, |
|
"rewards/format_reward": 0.7071428954601288, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 471.0625274658203, |
|
"epoch": 0.18160741885625967, |
|
"grad_norm": 0.9643970131874084, |
|
"kl": 1.3505859375, |
|
"learning_rate": 4.900281918608732e-06, |
|
"loss": 0.054, |
|
"reward": 1.738392949104309, |
|
"reward_std": 0.6436418563127517, |
|
"rewards/accuracy_reward": 1.0196429133415221, |
|
"rewards/format_reward": 0.7187500357627868, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 397.68840942382815, |
|
"epoch": 0.18547140649149924, |
|
"grad_norm": 1.6753307580947876, |
|
"kl": 0.4291015625, |
|
"learning_rate": 4.890630334483814e-06, |
|
"loss": 0.0172, |
|
"reward": 1.8008929371833802, |
|
"reward_std": 0.610142993927002, |
|
"rewards/accuracy_reward": 0.996428620815277, |
|
"rewards/format_reward": 0.8044643223285675, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 399.38930358886716, |
|
"epoch": 0.1893353941267388, |
|
"grad_norm": 2.494441509246826, |
|
"kl": 17.448046875, |
|
"learning_rate": 4.880543399276499e-06, |
|
"loss": 0.7002, |
|
"reward": 1.5107143640518188, |
|
"reward_std": 0.76023770570755, |
|
"rewards/accuracy_reward": 0.7892857491970062, |
|
"rewards/format_reward": 0.7214286029338837, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 375.9785858154297, |
|
"epoch": 0.19319938176197837, |
|
"grad_norm": 2.4901890754699707, |
|
"kl": 1.29296875, |
|
"learning_rate": 4.870022949890676e-06, |
|
"loss": 0.0517, |
|
"reward": 1.6321429491043091, |
|
"reward_std": 0.6653113335371017, |
|
"rewards/accuracy_reward": 0.8446428894996643, |
|
"rewards/format_reward": 0.7875000357627868, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 368.9598388671875, |
|
"epoch": 0.19706336939721794, |
|
"grad_norm": 2.3111751079559326, |
|
"kl": 0.7330078125, |
|
"learning_rate": 4.859070902176305e-06, |
|
"loss": 0.0293, |
|
"reward": 1.8500001072883605, |
|
"reward_std": 0.5273477554321289, |
|
"rewards/accuracy_reward": 0.9571428954601288, |
|
"rewards/format_reward": 0.8928571820259095, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 392.04912719726565, |
|
"epoch": 0.2009273570324575, |
|
"grad_norm": 1.6753560304641724, |
|
"kl": 3.5484375, |
|
"learning_rate": 4.8476892505805224e-06, |
|
"loss": 0.1425, |
|
"reward": 1.8562500715255736, |
|
"reward_std": 0.5729426324367524, |
|
"rewards/accuracy_reward": 1.0071429133415222, |
|
"rewards/format_reward": 0.849107164144516, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 461.39019470214845, |
|
"epoch": 0.20479134466769705, |
|
"grad_norm": 7.992901802062988, |
|
"kl": 0.6904296875, |
|
"learning_rate": 4.835880067784441e-06, |
|
"loss": 0.0276, |
|
"reward": 1.7035714983940125, |
|
"reward_std": 0.814025753736496, |
|
"rewards/accuracy_reward": 0.9500000417232514, |
|
"rewards/format_reward": 0.7535714566707611, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 400.0518035888672, |
|
"epoch": 0.20865533230293662, |
|
"grad_norm": 16.02567481994629, |
|
"kl": 0.78994140625, |
|
"learning_rate": 4.823645504325699e-06, |
|
"loss": 0.0317, |
|
"reward": 1.8741072297096253, |
|
"reward_std": 0.6473642587661743, |
|
"rewards/accuracy_reward": 1.0446428954601288, |
|
"rewards/format_reward": 0.8294643223285675, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 366.62322998046875, |
|
"epoch": 0.2125193199381762, |
|
"grad_norm": 3.842503547668457, |
|
"kl": 1.327734375, |
|
"learning_rate": 4.81098778820683e-06, |
|
"loss": 0.0531, |
|
"reward": 1.9205357909202576, |
|
"reward_std": 0.5114573985338211, |
|
"rewards/accuracy_reward": 1.0375000476837157, |
|
"rewards/format_reward": 0.8830357611179351, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 419.6143005371094, |
|
"epoch": 0.21638330757341576, |
|
"grad_norm": 1.8317447900772095, |
|
"kl": 1.608203125, |
|
"learning_rate": 4.797909224489531e-06, |
|
"loss": 0.0644, |
|
"reward": 1.725892925262451, |
|
"reward_std": 0.7076287031173706, |
|
"rewards/accuracy_reward": 0.9500000417232514, |
|
"rewards/format_reward": 0.7758929014205933, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 462.8062713623047, |
|
"epoch": 0.22024729520865532, |
|
"grad_norm": 3.1712355613708496, |
|
"kl": 2.66015625, |
|
"learning_rate": 4.7844121948748904e-06, |
|
"loss": 0.1064, |
|
"reward": 1.6767858028411866, |
|
"reward_std": 0.6928595781326294, |
|
"rewards/accuracy_reward": 0.9517857491970062, |
|
"rewards/format_reward": 0.7250000417232514, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 419.5089477539062, |
|
"epoch": 0.2241112828438949, |
|
"grad_norm": 4.4721269607543945, |
|
"kl": 2.098828125, |
|
"learning_rate": 4.770499157269664e-06, |
|
"loss": 0.084, |
|
"reward": 1.8848215460777282, |
|
"reward_std": 0.6089197903871536, |
|
"rewards/accuracy_reward": 1.0571428894996644, |
|
"rewards/format_reward": 0.8276786029338836, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 382.21519470214844, |
|
"epoch": 0.22797527047913446, |
|
"grad_norm": 3.2568917274475098, |
|
"kl": 1.37666015625, |
|
"learning_rate": 4.756172645338675e-06, |
|
"loss": 0.0551, |
|
"reward": 1.9062500953674317, |
|
"reward_std": 0.5363048523664474, |
|
"rewards/accuracy_reward": 1.0035714626312255, |
|
"rewards/format_reward": 0.9026785969734192, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 400.94912109375, |
|
"epoch": 0.23183925811437403, |
|
"grad_norm": 12.386054992675781, |
|
"kl": 2.53203125, |
|
"learning_rate": 4.741435268043412e-06, |
|
"loss": 0.1013, |
|
"reward": 1.702678656578064, |
|
"reward_std": 0.7214795827865601, |
|
"rewards/accuracy_reward": 0.941071480512619, |
|
"rewards/format_reward": 0.7616071760654449, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.23183925811437403, |
|
"eval_completion_length": 454.9096964518229, |
|
"eval_kl": 4.954166666666667, |
|
"eval_loss": 0.19525596499443054, |
|
"eval_reward": 1.3000000516573589, |
|
"eval_reward_std": 0.8129177749156952, |
|
"eval_rewards/accuracy_reward": 0.7142857472101848, |
|
"eval_rewards/format_reward": 0.585714316368103, |
|
"eval_runtime": 98.4065, |
|
"eval_samples_per_second": 1.006, |
|
"eval_steps_per_second": 0.041, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 470.2223419189453, |
|
"epoch": 0.2357032457496136, |
|
"grad_norm": 3.8275136947631836, |
|
"kl": 2.878125, |
|
"learning_rate": 4.7262897091669195e-06, |
|
"loss": 0.1152, |
|
"reward": 1.290178620815277, |
|
"reward_std": 0.830661517381668, |
|
"rewards/accuracy_reward": 0.6928571701049805, |
|
"rewards/format_reward": 0.5973214536905289, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 436.9973388671875, |
|
"epoch": 0.23956723338485317, |
|
"grad_norm": 2.0655641555786133, |
|
"kl": 2.27578125, |
|
"learning_rate": 4.710738726825059e-06, |
|
"loss": 0.091, |
|
"reward": 1.608928644657135, |
|
"reward_std": 0.8151212155818939, |
|
"rewards/accuracy_reward": 0.8553571701049805, |
|
"rewards/format_reward": 0.7535714507102966, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 363.5821594238281, |
|
"epoch": 0.24343122102009274, |
|
"grad_norm": 1.809662103652954, |
|
"kl": 1.327734375, |
|
"learning_rate": 4.694785152964244e-06, |
|
"loss": 0.0531, |
|
"reward": 1.8464286565780639, |
|
"reward_std": 0.5842416912317276, |
|
"rewards/accuracy_reward": 0.9767857551574707, |
|
"rewards/format_reward": 0.8696429014205933, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 341.5821563720703, |
|
"epoch": 0.2472952086553323, |
|
"grad_norm": 2.9773175716400146, |
|
"kl": 2.2734375, |
|
"learning_rate": 4.678431892845714e-06, |
|
"loss": 0.0908, |
|
"reward": 1.8232143759727477, |
|
"reward_std": 0.6686225771903992, |
|
"rewards/accuracy_reward": 0.9571428954601288, |
|
"rewards/format_reward": 0.8660714745521545, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 362.4410827636719, |
|
"epoch": 0.2511591962905719, |
|
"grad_norm": 1.7058671712875366, |
|
"kl": 1.4408203125, |
|
"learning_rate": 4.661681924516466e-06, |
|
"loss": 0.0577, |
|
"reward": 1.9660715341567994, |
|
"reward_std": 0.589709809422493, |
|
"rewards/accuracy_reward": 1.0446429073810577, |
|
"rewards/format_reward": 0.9214286148548126, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 380.65447692871095, |
|
"epoch": 0.2550231839258114, |
|
"grad_norm": 0.6990386843681335, |
|
"kl": 0.7595703125, |
|
"learning_rate": 4.6445382982669365e-06, |
|
"loss": 0.0304, |
|
"reward": 2.0437501072883606, |
|
"reward_std": 0.5305197536945343, |
|
"rewards/accuracy_reward": 1.1089286267757417, |
|
"rewards/format_reward": 0.9348214745521546, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 376.0223419189453, |
|
"epoch": 0.258887171561051, |
|
"grad_norm": 182.0006866455078, |
|
"kl": 1.23125, |
|
"learning_rate": 4.627004136075514e-06, |
|
"loss": 0.0492, |
|
"reward": 2.0428572416305544, |
|
"reward_std": 0.44865317046642306, |
|
"rewards/accuracy_reward": 1.0910714745521546, |
|
"rewards/format_reward": 0.9517857491970062, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 385.6750122070313, |
|
"epoch": 0.26275115919629055, |
|
"grad_norm": 0.5571011900901794, |
|
"kl": 0.4580078125, |
|
"learning_rate": 4.609082631040012e-06, |
|
"loss": 0.0183, |
|
"reward": 2.083928680419922, |
|
"reward_std": 0.48743111491203306, |
|
"rewards/accuracy_reward": 1.1214286267757416, |
|
"rewards/format_reward": 0.9625000357627869, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 376.5723388671875, |
|
"epoch": 0.26661514683153015, |
|
"grad_norm": 1.1355966329574585, |
|
"kl": 0.5251953125, |
|
"learning_rate": 4.5907770467961755e-06, |
|
"loss": 0.021, |
|
"reward": 1.9705358266830444, |
|
"reward_std": 0.5101535975933075, |
|
"rewards/accuracy_reward": 1.0375000417232514, |
|
"rewards/format_reward": 0.9330357611179352, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 462.158056640625, |
|
"epoch": 0.2704791344667697, |
|
"grad_norm": 1.7409440279006958, |
|
"kl": 1.1546875, |
|
"learning_rate": 4.572090716923354e-06, |
|
"loss": 0.0462, |
|
"reward": 1.6875000715255737, |
|
"reward_std": 0.7979255437850952, |
|
"rewards/accuracy_reward": 0.8857143223285675, |
|
"rewards/format_reward": 0.8017857491970062, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 482.0419860839844, |
|
"epoch": 0.2743431221020093, |
|
"grad_norm": 4.6754655838012695, |
|
"kl": 1.208203125, |
|
"learning_rate": 4.5530270443374305e-06, |
|
"loss": 0.0484, |
|
"reward": 1.471428632736206, |
|
"reward_std": 0.8514433205127716, |
|
"rewards/accuracy_reward": 0.7285714626312256, |
|
"rewards/format_reward": 0.7428571701049804, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 484.2116333007813, |
|
"epoch": 0.2782071097372488, |
|
"grad_norm": 11.937480926513672, |
|
"kl": 5.473828125, |
|
"learning_rate": 4.533589500671126e-06, |
|
"loss": 0.219, |
|
"reward": 1.4812500596046447, |
|
"reward_std": 0.8435086131095886, |
|
"rewards/accuracy_reward": 0.7446428775787354, |
|
"rewards/format_reward": 0.7366071760654449, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 538.0169860839844, |
|
"epoch": 0.2820710973724884, |
|
"grad_norm": 1.5927709341049194, |
|
"kl": 2.071875, |
|
"learning_rate": 4.513781625641793e-06, |
|
"loss": 0.0829, |
|
"reward": 1.3562500596046447, |
|
"reward_std": 0.9622864723205566, |
|
"rewards/accuracy_reward": 0.7089285969734191, |
|
"rewards/format_reward": 0.6473214566707611, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 498.48663330078125, |
|
"epoch": 0.28593508500772796, |
|
"grad_norm": 2.080902338027954, |
|
"kl": 3.178125, |
|
"learning_rate": 4.493607026406802e-06, |
|
"loss": 0.1272, |
|
"reward": 1.2741071939468385, |
|
"reward_std": 0.8825342118740082, |
|
"rewards/accuracy_reward": 0.6517857432365417, |
|
"rewards/format_reward": 0.6223214626312256, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 489.34288330078124, |
|
"epoch": 0.28979907264296756, |
|
"grad_norm": 43.23033142089844, |
|
"kl": 8.0921875, |
|
"learning_rate": 4.473069376906657e-06, |
|
"loss": 0.3234, |
|
"reward": 1.111607199907303, |
|
"reward_std": 0.8139643251895905, |
|
"rewards/accuracy_reward": 0.5375000178813935, |
|
"rewards/format_reward": 0.5741071790456772, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 547.9125305175781, |
|
"epoch": 0.2936630602782071, |
|
"grad_norm": 3.59335994720459, |
|
"kl": 2.996875, |
|
"learning_rate": 4.4521724171959404e-06, |
|
"loss": 0.1198, |
|
"reward": 0.965178620815277, |
|
"reward_std": 0.776919960975647, |
|
"rewards/accuracy_reward": 0.47142858505249025, |
|
"rewards/format_reward": 0.4937500238418579, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 447.9116271972656, |
|
"epoch": 0.2975270479134467, |
|
"grad_norm": 10.616156578063965, |
|
"kl": 2.9203125, |
|
"learning_rate": 4.430919952762226e-06, |
|
"loss": 0.1167, |
|
"reward": 1.7098215222358704, |
|
"reward_std": 0.7615076899528503, |
|
"rewards/accuracy_reward": 0.9089286088943481, |
|
"rewards/format_reward": 0.8008928894996643, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 408.0750213623047, |
|
"epoch": 0.30139103554868624, |
|
"grad_norm": 2.474897623062134, |
|
"kl": 1.240625, |
|
"learning_rate": 4.409315853833068e-06, |
|
"loss": 0.0495, |
|
"reward": 1.898214375972748, |
|
"reward_std": 0.6294975191354751, |
|
"rewards/accuracy_reward": 1.0250000357627869, |
|
"rewards/format_reward": 0.873214328289032, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 432.92323608398436, |
|
"epoch": 0.30525502318392583, |
|
"grad_norm": 1.0525386333465576, |
|
"kl": 0.984765625, |
|
"learning_rate": 4.387364054671208e-06, |
|
"loss": 0.0394, |
|
"reward": 1.804464375972748, |
|
"reward_std": 0.7036986917257309, |
|
"rewards/accuracy_reward": 0.9642857491970063, |
|
"rewards/format_reward": 0.8401786148548126, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 426.0428771972656, |
|
"epoch": 0.3091190108191654, |
|
"grad_norm": 28.755624771118164, |
|
"kl": 0.57890625, |
|
"learning_rate": 4.365068552858116e-06, |
|
"loss": 0.0232, |
|
"reward": 1.8732143640518188, |
|
"reward_std": 0.5758820742368698, |
|
"rewards/accuracy_reward": 1.021428632736206, |
|
"rewards/format_reward": 0.8517857551574707, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3091190108191654, |
|
"eval_completion_length": 421.4955749511719, |
|
"eval_kl": 0.2569661458333333, |
|
"eval_loss": 0.01039376575499773, |
|
"eval_reward": 1.8190476894378662, |
|
"eval_reward_std": 0.5760457158088684, |
|
"eval_rewards/accuracy_reward": 0.9333333849906922, |
|
"eval_rewards/format_reward": 0.885714328289032, |
|
"eval_runtime": 92.0979, |
|
"eval_samples_per_second": 1.075, |
|
"eval_steps_per_second": 0.043, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 431.314306640625, |
|
"epoch": 0.31298299845440497, |
|
"grad_norm": 1.367416262626648, |
|
"kl": 0.28056640625, |
|
"learning_rate": 4.342433408566e-06, |
|
"loss": 0.0112, |
|
"reward": 1.8312501072883607, |
|
"reward_std": 0.6301711022853851, |
|
"rewards/accuracy_reward": 0.9732143342494964, |
|
"rewards/format_reward": 0.8580357611179352, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 430.4732360839844, |
|
"epoch": 0.3168469860896445, |
|
"grad_norm": 0.7723910212516785, |
|
"kl": 0.4091796875, |
|
"learning_rate": 4.3194627438184235e-06, |
|
"loss": 0.0164, |
|
"reward": 1.748214340209961, |
|
"reward_std": 0.6911887288093567, |
|
"rewards/accuracy_reward": 0.9267857611179352, |
|
"rewards/format_reward": 0.8214286088943481, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 524.7294921875, |
|
"epoch": 0.3207109737248841, |
|
"grad_norm": 1.0905834436416626, |
|
"kl": 0.517578125, |
|
"learning_rate": 4.296160741739652e-06, |
|
"loss": 0.0207, |
|
"reward": 1.440178632736206, |
|
"reward_std": 0.7554588854312897, |
|
"rewards/accuracy_reward": 0.7535714626312255, |
|
"rewards/format_reward": 0.686607176065445, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 449.8026947021484, |
|
"epoch": 0.32457496136012365, |
|
"grad_norm": 0.6751212477684021, |
|
"kl": 0.26748046875, |
|
"learning_rate": 4.272531645792876e-06, |
|
"loss": 0.0107, |
|
"reward": 1.554464375972748, |
|
"reward_std": 0.7342777848243713, |
|
"rewards/accuracy_reward": 0.7571428775787353, |
|
"rewards/format_reward": 0.7973214685916901, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 425.3678771972656, |
|
"epoch": 0.3284389489953632, |
|
"grad_norm": 0.857072651386261, |
|
"kl": 0.311328125, |
|
"learning_rate": 4.2485797590074465e-06, |
|
"loss": 0.0124, |
|
"reward": 1.7098215222358704, |
|
"reward_std": 0.6917063415050506, |
|
"rewards/accuracy_reward": 0.8803571879863739, |
|
"rewards/format_reward": 0.829464316368103, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 368.4768035888672, |
|
"epoch": 0.3323029366306028, |
|
"grad_norm": 1.9811636209487915, |
|
"kl": 0.26328125, |
|
"learning_rate": 4.224309443195261e-06, |
|
"loss": 0.0105, |
|
"reward": 1.8812500953674316, |
|
"reward_std": 0.5524741142988205, |
|
"rewards/accuracy_reward": 0.9732143342494964, |
|
"rewards/format_reward": 0.9080357611179352, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 443.5598419189453, |
|
"epoch": 0.3361669242658423, |
|
"grad_norm": 1.1125954389572144, |
|
"kl": 0.38134765625, |
|
"learning_rate": 4.199725118156448e-06, |
|
"loss": 0.0152, |
|
"reward": 1.6392857909202576, |
|
"reward_std": 0.662892284989357, |
|
"rewards/accuracy_reward": 0.8517857611179351, |
|
"rewards/format_reward": 0.7875000298023224, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 445.95270080566405, |
|
"epoch": 0.3400309119010819, |
|
"grad_norm": 1.947538137435913, |
|
"kl": 0.4873046875, |
|
"learning_rate": 4.174831260874489e-06, |
|
"loss": 0.0195, |
|
"reward": 1.7642858147621154, |
|
"reward_std": 0.8089473009109497, |
|
"rewards/accuracy_reward": 0.9946429014205933, |
|
"rewards/format_reward": 0.7696428954601288, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 463.2759094238281, |
|
"epoch": 0.34389489953632146, |
|
"grad_norm": 0.44283971190452576, |
|
"kl": 0.41533203125, |
|
"learning_rate": 4.149632404700925e-06, |
|
"loss": 0.0166, |
|
"reward": 1.7160715103149413, |
|
"reward_std": 0.7051611065864563, |
|
"rewards/accuracy_reward": 0.9357143342494965, |
|
"rewards/format_reward": 0.780357176065445, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 396.64288024902345, |
|
"epoch": 0.34775888717156106, |
|
"grad_norm": 0.5284359455108643, |
|
"kl": 0.1916015625, |
|
"learning_rate": 4.124133138529804e-06, |
|
"loss": 0.0077, |
|
"reward": 1.9616072297096252, |
|
"reward_std": 0.5300792083144188, |
|
"rewards/accuracy_reward": 1.046428620815277, |
|
"rewards/format_reward": 0.9151786208152771, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 359.92144165039065, |
|
"epoch": 0.3516228748068006, |
|
"grad_norm": 0.6384701728820801, |
|
"kl": 0.18466796875, |
|
"learning_rate": 4.098338105962004e-06, |
|
"loss": 0.0074, |
|
"reward": 2.0705358505249025, |
|
"reward_std": 0.48216700553894043, |
|
"rewards/accuracy_reward": 1.1160714685916902, |
|
"rewards/format_reward": 0.9544643223285675, |
|
"step": 455 |
|
}, |
|
{ |
|
"completion_length": 353.79465942382814, |
|
"epoch": 0.3554868624420402, |
|
"grad_norm": 2.704573154449463, |
|
"kl": 0.2443359375, |
|
"learning_rate": 4.072252004459612e-06, |
|
"loss": 0.0098, |
|
"reward": 1.977678644657135, |
|
"reward_std": 0.511484894156456, |
|
"rewards/accuracy_reward": 1.0285714864730835, |
|
"rewards/format_reward": 0.9491071760654449, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 352.4866180419922, |
|
"epoch": 0.35935085007727974, |
|
"grad_norm": 1.346289038658142, |
|
"kl": 0.3484375, |
|
"learning_rate": 4.045879584490466e-06, |
|
"loss": 0.0139, |
|
"reward": 1.8348215341567993, |
|
"reward_std": 0.6451143264770508, |
|
"rewards/accuracy_reward": 0.917857187986374, |
|
"rewards/format_reward": 0.916964328289032, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 454.8107391357422, |
|
"epoch": 0.36321483771251933, |
|
"grad_norm": 0.8649754524230957, |
|
"kl": 0.369140625, |
|
"learning_rate": 4.019225648663073e-06, |
|
"loss": 0.0148, |
|
"reward": 1.560714340209961, |
|
"reward_std": 0.811360216140747, |
|
"rewards/accuracy_reward": 0.7607143223285675, |
|
"rewards/format_reward": 0.8000000298023224, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 382.1169830322266, |
|
"epoch": 0.3670788253477589, |
|
"grad_norm": 2.951951742172241, |
|
"kl": 0.2439453125, |
|
"learning_rate": 3.992295050852013e-06, |
|
"loss": 0.0098, |
|
"reward": 1.8357143878936768, |
|
"reward_std": 0.6485567986965179, |
|
"rewards/accuracy_reward": 0.9232143223285675, |
|
"rewards/format_reward": 0.9125000476837158, |
|
"step": 475 |
|
}, |
|
{ |
|
"completion_length": 377.8848358154297, |
|
"epoch": 0.37094281298299847, |
|
"grad_norm": 3.4858837127685547, |
|
"kl": 0.23447265625, |
|
"learning_rate": 3.965092695314018e-06, |
|
"loss": 0.0094, |
|
"reward": 1.8285714983940125, |
|
"reward_std": 0.5620756894350052, |
|
"rewards/accuracy_reward": 0.8892857611179352, |
|
"rewards/format_reward": 0.9392857670783996, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 420.1151947021484, |
|
"epoch": 0.374806800618238, |
|
"grad_norm": 0.7943024635314941, |
|
"kl": 0.3015625, |
|
"learning_rate": 3.937623535794864e-06, |
|
"loss": 0.0121, |
|
"reward": 1.8723214864730835, |
|
"reward_std": 0.5844464153051376, |
|
"rewards/accuracy_reward": 0.9607143461704254, |
|
"rewards/format_reward": 0.911607176065445, |
|
"step": 485 |
|
}, |
|
{ |
|
"completion_length": 476.45448608398436, |
|
"epoch": 0.3786707882534776, |
|
"grad_norm": 0.619339108467102, |
|
"kl": 0.3705078125, |
|
"learning_rate": 3.909892574627267e-06, |
|
"loss": 0.0148, |
|
"reward": 1.7160714983940124, |
|
"reward_std": 0.6855497658252716, |
|
"rewards/accuracy_reward": 0.8964286088943482, |
|
"rewards/format_reward": 0.8196428835391998, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 449.3410919189453, |
|
"epoch": 0.38253477588871715, |
|
"grad_norm": 0.9153338074684143, |
|
"kl": 0.2828125, |
|
"learning_rate": 3.881904861819914e-06, |
|
"loss": 0.0113, |
|
"reward": 1.7375000834465026, |
|
"reward_std": 0.773430997133255, |
|
"rewards/accuracy_reward": 0.9160714864730835, |
|
"rewards/format_reward": 0.8214286029338836, |
|
"step": 495 |
|
}, |
|
{ |
|
"completion_length": 424.5857330322266, |
|
"epoch": 0.38639876352395675, |
|
"grad_norm": 2.4771955013275146, |
|
"kl": 0.30927734375, |
|
"learning_rate": 3.853665494137825e-06, |
|
"loss": 0.0124, |
|
"reward": 1.7491072177886964, |
|
"reward_std": 0.6954550087451935, |
|
"rewards/accuracy_reward": 0.8785714685916901, |
|
"rewards/format_reward": 0.8705357670783996, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.38639876352395675, |
|
"eval_completion_length": 375.0712870279948, |
|
"eval_kl": 0.3849609375, |
|
"eval_loss": 0.015607084147632122, |
|
"eval_reward": 1.8738096157709758, |
|
"eval_reward_std": 0.5338563899199168, |
|
"eval_rewards/accuracy_reward": 0.9523809909820556, |
|
"eval_rewards/format_reward": 0.9214285969734192, |
|
"eval_runtime": 89.8262, |
|
"eval_samples_per_second": 1.102, |
|
"eval_steps_per_second": 0.045, |
|
"step": 500 |
|
}, |
|
{ |
|
"completion_length": 388.8553741455078, |
|
"epoch": 0.3902627511591963, |
|
"grad_norm": 0.7688225507736206, |
|
"kl": 0.4513671875, |
|
"learning_rate": 3.825179614174195e-06, |
|
"loss": 0.0181, |
|
"reward": 1.8348215222358704, |
|
"reward_std": 0.6801443308591842, |
|
"rewards/accuracy_reward": 0.9339286029338837, |
|
"rewards/format_reward": 0.9008928954601287, |
|
"step": 505 |
|
}, |
|
{ |
|
"completion_length": 369.0491241455078, |
|
"epoch": 0.3941267387944359, |
|
"grad_norm": 0.883755624294281, |
|
"kl": 0.4251953125, |
|
"learning_rate": 3.796452409413887e-06, |
|
"loss": 0.017, |
|
"reward": 1.913392972946167, |
|
"reward_std": 0.598648875951767, |
|
"rewards/accuracy_reward": 0.9767857670783997, |
|
"rewards/format_reward": 0.9366071820259094, |
|
"step": 510 |
|
}, |
|
{ |
|
"completion_length": 370.35894775390625, |
|
"epoch": 0.3979907264296754, |
|
"grad_norm": 1.2042393684387207, |
|
"kl": 0.3263671875, |
|
"learning_rate": 3.767489111288757e-06, |
|
"loss": 0.0131, |
|
"reward": 1.8642858028411866, |
|
"reward_std": 0.5265406250953675, |
|
"rewards/accuracy_reward": 0.9267857611179352, |
|
"rewards/format_reward": 0.9375000476837159, |
|
"step": 515 |
|
}, |
|
{ |
|
"completion_length": 396.01519775390625, |
|
"epoch": 0.401854714064915, |
|
"grad_norm": 2.4166510105133057, |
|
"kl": 0.45439453125, |
|
"learning_rate": 3.7382949942249695e-06, |
|
"loss": 0.0182, |
|
"reward": 1.8446429371833801, |
|
"reward_std": 0.5753746211528779, |
|
"rewards/accuracy_reward": 0.9410714566707611, |
|
"rewards/format_reward": 0.90357146859169, |
|
"step": 520 |
|
}, |
|
{ |
|
"completion_length": 387.351806640625, |
|
"epoch": 0.40571870170015456, |
|
"grad_norm": 1.1032941341400146, |
|
"kl": 0.33154296875, |
|
"learning_rate": 3.7088753746824896e-06, |
|
"loss": 0.0133, |
|
"reward": 1.9517857789993287, |
|
"reward_std": 0.62562136054039, |
|
"rewards/accuracy_reward": 1.0428571939468383, |
|
"rewards/format_reward": 0.9089286208152771, |
|
"step": 525 |
|
}, |
|
{ |
|
"completion_length": 399.0643035888672, |
|
"epoch": 0.4095826893353941, |
|
"grad_norm": 0.6850565075874329, |
|
"kl": 0.26015625, |
|
"learning_rate": 3.6792356101869157e-06, |
|
"loss": 0.0104, |
|
"reward": 1.8660715222358704, |
|
"reward_std": 0.6411642551422119, |
|
"rewards/accuracy_reward": 0.9500000476837158, |
|
"rewards/format_reward": 0.9160714626312256, |
|
"step": 530 |
|
}, |
|
{ |
|
"completion_length": 409.71162109375, |
|
"epoch": 0.4134466769706337, |
|
"grad_norm": 0.8124867677688599, |
|
"kl": 0.6361328125, |
|
"learning_rate": 3.649381098353834e-06, |
|
"loss": 0.0255, |
|
"reward": 1.9285714864730834, |
|
"reward_std": 0.6074248850345612, |
|
"rewards/accuracy_reward": 1.0392857670783997, |
|
"rewards/format_reward": 0.8892857611179352, |
|
"step": 535 |
|
}, |
|
{ |
|
"completion_length": 388.5687622070312, |
|
"epoch": 0.41731066460587324, |
|
"grad_norm": 0.6513718366622925, |
|
"kl": 0.31435546875, |
|
"learning_rate": 3.619317275905874e-06, |
|
"loss": 0.0126, |
|
"reward": 1.9750000953674316, |
|
"reward_std": 0.6598183512687683, |
|
"rewards/accuracy_reward": 1.0875000476837158, |
|
"rewards/format_reward": 0.8875000417232514, |
|
"step": 540 |
|
}, |
|
{ |
|
"completion_length": 375.90537109375, |
|
"epoch": 0.42117465224111283, |
|
"grad_norm": 0.9380148649215698, |
|
"kl": 0.3015625, |
|
"learning_rate": 3.589049617682646e-06, |
|
"loss": 0.0121, |
|
"reward": 1.9455357909202575, |
|
"reward_std": 0.6462159514427185, |
|
"rewards/accuracy_reward": 1.0446428954601288, |
|
"rewards/format_reward": 0.9008929073810578, |
|
"step": 545 |
|
}, |
|
{ |
|
"completion_length": 415.32323303222654, |
|
"epoch": 0.4250386398763524, |
|
"grad_norm": 3.0105764865875244, |
|
"kl": 0.60146484375, |
|
"learning_rate": 3.5585836356437266e-06, |
|
"loss": 0.0241, |
|
"reward": 1.81071435213089, |
|
"reward_std": 0.6503873735666275, |
|
"rewards/accuracy_reward": 0.9339286088943481, |
|
"rewards/format_reward": 0.8767857670783996, |
|
"step": 550 |
|
}, |
|
{ |
|
"completion_length": 440.2902008056641, |
|
"epoch": 0.42890262751159197, |
|
"grad_norm": 3.932904005050659, |
|
"kl": 0.581640625, |
|
"learning_rate": 3.5279248778648944e-06, |
|
"loss": 0.0233, |
|
"reward": 1.725892925262451, |
|
"reward_std": 0.7691966652870178, |
|
"rewards/accuracy_reward": 0.9178571760654449, |
|
"rewards/format_reward": 0.8080357551574707, |
|
"step": 555 |
|
}, |
|
{ |
|
"completion_length": 392.79644165039065, |
|
"epoch": 0.4327666151468315, |
|
"grad_norm": 1.320512294769287, |
|
"kl": 0.74921875, |
|
"learning_rate": 3.4970789275277878e-06, |
|
"loss": 0.03, |
|
"reward": 1.7607143640518188, |
|
"reward_std": 0.6493774831295014, |
|
"rewards/accuracy_reward": 0.9071428954601288, |
|
"rewards/format_reward": 0.853571480512619, |
|
"step": 560 |
|
}, |
|
{ |
|
"completion_length": 335.8830505371094, |
|
"epoch": 0.4366306027820711, |
|
"grad_norm": 0.6699833273887634, |
|
"kl": 1.0361328125, |
|
"learning_rate": 3.466051401903162e-06, |
|
"loss": 0.0416, |
|
"reward": 1.929464375972748, |
|
"reward_std": 0.5458748698234558, |
|
"rewards/accuracy_reward": 0.9892857730388641, |
|
"rewards/format_reward": 0.9401786088943481, |
|
"step": 565 |
|
}, |
|
{ |
|
"completion_length": 366.95447998046876, |
|
"epoch": 0.44049459041731065, |
|
"grad_norm": 1.531845211982727, |
|
"kl": 1.183984375, |
|
"learning_rate": 3.434847951327949e-06, |
|
"loss": 0.0474, |
|
"reward": 1.9107143759727478, |
|
"reward_std": 0.57608083486557, |
|
"rewards/accuracy_reward": 0.9750000536441803, |
|
"rewards/format_reward": 0.9357143223285675, |
|
"step": 570 |
|
}, |
|
{ |
|
"completion_length": 406.04376525878905, |
|
"epoch": 0.44435857805255025, |
|
"grad_norm": 2.7006077766418457, |
|
"kl": 1.67265625, |
|
"learning_rate": 3.403474258176283e-06, |
|
"loss": 0.067, |
|
"reward": 1.8223215341567993, |
|
"reward_std": 0.6289676070213318, |
|
"rewards/accuracy_reward": 0.9214286029338836, |
|
"rewards/format_reward": 0.9008929073810578, |
|
"step": 575 |
|
}, |
|
{ |
|
"completion_length": 434.67323608398436, |
|
"epoch": 0.4482225656877898, |
|
"grad_norm": 2.290830135345459, |
|
"kl": 2.16640625, |
|
"learning_rate": 3.3719360358247054e-06, |
|
"loss": 0.0866, |
|
"reward": 1.746428644657135, |
|
"reward_std": 0.7944486320018769, |
|
"rewards/accuracy_reward": 0.9267857491970062, |
|
"rewards/format_reward": 0.8196428894996644, |
|
"step": 580 |
|
}, |
|
{ |
|
"completion_length": 459.25270385742186, |
|
"epoch": 0.4520865533230294, |
|
"grad_norm": 1.7977360486984253, |
|
"kl": 1.92578125, |
|
"learning_rate": 3.3402390276117175e-06, |
|
"loss": 0.077, |
|
"reward": 1.6910715222358703, |
|
"reward_std": 0.8230761885643005, |
|
"rewards/accuracy_reward": 0.8892857611179352, |
|
"rewards/format_reward": 0.8017857432365417, |
|
"step": 585 |
|
}, |
|
{ |
|
"completion_length": 437.5250213623047, |
|
"epoch": 0.4559505409582689, |
|
"grad_norm": 1.8455265760421753, |
|
"kl": 1.071484375, |
|
"learning_rate": 3.308389005791872e-06, |
|
"loss": 0.0429, |
|
"reward": 1.7035714983940125, |
|
"reward_std": 0.7656769216060638, |
|
"rewards/accuracy_reward": 0.878571480512619, |
|
"rewards/format_reward": 0.8250000417232514, |
|
"step": 590 |
|
}, |
|
{ |
|
"completion_length": 435.51698303222656, |
|
"epoch": 0.4598145285935085, |
|
"grad_norm": 4.333515167236328, |
|
"kl": 0.9978515625, |
|
"learning_rate": 3.276391770484606e-06, |
|
"loss": 0.0399, |
|
"reward": 1.6803572058677674, |
|
"reward_std": 0.7570467174053193, |
|
"rewards/accuracy_reward": 0.8267857551574707, |
|
"rewards/format_reward": 0.8535714685916901, |
|
"step": 595 |
|
}, |
|
{ |
|
"completion_length": 410.84645080566406, |
|
"epoch": 0.46367851622874806, |
|
"grad_norm": 2.310718536376953, |
|
"kl": 0.76328125, |
|
"learning_rate": 3.244253148618002e-06, |
|
"loss": 0.0305, |
|
"reward": 1.771428644657135, |
|
"reward_std": 0.7016718983650208, |
|
"rewards/accuracy_reward": 0.9071428954601288, |
|
"rewards/format_reward": 0.8642857551574707, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.46367851622874806, |
|
"eval_completion_length": 385.1636678059896, |
|
"eval_kl": 0.6589192708333333, |
|
"eval_loss": 0.025798479095101357, |
|
"eval_reward": 1.7809524695078531, |
|
"eval_reward_std": 0.6921192049980164, |
|
"eval_rewards/accuracy_reward": 0.9285714546839396, |
|
"eval_rewards/format_reward": 0.8523809949556986, |
|
"eval_runtime": 92.176, |
|
"eval_samples_per_second": 1.074, |
|
"eval_steps_per_second": 0.043, |
|
"step": 600 |
|
}, |
|
{ |
|
"completion_length": 408.04198303222654, |
|
"epoch": 0.46754250386398766, |
|
"grad_norm": 1.6843591928482056, |
|
"kl": 0.76171875, |
|
"learning_rate": 3.211978992867653e-06, |
|
"loss": 0.0305, |
|
"reward": 1.771428656578064, |
|
"reward_std": 0.6202342182397842, |
|
"rewards/accuracy_reward": 0.8875000417232514, |
|
"rewards/format_reward": 0.8839286148548127, |
|
"step": 605 |
|
}, |
|
{ |
|
"completion_length": 395.34287414550784, |
|
"epoch": 0.4714064914992272, |
|
"grad_norm": 1.7479908466339111, |
|
"kl": 0.844140625, |
|
"learning_rate": 3.1795751805908578e-06, |
|
"loss": 0.0338, |
|
"reward": 1.8946429252624513, |
|
"reward_std": 0.6609594583511352, |
|
"rewards/accuracy_reward": 1.0071428954601287, |
|
"rewards/format_reward": 0.8875000476837158, |
|
"step": 610 |
|
}, |
|
{ |
|
"completion_length": 426.66698303222654, |
|
"epoch": 0.4752704791344668, |
|
"grad_norm": 4.6757588386535645, |
|
"kl": 0.909375, |
|
"learning_rate": 3.147047612756302e-06, |
|
"loss": 0.0364, |
|
"reward": 1.8125000596046448, |
|
"reward_std": 0.7211765825748444, |
|
"rewards/accuracy_reward": 0.9125000417232514, |
|
"rewards/format_reward": 0.9000000476837158, |
|
"step": 615 |
|
}, |
|
{ |
|
"completion_length": 409.9562713623047, |
|
"epoch": 0.47913446676970634, |
|
"grad_norm": 0.9073500633239746, |
|
"kl": 0.8787109375, |
|
"learning_rate": 3.1144022128694583e-06, |
|
"loss": 0.0352, |
|
"reward": 1.821428644657135, |
|
"reward_std": 0.6061809420585632, |
|
"rewards/accuracy_reward": 0.9285714626312256, |
|
"rewards/format_reward": 0.8928571879863739, |
|
"step": 620 |
|
}, |
|
{ |
|
"completion_length": 369.6464477539063, |
|
"epoch": 0.48299845440494593, |
|
"grad_norm": 0.41812455654144287, |
|
"kl": 0.39560546875, |
|
"learning_rate": 3.081644925893866e-06, |
|
"loss": 0.0158, |
|
"reward": 1.9875000715255737, |
|
"reward_std": 0.535971587896347, |
|
"rewards/accuracy_reward": 1.051785761117935, |
|
"rewards/format_reward": 0.935714328289032, |
|
"step": 625 |
|
}, |
|
{ |
|
"completion_length": 377.90537109375, |
|
"epoch": 0.4868624420401855, |
|
"grad_norm": 0.5811069011688232, |
|
"kl": 0.44228515625, |
|
"learning_rate": 3.048781717168513e-06, |
|
"loss": 0.0177, |
|
"reward": 1.9285714983940125, |
|
"reward_std": 0.5182920306921005, |
|
"rewards/accuracy_reward": 0.9875000536441803, |
|
"rewards/format_reward": 0.9410714745521546, |
|
"step": 630 |
|
}, |
|
{ |
|
"completion_length": 372.5982299804688, |
|
"epoch": 0.490726429675425, |
|
"grad_norm": 0.9005226492881775, |
|
"kl": 0.37333984375, |
|
"learning_rate": 3.015818571321504e-06, |
|
"loss": 0.0149, |
|
"reward": 2.0464287042617797, |
|
"reward_std": 0.5508933126926422, |
|
"rewards/accuracy_reward": 1.0946429014205932, |
|
"rewards/format_reward": 0.9517857551574707, |
|
"step": 635 |
|
}, |
|
{ |
|
"completion_length": 361.0571594238281, |
|
"epoch": 0.4945904173106646, |
|
"grad_norm": 1.6126573085784912, |
|
"kl": 0.625390625, |
|
"learning_rate": 2.9827614911802205e-06, |
|
"loss": 0.025, |
|
"reward": 1.9017857789993287, |
|
"reward_std": 0.594823956489563, |
|
"rewards/accuracy_reward": 0.9803571939468384, |
|
"rewards/format_reward": 0.9214286148548126, |
|
"step": 640 |
|
}, |
|
{ |
|
"completion_length": 379.9589447021484, |
|
"epoch": 0.49845440494590415, |
|
"grad_norm": 1.6361123323440552, |
|
"kl": 0.5390625, |
|
"learning_rate": 2.949616496678153e-06, |
|
"loss": 0.0215, |
|
"reward": 1.8375000596046447, |
|
"reward_std": 0.5699772477149964, |
|
"rewards/accuracy_reward": 0.9125000298023224, |
|
"rewards/format_reward": 0.9250000357627869, |
|
"step": 645 |
|
}, |
|
{ |
|
"completion_length": 368.4125183105469, |
|
"epoch": 0.5023183925811437, |
|
"grad_norm": 2.0848145484924316, |
|
"kl": 0.698046875, |
|
"learning_rate": 2.9163896237586365e-06, |
|
"loss": 0.0279, |
|
"reward": 1.8410715103149413, |
|
"reward_std": 0.5587106943130493, |
|
"rewards/accuracy_reward": 0.9125000417232514, |
|
"rewards/format_reward": 0.92857146859169, |
|
"step": 650 |
|
}, |
|
{ |
|
"completion_length": 376.9428741455078, |
|
"epoch": 0.5061823802163833, |
|
"grad_norm": 1.3533092737197876, |
|
"kl": 0.916015625, |
|
"learning_rate": 2.883086923275658e-06, |
|
"loss": 0.0366, |
|
"reward": 1.8705358147621154, |
|
"reward_std": 0.7300362765789032, |
|
"rewards/accuracy_reward": 0.9839286029338836, |
|
"rewards/format_reward": 0.886607187986374, |
|
"step": 655 |
|
}, |
|
{ |
|
"completion_length": 385.7473419189453, |
|
"epoch": 0.5100463678516228, |
|
"grad_norm": 2.705904960632324, |
|
"kl": 0.831640625, |
|
"learning_rate": 2.849714459891953e-06, |
|
"loss": 0.0333, |
|
"reward": 1.8303572177886962, |
|
"reward_std": 0.7160472482442856, |
|
"rewards/accuracy_reward": 0.9553571879863739, |
|
"rewards/format_reward": 0.8750000476837159, |
|
"step": 660 |
|
}, |
|
{ |
|
"completion_length": 380.43662109375, |
|
"epoch": 0.5139103554868625, |
|
"grad_norm": 0.9756877422332764, |
|
"kl": 0.5552734375, |
|
"learning_rate": 2.8162783109745833e-06, |
|
"loss": 0.0222, |
|
"reward": 1.9214286804199219, |
|
"reward_std": 0.6294688701629638, |
|
"rewards/accuracy_reward": 1.0196429073810578, |
|
"rewards/format_reward": 0.9017857551574707, |
|
"step": 665 |
|
}, |
|
{ |
|
"completion_length": 384.23841247558596, |
|
"epoch": 0.517774343122102, |
|
"grad_norm": 0.8622917532920837, |
|
"kl": 0.34326171875, |
|
"learning_rate": 2.7827845654882112e-06, |
|
"loss": 0.0137, |
|
"reward": 1.896428680419922, |
|
"reward_std": 0.5160377115011215, |
|
"rewards/accuracy_reward": 0.9660714745521546, |
|
"rewards/format_reward": 0.9303571879863739, |
|
"step": 670 |
|
}, |
|
{ |
|
"completion_length": 362.0500152587891, |
|
"epoch": 0.5216383307573416, |
|
"grad_norm": 0.9457660913467407, |
|
"kl": 0.4822265625, |
|
"learning_rate": 2.749239322886248e-06, |
|
"loss": 0.0193, |
|
"reward": 2.019642949104309, |
|
"reward_std": 0.5436660468578338, |
|
"rewards/accuracy_reward": 1.0803572058677673, |
|
"rewards/format_reward": 0.9392857611179352, |
|
"step": 675 |
|
}, |
|
{ |
|
"completion_length": 367.0884094238281, |
|
"epoch": 0.5255023183925811, |
|
"grad_norm": 0.9137616157531738, |
|
"kl": 0.378515625, |
|
"learning_rate": 2.7156486920001024e-06, |
|
"loss": 0.0151, |
|
"reward": 1.9035715103149413, |
|
"reward_std": 0.5723553836345673, |
|
"rewards/accuracy_reward": 0.9732143402099609, |
|
"rewards/format_reward": 0.9303571820259094, |
|
"step": 680 |
|
}, |
|
{ |
|
"completion_length": 360.2259063720703, |
|
"epoch": 0.5293663060278208, |
|
"grad_norm": 1.603175401687622, |
|
"kl": 0.57578125, |
|
"learning_rate": 2.6820187899267203e-06, |
|
"loss": 0.023, |
|
"reward": 1.9187500834465028, |
|
"reward_std": 0.6464061141014099, |
|
"rewards/accuracy_reward": 1.0089286267757416, |
|
"rewards/format_reward": 0.9098214745521546, |
|
"step": 685 |
|
}, |
|
{ |
|
"completion_length": 362.6777008056641, |
|
"epoch": 0.5332302936630603, |
|
"grad_norm": 1.3457704782485962, |
|
"kl": 0.8306640625, |
|
"learning_rate": 2.6483557409146133e-06, |
|
"loss": 0.0332, |
|
"reward": 1.8000000834465026, |
|
"reward_std": 0.6633238136768341, |
|
"rewards/accuracy_reward": 0.9178571820259094, |
|
"rewards/format_reward": 0.8821429014205933, |
|
"step": 690 |
|
}, |
|
{ |
|
"completion_length": 379.5053741455078, |
|
"epoch": 0.5370942812982998, |
|
"grad_norm": 2.6664907932281494, |
|
"kl": 0.89375, |
|
"learning_rate": 2.6146656752485904e-06, |
|
"loss": 0.0358, |
|
"reward": 1.7928572058677674, |
|
"reward_std": 0.7002339541912079, |
|
"rewards/accuracy_reward": 0.910714328289032, |
|
"rewards/format_reward": 0.8821428954601288, |
|
"step": 695 |
|
}, |
|
{ |
|
"completion_length": 382.989306640625, |
|
"epoch": 0.5409582689335394, |
|
"grad_norm": 2.186499834060669, |
|
"kl": 0.8466796875, |
|
"learning_rate": 2.5809547281333904e-06, |
|
"loss": 0.0339, |
|
"reward": 1.7866072177886962, |
|
"reward_std": 0.6277125418186188, |
|
"rewards/accuracy_reward": 0.8892857551574707, |
|
"rewards/format_reward": 0.8973214805126191, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5409582689335394, |
|
"eval_completion_length": 327.28858642578126, |
|
"eval_kl": 0.5944010416666666, |
|
"eval_loss": 0.024204090237617493, |
|
"eval_reward": 1.959523892402649, |
|
"eval_reward_std": 0.5741641213496526, |
|
"eval_rewards/accuracy_reward": 1.0190476616223654, |
|
"eval_rewards/format_reward": 0.9404762188593546, |
|
"eval_runtime": 83.2638, |
|
"eval_samples_per_second": 1.189, |
|
"eval_steps_per_second": 0.048, |
|
"step": 700 |
|
}, |
|
{ |
|
"completion_length": 327.90180053710935, |
|
"epoch": 0.544822256568779, |
|
"grad_norm": 2.957988739013672, |
|
"kl": 0.41298828125, |
|
"learning_rate": 2.5472290385764115e-06, |
|
"loss": 0.0166, |
|
"reward": 1.9491072416305542, |
|
"reward_std": 0.5571832716464996, |
|
"rewards/accuracy_reward": 0.9982143402099609, |
|
"rewards/format_reward": 0.9508928894996643, |
|
"step": 705 |
|
}, |
|
{ |
|
"completion_length": 335.6509094238281, |
|
"epoch": 0.5486862442040186, |
|
"grad_norm": 2.2409234046936035, |
|
"kl": 0.6271484375, |
|
"learning_rate": 2.5134947482697615e-06, |
|
"loss": 0.0251, |
|
"reward": 1.8776786565780639, |
|
"reward_std": 0.5538718163967132, |
|
"rewards/accuracy_reward": 0.9535714745521545, |
|
"rewards/format_reward": 0.9241071820259095, |
|
"step": 710 |
|
}, |
|
{ |
|
"completion_length": 316.4598358154297, |
|
"epoch": 0.5525502318392581, |
|
"grad_norm": 4.647254943847656, |
|
"kl": 1.2830078125, |
|
"learning_rate": 2.4797580004718038e-06, |
|
"loss": 0.0513, |
|
"reward": 1.9089286565780639, |
|
"reward_std": 0.6047213137149811, |
|
"rewards/accuracy_reward": 0.9821428954601288, |
|
"rewards/format_reward": 0.9267857551574707, |
|
"step": 715 |
|
}, |
|
{ |
|
"completion_length": 331.44822692871094, |
|
"epoch": 0.5564142194744977, |
|
"grad_norm": 1.3831017017364502, |
|
"kl": 0.9451171875, |
|
"learning_rate": 2.446024938888431e-06, |
|
"loss": 0.0378, |
|
"reward": 1.891964364051819, |
|
"reward_std": 0.60347221493721, |
|
"rewards/accuracy_reward": 0.9500000476837158, |
|
"rewards/format_reward": 0.941964328289032, |
|
"step": 720 |
|
}, |
|
{ |
|
"completion_length": 348.0705505371094, |
|
"epoch": 0.5602782071097373, |
|
"grad_norm": 2.458526849746704, |
|
"kl": 0.82890625, |
|
"learning_rate": 2.412301706554247e-06, |
|
"loss": 0.0332, |
|
"reward": 1.8625000715255737, |
|
"reward_std": 0.5647337824106217, |
|
"rewards/accuracy_reward": 0.9232143223285675, |
|
"rewards/format_reward": 0.9392857611179352, |
|
"step": 725 |
|
}, |
|
{ |
|
"completion_length": 351.4446594238281, |
|
"epoch": 0.5641421947449768, |
|
"grad_norm": 0.9637498259544373, |
|
"kl": 1.144140625, |
|
"learning_rate": 2.3785944447138804e-06, |
|
"loss": 0.0458, |
|
"reward": 1.82589293718338, |
|
"reward_std": 0.6317932158708572, |
|
"rewards/accuracy_reward": 0.92857146859169, |
|
"rewards/format_reward": 0.8973214805126191, |
|
"step": 730 |
|
}, |
|
{ |
|
"completion_length": 368.37501525878906, |
|
"epoch": 0.5680061823802164, |
|
"grad_norm": 0.7366132736206055, |
|
"kl": 0.6658203125, |
|
"learning_rate": 2.344909291703615e-06, |
|
"loss": 0.0266, |
|
"reward": 1.9098215222358703, |
|
"reward_std": 0.6351912558078766, |
|
"rewards/accuracy_reward": 1.0089286148548127, |
|
"rewards/format_reward": 0.9008929014205933, |
|
"step": 735 |
|
}, |
|
{ |
|
"completion_length": 381.0259094238281, |
|
"epoch": 0.5718701700154559, |
|
"grad_norm": 1.8531627655029297, |
|
"kl": 0.59921875, |
|
"learning_rate": 2.3112523818335606e-06, |
|
"loss": 0.024, |
|
"reward": 1.8910715222358703, |
|
"reward_std": 0.5899456530809403, |
|
"rewards/accuracy_reward": 0.9803571939468384, |
|
"rewards/format_reward": 0.910714328289032, |
|
"step": 740 |
|
}, |
|
{ |
|
"completion_length": 362.7080535888672, |
|
"epoch": 0.5757341576506955, |
|
"grad_norm": 1.4462131261825562, |
|
"kl": 0.6189453125, |
|
"learning_rate": 2.2776298442705434e-06, |
|
"loss": 0.0247, |
|
"reward": 1.933928668498993, |
|
"reward_std": 0.5655193716287613, |
|
"rewards/accuracy_reward": 0.9946428894996643, |
|
"rewards/format_reward": 0.9392857491970062, |
|
"step": 745 |
|
}, |
|
{ |
|
"completion_length": 339.10358276367185, |
|
"epoch": 0.5795981452859351, |
|
"grad_norm": 0.6540559530258179, |
|
"kl": 0.383984375, |
|
"learning_rate": 2.244047801921944e-06, |
|
"loss": 0.0154, |
|
"reward": 2.0205358028411866, |
|
"reward_std": 0.542399314045906, |
|
"rewards/accuracy_reward": 1.0642857611179353, |
|
"rewards/format_reward": 0.9562500357627869, |
|
"step": 750 |
|
}, |
|
{ |
|
"completion_length": 370.5160888671875, |
|
"epoch": 0.5834621329211747, |
|
"grad_norm": 0.6533460021018982, |
|
"kl": 0.4796875, |
|
"learning_rate": 2.2105123703206727e-06, |
|
"loss": 0.0192, |
|
"reward": 1.9312501072883606, |
|
"reward_std": 0.6289328157901763, |
|
"rewards/accuracy_reward": 0.998214328289032, |
|
"rewards/format_reward": 0.9330357611179352, |
|
"step": 755 |
|
}, |
|
{ |
|
"completion_length": 372.5491180419922, |
|
"epoch": 0.5873261205564142, |
|
"grad_norm": 3.128967761993408, |
|
"kl": 0.53544921875, |
|
"learning_rate": 2.1770296565114847e-06, |
|
"loss": 0.0214, |
|
"reward": 1.9785715460777282, |
|
"reward_std": 0.5253061711788177, |
|
"rewards/accuracy_reward": 1.0357143342494965, |
|
"rewards/format_reward": 0.942857176065445, |
|
"step": 760 |
|
}, |
|
{ |
|
"completion_length": 366.4009063720703, |
|
"epoch": 0.5911901081916537, |
|
"grad_norm": 0.8277491927146912, |
|
"kl": 0.46806640625, |
|
"learning_rate": 2.1436057579388443e-06, |
|
"loss": 0.0187, |
|
"reward": 1.865178656578064, |
|
"reward_std": 0.5838503152132034, |
|
"rewards/accuracy_reward": 0.9196429252624512, |
|
"rewards/format_reward": 0.9455357491970062, |
|
"step": 765 |
|
}, |
|
{ |
|
"completion_length": 362.5089447021484, |
|
"epoch": 0.5950540958268934, |
|
"grad_norm": 0.9837579131126404, |
|
"kl": 0.43828125, |
|
"learning_rate": 2.1102467613365336e-06, |
|
"loss": 0.0175, |
|
"reward": 2.007142972946167, |
|
"reward_std": 0.5876386165618896, |
|
"rewards/accuracy_reward": 1.0625000476837159, |
|
"rewards/format_reward": 0.9446429014205933, |
|
"step": 770 |
|
}, |
|
{ |
|
"completion_length": 373.8339447021484, |
|
"epoch": 0.5989180834621329, |
|
"grad_norm": 1.1622740030288696, |
|
"kl": 0.44111328125, |
|
"learning_rate": 2.0769587416192212e-06, |
|
"loss": 0.0177, |
|
"reward": 1.949107253551483, |
|
"reward_std": 0.5259672313928604, |
|
"rewards/accuracy_reward": 0.99107146859169, |
|
"rewards/format_reward": 0.9580357670783997, |
|
"step": 775 |
|
}, |
|
{ |
|
"completion_length": 388.4268035888672, |
|
"epoch": 0.6027820710973725, |
|
"grad_norm": 2.432398557662964, |
|
"kl": 0.5046875, |
|
"learning_rate": 2.0437477607761656e-06, |
|
"loss": 0.0202, |
|
"reward": 1.8973215103149415, |
|
"reward_std": 0.6027686059474945, |
|
"rewards/accuracy_reward": 0.960714328289032, |
|
"rewards/format_reward": 0.936607176065445, |
|
"step": 780 |
|
}, |
|
{ |
|
"completion_length": 376.93572692871095, |
|
"epoch": 0.606646058732612, |
|
"grad_norm": 0.8899438977241516, |
|
"kl": 0.475390625, |
|
"learning_rate": 2.0106198667672926e-06, |
|
"loss": 0.019, |
|
"reward": 1.929464340209961, |
|
"reward_std": 0.5717435568571091, |
|
"rewards/accuracy_reward": 0.9946429014205933, |
|
"rewards/format_reward": 0.93482146859169, |
|
"step": 785 |
|
}, |
|
{ |
|
"completion_length": 375.6580505371094, |
|
"epoch": 0.6105100463678517, |
|
"grad_norm": 1.3664709329605103, |
|
"kl": 0.3677734375, |
|
"learning_rate": 1.9775810924218126e-06, |
|
"loss": 0.0147, |
|
"reward": 1.969642949104309, |
|
"reward_std": 0.577250525355339, |
|
"rewards/accuracy_reward": 1.0267857670783997, |
|
"rewards/format_reward": 0.9428571879863739, |
|
"step": 790 |
|
}, |
|
{ |
|
"completion_length": 372.2384094238281, |
|
"epoch": 0.6143740340030912, |
|
"grad_norm": 0.5266821384429932, |
|
"kl": 0.34619140625, |
|
"learning_rate": 1.944637454339601e-06, |
|
"loss": 0.0139, |
|
"reward": 1.9500000715255736, |
|
"reward_std": 0.547850227355957, |
|
"rewards/accuracy_reward": 0.9982143342494965, |
|
"rewards/format_reward": 0.9517857670783997, |
|
"step": 795 |
|
}, |
|
{ |
|
"completion_length": 363.0500122070313, |
|
"epoch": 0.6182380216383307, |
|
"grad_norm": 0.4656059145927429, |
|
"kl": 0.34765625, |
|
"learning_rate": 1.9117949517955313e-06, |
|
"loss": 0.0139, |
|
"reward": 1.9125000834465027, |
|
"reward_std": 0.5355327159166337, |
|
"rewards/accuracy_reward": 0.9553571999073028, |
|
"rewards/format_reward": 0.9571429073810578, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6182380216383307, |
|
"eval_completion_length": 339.02525024414064, |
|
"eval_kl": 0.5237955729166667, |
|
"eval_loss": 0.020239148288965225, |
|
"eval_reward": 1.8928572177886962, |
|
"eval_reward_std": 0.520437486966451, |
|
"eval_rewards/accuracy_reward": 0.947619092464447, |
|
"eval_rewards/format_reward": 0.9452381173769633, |
|
"eval_runtime": 82.526, |
|
"eval_samples_per_second": 1.2, |
|
"eval_steps_per_second": 0.048, |
|
"step": 800 |
|
}, |
|
{ |
|
"completion_length": 355.1919799804688, |
|
"epoch": 0.6221020092735703, |
|
"grad_norm": 1.5192807912826538, |
|
"kl": 0.4408203125, |
|
"learning_rate": 1.8790595656469628e-06, |
|
"loss": 0.0176, |
|
"reward": 1.9366072177886964, |
|
"reward_std": 0.5646369904279709, |
|
"rewards/accuracy_reward": 0.9892857491970062, |
|
"rewards/format_reward": 0.9473214685916901, |
|
"step": 805 |
|
}, |
|
{ |
|
"completion_length": 354.4946624755859, |
|
"epoch": 0.6259659969088099, |
|
"grad_norm": 1.2222896814346313, |
|
"kl": 0.398046875, |
|
"learning_rate": 1.8464372572445867e-06, |
|
"loss": 0.0159, |
|
"reward": 1.9169643640518188, |
|
"reward_std": 0.5843247085809707, |
|
"rewards/accuracy_reward": 0.9785714685916901, |
|
"rewards/format_reward": 0.9383928954601288, |
|
"step": 810 |
|
}, |
|
{ |
|
"completion_length": 342.8660827636719, |
|
"epoch": 0.6298299845440495, |
|
"grad_norm": 1.4474635124206543, |
|
"kl": 0.32158203125, |
|
"learning_rate": 1.8139339673468142e-06, |
|
"loss": 0.0129, |
|
"reward": 1.9285715222358704, |
|
"reward_std": 0.5523967891931534, |
|
"rewards/accuracy_reward": 0.9678571939468383, |
|
"rewards/format_reward": 0.9607143342494965, |
|
"step": 815 |
|
}, |
|
{ |
|
"completion_length": 368.4000183105469, |
|
"epoch": 0.633693972179289, |
|
"grad_norm": 2.314242362976074, |
|
"kl": 0.63984375, |
|
"learning_rate": 1.7815556150379298e-06, |
|
"loss": 0.0256, |
|
"reward": 1.8758929371833801, |
|
"reward_std": 0.5279008090496063, |
|
"rewards/accuracy_reward": 0.9392857670783996, |
|
"rewards/format_reward": 0.9366071879863739, |
|
"step": 820 |
|
}, |
|
{ |
|
"completion_length": 379.89019470214845, |
|
"epoch": 0.6375579598145286, |
|
"grad_norm": 0.9705050587654114, |
|
"kl": 0.5564453125, |
|
"learning_rate": 1.7493080966501764e-06, |
|
"loss": 0.0222, |
|
"reward": 1.858928644657135, |
|
"reward_std": 0.5612190932035446, |
|
"rewards/accuracy_reward": 0.92857146859169, |
|
"rewards/format_reward": 0.930357176065445, |
|
"step": 825 |
|
}, |
|
{ |
|
"completion_length": 363.3250213623047, |
|
"epoch": 0.6414219474497682, |
|
"grad_norm": 0.9212985038757324, |
|
"kl": 0.2513671875, |
|
"learning_rate": 1.7171972846899942e-06, |
|
"loss": 0.01, |
|
"reward": 1.8767857909202577, |
|
"reward_std": 0.5048141717910767, |
|
"rewards/accuracy_reward": 0.9267857611179352, |
|
"rewards/format_reward": 0.9500000417232514, |
|
"step": 830 |
|
}, |
|
{ |
|
"completion_length": 369.3312683105469, |
|
"epoch": 0.6452859350850078, |
|
"grad_norm": 0.9865968227386475, |
|
"kl": 0.358203125, |
|
"learning_rate": 1.685229026768593e-06, |
|
"loss": 0.0143, |
|
"reward": 1.9616072297096252, |
|
"reward_std": 0.5067572951316833, |
|
"rewards/accuracy_reward": 1.0107143223285675, |
|
"rewards/format_reward": 0.9508929073810577, |
|
"step": 835 |
|
}, |
|
{ |
|
"completion_length": 373.60805053710936, |
|
"epoch": 0.6491499227202473, |
|
"grad_norm": 0.8022987842559814, |
|
"kl": 0.5169921875, |
|
"learning_rate": 1.6534091445370604e-06, |
|
"loss": 0.0207, |
|
"reward": 1.9375001072883606, |
|
"reward_std": 0.6279206275939941, |
|
"rewards/accuracy_reward": 1.0125000476837158, |
|
"rewards/format_reward": 0.9250000476837158, |
|
"step": 840 |
|
}, |
|
{ |
|
"completion_length": 372.23037109375, |
|
"epoch": 0.6530139103554868, |
|
"grad_norm": 1.627290964126587, |
|
"kl": 0.51416015625, |
|
"learning_rate": 1.6217434326261999e-06, |
|
"loss": 0.0206, |
|
"reward": 1.8633929371833802, |
|
"reward_std": 0.5344478011131286, |
|
"rewards/accuracy_reward": 0.9267857611179352, |
|
"rewards/format_reward": 0.936607176065445, |
|
"step": 845 |
|
}, |
|
{ |
|
"completion_length": 368.5562683105469, |
|
"epoch": 0.6568778979907264, |
|
"grad_norm": 0.7936336994171143, |
|
"kl": 0.2919921875, |
|
"learning_rate": 1.5902376575912815e-06, |
|
"loss": 0.0117, |
|
"reward": 1.9357143878936767, |
|
"reward_std": 0.5026874512434005, |
|
"rewards/accuracy_reward": 0.9821428954601288, |
|
"rewards/format_reward": 0.9535714745521545, |
|
"step": 850 |
|
}, |
|
{ |
|
"completion_length": 366.2107360839844, |
|
"epoch": 0.660741885625966, |
|
"grad_norm": 0.8993034958839417, |
|
"kl": 0.40234375, |
|
"learning_rate": 1.5588975568619124e-06, |
|
"loss": 0.0161, |
|
"reward": 1.9267858266830444, |
|
"reward_std": 0.5611187249422074, |
|
"rewards/accuracy_reward": 0.983928632736206, |
|
"rewards/format_reward": 0.9428571820259094, |
|
"step": 855 |
|
}, |
|
{ |
|
"completion_length": 358.8482299804688, |
|
"epoch": 0.6646058732612056, |
|
"grad_norm": 1.1370081901550293, |
|
"kl": 0.6099609375, |
|
"learning_rate": 1.5277288376972116e-06, |
|
"loss": 0.0244, |
|
"reward": 1.8241072416305542, |
|
"reward_std": 0.5759775102138519, |
|
"rewards/accuracy_reward": 0.8892857551574707, |
|
"rewards/format_reward": 0.9348214745521546, |
|
"step": 860 |
|
}, |
|
{ |
|
"completion_length": 369.9125183105469, |
|
"epoch": 0.6684698608964451, |
|
"grad_norm": 0.681843101978302, |
|
"kl": 0.424609375, |
|
"learning_rate": 1.4967371761464738e-06, |
|
"loss": 0.017, |
|
"reward": 1.9053572297096253, |
|
"reward_std": 0.541083812713623, |
|
"rewards/accuracy_reward": 0.9750000417232514, |
|
"rewards/format_reward": 0.9303571879863739, |
|
"step": 865 |
|
}, |
|
{ |
|
"completion_length": 362.09466247558595, |
|
"epoch": 0.6723338485316847, |
|
"grad_norm": 1.1364926099777222, |
|
"kl": 0.491015625, |
|
"learning_rate": 1.4659282160155222e-06, |
|
"loss": 0.0196, |
|
"reward": 1.8607143878936767, |
|
"reward_std": 0.5730058521032333, |
|
"rewards/accuracy_reward": 0.9267857611179352, |
|
"rewards/format_reward": 0.9339286148548126, |
|
"step": 870 |
|
}, |
|
{ |
|
"completion_length": 360.81340942382815, |
|
"epoch": 0.6761978361669243, |
|
"grad_norm": 1.0204797983169556, |
|
"kl": 0.5099609375, |
|
"learning_rate": 1.4353075678389284e-06, |
|
"loss": 0.0204, |
|
"reward": 1.9785715460777282, |
|
"reward_std": 0.5866712421178818, |
|
"rewards/accuracy_reward": 1.0392857611179351, |
|
"rewards/format_reward": 0.9392857491970062, |
|
"step": 875 |
|
}, |
|
{ |
|
"completion_length": 364.41787109375, |
|
"epoch": 0.6800618238021638, |
|
"grad_norm": 1.010473608970642, |
|
"kl": 0.63125, |
|
"learning_rate": 1.4048808078582943e-06, |
|
"loss": 0.0253, |
|
"reward": 1.908928668498993, |
|
"reward_std": 0.611833056807518, |
|
"rewards/accuracy_reward": 0.9928571820259094, |
|
"rewards/format_reward": 0.9160714685916901, |
|
"step": 880 |
|
}, |
|
{ |
|
"completion_length": 354.1143005371094, |
|
"epoch": 0.6839258114374034, |
|
"grad_norm": 1.0668288469314575, |
|
"kl": 0.5849609375, |
|
"learning_rate": 1.3746534770067803e-06, |
|
"loss": 0.0234, |
|
"reward": 2.0035715460777284, |
|
"reward_std": 0.5578104436397553, |
|
"rewards/accuracy_reward": 1.066071480512619, |
|
"rewards/format_reward": 0.9375000417232513, |
|
"step": 885 |
|
}, |
|
{ |
|
"completion_length": 364.14198303222656, |
|
"epoch": 0.6877897990726429, |
|
"grad_norm": 1.7410629987716675, |
|
"kl": 0.37431640625, |
|
"learning_rate": 1.3446310799000578e-06, |
|
"loss": 0.015, |
|
"reward": 1.9598215341567993, |
|
"reward_std": 0.5321772754192352, |
|
"rewards/accuracy_reward": 1.0303571939468383, |
|
"rewards/format_reward": 0.929464328289032, |
|
"step": 890 |
|
}, |
|
{ |
|
"completion_length": 345.7285888671875, |
|
"epoch": 0.6916537867078826, |
|
"grad_norm": 0.8086636662483215, |
|
"kl": 0.453515625, |
|
"learning_rate": 1.3148190838338804e-06, |
|
"loss": 0.0181, |
|
"reward": 1.9848214864730835, |
|
"reward_std": 0.5839688003063201, |
|
"rewards/accuracy_reward": 1.0517857670783997, |
|
"rewards/format_reward": 0.9330357491970063, |
|
"step": 895 |
|
}, |
|
{ |
|
"completion_length": 391.9473358154297, |
|
"epoch": 0.6955177743431221, |
|
"grad_norm": 1.5891263484954834, |
|
"kl": 0.8416015625, |
|
"learning_rate": 1.2852229177884492e-06, |
|
"loss": 0.0337, |
|
"reward": 1.813392949104309, |
|
"reward_std": 0.6605880260467529, |
|
"rewards/accuracy_reward": 0.9160714685916901, |
|
"rewards/format_reward": 0.8973214745521545, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6955177743431221, |
|
"eval_completion_length": 342.9001749674479, |
|
"eval_kl": 0.8555989583333333, |
|
"eval_loss": 0.03408632054924965, |
|
"eval_reward": 2.0619048436482745, |
|
"eval_reward_std": 0.5802303751309713, |
|
"eval_rewards/accuracy_reward": 1.1428571979204813, |
|
"eval_rewards/format_reward": 0.9190476576487223, |
|
"eval_runtime": 90.4473, |
|
"eval_samples_per_second": 1.095, |
|
"eval_steps_per_second": 0.044, |
|
"step": 900 |
|
}, |
|
{ |
|
"completion_length": 354.5241180419922, |
|
"epoch": 0.6993817619783617, |
|
"grad_norm": 2.794023036956787, |
|
"kl": 0.760546875, |
|
"learning_rate": 1.2558479714397585e-06, |
|
"loss": 0.0304, |
|
"reward": 1.9258929371833802, |
|
"reward_std": 0.6209361255168915, |
|
"rewards/accuracy_reward": 0.9946429073810578, |
|
"rewards/format_reward": 0.9312500417232513, |
|
"step": 905 |
|
}, |
|
{ |
|
"completion_length": 375.51787719726565, |
|
"epoch": 0.7032457496136012, |
|
"grad_norm": 1.4545397758483887, |
|
"kl": 0.623828125, |
|
"learning_rate": 1.2266995941780934e-06, |
|
"loss": 0.025, |
|
"reward": 1.858928644657135, |
|
"reward_std": 0.6513124674558639, |
|
"rewards/accuracy_reward": 0.9464286029338836, |
|
"rewards/format_reward": 0.9125000476837158, |
|
"step": 910 |
|
}, |
|
{ |
|
"completion_length": 352.10894775390625, |
|
"epoch": 0.7071097372488409, |
|
"grad_norm": 1.136331558227539, |
|
"kl": 0.5474609375, |
|
"learning_rate": 1.197783094133869e-06, |
|
"loss": 0.0219, |
|
"reward": 1.902678644657135, |
|
"reward_std": 0.5192236006259918, |
|
"rewards/accuracy_reward": 0.973214328289032, |
|
"rewards/format_reward": 0.929464328289032, |
|
"step": 915 |
|
}, |
|
{ |
|
"completion_length": 343.6509094238281, |
|
"epoch": 0.7109737248840804, |
|
"grad_norm": 1.513318419456482, |
|
"kl": 0.474609375, |
|
"learning_rate": 1.1691037372109835e-06, |
|
"loss": 0.019, |
|
"reward": 2.0866072535514832, |
|
"reward_std": 0.4978013187646866, |
|
"rewards/accuracy_reward": 1.1410714983940125, |
|
"rewards/format_reward": 0.9455357551574707, |
|
"step": 920 |
|
}, |
|
{ |
|
"completion_length": 365.5428741455078, |
|
"epoch": 0.7148377125193199, |
|
"grad_norm": 2.5464484691619873, |
|
"kl": 0.6884765625, |
|
"learning_rate": 1.140666746127854e-06, |
|
"loss": 0.0276, |
|
"reward": 1.988392949104309, |
|
"reward_std": 0.5555747985839844, |
|
"rewards/accuracy_reward": 1.048214328289032, |
|
"rewards/format_reward": 0.9401786088943481, |
|
"step": 925 |
|
}, |
|
{ |
|
"completion_length": 359.7901947021484, |
|
"epoch": 0.7187017001545595, |
|
"grad_norm": 0.9951956272125244, |
|
"kl": 0.61796875, |
|
"learning_rate": 1.1124772994663258e-06, |
|
"loss": 0.0247, |
|
"reward": 2.0348215341567992, |
|
"reward_std": 0.5215620249509811, |
|
"rewards/accuracy_reward": 1.0964286148548126, |
|
"rewards/format_reward": 0.9383929014205933, |
|
"step": 930 |
|
}, |
|
{ |
|
"completion_length": 344.65537719726564, |
|
"epoch": 0.7225656877897991, |
|
"grad_norm": 0.8325075507164001, |
|
"kl": 0.4759765625, |
|
"learning_rate": 1.084540530728613e-06, |
|
"loss": 0.0191, |
|
"reward": 2.0116072416305544, |
|
"reward_std": 0.48731706738471986, |
|
"rewards/accuracy_reward": 1.0625000417232513, |
|
"rewards/format_reward": 0.949107187986374, |
|
"step": 935 |
|
}, |
|
{ |
|
"completion_length": 374.4696563720703, |
|
"epoch": 0.7264296754250387, |
|
"grad_norm": 0.7985464930534363, |
|
"kl": 0.4962890625, |
|
"learning_rate": 1.0568615274024521e-06, |
|
"loss": 0.0199, |
|
"reward": 1.9285715460777282, |
|
"reward_std": 0.5753836840391159, |
|
"rewards/accuracy_reward": 0.99107146859169, |
|
"rewards/format_reward": 0.9375000476837159, |
|
"step": 940 |
|
}, |
|
{ |
|
"completion_length": 360.10626831054685, |
|
"epoch": 0.7302936630602782, |
|
"grad_norm": 1.1231186389923096, |
|
"kl": 0.700390625, |
|
"learning_rate": 1.029445330034633e-06, |
|
"loss": 0.028, |
|
"reward": 1.946428644657135, |
|
"reward_std": 0.618572261929512, |
|
"rewards/accuracy_reward": 1.0303571939468383, |
|
"rewards/format_reward": 0.9160714685916901, |
|
"step": 945 |
|
}, |
|
{ |
|
"completion_length": 376.62322998046875, |
|
"epoch": 0.7341576506955177, |
|
"grad_norm": 1.4602620601654053, |
|
"kl": 0.5408203125, |
|
"learning_rate": 1.0022969313130773e-06, |
|
"loss": 0.0216, |
|
"reward": 1.9642858266830445, |
|
"reward_std": 0.5609104305505752, |
|
"rewards/accuracy_reward": 1.028571480512619, |
|
"rewards/format_reward": 0.935714328289032, |
|
"step": 950 |
|
}, |
|
{ |
|
"completion_length": 381.98126525878905, |
|
"epoch": 0.7380216383307573, |
|
"grad_norm": 2.153388023376465, |
|
"kl": 0.7783203125, |
|
"learning_rate": 9.754212751576386e-07, |
|
"loss": 0.0311, |
|
"reward": 1.8812500834465027, |
|
"reward_std": 0.6705505669116973, |
|
"rewards/accuracy_reward": 0.9785714745521545, |
|
"rewards/format_reward": 0.9026786148548126, |
|
"step": 955 |
|
}, |
|
{ |
|
"completion_length": 364.32322387695314, |
|
"epoch": 0.7418856259659969, |
|
"grad_norm": 1.639176845550537, |
|
"kl": 0.65078125, |
|
"learning_rate": 9.488232558197732e-07, |
|
"loss": 0.0261, |
|
"reward": 1.8741072297096253, |
|
"reward_std": 0.6317374408245087, |
|
"rewards/accuracy_reward": 0.948214328289032, |
|
"rewards/format_reward": 0.9258929073810578, |
|
"step": 960 |
|
}, |
|
{ |
|
"completion_length": 371.18751525878906, |
|
"epoch": 0.7457496136012365, |
|
"grad_norm": 1.5717941522598267, |
|
"kl": 0.4546875, |
|
"learning_rate": 9.225077169912644e-07, |
|
"loss": 0.0182, |
|
"reward": 1.9750001072883605, |
|
"reward_std": 0.5823389858007431, |
|
"rewards/accuracy_reward": 1.048214328289032, |
|
"rewards/format_reward": 0.9267857551574707, |
|
"step": 965 |
|
}, |
|
{ |
|
"completion_length": 374.3330505371094, |
|
"epoch": 0.749613601236476, |
|
"grad_norm": 1.1975027322769165, |
|
"kl": 0.566796875, |
|
"learning_rate": 8.964794509221508e-07, |
|
"loss": 0.0227, |
|
"reward": 1.9089286565780639, |
|
"reward_std": 0.5614093959331512, |
|
"rewards/accuracy_reward": 0.9875000357627869, |
|
"rewards/format_reward": 0.9214286208152771, |
|
"step": 970 |
|
}, |
|
{ |
|
"completion_length": 367.66876525878905, |
|
"epoch": 0.7534775888717156, |
|
"grad_norm": 1.064929723739624, |
|
"kl": 0.7947265625, |
|
"learning_rate": 8.707431975480221e-07, |
|
"loss": 0.0318, |
|
"reward": 1.9767858147621156, |
|
"reward_std": 0.611976683139801, |
|
"rewards/accuracy_reward": 1.0571429133415222, |
|
"rewards/format_reward": 0.9196428894996643, |
|
"step": 975 |
|
}, |
|
{ |
|
"completion_length": 393.2553741455078, |
|
"epoch": 0.7573415765069552, |
|
"grad_norm": 1.5490120649337769, |
|
"kl": 0.845703125, |
|
"learning_rate": 8.453036436268458e-07, |
|
"loss": 0.0338, |
|
"reward": 1.8348215222358704, |
|
"reward_std": 0.6252253264188766, |
|
"rewards/accuracy_reward": 0.9321429014205933, |
|
"rewards/format_reward": 0.9026786148548126, |
|
"step": 980 |
|
}, |
|
{ |
|
"completion_length": 385.7125183105469, |
|
"epoch": 0.7612055641421948, |
|
"grad_norm": 1.1330265998840332, |
|
"kl": 0.6615234375, |
|
"learning_rate": 8.20165421885469e-07, |
|
"loss": 0.0265, |
|
"reward": 1.8267858147621154, |
|
"reward_std": 0.641253513097763, |
|
"rewards/accuracy_reward": 0.935714328289032, |
|
"rewards/format_reward": 0.8910714685916901, |
|
"step": 985 |
|
}, |
|
{ |
|
"completion_length": 347.54822998046876, |
|
"epoch": 0.7650695517774343, |
|
"grad_norm": 1.4882538318634033, |
|
"kl": 0.6056640625, |
|
"learning_rate": 7.953331101759706e-07, |
|
"loss": 0.0242, |
|
"reward": 1.9312501072883606, |
|
"reward_std": 0.6073622226715087, |
|
"rewards/accuracy_reward": 1.00357146859169, |
|
"rewards/format_reward": 0.9276786208152771, |
|
"step": 990 |
|
}, |
|
{ |
|
"completion_length": 372.0375183105469, |
|
"epoch": 0.7689335394126738, |
|
"grad_norm": 1.4863989353179932, |
|
"kl": 0.7060546875, |
|
"learning_rate": 7.708112306419968e-07, |
|
"loss": 0.0282, |
|
"reward": 1.8142857909202577, |
|
"reward_std": 0.582288071513176, |
|
"rewards/accuracy_reward": 0.898214328289032, |
|
"rewards/format_reward": 0.9160714626312256, |
|
"step": 995 |
|
}, |
|
{ |
|
"completion_length": 348.9973388671875, |
|
"epoch": 0.7727975270479135, |
|
"grad_norm": 1.5836974382400513, |
|
"kl": 0.795703125, |
|
"learning_rate": 7.466042488952521e-07, |
|
"loss": 0.0318, |
|
"reward": 1.958928656578064, |
|
"reward_std": 0.6530672192573548, |
|
"rewards/accuracy_reward": 1.0357143342494965, |
|
"rewards/format_reward": 0.9232143223285675, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7727975270479135, |
|
"eval_completion_length": 376.2670003255208, |
|
"eval_kl": 0.5617838541666667, |
|
"eval_loss": 0.022667212411761284, |
|
"eval_reward": 2.011904811859131, |
|
"eval_reward_std": 0.5900813996791839, |
|
"eval_rewards/accuracy_reward": 1.0809524337450662, |
|
"eval_rewards/format_reward": 0.9309524138768513, |
|
"eval_runtime": 86.4556, |
|
"eval_samples_per_second": 1.145, |
|
"eval_steps_per_second": 0.046, |
|
"step": 1000 |
|
}, |
|
{ |
|
"completion_length": 351.85447998046874, |
|
"epoch": 0.776661514683153, |
|
"grad_norm": 1.1455680131912231, |
|
"kl": 0.5607421875, |
|
"learning_rate": 7.227165732022717e-07, |
|
"loss": 0.0224, |
|
"reward": 2.1116072654724123, |
|
"reward_std": 0.5147848486900329, |
|
"rewards/accuracy_reward": 1.1607143521308898, |
|
"rewards/format_reward": 0.9508928954601288, |
|
"step": 1005 |
|
}, |
|
{ |
|
"completion_length": 361.89020080566405, |
|
"epoch": 0.7805255023183926, |
|
"grad_norm": 1.188633680343628, |
|
"kl": 0.5509765625, |
|
"learning_rate": 6.991525536816498e-07, |
|
"loss": 0.022, |
|
"reward": 1.9598215103149415, |
|
"reward_std": 0.48385874927043915, |
|
"rewards/accuracy_reward": 1.0196429014205932, |
|
"rewards/format_reward": 0.9401786148548126, |
|
"step": 1010 |
|
}, |
|
{ |
|
"completion_length": 373.9660919189453, |
|
"epoch": 0.7843894899536321, |
|
"grad_norm": 0.8180363774299622, |
|
"kl": 0.598828125, |
|
"learning_rate": 6.759164815118493e-07, |
|
"loss": 0.024, |
|
"reward": 1.9000000953674316, |
|
"reward_std": 0.5949198305606842, |
|
"rewards/accuracy_reward": 0.9625000357627869, |
|
"rewards/format_reward": 0.9375000417232513, |
|
"step": 1015 |
|
}, |
|
{ |
|
"completion_length": 377.0375152587891, |
|
"epoch": 0.7882534775888718, |
|
"grad_norm": 0.7868214845657349, |
|
"kl": 0.5365234375, |
|
"learning_rate": 6.530125881497473e-07, |
|
"loss": 0.0215, |
|
"reward": 1.8812501072883605, |
|
"reward_std": 0.541077944636345, |
|
"rewards/accuracy_reward": 0.9357143342494965, |
|
"rewards/format_reward": 0.9455357491970062, |
|
"step": 1020 |
|
}, |
|
{ |
|
"completion_length": 365.1553680419922, |
|
"epoch": 0.7921174652241113, |
|
"grad_norm": 2.3109092712402344, |
|
"kl": 0.604296875, |
|
"learning_rate": 6.30445044560056e-07, |
|
"loss": 0.0242, |
|
"reward": 1.9892858147621155, |
|
"reward_std": 0.5382134824991226, |
|
"rewards/accuracy_reward": 1.0464286267757417, |
|
"rewards/format_reward": 0.9428571820259094, |
|
"step": 1025 |
|
}, |
|
{ |
|
"completion_length": 362.01787719726565, |
|
"epoch": 0.7959814528593508, |
|
"grad_norm": 1.4343385696411133, |
|
"kl": 0.7544921875, |
|
"learning_rate": 6.082179604557617e-07, |
|
"loss": 0.0302, |
|
"reward": 1.960714375972748, |
|
"reward_std": 0.5546221494674682, |
|
"rewards/accuracy_reward": 1.0303571939468383, |
|
"rewards/format_reward": 0.9303571820259094, |
|
"step": 1030 |
|
}, |
|
{ |
|
"completion_length": 365.1625122070312, |
|
"epoch": 0.7998454404945904, |
|
"grad_norm": 1.025671362876892, |
|
"kl": 0.5177734375, |
|
"learning_rate": 5.863353835497137e-07, |
|
"loss": 0.0207, |
|
"reward": 1.940178644657135, |
|
"reward_std": 0.6024579167366028, |
|
"rewards/accuracy_reward": 1.0071429014205933, |
|
"rewards/format_reward": 0.9330357551574707, |
|
"step": 1035 |
|
}, |
|
{ |
|
"completion_length": 372.9285888671875, |
|
"epoch": 0.80370942812983, |
|
"grad_norm": 0.9761889576911926, |
|
"kl": 0.61640625, |
|
"learning_rate": 5.648012988175075e-07, |
|
"loss": 0.0247, |
|
"reward": 1.8482143759727478, |
|
"reward_std": 0.6087313055992126, |
|
"rewards/accuracy_reward": 0.9375000476837159, |
|
"rewards/format_reward": 0.9107143342494964, |
|
"step": 1040 |
|
}, |
|
{ |
|
"completion_length": 376.95359191894534, |
|
"epoch": 0.8075734157650696, |
|
"grad_norm": 1.214440941810608, |
|
"kl": 0.875390625, |
|
"learning_rate": 5.436196277717928e-07, |
|
"loss": 0.035, |
|
"reward": 1.90089293718338, |
|
"reward_std": 0.6438783019781112, |
|
"rewards/accuracy_reward": 0.9910714864730835, |
|
"rewards/format_reward": 0.9098214626312255, |
|
"step": 1045 |
|
}, |
|
{ |
|
"completion_length": 365.9160919189453, |
|
"epoch": 0.8114374034003091, |
|
"grad_norm": 1.3267405033111572, |
|
"kl": 0.6033203125, |
|
"learning_rate": 5.227942277481363e-07, |
|
"loss": 0.0241, |
|
"reward": 1.9250001072883607, |
|
"reward_std": 0.5924921661615372, |
|
"rewards/accuracy_reward": 1.000000035762787, |
|
"rewards/format_reward": 0.9250000298023224, |
|
"step": 1050 |
|
}, |
|
{ |
|
"completion_length": 380.3330535888672, |
|
"epoch": 0.8153013910355487, |
|
"grad_norm": 1.3830839395523071, |
|
"kl": 0.4916015625, |
|
"learning_rate": 5.023288912025742e-07, |
|
"loss": 0.0197, |
|
"reward": 1.9303572654724122, |
|
"reward_std": 0.5795833975076675, |
|
"rewards/accuracy_reward": 1.0000000476837159, |
|
"rewards/format_reward": 0.9303571820259094, |
|
"step": 1055 |
|
}, |
|
{ |
|
"completion_length": 343.18394470214844, |
|
"epoch": 0.8191653786707882, |
|
"grad_norm": 1.7928149700164795, |
|
"kl": 0.4869140625, |
|
"learning_rate": 4.822273450209767e-07, |
|
"loss": 0.0195, |
|
"reward": 2.000000071525574, |
|
"reward_std": 0.5124461591243744, |
|
"rewards/accuracy_reward": 1.0571428954601287, |
|
"rewards/format_reward": 0.9428571879863739, |
|
"step": 1060 |
|
}, |
|
{ |
|
"completion_length": 353.4276977539063, |
|
"epoch": 0.8230293663060279, |
|
"grad_norm": 0.9495754837989807, |
|
"kl": 0.83935546875, |
|
"learning_rate": 4.6249324984035863e-07, |
|
"loss": 0.0335, |
|
"reward": 1.9812500834465028, |
|
"reward_std": 0.5092057317495347, |
|
"rewards/accuracy_reward": 1.0214286267757415, |
|
"rewards/format_reward": 0.95982146859169, |
|
"step": 1065 |
|
}, |
|
{ |
|
"completion_length": 375.46876831054686, |
|
"epoch": 0.8268933539412674, |
|
"grad_norm": 0.7771401405334473, |
|
"kl": 0.4986328125, |
|
"learning_rate": 4.431301993822471e-07, |
|
"loss": 0.02, |
|
"reward": 2.0125001072883606, |
|
"reward_std": 0.5730317920446396, |
|
"rewards/accuracy_reward": 1.0839286148548126, |
|
"rewards/format_reward": 0.9285714745521545, |
|
"step": 1070 |
|
}, |
|
{ |
|
"completion_length": 371.25269470214846, |
|
"epoch": 0.8307573415765069, |
|
"grad_norm": 1.0410772562026978, |
|
"kl": 0.4998046875, |
|
"learning_rate": 4.2414171979824e-07, |
|
"loss": 0.02, |
|
"reward": 1.9660715341567994, |
|
"reward_std": 0.5737438589334488, |
|
"rewards/accuracy_reward": 1.0250000476837158, |
|
"rewards/format_reward": 0.9410714745521546, |
|
"step": 1075 |
|
}, |
|
{ |
|
"completion_length": 367.7830474853516, |
|
"epoch": 0.8346213292117465, |
|
"grad_norm": 1.4777237176895142, |
|
"kl": 0.54140625, |
|
"learning_rate": 4.055312690278701e-07, |
|
"loss": 0.0217, |
|
"reward": 1.9062500953674317, |
|
"reward_std": 0.5742262482643128, |
|
"rewards/accuracy_reward": 0.9714286029338837, |
|
"rewards/format_reward": 0.93482146859169, |
|
"step": 1080 |
|
}, |
|
{ |
|
"completion_length": 371.16876831054685, |
|
"epoch": 0.8384853168469861, |
|
"grad_norm": 0.8883810043334961, |
|
"kl": 0.453125, |
|
"learning_rate": 3.8730223616888634e-07, |
|
"loss": 0.0181, |
|
"reward": 1.927678680419922, |
|
"reward_std": 0.47931237816810607, |
|
"rewards/accuracy_reward": 0.9821428954601288, |
|
"rewards/format_reward": 0.9455357551574707, |
|
"step": 1085 |
|
}, |
|
{ |
|
"completion_length": 372.65984497070315, |
|
"epoch": 0.8423493044822257, |
|
"grad_norm": 1.4345813989639282, |
|
"kl": 0.4869140625, |
|
"learning_rate": 3.6945794086007706e-07, |
|
"loss": 0.0195, |
|
"reward": 2.0116072297096252, |
|
"reward_std": 0.565495365858078, |
|
"rewards/accuracy_reward": 1.080357199907303, |
|
"rewards/format_reward": 0.9312500357627869, |
|
"step": 1090 |
|
}, |
|
{ |
|
"completion_length": 370.21876525878906, |
|
"epoch": 0.8462132921174652, |
|
"grad_norm": 1.1525709629058838, |
|
"kl": 0.47900390625, |
|
"learning_rate": 3.520016326767381e-07, |
|
"loss": 0.0191, |
|
"reward": 1.9973214983940124, |
|
"reward_std": 0.5897004574537277, |
|
"rewards/accuracy_reward": 1.067857176065445, |
|
"rewards/format_reward": 0.9294643342494965, |
|
"step": 1095 |
|
}, |
|
{ |
|
"completion_length": 359.56609802246095, |
|
"epoch": 0.8500772797527048, |
|
"grad_norm": 1.2896158695220947, |
|
"kl": 0.5654296875, |
|
"learning_rate": 3.3493649053890325e-07, |
|
"loss": 0.0226, |
|
"reward": 1.9857143998146056, |
|
"reward_std": 0.4953633636236191, |
|
"rewards/accuracy_reward": 1.0446428954601288, |
|
"rewards/format_reward": 0.9410714626312255, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8500772797527048, |
|
"eval_completion_length": 358.0001708984375, |
|
"eval_kl": 0.4869791666666667, |
|
"eval_loss": 0.019230343401432037, |
|
"eval_reward": 1.9404762903849284, |
|
"eval_reward_std": 0.522600182890892, |
|
"eval_rewards/accuracy_reward": 1.009523856639862, |
|
"eval_rewards/format_reward": 0.9309524138768513, |
|
"eval_runtime": 90.3216, |
|
"eval_samples_per_second": 1.096, |
|
"eval_steps_per_second": 0.044, |
|
"step": 1100 |
|
}, |
|
{ |
|
"completion_length": 364.4919799804687, |
|
"epoch": 0.8539412673879444, |
|
"grad_norm": 1.5501704216003418, |
|
"kl": 0.46357421875, |
|
"learning_rate": 3.182656221324384e-07, |
|
"loss": 0.0185, |
|
"reward": 1.9089286923408508, |
|
"reward_std": 0.5076944470405579, |
|
"rewards/accuracy_reward": 0.9750000536441803, |
|
"rewards/format_reward": 0.9339286088943481, |
|
"step": 1105 |
|
}, |
|
{ |
|
"completion_length": 349.36341247558596, |
|
"epoch": 0.8578052550231839, |
|
"grad_norm": 0.8084538578987122, |
|
"kl": 0.41845703125, |
|
"learning_rate": 3.019920633431095e-07, |
|
"loss": 0.0167, |
|
"reward": 2.065178656578064, |
|
"reward_std": 0.4948657438158989, |
|
"rewards/accuracy_reward": 1.1071428954601288, |
|
"rewards/format_reward": 0.9580357551574707, |
|
"step": 1110 |
|
}, |
|
{ |
|
"completion_length": 371.67323303222656, |
|
"epoch": 0.8616692426584235, |
|
"grad_norm": 1.2597169876098633, |
|
"kl": 0.4419921875, |
|
"learning_rate": 2.861187777037269e-07, |
|
"loss": 0.0177, |
|
"reward": 1.9366072297096253, |
|
"reward_std": 0.5391754776239395, |
|
"rewards/accuracy_reward": 0.9982143342494965, |
|
"rewards/format_reward": 0.9383928954601288, |
|
"step": 1115 |
|
}, |
|
{ |
|
"completion_length": 374.87412414550784, |
|
"epoch": 0.865533230293663, |
|
"grad_norm": 0.7964634895324707, |
|
"kl": 0.5240234375, |
|
"learning_rate": 2.706486558544644e-07, |
|
"loss": 0.0209, |
|
"reward": 1.927678644657135, |
|
"reward_std": 0.5668253153562546, |
|
"rewards/accuracy_reward": 0.9928571820259094, |
|
"rewards/format_reward": 0.93482146859169, |
|
"step": 1120 |
|
}, |
|
{ |
|
"completion_length": 357.7071563720703, |
|
"epoch": 0.8693972179289027, |
|
"grad_norm": 0.9206239581108093, |
|
"kl": 0.470703125, |
|
"learning_rate": 2.55584515016451e-07, |
|
"loss": 0.0188, |
|
"reward": 1.9598214864730834, |
|
"reward_std": 0.546750283241272, |
|
"rewards/accuracy_reward": 1.0267857611179352, |
|
"rewards/format_reward": 0.9330357670783996, |
|
"step": 1125 |
|
}, |
|
{ |
|
"completion_length": 366.42144775390625, |
|
"epoch": 0.8732612055641422, |
|
"grad_norm": 1.1973419189453125, |
|
"kl": 0.5232421875, |
|
"learning_rate": 2.4092909847873713e-07, |
|
"loss": 0.0209, |
|
"reward": 1.9580357909202575, |
|
"reward_std": 0.5517275601625442, |
|
"rewards/accuracy_reward": 1.023214340209961, |
|
"rewards/format_reward": 0.93482146859169, |
|
"step": 1130 |
|
}, |
|
{ |
|
"completion_length": 365.56787109375, |
|
"epoch": 0.8771251931993818, |
|
"grad_norm": 0.7583184838294983, |
|
"kl": 0.587109375, |
|
"learning_rate": 2.2668507509871957e-07, |
|
"loss": 0.0235, |
|
"reward": 1.894642949104309, |
|
"reward_std": 0.5627562046051026, |
|
"rewards/accuracy_reward": 0.9553571879863739, |
|
"rewards/format_reward": 0.9392857491970062, |
|
"step": 1135 |
|
}, |
|
{ |
|
"completion_length": 354.81340942382815, |
|
"epoch": 0.8809891808346213, |
|
"grad_norm": 2.083658218383789, |
|
"kl": 0.714453125, |
|
"learning_rate": 2.128550388161263e-07, |
|
"loss": 0.0286, |
|
"reward": 1.93571435213089, |
|
"reward_std": 0.570975062251091, |
|
"rewards/accuracy_reward": 1.0071428775787354, |
|
"rewards/format_reward": 0.92857146859169, |
|
"step": 1140 |
|
}, |
|
{ |
|
"completion_length": 351.0473327636719, |
|
"epoch": 0.884853168469861, |
|
"grad_norm": 1.6536860466003418, |
|
"kl": 0.641796875, |
|
"learning_rate": 1.9944150818063667e-07, |
|
"loss": 0.0257, |
|
"reward": 1.952678632736206, |
|
"reward_std": 0.5870893836021424, |
|
"rewards/accuracy_reward": 1.0178571820259095, |
|
"rewards/format_reward": 0.93482146859169, |
|
"step": 1145 |
|
}, |
|
{ |
|
"completion_length": 371.1955535888672, |
|
"epoch": 0.8887171561051005, |
|
"grad_norm": 0.8971360325813293, |
|
"kl": 0.6400390625, |
|
"learning_rate": 1.864469258932397e-07, |
|
"loss": 0.0256, |
|
"reward": 1.9348215222358705, |
|
"reward_std": 0.5955395519733429, |
|
"rewards/accuracy_reward": 1.008928608894348, |
|
"rewards/format_reward": 0.9258928954601288, |
|
"step": 1150 |
|
}, |
|
{ |
|
"completion_length": 350.9419799804688, |
|
"epoch": 0.89258114374034, |
|
"grad_norm": 1.2180782556533813, |
|
"kl": 0.6533203125, |
|
"learning_rate": 1.7387365836139785e-07, |
|
"loss": 0.0261, |
|
"reward": 1.952678668498993, |
|
"reward_std": 0.5585884839296341, |
|
"rewards/accuracy_reward": 1.005357176065445, |
|
"rewards/format_reward": 0.9473214626312256, |
|
"step": 1155 |
|
}, |
|
{ |
|
"completion_length": 376.7419769287109, |
|
"epoch": 0.8964451313755796, |
|
"grad_norm": 1.8317776918411255, |
|
"kl": 0.723046875, |
|
"learning_rate": 1.6172399526810822e-07, |
|
"loss": 0.0289, |
|
"reward": 1.846428632736206, |
|
"reward_std": 0.5882213652133942, |
|
"rewards/accuracy_reward": 0.9250000357627869, |
|
"rewards/format_reward": 0.9214286208152771, |
|
"step": 1160 |
|
}, |
|
{ |
|
"completion_length": 366.1000244140625, |
|
"epoch": 0.9003091190108191, |
|
"grad_norm": 2.5519237518310547, |
|
"kl": 0.687109375, |
|
"learning_rate": 1.5000014915493467e-07, |
|
"loss": 0.0275, |
|
"reward": 1.9366072297096253, |
|
"reward_std": 0.6008837521076202, |
|
"rewards/accuracy_reward": 1.003571480512619, |
|
"rewards/format_reward": 0.9330357491970063, |
|
"step": 1165 |
|
}, |
|
{ |
|
"completion_length": 355.3768035888672, |
|
"epoch": 0.9041731066460588, |
|
"grad_norm": 1.4227640628814697, |
|
"kl": 0.80859375, |
|
"learning_rate": 1.3870425501908674e-07, |
|
"loss": 0.0324, |
|
"reward": 1.908928680419922, |
|
"reward_std": 0.5863888055086136, |
|
"rewards/accuracy_reward": 0.9839286148548126, |
|
"rewards/format_reward": 0.9250000476837158, |
|
"step": 1170 |
|
}, |
|
{ |
|
"completion_length": 355.98841247558596, |
|
"epoch": 0.9080370942812983, |
|
"grad_norm": 2.7810275554656982, |
|
"kl": 0.644921875, |
|
"learning_rate": 1.278383699246244e-07, |
|
"loss": 0.0258, |
|
"reward": 1.9991072177886964, |
|
"reward_std": 0.549174913764, |
|
"rewards/accuracy_reward": 1.0678571879863739, |
|
"rewards/format_reward": 0.9312500476837158, |
|
"step": 1175 |
|
}, |
|
{ |
|
"completion_length": 354.95090942382814, |
|
"epoch": 0.9119010819165378, |
|
"grad_norm": 1.349503517150879, |
|
"kl": 0.557421875, |
|
"learning_rate": 1.1740447262784782e-07, |
|
"loss": 0.0223, |
|
"reward": 1.9553572297096253, |
|
"reward_std": 0.6008891820907593, |
|
"rewards/accuracy_reward": 1.02857146859169, |
|
"rewards/format_reward": 0.9267857551574707, |
|
"step": 1180 |
|
}, |
|
{ |
|
"completion_length": 358.05895080566404, |
|
"epoch": 0.9157650695517774, |
|
"grad_norm": 1.2601906061172485, |
|
"kl": 0.625, |
|
"learning_rate": 1.0740446321695408e-07, |
|
"loss": 0.025, |
|
"reward": 1.9705358266830444, |
|
"reward_std": 0.6071323782205582, |
|
"rewards/accuracy_reward": 1.0517857551574707, |
|
"rewards/format_reward": 0.9187500417232514, |
|
"step": 1185 |
|
}, |
|
{ |
|
"completion_length": 356.47412109375, |
|
"epoch": 0.919629057187017, |
|
"grad_norm": 1.0005254745483398, |
|
"kl": 0.508984375, |
|
"learning_rate": 9.78401627660161e-08, |
|
"loss": 0.0204, |
|
"reward": 1.9919643878936768, |
|
"reward_std": 0.5772496670484543, |
|
"rewards/accuracy_reward": 1.0553572058677674, |
|
"rewards/format_reward": 0.9366071939468383, |
|
"step": 1190 |
|
}, |
|
{ |
|
"completion_length": 352.22947692871094, |
|
"epoch": 0.9234930448222566, |
|
"grad_norm": 2.757625102996826, |
|
"kl": 0.5763671875, |
|
"learning_rate": 8.871331300335322e-08, |
|
"loss": 0.023, |
|
"reward": 2.016964375972748, |
|
"reward_std": 0.5439181506633759, |
|
"rewards/accuracy_reward": 1.0732143223285675, |
|
"rewards/format_reward": 0.9437500476837158, |
|
"step": 1195 |
|
}, |
|
{ |
|
"completion_length": 354.0669860839844, |
|
"epoch": 0.9273570324574961, |
|
"grad_norm": 1.34922456741333, |
|
"kl": 0.555078125, |
|
"learning_rate": 8.002557599434802e-08, |
|
"loss": 0.0222, |
|
"reward": 1.9580357670783997, |
|
"reward_std": 0.5712159514427185, |
|
"rewards/accuracy_reward": 1.023214328289032, |
|
"rewards/format_reward": 0.93482146859169, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.9273570324574961, |
|
"eval_completion_length": 342.553505452474, |
|
"eval_kl": 0.5052734375, |
|
"eval_loss": 0.02041592448949814, |
|
"eval_reward": 2.0404762744903566, |
|
"eval_reward_std": 0.4870568384726842, |
|
"eval_rewards/accuracy_reward": 1.085714336236318, |
|
"eval_rewards/format_reward": 0.9547619263331095, |
|
"eval_runtime": 80.4099, |
|
"eval_samples_per_second": 1.231, |
|
"eval_steps_per_second": 0.05, |
|
"step": 1200 |
|
}, |
|
{ |
|
"completion_length": 378.85805358886716, |
|
"epoch": 0.9312210200927357, |
|
"grad_norm": 2.145590305328369, |
|
"kl": 0.67890625, |
|
"learning_rate": 7.177853383877498e-08, |
|
"loss": 0.0272, |
|
"reward": 1.8732143640518188, |
|
"reward_std": 0.5757515370845795, |
|
"rewards/accuracy_reward": 0.9535714745521545, |
|
"rewards/format_reward": 0.9196429073810577, |
|
"step": 1205 |
|
}, |
|
{ |
|
"completion_length": 346.4785888671875, |
|
"epoch": 0.9350850077279753, |
|
"grad_norm": 1.1914241313934326, |
|
"kl": 0.5564453125, |
|
"learning_rate": 6.397368838268497e-08, |
|
"loss": 0.0223, |
|
"reward": 1.9446429371833802, |
|
"reward_std": 0.49446034133434297, |
|
"rewards/accuracy_reward": 0.9946429014205933, |
|
"rewards/format_reward": 0.9500000357627869, |
|
"step": 1210 |
|
}, |
|
{ |
|
"completion_length": 355.9580535888672, |
|
"epoch": 0.9389489953632149, |
|
"grad_norm": 1.2249726057052612, |
|
"kl": 0.544140625, |
|
"learning_rate": 5.661246094491146e-08, |
|
"loss": 0.0218, |
|
"reward": 1.9660715103149413, |
|
"reward_std": 0.5542481303215027, |
|
"rewards/accuracy_reward": 1.0303571760654449, |
|
"rewards/format_reward": 0.935714328289032, |
|
"step": 1215 |
|
}, |
|
{ |
|
"completion_length": 365.9223388671875, |
|
"epoch": 0.9428129829984544, |
|
"grad_norm": 1.0840822458267212, |
|
"kl": 0.63984375, |
|
"learning_rate": 4.969619205823617e-08, |
|
"loss": 0.0256, |
|
"reward": 1.9410715103149414, |
|
"reward_std": 0.5451422989368438, |
|
"rewards/accuracy_reward": 0.9946429073810578, |
|
"rewards/format_reward": 0.9464286208152771, |
|
"step": 1220 |
|
}, |
|
{ |
|
"completion_length": 347.46519165039064, |
|
"epoch": 0.9466769706336939, |
|
"grad_norm": 1.4779506921768188, |
|
"kl": 0.5994140625, |
|
"learning_rate": 4.3226141225268804e-08, |
|
"loss": 0.024, |
|
"reward": 1.9232143759727478, |
|
"reward_std": 0.5508476465940475, |
|
"rewards/accuracy_reward": 0.980357187986374, |
|
"rewards/format_reward": 0.9428571879863739, |
|
"step": 1225 |
|
}, |
|
{ |
|
"completion_length": 358.8866302490234, |
|
"epoch": 0.9505409582689336, |
|
"grad_norm": 0.8061991930007935, |
|
"kl": 0.5189453125, |
|
"learning_rate": 3.7203486689083857e-08, |
|
"loss": 0.0208, |
|
"reward": 1.983928668498993, |
|
"reward_std": 0.6267418980598449, |
|
"rewards/accuracy_reward": 1.0500000536441803, |
|
"rewards/format_reward": 0.9339286088943481, |
|
"step": 1230 |
|
}, |
|
{ |
|
"completion_length": 342.37412414550784, |
|
"epoch": 0.9544049459041731, |
|
"grad_norm": 1.062484622001648, |
|
"kl": 0.5380859375, |
|
"learning_rate": 3.1629325218651695e-08, |
|
"loss": 0.0215, |
|
"reward": 2.0116072297096252, |
|
"reward_std": 0.501123908162117, |
|
"rewards/accuracy_reward": 1.060714316368103, |
|
"rewards/format_reward": 0.9508928954601288, |
|
"step": 1235 |
|
}, |
|
{ |
|
"completion_length": 353.96519775390624, |
|
"epoch": 0.9582689335394127, |
|
"grad_norm": 1.2145483493804932, |
|
"kl": 0.5751953125, |
|
"learning_rate": 2.6504671909109993e-08, |
|
"loss": 0.023, |
|
"reward": 1.9794643640518188, |
|
"reward_std": 0.4939621418714523, |
|
"rewards/accuracy_reward": 1.0357143342494965, |
|
"rewards/format_reward": 0.9437500417232514, |
|
"step": 1240 |
|
}, |
|
{ |
|
"completion_length": 357.8018035888672, |
|
"epoch": 0.9621329211746522, |
|
"grad_norm": 0.7816463708877563, |
|
"kl": 0.5390625, |
|
"learning_rate": 2.1830459996908527e-08, |
|
"loss": 0.0216, |
|
"reward": 1.9553572177886962, |
|
"reward_std": 0.5724190145730972, |
|
"rewards/accuracy_reward": 1.0125000536441804, |
|
"rewards/format_reward": 0.9428571820259094, |
|
"step": 1245 |
|
}, |
|
{ |
|
"completion_length": 347.95180053710936, |
|
"epoch": 0.9659969088098919, |
|
"grad_norm": 1.0017303228378296, |
|
"kl": 0.5392578125, |
|
"learning_rate": 1.7607540689859036e-08, |
|
"loss": 0.0216, |
|
"reward": 1.9767858147621156, |
|
"reward_std": 0.5618195921182633, |
|
"rewards/accuracy_reward": 1.0446429073810577, |
|
"rewards/format_reward": 0.9321429073810578, |
|
"step": 1250 |
|
}, |
|
{ |
|
"completion_length": 357.79555053710936, |
|
"epoch": 0.9698608964451314, |
|
"grad_norm": 1.0843381881713867, |
|
"kl": 0.5859375, |
|
"learning_rate": 1.383668301212393e-08, |
|
"loss": 0.0234, |
|
"reward": 1.9732144117355346, |
|
"reward_std": 0.5734216451644898, |
|
"rewards/accuracy_reward": 1.0410714626312256, |
|
"rewards/format_reward": 0.9321428954601287, |
|
"step": 1255 |
|
}, |
|
{ |
|
"completion_length": 338.9107269287109, |
|
"epoch": 0.973724884080371, |
|
"grad_norm": 0.9808955192565918, |
|
"kl": 0.611328125, |
|
"learning_rate": 1.0518573664172193e-08, |
|
"loss": 0.0245, |
|
"reward": 1.9794643878936768, |
|
"reward_std": 0.566856500506401, |
|
"rewards/accuracy_reward": 1.035714340209961, |
|
"rewards/format_reward": 0.9437500417232514, |
|
"step": 1260 |
|
}, |
|
{ |
|
"completion_length": 368.7884094238281, |
|
"epoch": 0.9775888717156105, |
|
"grad_norm": 1.4401775598526, |
|
"kl": 0.498046875, |
|
"learning_rate": 7.653816897725819e-09, |
|
"loss": 0.0199, |
|
"reward": 1.9937500834465027, |
|
"reward_std": 0.5333459317684174, |
|
"rewards/accuracy_reward": 1.04821435213089, |
|
"rewards/format_reward": 0.9455357551574707, |
|
"step": 1265 |
|
}, |
|
{ |
|
"completion_length": 365.1580535888672, |
|
"epoch": 0.98145285935085, |
|
"grad_norm": 1.1229665279388428, |
|
"kl": 0.6189453125, |
|
"learning_rate": 5.242934405720879e-09, |
|
"loss": 0.0248, |
|
"reward": 1.9437500715255738, |
|
"reward_std": 0.5873760730028152, |
|
"rewards/accuracy_reward": 0.9982143342494965, |
|
"rewards/format_reward": 0.9455357670783997, |
|
"step": 1270 |
|
}, |
|
{ |
|
"completion_length": 360.70001831054685, |
|
"epoch": 0.9853168469860897, |
|
"grad_norm": 1.2572500705718994, |
|
"kl": 0.56953125, |
|
"learning_rate": 3.286365227304633e-09, |
|
"loss": 0.0228, |
|
"reward": 2.0000000953674317, |
|
"reward_std": 0.5724943190813064, |
|
"rewards/accuracy_reward": 1.0642857611179353, |
|
"rewards/format_reward": 0.935714328289032, |
|
"step": 1275 |
|
}, |
|
{ |
|
"completion_length": 357.1821533203125, |
|
"epoch": 0.9891808346213292, |
|
"grad_norm": 0.9178944826126099, |
|
"kl": 0.537109375, |
|
"learning_rate": 1.7844656678817074e-09, |
|
"loss": 0.0215, |
|
"reward": 1.9071429371833801, |
|
"reward_std": 0.5804499536752701, |
|
"rewards/accuracy_reward": 0.9732143223285675, |
|
"rewards/format_reward": 0.9339286088943481, |
|
"step": 1280 |
|
}, |
|
{ |
|
"completion_length": 360.2035888671875, |
|
"epoch": 0.9930448222565688, |
|
"grad_norm": 1.0505741834640503, |
|
"kl": 0.52841796875, |
|
"learning_rate": 7.375092342298828e-10, |
|
"loss": 0.0212, |
|
"reward": 1.9723215103149414, |
|
"reward_std": 0.5573354363441467, |
|
"rewards/accuracy_reward": 1.0339286267757415, |
|
"rewards/format_reward": 0.9383928954601288, |
|
"step": 1285 |
|
}, |
|
{ |
|
"completion_length": 353.73841247558596, |
|
"epoch": 0.9969088098918083, |
|
"grad_norm": 1.0907429456710815, |
|
"kl": 0.5439453125, |
|
"learning_rate": 1.4568658469132913e-10, |
|
"loss": 0.0218, |
|
"reward": 2.0357143759727476, |
|
"reward_std": 0.5376188308000565, |
|
"rewards/accuracy_reward": 1.0821429014205932, |
|
"rewards/format_reward": 0.9535714745521545, |
|
"step": 1290 |
|
}, |
|
{ |
|
"completion_length": 359.22038650512695, |
|
"epoch": 1.0, |
|
"kl": 0.52099609375, |
|
"reward": 1.9821429252624512, |
|
"reward_std": 0.5921385176479816, |
|
"rewards/accuracy_reward": 1.051339328289032, |
|
"rewards/format_reward": 0.9308036118745804, |
|
"step": 1294, |
|
"total_flos": 0.0, |
|
"train_loss": 0.7486699576652505, |
|
"train_runtime": 29158.9846, |
|
"train_samples_per_second": 2.484, |
|
"train_steps_per_second": 0.044 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1294, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|