|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9991024490319271, |
|
"eval_steps": 400, |
|
"global_step": 974, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"count/fg_chosen": 14.079999923706055, |
|
"count/fg_rejected": 3.7826087474823, |
|
"epoch": 0.01025772534940377, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.78272247314453, |
|
"fg_logps/policy_chosen": -15.784832000732422, |
|
"fg_logps/policy_rejected": -17.75761604309082, |
|
"fg_logps/reference_KL": -21.78750991821289, |
|
"fg_logps/reference_chosen": -15.790050506591797, |
|
"fg_logps/reference_rejected": -17.756908416748047, |
|
"fg_loss": 0.808033287525177, |
|
"fg_rewards/chosen_sum": 0.006448890082538128, |
|
"fg_rewards/rejected_sum": -2.7408803362050094e-05, |
|
"grad_norm": 18.746104238140656, |
|
"kl": 0.03848903253674507, |
|
"learning_rate": 2.1220544726955587e-07, |
|
"logps/chosen": -305.5322529560811, |
|
"logps/rejected": -327.7266306322674, |
|
"loss": 0.6362, |
|
"rewards/chosen": -0.001747507501292873, |
|
"rewards/margins": -0.002081668250248688, |
|
"rewards/rejected": 0.00033416074895581535, |
|
"step": 10 |
|
}, |
|
{ |
|
"count/fg_chosen": 13.947368621826172, |
|
"count/fg_rejected": 3.5714285373687744, |
|
"epoch": 0.02051545069880754, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.42728614807129, |
|
"fg_logps/policy_chosen": -15.204089164733887, |
|
"fg_logps/policy_rejected": -20.7266902923584, |
|
"fg_logps/reference_KL": -20.41632652282715, |
|
"fg_logps/reference_chosen": -15.188560485839844, |
|
"fg_logps/reference_rejected": -20.721162796020508, |
|
"fg_loss": 0.7661333084106445, |
|
"fg_rewards/chosen_sum": -0.025545625016093254, |
|
"fg_rewards/rejected_sum": -0.00522112101316452, |
|
"grad_norm": 19.071229786810708, |
|
"kl": 0.006695461459457874, |
|
"learning_rate": 3.0896765977473373e-07, |
|
"logps/chosen": -313.60243626644734, |
|
"logps/rejected": -306.7811569940476, |
|
"loss": 0.5877, |
|
"rewards/chosen": -0.012838767547356454, |
|
"rewards/margins": 0.004141214908215037, |
|
"rewards/rejected": -0.01697998245557149, |
|
"step": 20 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.894737243652344, |
|
"count/fg_rejected": 3.5, |
|
"epoch": 0.03077317604821131, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.23096466064453, |
|
"fg_logps/policy_chosen": -14.642033576965332, |
|
"fg_logps/policy_rejected": -16.705078125, |
|
"fg_logps/reference_KL": -19.1680850982666, |
|
"fg_logps/reference_chosen": -14.53127384185791, |
|
"fg_logps/reference_rejected": -16.613784790039062, |
|
"fg_loss": 0.7854437232017517, |
|
"fg_rewards/chosen_sum": -0.16305610537528992, |
|
"fg_rewards/rejected_sum": -0.03273481875658035, |
|
"grad_norm": 27.51124230932652, |
|
"kl": 0.0037935494910925627, |
|
"learning_rate": 3.5530203356711474e-07, |
|
"logps/chosen": -328.168653612013, |
|
"logps/rejected": -307.9456419427711, |
|
"loss": 0.5906, |
|
"rewards/chosen": -0.04671859121941901, |
|
"rewards/margins": 0.03360887939913921, |
|
"rewards/rejected": -0.08032747061855822, |
|
"step": 30 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.538461685180664, |
|
"count/fg_rejected": 3.4761905670166016, |
|
"epoch": 0.04103090139761508, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.668180465698242, |
|
"fg_logps/policy_chosen": -13.914729118347168, |
|
"fg_logps/policy_rejected": -17.876625061035156, |
|
"fg_logps/reference_KL": -19.54073143005371, |
|
"fg_logps/reference_chosen": -13.71956729888916, |
|
"fg_logps/reference_rejected": -17.639841079711914, |
|
"fg_loss": 0.7382856011390686, |
|
"fg_rewards/chosen_sum": -0.32776451110839844, |
|
"fg_rewards/rejected_sum": -0.08581020683050156, |
|
"grad_norm": 18.904689541215447, |
|
"kl": 0.0, |
|
"learning_rate": 3.907900061589056e-07, |
|
"logps/chosen": -324.57538132440476, |
|
"logps/rejected": -341.6456363075658, |
|
"loss": 0.6186, |
|
"rewards/chosen": -0.12434386071704683, |
|
"rewards/margins": 0.07653144666724336, |
|
"rewards/rejected": -0.20087530738429019, |
|
"step": 40 |
|
}, |
|
{ |
|
"count/fg_chosen": 13.052631378173828, |
|
"count/fg_rejected": 3.5999999046325684, |
|
"epoch": 0.051288626747018846, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -22.696481704711914, |
|
"fg_logps/policy_chosen": -14.908551216125488, |
|
"fg_logps/policy_rejected": -17.680747985839844, |
|
"fg_logps/reference_KL": -22.48174476623535, |
|
"fg_logps/reference_chosen": -14.603896141052246, |
|
"fg_logps/reference_rejected": -17.34904670715332, |
|
"fg_loss": 0.7404539585113525, |
|
"fg_rewards/chosen_sum": -0.35476553440093994, |
|
"fg_rewards/rejected_sum": -0.14453592896461487, |
|
"grad_norm": 14.07450204568209, |
|
"kl": 0.014550399966537952, |
|
"learning_rate": 4.175211072161436e-07, |
|
"logps/chosen": -291.7388392857143, |
|
"logps/rejected": -314.5031061746988, |
|
"loss": 0.5781, |
|
"rewards/chosen": -0.10712227264007965, |
|
"rewards/margins": 0.12518125482426995, |
|
"rewards/rejected": -0.2323035274643496, |
|
"step": 50 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.5, |
|
"count/fg_rejected": 4.166666507720947, |
|
"epoch": 0.06154635209642262, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.49053955078125, |
|
"fg_logps/policy_chosen": -15.30705738067627, |
|
"fg_logps/policy_rejected": -18.280315399169922, |
|
"fg_logps/reference_KL": -20.184072494506836, |
|
"fg_logps/reference_chosen": -14.819132804870605, |
|
"fg_logps/reference_rejected": -17.81497573852539, |
|
"fg_loss": 0.7277103066444397, |
|
"fg_rewards/chosen_sum": -0.6670733690261841, |
|
"fg_rewards/rejected_sum": -0.22372370958328247, |
|
"grad_norm": 17.53900194639282, |
|
"kl": 0.0, |
|
"learning_rate": 4.389727636522206e-07, |
|
"logps/chosen": -306.5228568412162, |
|
"logps/rejected": -308.6946538880814, |
|
"loss": 0.5582, |
|
"rewards/chosen": -0.19411424688390783, |
|
"rewards/margins": 0.22253594857053438, |
|
"rewards/rejected": -0.4166501954544422, |
|
"step": 60 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.692307472229004, |
|
"count/fg_rejected": 4.458333492279053, |
|
"epoch": 0.07180407744582638, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.50032615661621, |
|
"fg_logps/policy_chosen": -15.366405487060547, |
|
"fg_logps/policy_rejected": -20.129003524780273, |
|
"fg_logps/reference_KL": -20.127044677734375, |
|
"fg_logps/reference_chosen": -14.826613426208496, |
|
"fg_logps/reference_rejected": -19.553565979003906, |
|
"fg_loss": 0.7948136925697327, |
|
"fg_rewards/chosen_sum": -0.8405532836914062, |
|
"fg_rewards/rejected_sum": -0.24530810117721558, |
|
"grad_norm": 17.707988179125024, |
|
"kl": 0.016248703002929688, |
|
"learning_rate": 4.568903445353587e-07, |
|
"logps/chosen": -266.67440580985914, |
|
"logps/rejected": -392.16322858146066, |
|
"loss": 0.5804, |
|
"rewards/chosen": -0.3093531433965119, |
|
"rewards/margins": 0.4345650290326051, |
|
"rewards/rejected": -0.743918172429117, |
|
"step": 70 |
|
}, |
|
{ |
|
"count/fg_chosen": 13.82608699798584, |
|
"count/fg_rejected": 4.736842155456543, |
|
"epoch": 0.08206180279523016, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.310537338256836, |
|
"fg_logps/policy_chosen": -15.233660697937012, |
|
"fg_logps/policy_rejected": -19.051939010620117, |
|
"fg_logps/reference_KL": -19.766571044921875, |
|
"fg_logps/reference_chosen": -14.48674201965332, |
|
"fg_logps/reference_rejected": -18.337350845336914, |
|
"fg_loss": 0.840079665184021, |
|
"fg_rewards/chosen_sum": -1.008780837059021, |
|
"fg_rewards/rejected_sum": -0.38555219769477844, |
|
"grad_norm": 16.529452607844135, |
|
"kl": 0.0, |
|
"learning_rate": 4.7227525172567226e-07, |
|
"logps/chosen": -258.29019579475306, |
|
"logps/rejected": -339.7275761471519, |
|
"loss": 0.5753, |
|
"rewards/chosen": -0.38390738875777636, |
|
"rewards/margins": 0.5281775528648007, |
|
"rewards/rejected": -0.9120849416225771, |
|
"step": 80 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.94444465637207, |
|
"count/fg_rejected": 4.5625, |
|
"epoch": 0.09231952814463393, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -18.532676696777344, |
|
"fg_logps/policy_chosen": -14.168781280517578, |
|
"fg_logps/policy_rejected": -15.193059921264648, |
|
"fg_logps/reference_KL": -17.916645050048828, |
|
"fg_logps/reference_chosen": -13.280251502990723, |
|
"fg_logps/reference_rejected": -14.359517097473145, |
|
"fg_loss": 0.7089979648590088, |
|
"fg_rewards/chosen_sum": -1.4005531072616577, |
|
"fg_rewards/rejected_sum": -0.3471396267414093, |
|
"grad_norm": 22.36999453602448, |
|
"kl": 0.013136100955307484, |
|
"learning_rate": 4.857556042627658e-07, |
|
"logps/chosen": -354.51475432981925, |
|
"logps/rejected": -378.41106939935065, |
|
"loss": 0.55, |
|
"rewards/chosen": -0.5630912321159639, |
|
"rewards/margins": 0.6156807405402861, |
|
"rewards/rejected": -1.17877197265625, |
|
"step": 90 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.559999465942383, |
|
"count/fg_rejected": 3.4583332538604736, |
|
"epoch": 0.10257725349403769, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.625595092773438, |
|
"fg_logps/policy_chosen": -14.884417533874512, |
|
"fg_logps/policy_rejected": -19.52889060974121, |
|
"fg_logps/reference_KL": -18.972139358520508, |
|
"fg_logps/reference_chosen": -13.96281623840332, |
|
"fg_logps/reference_rejected": -18.573102951049805, |
|
"fg_loss": 0.7564048171043396, |
|
"fg_rewards/chosen_sum": -1.531907081604004, |
|
"fg_rewards/rejected_sum": -0.3621842861175537, |
|
"grad_norm": 16.9566179157687, |
|
"kl": 0.0, |
|
"learning_rate": 4.977514249230057e-07, |
|
"logps/chosen": -296.1236049107143, |
|
"logps/rejected": -352.51732113486844, |
|
"loss": 0.6056, |
|
"rewards/chosen": -0.8061084747314453, |
|
"rewards/margins": 0.3998990309865851, |
|
"rewards/rejected": -1.2060075057180304, |
|
"step": 100 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.423076629638672, |
|
"count/fg_rejected": 3.277777671813965, |
|
"epoch": 0.11283497884344147, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.713420867919922, |
|
"fg_logps/policy_chosen": -14.724455833435059, |
|
"fg_logps/policy_rejected": -17.52227020263672, |
|
"fg_logps/reference_KL": -19.24376678466797, |
|
"fg_logps/reference_chosen": -14.040182113647461, |
|
"fg_logps/reference_rejected": -16.54549789428711, |
|
"fg_loss": 0.6636641621589661, |
|
"fg_rewards/chosen_sum": -0.9770669341087341, |
|
"fg_rewards/rejected_sum": -0.3046890199184418, |
|
"grad_norm": 15.8810865349395, |
|
"kl": 0.013418341055512428, |
|
"learning_rate": 4.960045662100457e-07, |
|
"logps/chosen": -319.42288602941176, |
|
"logps/rejected": -328.1127604166667, |
|
"loss": 0.5794, |
|
"rewards/chosen": -0.6423200719496783, |
|
"rewards/margins": 0.5826459518133426, |
|
"rewards/rejected": -1.2249660237630209, |
|
"step": 110 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.038461685180664, |
|
"count/fg_rejected": 3.549999952316284, |
|
"epoch": 0.12309270419284524, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.378173828125, |
|
"fg_logps/policy_chosen": -15.218153953552246, |
|
"fg_logps/policy_rejected": -17.869609832763672, |
|
"fg_logps/reference_KL": -20.997371673583984, |
|
"fg_logps/reference_chosen": -14.897927284240723, |
|
"fg_logps/reference_rejected": -17.389179229736328, |
|
"fg_loss": 0.7092593908309937, |
|
"fg_rewards/chosen_sum": -0.43586668372154236, |
|
"fg_rewards/rejected_sum": -0.22385962307453156, |
|
"grad_norm": 13.44458367896507, |
|
"kl": 0.11229915916919708, |
|
"learning_rate": 4.90296803652968e-07, |
|
"logps/chosen": -279.3057077891791, |
|
"logps/rejected": -315.11082409274195, |
|
"loss": 0.5637, |
|
"rewards/chosen": -0.42403762020281893, |
|
"rewards/margins": 0.6563830071860841, |
|
"rewards/rejected": -1.080420627388903, |
|
"step": 120 |
|
}, |
|
{ |
|
"count/fg_chosen": 18.882352828979492, |
|
"count/fg_rejected": 4.5333333015441895, |
|
"epoch": 0.13335042954224902, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -18.839466094970703, |
|
"fg_logps/policy_chosen": -14.21582317352295, |
|
"fg_logps/policy_rejected": -15.989607810974121, |
|
"fg_logps/reference_KL": -18.35956382751465, |
|
"fg_logps/reference_chosen": -13.778548240661621, |
|
"fg_logps/reference_rejected": -15.061809539794922, |
|
"fg_loss": 0.752821147441864, |
|
"fg_rewards/chosen_sum": -0.353499174118042, |
|
"fg_rewards/rejected_sum": -0.5261321663856506, |
|
"grad_norm": 18.943079661475036, |
|
"kl": 0.0, |
|
"learning_rate": 4.845890410958904e-07, |
|
"logps/chosen": -399.24955100574715, |
|
"logps/rejected": -355.0519852311644, |
|
"loss": 0.5522, |
|
"rewards/chosen": -0.5169386589664152, |
|
"rewards/margins": 0.7560970959297749, |
|
"rewards/rejected": -1.2730357548961901, |
|
"step": 130 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.600000381469727, |
|
"count/fg_rejected": 3.2941176891326904, |
|
"epoch": 0.14360815489165277, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -22.31153106689453, |
|
"fg_logps/policy_chosen": -15.935779571533203, |
|
"fg_logps/policy_rejected": -19.569046020507812, |
|
"fg_logps/reference_KL": -21.87625503540039, |
|
"fg_logps/reference_chosen": -15.854820251464844, |
|
"fg_logps/reference_rejected": -19.18140983581543, |
|
"fg_loss": 0.6795336604118347, |
|
"fg_rewards/chosen_sum": 0.07530271261930466, |
|
"fg_rewards/rejected_sum": -0.0755414366722107, |
|
"grad_norm": 14.329744807366827, |
|
"kl": 0.010508442297577858, |
|
"learning_rate": 4.788812785388127e-07, |
|
"logps/chosen": -288.93102254746833, |
|
"logps/rejected": -339.4910059799383, |
|
"loss": 0.5424, |
|
"rewards/chosen": -0.38591092749486994, |
|
"rewards/margins": 0.9102081973062006, |
|
"rewards/rejected": -1.2961191248010706, |
|
"step": 140 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.049999237060547, |
|
"count/fg_rejected": 3.2352941036224365, |
|
"epoch": 0.15386588024105655, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.11859703063965, |
|
"fg_logps/policy_chosen": -15.353170394897461, |
|
"fg_logps/policy_rejected": -19.041038513183594, |
|
"fg_logps/reference_KL": -20.675270080566406, |
|
"fg_logps/reference_chosen": -15.19269847869873, |
|
"fg_logps/reference_rejected": -18.65860366821289, |
|
"fg_loss": 0.6720137596130371, |
|
"fg_rewards/chosen_sum": -0.040867485105991364, |
|
"fg_rewards/rejected_sum": -0.07588446885347366, |
|
"grad_norm": 15.943835226437113, |
|
"kl": 0.0, |
|
"learning_rate": 4.7317351598173515e-07, |
|
"logps/chosen": -372.51135706018516, |
|
"logps/rejected": -344.8247132120253, |
|
"loss": 0.5326, |
|
"rewards/chosen": -0.4752860363618827, |
|
"rewards/margins": 0.8493531029342507, |
|
"rewards/rejected": -1.3246391392961334, |
|
"step": 150 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.833333015441895, |
|
"count/fg_rejected": 2.642857074737549, |
|
"epoch": 0.16412360559046033, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.43903350830078, |
|
"fg_logps/policy_chosen": -14.500401496887207, |
|
"fg_logps/policy_rejected": -17.249717712402344, |
|
"fg_logps/reference_KL": -21.081066131591797, |
|
"fg_logps/reference_chosen": -14.807108879089355, |
|
"fg_logps/reference_rejected": -17.323772430419922, |
|
"fg_loss": 0.6379384994506836, |
|
"fg_rewards/chosen_sum": 0.5964276194572449, |
|
"fg_rewards/rejected_sum": -0.0015845863381400704, |
|
"grad_norm": 13.675438870382441, |
|
"kl": 0.0012632369762286544, |
|
"learning_rate": 4.6746575342465747e-07, |
|
"logps/chosen": -327.4292566636029, |
|
"logps/rejected": -389.55515455163044, |
|
"loss": 0.4826, |
|
"rewards/chosen": -0.45451857061947093, |
|
"rewards/margins": 0.7520349300121103, |
|
"rewards/rejected": -1.2065535006315813, |
|
"step": 160 |
|
}, |
|
{ |
|
"count/fg_chosen": 18.764705657958984, |
|
"count/fg_rejected": 3.538461446762085, |
|
"epoch": 0.17438133093986408, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.866031646728516, |
|
"fg_logps/policy_chosen": -14.610063552856445, |
|
"fg_logps/policy_rejected": -15.764613151550293, |
|
"fg_logps/reference_KL": -19.758529663085938, |
|
"fg_logps/reference_chosen": -15.149678230285645, |
|
"fg_logps/reference_rejected": -15.872199058532715, |
|
"fg_loss": 0.7423752546310425, |
|
"fg_rewards/chosen_sum": 1.0474270582199097, |
|
"fg_rewards/rejected_sum": -0.00038578076055273414, |
|
"grad_norm": 14.85184229504073, |
|
"kl": 0.05892143398523331, |
|
"learning_rate": 4.617579908675799e-07, |
|
"logps/chosen": -289.5171331091772, |
|
"logps/rejected": -364.5406298225309, |
|
"loss": 0.5105, |
|
"rewards/chosen": -0.3850521618806863, |
|
"rewards/margins": 0.9919037888209918, |
|
"rewards/rejected": -1.3769559507016782, |
|
"step": 170 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.399999618530273, |
|
"count/fg_rejected": 4.5, |
|
"epoch": 0.18463905628926786, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.385639190673828, |
|
"fg_logps/policy_chosen": -15.719804763793945, |
|
"fg_logps/policy_rejected": -21.617740631103516, |
|
"fg_logps/reference_KL": -21.118305206298828, |
|
"fg_logps/reference_chosen": -15.717147827148438, |
|
"fg_logps/reference_rejected": -21.633119583129883, |
|
"fg_loss": 0.875866711139679, |
|
"fg_rewards/chosen_sum": 0.0969744473695755, |
|
"fg_rewards/rejected_sum": -0.13854043185710907, |
|
"grad_norm": 14.688764944309868, |
|
"kl": 0.0, |
|
"learning_rate": 4.5605022831050226e-07, |
|
"logps/chosen": -251.9897216796875, |
|
"logps/rejected": -383.8666259765625, |
|
"loss": 0.5364, |
|
"rewards/chosen": -0.6086512088775635, |
|
"rewards/margins": 1.0637765407562256, |
|
"rewards/rejected": -1.6724277496337892, |
|
"step": 180 |
|
}, |
|
{ |
|
"count/fg_chosen": 19.217391967773438, |
|
"count/fg_rejected": 4.523809432983398, |
|
"epoch": 0.19489678163867163, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -18.718454360961914, |
|
"fg_logps/policy_chosen": -13.899271965026855, |
|
"fg_logps/policy_rejected": -18.827178955078125, |
|
"fg_logps/reference_KL": -18.324739456176758, |
|
"fg_logps/reference_chosen": -13.679468154907227, |
|
"fg_logps/reference_rejected": -18.46164321899414, |
|
"fg_loss": 0.7202942967414856, |
|
"fg_rewards/chosen_sum": -0.006031676661223173, |
|
"fg_rewards/rejected_sum": -0.15737372636795044, |
|
"grad_norm": 14.404560894432748, |
|
"kl": 0.05560264736413956, |
|
"learning_rate": 4.503424657534247e-07, |
|
"logps/chosen": -338.58002804487177, |
|
"logps/rejected": -338.0175543064024, |
|
"loss": 0.5344, |
|
"rewards/chosen": -0.542063737526918, |
|
"rewards/margins": 1.0375677887688135, |
|
"rewards/rejected": -1.5796315262957317, |
|
"step": 190 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.399999618530273, |
|
"count/fg_rejected": 4.789473533630371, |
|
"epoch": 0.20515450698807539, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.66611099243164, |
|
"fg_logps/policy_chosen": -14.653355598449707, |
|
"fg_logps/policy_rejected": -17.5184383392334, |
|
"fg_logps/reference_KL": -19.25302505493164, |
|
"fg_logps/reference_chosen": -14.066553115844727, |
|
"fg_logps/reference_rejected": -17.042163848876953, |
|
"fg_loss": 0.9406682848930359, |
|
"fg_rewards/chosen_sum": -0.7670488357543945, |
|
"fg_rewards/rejected_sum": -0.16450081765651703, |
|
"grad_norm": 13.472817295849191, |
|
"kl": 0.0, |
|
"learning_rate": 4.44634703196347e-07, |
|
"logps/chosen": -328.9538845486111, |
|
"logps/rejected": -382.8152622767857, |
|
"loss": 0.5533, |
|
"rewards/chosen": -0.6610004001193577, |
|
"rewards/margins": 1.229798332093254, |
|
"rewards/rejected": -1.8907987322126116, |
|
"step": 200 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.6875, |
|
"count/fg_rejected": 5.384615421295166, |
|
"epoch": 0.21541223233747916, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.451927185058594, |
|
"fg_logps/policy_chosen": -14.831332206726074, |
|
"fg_logps/policy_rejected": -17.854537963867188, |
|
"fg_logps/reference_KL": -20.010757446289062, |
|
"fg_logps/reference_chosen": -14.1827392578125, |
|
"fg_logps/reference_rejected": -16.983478546142578, |
|
"fg_loss": 0.7702024579048157, |
|
"fg_rewards/chosen_sum": -1.0590859651565552, |
|
"fg_rewards/rejected_sum": -0.4240546226501465, |
|
"grad_norm": 16.379366840236408, |
|
"kl": 0.0, |
|
"learning_rate": 4.3892694063926936e-07, |
|
"logps/chosen": -336.93881138392857, |
|
"logps/rejected": -410.259375, |
|
"loss": 0.4634, |
|
"rewards/chosen": -0.4613551548549107, |
|
"rewards/margins": 1.4628642733134922, |
|
"rewards/rejected": -1.9242194281684029, |
|
"step": 210 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.0, |
|
"count/fg_rejected": 4.181818008422852, |
|
"epoch": 0.22566995768688294, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.021493911743164, |
|
"fg_logps/policy_chosen": -16.28066062927246, |
|
"fg_logps/policy_rejected": -20.48377799987793, |
|
"fg_logps/reference_KL": -20.17574691772461, |
|
"fg_logps/reference_chosen": -15.166167259216309, |
|
"fg_logps/reference_rejected": -19.03717803955078, |
|
"fg_loss": 0.6626110672950745, |
|
"fg_rewards/chosen_sum": -1.1887147426605225, |
|
"fg_rewards/rejected_sum": -0.47799989581108093, |
|
"grad_norm": 13.99674523050283, |
|
"kl": 0.0, |
|
"learning_rate": 4.332191780821918e-07, |
|
"logps/chosen": -288.56021278782896, |
|
"logps/rejected": -361.5675455729167, |
|
"loss": 0.4491, |
|
"rewards/chosen": -0.5620980011789423, |
|
"rewards/margins": 1.510199412964938, |
|
"rewards/rejected": -2.0722974141438804, |
|
"step": 220 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.4399995803833, |
|
"count/fg_rejected": 4.888888835906982, |
|
"epoch": 0.2359276830362867, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.916282653808594, |
|
"fg_logps/policy_chosen": -15.752463340759277, |
|
"fg_logps/policy_rejected": -19.13568878173828, |
|
"fg_logps/reference_KL": -20.100528717041016, |
|
"fg_logps/reference_chosen": -14.519099235534668, |
|
"fg_logps/reference_rejected": -17.75797462463379, |
|
"fg_loss": 0.8086566925048828, |
|
"fg_rewards/chosen_sum": -1.4387943744659424, |
|
"fg_rewards/rejected_sum": -0.7463889122009277, |
|
"grad_norm": 9.096647980350944, |
|
"kl": 0.0, |
|
"learning_rate": 4.2751141552511415e-07, |
|
"logps/chosen": -325.9561071908602, |
|
"logps/rejected": -342.84818097014926, |
|
"loss": 0.5797, |
|
"rewards/chosen": -0.93502930671938, |
|
"rewards/margins": 1.3941938706068793, |
|
"rewards/rejected": -2.3292231773262593, |
|
"step": 230 |
|
}, |
|
{ |
|
"count/fg_chosen": 19.450000762939453, |
|
"count/fg_rejected": 3.882352828979492, |
|
"epoch": 0.24618540838569047, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.968975067138672, |
|
"fg_logps/policy_chosen": -15.759946823120117, |
|
"fg_logps/policy_rejected": -18.90660858154297, |
|
"fg_logps/reference_KL": -19.11962890625, |
|
"fg_logps/reference_chosen": -14.694534301757812, |
|
"fg_logps/reference_rejected": -17.608003616333008, |
|
"fg_loss": 0.7771142721176147, |
|
"fg_rewards/chosen_sum": -2.0326945781707764, |
|
"fg_rewards/rejected_sum": -0.5817491412162781, |
|
"grad_norm": 13.085165433618942, |
|
"kl": 0.0, |
|
"learning_rate": 4.218036529680365e-07, |
|
"logps/chosen": -324.1948939732143, |
|
"logps/rejected": -355.4172092013889, |
|
"loss": 0.4984, |
|
"rewards/chosen": -0.8741681780133929, |
|
"rewards/margins": 0.9406820145864335, |
|
"rewards/rejected": -1.8148501925998264, |
|
"step": 240 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.941176414489746, |
|
"count/fg_rejected": 3.5625, |
|
"epoch": 0.25644313373509425, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.16470718383789, |
|
"fg_logps/policy_chosen": -15.544363975524902, |
|
"fg_logps/policy_rejected": -22.452640533447266, |
|
"fg_logps/reference_KL": -20.34503746032715, |
|
"fg_logps/reference_chosen": -14.364212989807129, |
|
"fg_logps/reference_rejected": -20.861942291259766, |
|
"fg_loss": 0.8010571599006653, |
|
"fg_rewards/chosen_sum": -1.5489312410354614, |
|
"fg_rewards/rejected_sum": -0.5739867687225342, |
|
"grad_norm": 20.269756145960844, |
|
"kl": 0.0004318237188272178, |
|
"learning_rate": 4.160958904109589e-07, |
|
"logps/chosen": -284.01116415895063, |
|
"logps/rejected": -337.00729331487344, |
|
"loss": 0.4967, |
|
"rewards/chosen": -0.7318777390468267, |
|
"rewards/margins": 1.3799085359235055, |
|
"rewards/rejected": -2.1117862749703322, |
|
"step": 250 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.647058486938477, |
|
"count/fg_rejected": 3.8333332538604736, |
|
"epoch": 0.26670085908449803, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.48432159423828, |
|
"fg_logps/policy_chosen": -15.442632675170898, |
|
"fg_logps/policy_rejected": -21.998336791992188, |
|
"fg_logps/reference_KL": -20.76389503479004, |
|
"fg_logps/reference_chosen": -14.410895347595215, |
|
"fg_logps/reference_rejected": -20.8482723236084, |
|
"fg_loss": 0.7149848937988281, |
|
"fg_rewards/chosen_sum": -0.9253503680229187, |
|
"fg_rewards/rejected_sum": -0.5536249279975891, |
|
"grad_norm": 13.117998935505918, |
|
"kl": 0.0, |
|
"learning_rate": 4.1038812785388125e-07, |
|
"logps/chosen": -288.13616943359375, |
|
"logps/rejected": -351.7694905598958, |
|
"loss": 0.4464, |
|
"rewards/chosen": -0.422292560338974, |
|
"rewards/margins": 1.4216915667057037, |
|
"rewards/rejected": -1.8439841270446777, |
|
"step": 260 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.785714149475098, |
|
"count/fg_rejected": 3.642857074737549, |
|
"epoch": 0.27695858443390176, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -22.614913940429688, |
|
"fg_logps/policy_chosen": -16.907217025756836, |
|
"fg_logps/policy_rejected": -19.078645706176758, |
|
"fg_logps/reference_KL": -21.76515007019043, |
|
"fg_logps/reference_chosen": -16.108179092407227, |
|
"fg_logps/reference_rejected": -18.381593704223633, |
|
"fg_loss": 0.819545567035675, |
|
"fg_rewards/chosen_sum": -0.7028852105140686, |
|
"fg_rewards/rejected_sum": -0.34045127034187317, |
|
"grad_norm": 18.77208905467823, |
|
"kl": 0.0, |
|
"learning_rate": 4.046803652968037e-07, |
|
"logps/chosen": -345.2209114583333, |
|
"logps/rejected": -358.41994485294117, |
|
"loss": 0.475, |
|
"rewards/chosen": -0.8343807983398438, |
|
"rewards/margins": 1.4960064517750458, |
|
"rewards/rejected": -2.3303872501148897, |
|
"step": 270 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.962963104248047, |
|
"count/fg_rejected": 5.884615421295166, |
|
"epoch": 0.28721630978330553, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.402196884155273, |
|
"fg_logps/policy_chosen": -15.454611778259277, |
|
"fg_logps/policy_rejected": -17.580909729003906, |
|
"fg_logps/reference_KL": -19.303863525390625, |
|
"fg_logps/reference_chosen": -14.586993217468262, |
|
"fg_logps/reference_rejected": -16.358680725097656, |
|
"fg_loss": 0.8306005597114563, |
|
"fg_rewards/chosen_sum": -1.1611218452453613, |
|
"fg_rewards/rejected_sum": -0.7079381942749023, |
|
"grad_norm": 13.80952507861355, |
|
"kl": 0.0, |
|
"learning_rate": 3.98972602739726e-07, |
|
"logps/chosen": -283.1960693359375, |
|
"logps/rejected": -412.07763671875, |
|
"loss": 0.5757, |
|
"rewards/chosen": -0.4877366542816162, |
|
"rewards/margins": 1.6564238071441653, |
|
"rewards/rejected": -2.1441604614257814, |
|
"step": 280 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.549999237060547, |
|
"count/fg_rejected": 5.277777671813965, |
|
"epoch": 0.2974740351327093, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -22.114633560180664, |
|
"fg_logps/policy_chosen": -15.825154304504395, |
|
"fg_logps/policy_rejected": -20.11035919189453, |
|
"fg_logps/reference_KL": -21.259445190429688, |
|
"fg_logps/reference_chosen": -15.046856880187988, |
|
"fg_logps/reference_rejected": -19.045265197753906, |
|
"fg_loss": 0.8168869614601135, |
|
"fg_rewards/chosen_sum": -0.7166315913200378, |
|
"fg_rewards/rejected_sum": -0.4433734118938446, |
|
"grad_norm": 14.177325128931766, |
|
"kl": 0.007422161288559437, |
|
"learning_rate": 3.9326484018264836e-07, |
|
"logps/chosen": -319.059375, |
|
"logps/rejected": -395.9953125, |
|
"loss": 0.5099, |
|
"rewards/chosen": -0.6542438761393229, |
|
"rewards/margins": 1.4790777288698682, |
|
"rewards/rejected": -2.133321605009191, |
|
"step": 290 |
|
}, |
|
{ |
|
"count/fg_chosen": 13.82608699798584, |
|
"count/fg_rejected": 4.7727274894714355, |
|
"epoch": 0.3077317604821131, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.241731643676758, |
|
"fg_logps/policy_chosen": -16.656099319458008, |
|
"fg_logps/policy_rejected": -18.52686882019043, |
|
"fg_logps/reference_KL": -20.161357879638672, |
|
"fg_logps/reference_chosen": -15.559574127197266, |
|
"fg_logps/reference_rejected": -17.188322067260742, |
|
"fg_loss": 0.8407912850379944, |
|
"fg_rewards/chosen_sum": -1.1080502271652222, |
|
"fg_rewards/rejected_sum": -0.7194547057151794, |
|
"grad_norm": 11.479402820786039, |
|
"kl": 0.0, |
|
"learning_rate": 3.875570776255708e-07, |
|
"logps/chosen": -260.6196986607143, |
|
"logps/rejected": -403.26040097891564, |
|
"loss": 0.547, |
|
"rewards/chosen": -1.1976228193803267, |
|
"rewards/margins": 1.5034547452728173, |
|
"rewards/rejected": -2.701077564653144, |
|
"step": 300 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.0, |
|
"count/fg_rejected": 5.5333333015441895, |
|
"epoch": 0.31798948583151687, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.364051818847656, |
|
"fg_logps/policy_chosen": -15.91810417175293, |
|
"fg_logps/policy_rejected": -18.95937728881836, |
|
"fg_logps/reference_KL": -20.408035278320312, |
|
"fg_logps/reference_chosen": -14.995894432067871, |
|
"fg_logps/reference_rejected": -17.87520408630371, |
|
"fg_loss": 0.8350895047187805, |
|
"fg_rewards/chosen_sum": -1.1522879600524902, |
|
"fg_rewards/rejected_sum": -0.6541081070899963, |
|
"grad_norm": 12.005685770721653, |
|
"kl": 0.05910682678222656, |
|
"learning_rate": 3.8184931506849315e-07, |
|
"logps/chosen": -314.4315476190476, |
|
"logps/rejected": -347.2052580180921, |
|
"loss": 0.5311, |
|
"rewards/chosen": -0.8637207576206752, |
|
"rewards/margins": 1.5000286962752951, |
|
"rewards/rejected": -2.3637494538959705, |
|
"step": 310 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.047618865966797, |
|
"count/fg_rejected": 5.5, |
|
"epoch": 0.32824721118092065, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.606046676635742, |
|
"fg_logps/policy_chosen": -15.302529335021973, |
|
"fg_logps/policy_rejected": -20.181062698364258, |
|
"fg_logps/reference_KL": -20.86503791809082, |
|
"fg_logps/reference_chosen": -14.479813575744629, |
|
"fg_logps/reference_rejected": -19.489940643310547, |
|
"fg_loss": 0.7766517996788025, |
|
"fg_rewards/chosen_sum": -0.8992184400558472, |
|
"fg_rewards/rejected_sum": -0.4495273232460022, |
|
"grad_norm": 16.284183960507036, |
|
"kl": 0.0, |
|
"learning_rate": 3.761415525114155e-07, |
|
"logps/chosen": -260.18896484375, |
|
"logps/rejected": -417.2262073863636, |
|
"loss": 0.4704, |
|
"rewards/chosen": -0.6337332195705838, |
|
"rewards/margins": 1.9360473035561916, |
|
"rewards/rejected": -2.5697805231267754, |
|
"step": 320 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.22222137451172, |
|
"count/fg_rejected": 3.3333332538604736, |
|
"epoch": 0.3385049365303244, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.270763397216797, |
|
"fg_logps/policy_chosen": -16.191619873046875, |
|
"fg_logps/policy_rejected": -19.492420196533203, |
|
"fg_logps/reference_KL": -19.561973571777344, |
|
"fg_logps/reference_chosen": -15.197806358337402, |
|
"fg_logps/reference_rejected": -18.365306854248047, |
|
"fg_loss": 0.7132466435432434, |
|
"fg_rewards/chosen_sum": -1.5309462547302246, |
|
"fg_rewards/rejected_sum": -0.2937175929546356, |
|
"grad_norm": 7.818574791129848, |
|
"kl": 0.0, |
|
"learning_rate": 3.704337899543379e-07, |
|
"logps/chosen": -285.40846946022725, |
|
"logps/rejected": -327.64527925531917, |
|
"loss": 0.4679, |
|
"rewards/chosen": -1.0337238889752012, |
|
"rewards/margins": 1.5292926372673188, |
|
"rewards/rejected": -2.56301652624252, |
|
"step": 330 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.36842155456543, |
|
"count/fg_rejected": 8.75, |
|
"epoch": 0.34876266187972815, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -18.425670623779297, |
|
"fg_logps/policy_chosen": -13.722508430480957, |
|
"fg_logps/policy_rejected": -16.77787971496582, |
|
"fg_logps/reference_KL": -17.679426193237305, |
|
"fg_logps/reference_chosen": -13.155657768249512, |
|
"fg_logps/reference_rejected": -15.926578521728516, |
|
"fg_loss": 0.7438153624534607, |
|
"fg_rewards/chosen_sum": -0.5837015509605408, |
|
"fg_rewards/rejected_sum": -0.5333263874053955, |
|
"grad_norm": 9.802323506414373, |
|
"kl": 0.0, |
|
"learning_rate": 3.6472602739726025e-07, |
|
"logps/chosen": -300.561328125, |
|
"logps/rejected": -314.146240234375, |
|
"loss": 0.5005, |
|
"rewards/chosen": -0.9040058135986329, |
|
"rewards/margins": 1.4882362365722657, |
|
"rewards/rejected": -2.3922420501708985, |
|
"step": 340 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.14285659790039, |
|
"count/fg_rejected": 5.315789699554443, |
|
"epoch": 0.35902038722913193, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.929489135742188, |
|
"fg_logps/policy_chosen": -15.678800582885742, |
|
"fg_logps/policy_rejected": -19.949668884277344, |
|
"fg_logps/reference_KL": -19.97319984436035, |
|
"fg_logps/reference_chosen": -14.735365867614746, |
|
"fg_logps/reference_rejected": -18.699697494506836, |
|
"fg_loss": 0.9030398726463318, |
|
"fg_rewards/chosen_sum": -1.2398407459259033, |
|
"fg_rewards/rejected_sum": -0.8212930560112, |
|
"grad_norm": 8.643717223258967, |
|
"kl": 0.0, |
|
"learning_rate": 3.590182648401826e-07, |
|
"logps/chosen": -351.5288783482143, |
|
"logps/rejected": -359.00362356085526, |
|
"loss": 0.5488, |
|
"rewards/chosen": -1.1266803741455078, |
|
"rewards/margins": 1.3310967495566919, |
|
"rewards/rejected": -2.4577771237021997, |
|
"step": 350 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.214285850524902, |
|
"count/fg_rejected": 2.6666667461395264, |
|
"epoch": 0.3692781125785357, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -23.768299102783203, |
|
"fg_logps/policy_chosen": -16.969907760620117, |
|
"fg_logps/policy_rejected": -20.13111114501953, |
|
"fg_logps/reference_KL": -22.658279418945312, |
|
"fg_logps/reference_chosen": -15.871198654174805, |
|
"fg_logps/reference_rejected": -18.806407928466797, |
|
"fg_loss": 0.7307645678520203, |
|
"fg_rewards/chosen_sum": -0.9898194670677185, |
|
"fg_rewards/rejected_sum": -0.20603026449680328, |
|
"grad_norm": 8.142135660669938, |
|
"kl": 0.0, |
|
"learning_rate": 3.53310502283105e-07, |
|
"logps/chosen": -282.40988869863014, |
|
"logps/rejected": -349.8757632902299, |
|
"loss": 0.464, |
|
"rewards/chosen": -1.211091551062179, |
|
"rewards/margins": 1.277019747597573, |
|
"rewards/rejected": -2.488111298659752, |
|
"step": 360 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.15999984741211, |
|
"count/fg_rejected": 3.4166667461395264, |
|
"epoch": 0.3795358379279395, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.647724151611328, |
|
"fg_logps/policy_chosen": -15.575891494750977, |
|
"fg_logps/policy_rejected": -18.71537971496582, |
|
"fg_logps/reference_KL": -20.737442016601562, |
|
"fg_logps/reference_chosen": -14.95889663696289, |
|
"fg_logps/reference_rejected": -18.047760009765625, |
|
"fg_loss": 0.7305776476860046, |
|
"fg_rewards/chosen_sum": -0.11250638961791992, |
|
"fg_rewards/rejected_sum": -0.22363699972629547, |
|
"grad_norm": 10.677432473272551, |
|
"kl": 0.09735889732837677, |
|
"learning_rate": 3.476027397260274e-07, |
|
"logps/chosen": -302.0405040922619, |
|
"logps/rejected": -364.09403268914474, |
|
"loss": 0.5405, |
|
"rewards/chosen": -0.9889777047293526, |
|
"rewards/margins": 1.4359847907733199, |
|
"rewards/rejected": -2.4249624955026725, |
|
"step": 370 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.35714340209961, |
|
"count/fg_rejected": 4.461538314819336, |
|
"epoch": 0.38979356327734327, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -23.035646438598633, |
|
"fg_logps/policy_chosen": -16.462678909301758, |
|
"fg_logps/policy_rejected": -19.290132522583008, |
|
"fg_logps/reference_KL": -22.22718620300293, |
|
"fg_logps/reference_chosen": -16.05594825744629, |
|
"fg_logps/reference_rejected": -18.451143264770508, |
|
"fg_loss": 0.772994875907898, |
|
"fg_rewards/chosen_sum": -0.47333768010139465, |
|
"fg_rewards/rejected_sum": -0.43138498067855835, |
|
"grad_norm": 12.589055564062804, |
|
"kl": 0.0, |
|
"learning_rate": 3.418949771689498e-07, |
|
"logps/chosen": -303.3091415777439, |
|
"logps/rejected": -422.3029346955128, |
|
"loss": 0.4738, |
|
"rewards/chosen": -1.0245074760623094, |
|
"rewards/margins": 1.969195730317899, |
|
"rewards/rejected": -2.9937032063802085, |
|
"step": 380 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.047618865966797, |
|
"count/fg_rejected": 3.6666667461395264, |
|
"epoch": 0.400051288626747, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -22.204946517944336, |
|
"fg_logps/policy_chosen": -16.388538360595703, |
|
"fg_logps/policy_rejected": -19.46855926513672, |
|
"fg_logps/reference_KL": -21.356998443603516, |
|
"fg_logps/reference_chosen": -15.460061073303223, |
|
"fg_logps/reference_rejected": -17.982017517089844, |
|
"fg_loss": 0.7031415700912476, |
|
"fg_rewards/chosen_sum": -1.0803974866867065, |
|
"fg_rewards/rejected_sum": -0.587272047996521, |
|
"grad_norm": 13.352594752925569, |
|
"kl": 0.013134384527802467, |
|
"learning_rate": 3.361872146118721e-07, |
|
"logps/chosen": -344.073916153169, |
|
"logps/rejected": -408.09998244382024, |
|
"loss": 0.5212, |
|
"rewards/chosen": -1.473697286256602, |
|
"rewards/margins": 1.0633102491849358, |
|
"rewards/rejected": -2.537007535441538, |
|
"step": 390 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.714285850524902, |
|
"count/fg_rejected": 3.444444417953491, |
|
"epoch": 0.41030901397615077, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.212703704833984, |
|
"fg_logps/policy_chosen": -15.624214172363281, |
|
"fg_logps/policy_rejected": -18.56574249267578, |
|
"fg_logps/reference_KL": -20.04758071899414, |
|
"fg_logps/reference_chosen": -14.2003173828125, |
|
"fg_logps/reference_rejected": -17.362810134887695, |
|
"fg_loss": 0.7282993197441101, |
|
"fg_rewards/chosen_sum": -1.263887643814087, |
|
"fg_rewards/rejected_sum": -0.6069123148918152, |
|
"grad_norm": 10.430038786087746, |
|
"kl": 0.0, |
|
"learning_rate": 3.304794520547945e-07, |
|
"logps/chosen": -331.27660778985506, |
|
"logps/rejected": -399.446042239011, |
|
"loss": 0.4495, |
|
"rewards/chosen": -0.9196471615114074, |
|
"rewards/margins": 1.7936450546952378, |
|
"rewards/rejected": -2.713292216206645, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.41030901397615077, |
|
"eval_count/fg_chosen": 16.01369857788086, |
|
"eval_count/fg_rejected": 4.0824174880981445, |
|
"eval_fg_kl": NaN, |
|
"eval_fg_logps/policy_KL": -21.182559967041016, |
|
"eval_fg_logps/policy_chosen": -16.106983184814453, |
|
"eval_fg_logps/policy_rejected": -20.267147064208984, |
|
"eval_fg_logps/reference_KL": -20.207021713256836, |
|
"eval_fg_logps/reference_chosen": -14.929487228393555, |
|
"eval_fg_logps/reference_rejected": -18.78682518005371, |
|
"eval_fg_loss": 0.744895875453949, |
|
"eval_fg_rewards/chosen_sum": -1.3183764219284058, |
|
"eval_fg_rewards/rejected_sum": -0.5732491612434387, |
|
"eval_kl": 0.005370728671550751, |
|
"eval_logps/chosen": -303.5075557511737, |
|
"eval_logps/rejected": -365.12120890022675, |
|
"eval_loss": 0.4977550208568573, |
|
"eval_rewards/chosen": -1.0396734246625587, |
|
"eval_rewards/margins": 1.678511342733983, |
|
"eval_rewards/rejected": -2.718184767396542, |
|
"eval_runtime": 806.8942, |
|
"eval_samples_per_second": 2.148, |
|
"eval_steps_per_second": 1.074, |
|
"step": 400 |
|
}, |
|
{ |
|
"count/fg_chosen": 13.666666984558105, |
|
"count/fg_rejected": 3.3181817531585693, |
|
"epoch": 0.42056673932555455, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -26.42474937438965, |
|
"fg_logps/policy_chosen": -20.587556838989258, |
|
"fg_logps/policy_rejected": -19.216453552246094, |
|
"fg_logps/reference_KL": -25.272415161132812, |
|
"fg_logps/reference_chosen": -18.695642471313477, |
|
"fg_logps/reference_rejected": -16.948997497558594, |
|
"fg_loss": 0.8068851232528687, |
|
"fg_rewards/chosen_sum": -1.4713115692138672, |
|
"fg_rewards/rejected_sum": -0.7158498167991638, |
|
"grad_norm": 7.617410625793688, |
|
"kl": 0.0, |
|
"learning_rate": 3.247716894977169e-07, |
|
"logps/chosen": -285.80265184859155, |
|
"logps/rejected": -366.92332338483146, |
|
"loss": 0.4728, |
|
"rewards/chosen": -0.8456243058325539, |
|
"rewards/margins": 1.8824731597683062, |
|
"rewards/rejected": -2.72809746560086, |
|
"step": 410 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.4375, |
|
"count/fg_rejected": 3.799999952316284, |
|
"epoch": 0.43082446467495833, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.31452178955078, |
|
"fg_logps/policy_chosen": -16.454280853271484, |
|
"fg_logps/policy_rejected": -22.019365310668945, |
|
"fg_logps/reference_KL": -20.437578201293945, |
|
"fg_logps/reference_chosen": -15.224520683288574, |
|
"fg_logps/reference_rejected": -20.551807403564453, |
|
"fg_loss": 0.8755480647087097, |
|
"fg_rewards/chosen_sum": -1.7564505338668823, |
|
"fg_rewards/rejected_sum": -0.6421544551849365, |
|
"grad_norm": 14.132450428785305, |
|
"kl": 0.0, |
|
"learning_rate": 3.1906392694063925e-07, |
|
"logps/chosen": -319.19919463734567, |
|
"logps/rejected": -400.0357001582278, |
|
"loss": 0.494, |
|
"rewards/chosen": -1.2088648478190105, |
|
"rewards/margins": 1.8420809653237902, |
|
"rewards/rejected": -3.0509458131428007, |
|
"step": 420 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.55555534362793, |
|
"count/fg_rejected": 3.8235294818878174, |
|
"epoch": 0.4410821900243621, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.4989013671875, |
|
"fg_logps/policy_chosen": -16.78192138671875, |
|
"fg_logps/policy_rejected": -19.125896453857422, |
|
"fg_logps/reference_KL": -20.556718826293945, |
|
"fg_logps/reference_chosen": -15.185633659362793, |
|
"fg_logps/reference_rejected": -17.72509002685547, |
|
"fg_loss": 0.8559948205947876, |
|
"fg_rewards/chosen_sum": -1.8476042747497559, |
|
"fg_rewards/rejected_sum": -0.710954487323761, |
|
"grad_norm": 11.780117349913754, |
|
"kl": 0.0, |
|
"learning_rate": 3.133561643835616e-07, |
|
"logps/chosen": -301.3424876143293, |
|
"logps/rejected": -394.38636818910254, |
|
"loss": 0.4855, |
|
"rewards/chosen": -0.9122947134622713, |
|
"rewards/margins": 2.028106999591114, |
|
"rewards/rejected": -2.9404017130533853, |
|
"step": 430 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.0, |
|
"count/fg_rejected": 4.75, |
|
"epoch": 0.4513399153737659, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -18.407926559448242, |
|
"fg_logps/policy_chosen": -14.23660945892334, |
|
"fg_logps/policy_rejected": -19.2972354888916, |
|
"fg_logps/reference_KL": -17.730350494384766, |
|
"fg_logps/reference_chosen": -13.197031021118164, |
|
"fg_logps/reference_rejected": -17.424985885620117, |
|
"fg_loss": 0.9003034234046936, |
|
"fg_rewards/chosen_sum": -1.1896860599517822, |
|
"fg_rewards/rejected_sum": -0.8951053023338318, |
|
"grad_norm": 13.19151786370417, |
|
"kl": 0.0, |
|
"learning_rate": 3.07648401826484e-07, |
|
"logps/chosen": -381.12164834104937, |
|
"logps/rejected": -368.0486797863924, |
|
"loss": 0.4868, |
|
"rewards/chosen": -1.618426099235629, |
|
"rewards/margins": 0.9258255479409927, |
|
"rewards/rejected": -2.5442516471766217, |
|
"step": 440 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.588234901428223, |
|
"count/fg_rejected": 4.25, |
|
"epoch": 0.4615976407231696, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.48902130126953, |
|
"fg_logps/policy_chosen": -16.183223724365234, |
|
"fg_logps/policy_rejected": -20.021495819091797, |
|
"fg_logps/reference_KL": -20.65131950378418, |
|
"fg_logps/reference_chosen": -15.696285247802734, |
|
"fg_logps/reference_rejected": -18.851573944091797, |
|
"fg_loss": 0.8272340297698975, |
|
"fg_rewards/chosen_sum": -0.4996866285800934, |
|
"fg_rewards/rejected_sum": -0.5080854296684265, |
|
"grad_norm": 11.534503591617172, |
|
"kl": 0.038506411015987396, |
|
"learning_rate": 3.019406392694064e-07, |
|
"logps/chosen": -257.8545692845395, |
|
"logps/rejected": -389.2383277529762, |
|
"loss": 0.4618, |
|
"rewards/chosen": -1.0474451968544407, |
|
"rewards/margins": 1.9993883864323896, |
|
"rewards/rejected": -3.0468335832868303, |
|
"step": 450 |
|
}, |
|
{ |
|
"count/fg_chosen": 13.0, |
|
"count/fg_rejected": 3.5999999046325684, |
|
"epoch": 0.4718553660725734, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -25.045570373535156, |
|
"fg_logps/policy_chosen": -18.018939971923828, |
|
"fg_logps/policy_rejected": -24.219295501708984, |
|
"fg_logps/reference_KL": -23.617990493774414, |
|
"fg_logps/reference_chosen": -16.746540069580078, |
|
"fg_logps/reference_rejected": -22.393386840820312, |
|
"fg_loss": 0.7469481825828552, |
|
"fg_rewards/chosen_sum": -0.8979846239089966, |
|
"fg_rewards/rejected_sum": -0.5428987145423889, |
|
"grad_norm": 9.60552438901234, |
|
"kl": 0.045855142176151276, |
|
"learning_rate": 2.9623287671232877e-07, |
|
"logps/chosen": -297.8484563253012, |
|
"logps/rejected": -350.15962357954544, |
|
"loss": 0.4823, |
|
"rewards/chosen": -1.071040233933782, |
|
"rewards/margins": 1.3700017867896352, |
|
"rewards/rejected": -2.441042020723417, |
|
"step": 460 |
|
}, |
|
{ |
|
"count/fg_chosen": 18.479999542236328, |
|
"count/fg_rejected": 2.7727272510528564, |
|
"epoch": 0.48211309142197717, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -18.86458969116211, |
|
"fg_logps/policy_chosen": -13.49354362487793, |
|
"fg_logps/policy_rejected": -17.55750274658203, |
|
"fg_logps/reference_KL": -18.0754337310791, |
|
"fg_logps/reference_chosen": -13.031634330749512, |
|
"fg_logps/reference_rejected": -17.194589614868164, |
|
"fg_loss": 0.6403311491012573, |
|
"fg_rewards/chosen_sum": -0.32538533210754395, |
|
"fg_rewards/rejected_sum": -0.19457949697971344, |
|
"grad_norm": 10.665362097968403, |
|
"kl": 0.0, |
|
"learning_rate": 2.905251141552511e-07, |
|
"logps/chosen": -284.00316540948273, |
|
"logps/rejected": -342.78692208904107, |
|
"loss": 0.5004, |
|
"rewards/chosen": -0.6770254990150189, |
|
"rewards/margins": 2.199749119055401, |
|
"rewards/rejected": -2.8767746180704195, |
|
"step": 470 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.523809432983398, |
|
"count/fg_rejected": 4.5714287757873535, |
|
"epoch": 0.49237081677138095, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.194419860839844, |
|
"fg_logps/policy_chosen": -15.528361320495605, |
|
"fg_logps/policy_rejected": -17.91610336303711, |
|
"fg_logps/reference_KL": -19.58913803100586, |
|
"fg_logps/reference_chosen": -15.298972129821777, |
|
"fg_logps/reference_rejected": -17.643596649169922, |
|
"fg_loss": 0.7117694616317749, |
|
"fg_rewards/chosen_sum": 0.15593282878398895, |
|
"fg_rewards/rejected_sum": -0.3202955722808838, |
|
"grad_norm": 8.89018776362184, |
|
"kl": 0.0, |
|
"learning_rate": 2.848173515981735e-07, |
|
"logps/chosen": -366.11363389756946, |
|
"logps/rejected": -336.78042879971593, |
|
"loss": 0.5079, |
|
"rewards/chosen": -1.2623120413886175, |
|
"rewards/margins": 1.1003893746270075, |
|
"rewards/rejected": -2.362701416015625, |
|
"step": 480 |
|
}, |
|
{ |
|
"count/fg_chosen": 18.47058868408203, |
|
"count/fg_rejected": 2.3333332538604736, |
|
"epoch": 0.5026285421207847, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -22.13791275024414, |
|
"fg_logps/policy_chosen": -15.788673400878906, |
|
"fg_logps/policy_rejected": -16.09772491455078, |
|
"fg_logps/reference_KL": -21.38779640197754, |
|
"fg_logps/reference_chosen": -15.412402153015137, |
|
"fg_logps/reference_rejected": -15.27255916595459, |
|
"fg_loss": 0.5969161987304688, |
|
"fg_rewards/chosen_sum": -0.09672492742538452, |
|
"fg_rewards/rejected_sum": -0.2052246332168579, |
|
"grad_norm": 9.190876919128005, |
|
"kl": 0.0, |
|
"learning_rate": 2.791095890410959e-07, |
|
"logps/chosen": -293.7143950591216, |
|
"logps/rejected": -421.82026707848837, |
|
"loss": 0.454, |
|
"rewards/chosen": -0.9402487471296981, |
|
"rewards/margins": 2.0355014884943934, |
|
"rewards/rejected": -2.9757502356240915, |
|
"step": 490 |
|
}, |
|
{ |
|
"count/fg_chosen": 19.263158798217773, |
|
"count/fg_rejected": 3.1764705181121826, |
|
"epoch": 0.5128862674701885, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -18.44475746154785, |
|
"fg_logps/policy_chosen": -13.578866958618164, |
|
"fg_logps/policy_rejected": -16.77104949951172, |
|
"fg_logps/reference_KL": -18.07316780090332, |
|
"fg_logps/reference_chosen": -13.79211139678955, |
|
"fg_logps/reference_rejected": -16.677942276000977, |
|
"fg_loss": 0.8001216650009155, |
|
"fg_rewards/chosen_sum": 0.6667798161506653, |
|
"fg_rewards/rejected_sum": -0.12318305671215057, |
|
"grad_norm": 11.958302083795623, |
|
"kl": 0.06755809485912323, |
|
"learning_rate": 2.734018264840183e-07, |
|
"logps/chosen": -306.4125142911585, |
|
"logps/rejected": -373.8045873397436, |
|
"loss": 0.4904, |
|
"rewards/chosen": -0.8678309742997332, |
|
"rewards/margins": 1.8353534958525104, |
|
"rewards/rejected": -2.7031844701522436, |
|
"step": 500 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.157894134521484, |
|
"count/fg_rejected": 3.8888888359069824, |
|
"epoch": 0.5231439928195922, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.518945693969727, |
|
"fg_logps/policy_chosen": -14.510261535644531, |
|
"fg_logps/policy_rejected": -19.071426391601562, |
|
"fg_logps/reference_KL": -19.78034782409668, |
|
"fg_logps/reference_chosen": -15.087010383605957, |
|
"fg_logps/reference_rejected": -18.8708553314209, |
|
"fg_loss": 0.7365949749946594, |
|
"fg_rewards/chosen_sum": 1.3324851989746094, |
|
"fg_rewards/rejected_sum": -0.13402292132377625, |
|
"grad_norm": 11.840579393816673, |
|
"kl": 0.0, |
|
"learning_rate": 2.676940639269406e-07, |
|
"logps/chosen": -276.75473257211536, |
|
"logps/rejected": -376.3443216463415, |
|
"loss": 0.4727, |
|
"rewards/chosen": -0.42943524091671675, |
|
"rewards/margins": 2.161930108085284, |
|
"rewards/rejected": -2.5913653490020008, |
|
"step": 510 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.416666984558105, |
|
"count/fg_rejected": 4.473684310913086, |
|
"epoch": 0.5334017181689961, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -23.022239685058594, |
|
"fg_logps/policy_chosen": -15.866630554199219, |
|
"fg_logps/policy_rejected": -18.78508949279785, |
|
"fg_logps/reference_KL": -21.714487075805664, |
|
"fg_logps/reference_chosen": -15.187236785888672, |
|
"fg_logps/reference_rejected": -18.08281898498535, |
|
"fg_loss": 0.7238275408744812, |
|
"fg_rewards/chosen_sum": -0.16382475197315216, |
|
"fg_rewards/rejected_sum": -0.5171225070953369, |
|
"grad_norm": 12.176540624619156, |
|
"kl": 0.04069461673498154, |
|
"learning_rate": 2.61986301369863e-07, |
|
"logps/chosen": -308.16112012987014, |
|
"logps/rejected": -321.8852362575301, |
|
"loss": 0.5229, |
|
"rewards/chosen": -0.8690564044110187, |
|
"rewards/margins": 1.6282350686099714, |
|
"rewards/rejected": -2.49729147302099, |
|
"step": 520 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.81818199157715, |
|
"count/fg_rejected": 3.588235378265381, |
|
"epoch": 0.5436594435183998, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.363672256469727, |
|
"fg_logps/policy_chosen": -14.961847305297852, |
|
"fg_logps/policy_rejected": -21.47898292541504, |
|
"fg_logps/reference_KL": -19.768022537231445, |
|
"fg_logps/reference_chosen": -14.601174354553223, |
|
"fg_logps/reference_rejected": -20.11042594909668, |
|
"fg_loss": 0.6309139728546143, |
|
"fg_rewards/chosen_sum": 0.3970043957233429, |
|
"fg_rewards/rejected_sum": -0.6216399669647217, |
|
"grad_norm": 9.438462165579672, |
|
"kl": 0.017117690294981003, |
|
"learning_rate": 2.562785388127854e-07, |
|
"logps/chosen": -286.2251333841463, |
|
"logps/rejected": -331.3464793669872, |
|
"loss": 0.512, |
|
"rewards/chosen": -1.0500634356242855, |
|
"rewards/margins": 1.6451627935298612, |
|
"rewards/rejected": -2.6952262291541467, |
|
"step": 530 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.850000381469727, |
|
"count/fg_rejected": 2.6875, |
|
"epoch": 0.5539171688678035, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.712533950805664, |
|
"fg_logps/policy_chosen": -14.07727336883545, |
|
"fg_logps/policy_rejected": -18.43448829650879, |
|
"fg_logps/reference_KL": -18.728797912597656, |
|
"fg_logps/reference_chosen": -13.999521255493164, |
|
"fg_logps/reference_rejected": -17.450742721557617, |
|
"fg_loss": 0.6835896372795105, |
|
"fg_rewards/chosen_sum": 0.26210981607437134, |
|
"fg_rewards/rejected_sum": -0.32241931557655334, |
|
"grad_norm": 12.291319609660407, |
|
"kl": 0.0, |
|
"learning_rate": 2.5057077625570777e-07, |
|
"logps/chosen": -306.774658203125, |
|
"logps/rejected": -356.60145399305554, |
|
"loss": 0.5117, |
|
"rewards/chosen": -0.8661181709983132, |
|
"rewards/margins": 1.7041260883061573, |
|
"rewards/rejected": -2.5702442593044705, |
|
"step": 540 |
|
}, |
|
{ |
|
"count/fg_chosen": 18.714284896850586, |
|
"count/fg_rejected": 4.722222328186035, |
|
"epoch": 0.5641748942172073, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.954633712768555, |
|
"fg_logps/policy_chosen": -13.810935974121094, |
|
"fg_logps/policy_rejected": -17.9754638671875, |
|
"fg_logps/reference_KL": -19.242063522338867, |
|
"fg_logps/reference_chosen": -13.879505157470703, |
|
"fg_logps/reference_rejected": -17.572124481201172, |
|
"fg_loss": 0.7275994420051575, |
|
"fg_rewards/chosen_sum": 0.31742504239082336, |
|
"fg_rewards/rejected_sum": -0.11265398561954498, |
|
"grad_norm": 8.68692731946405, |
|
"kl": 0.03491802141070366, |
|
"learning_rate": 2.4486301369863014e-07, |
|
"logps/chosen": -307.91368272569446, |
|
"logps/rejected": -344.2369939630682, |
|
"loss": 0.4965, |
|
"rewards/chosen": -0.7141922314961752, |
|
"rewards/margins": 1.454848968621456, |
|
"rewards/rejected": -2.1690412001176314, |
|
"step": 550 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.190475463867188, |
|
"count/fg_rejected": 3.5, |
|
"epoch": 0.5744326195666111, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.567123413085938, |
|
"fg_logps/policy_chosen": -14.335834503173828, |
|
"fg_logps/policy_rejected": -19.222152709960938, |
|
"fg_logps/reference_KL": -19.11014747619629, |
|
"fg_logps/reference_chosen": -14.28973388671875, |
|
"fg_logps/reference_rejected": -18.501300811767578, |
|
"fg_loss": 0.7595401406288147, |
|
"fg_rewards/chosen_sum": 0.21668264269828796, |
|
"fg_rewards/rejected_sum": -0.24018199741840363, |
|
"grad_norm": 13.295487655176196, |
|
"kl": 0.01426544226706028, |
|
"learning_rate": 2.391552511415525e-07, |
|
"logps/chosen": -340.173639871988, |
|
"logps/rejected": -346.4476207386364, |
|
"loss": 0.5126, |
|
"rewards/chosen": -0.6746720463396555, |
|
"rewards/margins": 1.3085595353111157, |
|
"rewards/rejected": -1.983231581650771, |
|
"step": 560 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.61111068725586, |
|
"count/fg_rejected": 3.230769157409668, |
|
"epoch": 0.5846903449160149, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.59079933166504, |
|
"fg_logps/policy_chosen": -15.83668327331543, |
|
"fg_logps/policy_rejected": -20.33367919921875, |
|
"fg_logps/reference_KL": -19.622900009155273, |
|
"fg_logps/reference_chosen": -14.985246658325195, |
|
"fg_logps/reference_rejected": -19.151033401489258, |
|
"fg_loss": 0.7142822742462158, |
|
"fg_rewards/chosen_sum": -1.0015321969985962, |
|
"fg_rewards/rejected_sum": -0.4118236005306244, |
|
"grad_norm": 14.880021607580103, |
|
"kl": 0.0, |
|
"learning_rate": 2.3344748858447487e-07, |
|
"logps/chosen": -321.2857349537037, |
|
"logps/rejected": -375.5545638844937, |
|
"loss": 0.4752, |
|
"rewards/chosen": -1.0196603845666956, |
|
"rewards/margins": 1.8235435825639412, |
|
"rewards/rejected": -2.843203967130637, |
|
"step": 570 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.615385055541992, |
|
"count/fg_rejected": 3.2727272510528564, |
|
"epoch": 0.5949480702654186, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -22.73748779296875, |
|
"fg_logps/policy_chosen": -16.519458770751953, |
|
"fg_logps/policy_rejected": -19.338417053222656, |
|
"fg_logps/reference_KL": -21.60148048400879, |
|
"fg_logps/reference_chosen": -15.273176193237305, |
|
"fg_logps/reference_rejected": -18.030052185058594, |
|
"fg_loss": 0.7020931243896484, |
|
"fg_rewards/chosen_sum": -1.3088314533233643, |
|
"fg_rewards/rejected_sum": -0.47078707814216614, |
|
"grad_norm": 16.46811452693527, |
|
"kl": 0.010451125912368298, |
|
"learning_rate": 2.2773972602739724e-07, |
|
"logps/chosen": -263.2815152391975, |
|
"logps/rejected": -350.35057357594934, |
|
"loss": 0.531, |
|
"rewards/chosen": -1.0503113358109086, |
|
"rewards/margins": 1.5224891194031933, |
|
"rewards/rejected": -2.572800455214102, |
|
"step": 580 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.391304016113281, |
|
"count/fg_rejected": 3.941176414489746, |
|
"epoch": 0.6052057956148225, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -22.143524169921875, |
|
"fg_logps/policy_chosen": -16.308269500732422, |
|
"fg_logps/policy_rejected": -21.572189331054688, |
|
"fg_logps/reference_KL": -20.941722869873047, |
|
"fg_logps/reference_chosen": -15.216353416442871, |
|
"fg_logps/reference_rejected": -19.638399124145508, |
|
"fg_loss": 0.7051105499267578, |
|
"fg_rewards/chosen_sum": -1.149840235710144, |
|
"fg_rewards/rejected_sum": -0.7558993101119995, |
|
"grad_norm": 10.114785462372456, |
|
"kl": 0.0, |
|
"learning_rate": 2.2203196347031963e-07, |
|
"logps/chosen": -397.5863940746753, |
|
"logps/rejected": -363.87947100903614, |
|
"loss": 0.4982, |
|
"rewards/chosen": -1.2836351270799513, |
|
"rewards/margins": 1.5669618408699733, |
|
"rewards/rejected": -2.8505969679499246, |
|
"step": 590 |
|
}, |
|
{ |
|
"count/fg_chosen": 13.0, |
|
"count/fg_rejected": 3.461538553237915, |
|
"epoch": 0.6154635209642262, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.54435920715332, |
|
"fg_logps/policy_chosen": -16.62588119506836, |
|
"fg_logps/policy_rejected": -24.728574752807617, |
|
"fg_logps/reference_KL": -20.335765838623047, |
|
"fg_logps/reference_chosen": -15.187963485717773, |
|
"fg_logps/reference_rejected": -22.167306900024414, |
|
"fg_loss": 0.6712305545806885, |
|
"fg_rewards/chosen_sum": -1.496664047241211, |
|
"fg_rewards/rejected_sum": -0.9014762043952942, |
|
"grad_norm": 13.343339163568743, |
|
"kl": 0.0, |
|
"learning_rate": 2.16324200913242e-07, |
|
"logps/chosen": -308.1182432432432, |
|
"logps/rejected": -434.2431413517442, |
|
"loss": 0.4397, |
|
"rewards/chosen": -0.7798370155128272, |
|
"rewards/margins": 2.0589501520584848, |
|
"rewards/rejected": -2.838787167571312, |
|
"step": 600 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.222222328186035, |
|
"count/fg_rejected": 4.470588207244873, |
|
"epoch": 0.6257212463136299, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.16152572631836, |
|
"fg_logps/policy_chosen": -16.751249313354492, |
|
"fg_logps/policy_rejected": -19.572917938232422, |
|
"fg_logps/reference_KL": -20.091609954833984, |
|
"fg_logps/reference_chosen": -15.278923988342285, |
|
"fg_logps/reference_rejected": -17.713302612304688, |
|
"fg_loss": 0.7949765920639038, |
|
"fg_rewards/chosen_sum": -1.593385100364685, |
|
"fg_rewards/rejected_sum": -0.8538177609443665, |
|
"grad_norm": 13.466395129856458, |
|
"kl": 0.0, |
|
"learning_rate": 2.106164383561644e-07, |
|
"logps/chosen": -293.43453125, |
|
"logps/rejected": -324.9224264705882, |
|
"loss": 0.4617, |
|
"rewards/chosen": -1.0275954182942708, |
|
"rewards/margins": 1.7896552710439646, |
|
"rewards/rejected": -2.8172506893382354, |
|
"step": 610 |
|
}, |
|
{ |
|
"count/fg_chosen": 18.760000228881836, |
|
"count/fg_rejected": 4.304347991943359, |
|
"epoch": 0.6359789716630337, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -18.953258514404297, |
|
"fg_logps/policy_chosen": -14.654019355773926, |
|
"fg_logps/policy_rejected": -18.075929641723633, |
|
"fg_logps/reference_KL": -18.280025482177734, |
|
"fg_logps/reference_chosen": -14.031706809997559, |
|
"fg_logps/reference_rejected": -16.499387741088867, |
|
"fg_loss": 0.753377377986908, |
|
"fg_rewards/chosen_sum": -0.787641704082489, |
|
"fg_rewards/rejected_sum": -0.6412522792816162, |
|
"grad_norm": 11.259123987500764, |
|
"kl": 0.0, |
|
"learning_rate": 2.0490867579908674e-07, |
|
"logps/chosen": -303.9673755787037, |
|
"logps/rejected": -421.6932357594937, |
|
"loss": 0.4916, |
|
"rewards/chosen": -0.5601168502995997, |
|
"rewards/margins": 2.57166944784864, |
|
"rewards/rejected": -3.13178629814824, |
|
"step": 620 |
|
}, |
|
{ |
|
"count/fg_chosen": 19.100000381469727, |
|
"count/fg_rejected": 3.9375, |
|
"epoch": 0.6462366970124375, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -17.8834171295166, |
|
"fg_logps/policy_chosen": -12.904516220092773, |
|
"fg_logps/policy_rejected": -18.650463104248047, |
|
"fg_logps/reference_KL": -17.336223602294922, |
|
"fg_logps/reference_chosen": -12.578201293945312, |
|
"fg_logps/reference_rejected": -18.051301956176758, |
|
"fg_loss": 0.8024751543998718, |
|
"fg_rewards/chosen_sum": -0.19778959453105927, |
|
"fg_rewards/rejected_sum": -0.45491790771484375, |
|
"grad_norm": 11.633599458809858, |
|
"kl": 0.0, |
|
"learning_rate": 1.9920091324200913e-07, |
|
"logps/chosen": -319.94747740963857, |
|
"logps/rejected": -482.38976258116884, |
|
"loss": 0.4819, |
|
"rewards/chosen": -0.8397377657603069, |
|
"rewards/margins": 2.030176697827863, |
|
"rewards/rejected": -2.8699144635881697, |
|
"step": 630 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.666666984558105, |
|
"count/fg_rejected": 4.599999904632568, |
|
"epoch": 0.6564944223618413, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.435789108276367, |
|
"fg_logps/policy_chosen": -14.770197868347168, |
|
"fg_logps/policy_rejected": -21.445520401000977, |
|
"fg_logps/reference_KL": -20.57337760925293, |
|
"fg_logps/reference_chosen": -14.659423828125, |
|
"fg_logps/reference_rejected": -20.503273010253906, |
|
"fg_loss": 0.7505216598510742, |
|
"fg_rewards/chosen_sum": 0.48458048701286316, |
|
"fg_rewards/rejected_sum": -0.5640232563018799, |
|
"grad_norm": 10.675237142753284, |
|
"kl": 0.0, |
|
"learning_rate": 1.934931506849315e-07, |
|
"logps/chosen": -296.4902083333333, |
|
"logps/rejected": -395.98147977941176, |
|
"loss": 0.4865, |
|
"rewards/chosen": -0.7284186808268229, |
|
"rewards/margins": 1.9219920498717067, |
|
"rewards/rejected": -2.6504107306985296, |
|
"step": 640 |
|
}, |
|
{ |
|
"count/fg_chosen": 13.600000381469727, |
|
"count/fg_rejected": 3.642857074737549, |
|
"epoch": 0.666752147711245, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.876583099365234, |
|
"fg_logps/policy_chosen": -15.880990982055664, |
|
"fg_logps/policy_rejected": -18.033313751220703, |
|
"fg_logps/reference_KL": -20.89072608947754, |
|
"fg_logps/reference_chosen": -15.644922256469727, |
|
"fg_logps/reference_rejected": -17.162748336791992, |
|
"fg_loss": 0.8513435125350952, |
|
"fg_rewards/chosen_sum": 0.21025246381759644, |
|
"fg_rewards/rejected_sum": -0.4860546588897705, |
|
"grad_norm": 11.01420885518033, |
|
"kl": 0.0, |
|
"learning_rate": 1.877853881278539e-07, |
|
"logps/chosen": -283.9858993902439, |
|
"logps/rejected": -359.59252303685895, |
|
"loss": 0.4902, |
|
"rewards/chosen": -0.6547578951207603, |
|
"rewards/margins": 1.8769426304075854, |
|
"rewards/rejected": -2.5317005255283456, |
|
"step": 650 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.045454978942871, |
|
"count/fg_rejected": 4.222222328186035, |
|
"epoch": 0.6770098730606487, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.071636199951172, |
|
"fg_logps/policy_chosen": -15.461947441101074, |
|
"fg_logps/policy_rejected": -16.1773738861084, |
|
"fg_logps/reference_KL": -20.327823638916016, |
|
"fg_logps/reference_chosen": -14.584894180297852, |
|
"fg_logps/reference_rejected": -15.22111988067627, |
|
"fg_loss": 0.7847881317138672, |
|
"fg_rewards/chosen_sum": -0.37347137928009033, |
|
"fg_rewards/rejected_sum": -0.5809666514396667, |
|
"grad_norm": 10.971292603882963, |
|
"kl": 0.0, |
|
"learning_rate": 1.8207762557077624e-07, |
|
"logps/chosen": -292.3511575838415, |
|
"logps/rejected": -411.4802684294872, |
|
"loss": 0.4927, |
|
"rewards/chosen": -0.615756523318407, |
|
"rewards/margins": 2.001151623466449, |
|
"rewards/rejected": -2.616908146784856, |
|
"step": 660 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.521739959716797, |
|
"count/fg_rejected": 3.8947367668151855, |
|
"epoch": 0.6872675984100526, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.02415657043457, |
|
"fg_logps/policy_chosen": -16.135929107666016, |
|
"fg_logps/policy_rejected": -17.662372589111328, |
|
"fg_logps/reference_KL": -20.120943069458008, |
|
"fg_logps/reference_chosen": -15.36439037322998, |
|
"fg_logps/reference_rejected": -16.619319915771484, |
|
"fg_loss": 0.6095057725906372, |
|
"fg_rewards/chosen_sum": -0.9091306924819946, |
|
"fg_rewards/rejected_sum": -0.6099987030029297, |
|
"grad_norm": 11.277192107560657, |
|
"kl": 0.0, |
|
"learning_rate": 1.7636986301369863e-07, |
|
"logps/chosen": -329.480522260274, |
|
"logps/rejected": -394.565149066092, |
|
"loss": 0.5064, |
|
"rewards/chosen": -1.2582495702456122, |
|
"rewards/margins": 1.6947859816805733, |
|
"rewards/rejected": -2.9530355519261855, |
|
"step": 670 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.882352828979492, |
|
"count/fg_rejected": 2.9166667461395264, |
|
"epoch": 0.6975253237594563, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -18.715160369873047, |
|
"fg_logps/policy_chosen": -13.929475784301758, |
|
"fg_logps/policy_rejected": -18.618078231811523, |
|
"fg_logps/reference_KL": -18.27341651916504, |
|
"fg_logps/reference_chosen": -13.522153854370117, |
|
"fg_logps/reference_rejected": -17.502342224121094, |
|
"fg_loss": 0.6171839237213135, |
|
"fg_rewards/chosen_sum": -0.3216637372970581, |
|
"fg_rewards/rejected_sum": -0.12122553586959839, |
|
"grad_norm": 10.871199583632286, |
|
"kl": 0.07263422012329102, |
|
"learning_rate": 1.70662100456621e-07, |
|
"logps/chosen": -290.58342978395063, |
|
"logps/rejected": -358.15182456487344, |
|
"loss": 0.473, |
|
"rewards/chosen": -0.5404636712721836, |
|
"rewards/margins": 2.1079875423230496, |
|
"rewards/rejected": -2.648451213595233, |
|
"step": 680 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.10714340209961, |
|
"count/fg_rejected": 3.5714285373687744, |
|
"epoch": 0.7077830491088601, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.88849449157715, |
|
"fg_logps/policy_chosen": -15.352334022521973, |
|
"fg_logps/policy_rejected": -19.385269165039062, |
|
"fg_logps/reference_KL": -19.914424896240234, |
|
"fg_logps/reference_chosen": -14.209139823913574, |
|
"fg_logps/reference_rejected": -18.167560577392578, |
|
"fg_loss": 0.6616089344024658, |
|
"fg_rewards/chosen_sum": -0.9578856229782104, |
|
"fg_rewards/rejected_sum": -0.687567949295044, |
|
"grad_norm": 7.140501610514079, |
|
"kl": 0.0, |
|
"learning_rate": 1.649543378995434e-07, |
|
"logps/chosen": -293.50786458333334, |
|
"logps/rejected": -386.10523897058823, |
|
"loss": 0.4551, |
|
"rewards/chosen": -0.3227819315592448, |
|
"rewards/margins": 2.913227015476601, |
|
"rewards/rejected": -3.2360089470358457, |
|
"step": 690 |
|
}, |
|
{ |
|
"count/fg_chosen": 18.0, |
|
"count/fg_rejected": 4.599999904632568, |
|
"epoch": 0.7180407744582639, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.225204467773438, |
|
"fg_logps/policy_chosen": -15.224019050598145, |
|
"fg_logps/policy_rejected": -17.033180236816406, |
|
"fg_logps/reference_KL": -19.995820999145508, |
|
"fg_logps/reference_chosen": -13.913763046264648, |
|
"fg_logps/reference_rejected": -15.521989822387695, |
|
"fg_loss": 0.799098014831543, |
|
"fg_rewards/chosen_sum": -1.0094504356384277, |
|
"fg_rewards/rejected_sum": -0.5853248238563538, |
|
"grad_norm": 12.12648011854002, |
|
"kl": 0.0, |
|
"learning_rate": 1.5924657534246573e-07, |
|
"logps/chosen": -344.3683810763889, |
|
"logps/rejected": -375.64266183035716, |
|
"loss": 0.4812, |
|
"rewards/chosen": -0.6706111907958985, |
|
"rewards/margins": 2.2407723018101287, |
|
"rewards/rejected": -2.911383492606027, |
|
"step": 700 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.1200008392334, |
|
"count/fg_rejected": 3.095238208770752, |
|
"epoch": 0.7282984998076677, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.293447494506836, |
|
"fg_logps/policy_chosen": -13.494970321655273, |
|
"fg_logps/policy_rejected": -16.938003540039062, |
|
"fg_logps/reference_KL": -18.482240676879883, |
|
"fg_logps/reference_chosen": -13.644604682922363, |
|
"fg_logps/reference_rejected": -16.76604652404785, |
|
"fg_loss": 0.7007278800010681, |
|
"fg_rewards/chosen_sum": 0.5598518252372742, |
|
"fg_rewards/rejected_sum": -0.21593458950519562, |
|
"grad_norm": 14.686077759756456, |
|
"kl": 0.005865669343620539, |
|
"learning_rate": 1.5353881278538813e-07, |
|
"logps/chosen": -292.7752586570946, |
|
"logps/rejected": -331.10535519622096, |
|
"loss": 0.4337, |
|
"rewards/chosen": -0.09871104601267222, |
|
"rewards/margins": 2.7478330866635634, |
|
"rewards/rejected": -2.8465441326762355, |
|
"step": 710 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.35714340209961, |
|
"count/fg_rejected": 3.0, |
|
"epoch": 0.7385562251570714, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.972545623779297, |
|
"fg_logps/policy_chosen": -14.150968551635742, |
|
"fg_logps/policy_rejected": -22.872663497924805, |
|
"fg_logps/reference_KL": -20.97467041015625, |
|
"fg_logps/reference_chosen": -14.420245170593262, |
|
"fg_logps/reference_rejected": -23.34649658203125, |
|
"fg_loss": 0.6548932194709778, |
|
"fg_rewards/chosen_sum": 1.0662188529968262, |
|
"fg_rewards/rejected_sum": 0.059594761580228806, |
|
"grad_norm": 10.578828543316053, |
|
"kl": 0.0, |
|
"learning_rate": 1.478310502283105e-07, |
|
"logps/chosen": -315.2850294237013, |
|
"logps/rejected": -390.93107586596386, |
|
"loss": 0.4322, |
|
"rewards/chosen": -0.855069940740412, |
|
"rewards/margins": 1.7850939864527353, |
|
"rewards/rejected": -2.6401639271931474, |
|
"step": 720 |
|
}, |
|
{ |
|
"count/fg_chosen": 18.545454025268555, |
|
"count/fg_rejected": 4.666666507720947, |
|
"epoch": 0.7488139505064751, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -22.168006896972656, |
|
"fg_logps/policy_chosen": -15.429848670959473, |
|
"fg_logps/policy_rejected": -20.59855079650879, |
|
"fg_logps/reference_KL": -21.434024810791016, |
|
"fg_logps/reference_chosen": -15.218304634094238, |
|
"fg_logps/reference_rejected": -20.059797286987305, |
|
"fg_loss": 0.7126922607421875, |
|
"fg_rewards/chosen_sum": 0.7979265451431274, |
|
"fg_rewards/rejected_sum": -0.11843083798885345, |
|
"grad_norm": 14.792127676836103, |
|
"kl": 0.06886587291955948, |
|
"learning_rate": 1.421232876712329e-07, |
|
"logps/chosen": -287.5471157962329, |
|
"logps/rejected": -396.7852460488506, |
|
"loss": 0.437, |
|
"rewards/chosen": 0.011234314474340987, |
|
"rewards/margins": 3.0592356586508855, |
|
"rewards/rejected": -3.0480013441765443, |
|
"step": 730 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.793103218078613, |
|
"count/fg_rejected": 4.148148059844971, |
|
"epoch": 0.759071675855879, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.16313934326172, |
|
"fg_logps/policy_chosen": -15.684663772583008, |
|
"fg_logps/policy_rejected": -19.98002052307129, |
|
"fg_logps/reference_KL": -20.154132843017578, |
|
"fg_logps/reference_chosen": -14.688092231750488, |
|
"fg_logps/reference_rejected": -18.67272186279297, |
|
"fg_loss": 0.8155136704444885, |
|
"fg_rewards/chosen_sum": -0.6768214106559753, |
|
"fg_rewards/rejected_sum": -0.5260854959487915, |
|
"grad_norm": 15.891406104752123, |
|
"kl": 0.0, |
|
"learning_rate": 1.3641552511415523e-07, |
|
"logps/chosen": -287.50723958333333, |
|
"logps/rejected": -408.88382352941176, |
|
"loss": 0.5193, |
|
"rewards/chosen": -0.7413622029622395, |
|
"rewards/margins": 2.1912876562978707, |
|
"rewards/rejected": -2.9326498592601102, |
|
"step": 740 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.5, |
|
"count/fg_rejected": 4.222222328186035, |
|
"epoch": 0.7693294012052827, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -18.54020881652832, |
|
"fg_logps/policy_chosen": -14.070555686950684, |
|
"fg_logps/policy_rejected": -19.403593063354492, |
|
"fg_logps/reference_KL": -17.662343978881836, |
|
"fg_logps/reference_chosen": -13.439438819885254, |
|
"fg_logps/reference_rejected": -18.562532424926758, |
|
"fg_loss": 0.7397133111953735, |
|
"fg_rewards/chosen_sum": -0.7740154266357422, |
|
"fg_rewards/rejected_sum": -0.5391371846199036, |
|
"grad_norm": 10.009694005800096, |
|
"kl": 0.0, |
|
"learning_rate": 1.3070776255707763e-07, |
|
"logps/chosen": -312.1899646577381, |
|
"logps/rejected": -437.6450966282895, |
|
"loss": 0.5177, |
|
"rewards/chosen": -1.1885592142740886, |
|
"rewards/margins": 1.7126708448978893, |
|
"rewards/rejected": -2.901230059171978, |
|
"step": 750 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.52173900604248, |
|
"count/fg_rejected": 3.25, |
|
"epoch": 0.7795871265546865, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.690628051757812, |
|
"fg_logps/policy_chosen": -15.920393943786621, |
|
"fg_logps/policy_rejected": -17.173805236816406, |
|
"fg_logps/reference_KL": -20.706315994262695, |
|
"fg_logps/reference_chosen": -15.248817443847656, |
|
"fg_logps/reference_rejected": -16.315380096435547, |
|
"fg_loss": 0.7596006393432617, |
|
"fg_rewards/chosen_sum": -0.31831929087638855, |
|
"fg_rewards/rejected_sum": -0.46696019172668457, |
|
"grad_norm": 11.001152687682147, |
|
"kl": 0.1602586805820465, |
|
"learning_rate": 1.25e-07, |
|
"logps/chosen": -315.33207514044943, |
|
"logps/rejected": -346.9461652728873, |
|
"loss": 0.5177, |
|
"rewards/chosen": -0.877791586886631, |
|
"rewards/margins": 1.9533982805897114, |
|
"rewards/rejected": -2.8311898674763425, |
|
"step": 760 |
|
}, |
|
{ |
|
"count/fg_chosen": 13.954545021057129, |
|
"count/fg_rejected": 5.0625, |
|
"epoch": 0.7898448519040903, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.25006866455078, |
|
"fg_logps/policy_chosen": -15.683282852172852, |
|
"fg_logps/policy_rejected": -20.789840698242188, |
|
"fg_logps/reference_KL": -20.35251808166504, |
|
"fg_logps/reference_chosen": -15.138045310974121, |
|
"fg_logps/reference_rejected": -19.545236587524414, |
|
"fg_loss": 0.7877352833747864, |
|
"fg_rewards/chosen_sum": -0.0805804431438446, |
|
"fg_rewards/rejected_sum": -0.919762909412384, |
|
"grad_norm": 8.914441470208311, |
|
"kl": 0.0056018829345703125, |
|
"learning_rate": 1.1929223744292236e-07, |
|
"logps/chosen": -287.3155048076923, |
|
"logps/rejected": -411.3341749237805, |
|
"loss": 0.4771, |
|
"rewards/chosen": -0.5340637304844, |
|
"rewards/margins": 2.177349211052852, |
|
"rewards/rejected": -2.7114129415372523, |
|
"step": 770 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.94444465637207, |
|
"count/fg_rejected": 3.8125, |
|
"epoch": 0.800102577253494, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.90068244934082, |
|
"fg_logps/policy_chosen": -14.70145034790039, |
|
"fg_logps/policy_rejected": -18.811277389526367, |
|
"fg_logps/reference_KL": -19.578754425048828, |
|
"fg_logps/reference_chosen": -14.712576866149902, |
|
"fg_logps/reference_rejected": -18.912837982177734, |
|
"fg_loss": 0.7731603980064392, |
|
"fg_rewards/chosen_sum": 0.47520291805267334, |
|
"fg_rewards/rejected_sum": -0.34054040908813477, |
|
"grad_norm": 8.587072014013287, |
|
"kl": 0.09591908752918243, |
|
"learning_rate": 1.1358447488584474e-07, |
|
"logps/chosen": -269.6305419921875, |
|
"logps/rejected": -346.35361328125, |
|
"loss": 0.4703, |
|
"rewards/chosen": -0.6143415451049805, |
|
"rewards/margins": 1.9741443634033204, |
|
"rewards/rejected": -2.588485908508301, |
|
"step": 780 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.72222137451172, |
|
"count/fg_rejected": 4.916666507720947, |
|
"epoch": 0.8103603026028978, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.655256271362305, |
|
"fg_logps/policy_chosen": -14.51056957244873, |
|
"fg_logps/policy_rejected": -17.415449142456055, |
|
"fg_logps/reference_KL": -19.09449005126953, |
|
"fg_logps/reference_chosen": -14.248104095458984, |
|
"fg_logps/reference_rejected": -16.654375076293945, |
|
"fg_loss": 0.6147817969322205, |
|
"fg_rewards/chosen_sum": -0.14415740966796875, |
|
"fg_rewards/rejected_sum": -0.5222800970077515, |
|
"grad_norm": 15.457265468907643, |
|
"kl": 0.08856544643640518, |
|
"learning_rate": 1.0787671232876712e-07, |
|
"logps/chosen": -288.4888599537037, |
|
"logps/rejected": -336.5465783227848, |
|
"loss": 0.4963, |
|
"rewards/chosen": -0.8326729668511285, |
|
"rewards/margins": 1.523808464554627, |
|
"rewards/rejected": -2.3564814314057556, |
|
"step": 790 |
|
}, |
|
{ |
|
"count/fg_chosen": 18.117647171020508, |
|
"count/fg_rejected": 5.333333492279053, |
|
"epoch": 0.8206180279523015, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -18.511550903320312, |
|
"fg_logps/policy_chosen": -13.713569641113281, |
|
"fg_logps/policy_rejected": -20.020904541015625, |
|
"fg_logps/reference_KL": -17.690017700195312, |
|
"fg_logps/reference_chosen": -13.43185806274414, |
|
"fg_logps/reference_rejected": -19.19187355041504, |
|
"fg_loss": 0.7455827593803406, |
|
"fg_rewards/chosen_sum": 0.051257286220788956, |
|
"fg_rewards/rejected_sum": -0.5338117480278015, |
|
"grad_norm": 15.017949800951369, |
|
"kl": 0.0, |
|
"learning_rate": 1.0216894977168949e-07, |
|
"logps/chosen": -324.4031684027778, |
|
"logps/rejected": -468.073828125, |
|
"loss": 0.5189, |
|
"rewards/chosen": -0.744783443874783, |
|
"rewards/margins": 1.6178517053997705, |
|
"rewards/rejected": -2.3626351492745536, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8206180279523015, |
|
"eval_count/fg_chosen": 16.01369857788086, |
|
"eval_count/fg_rejected": 4.0824174880981445, |
|
"eval_fg_kl": NaN, |
|
"eval_fg_logps/policy_KL": -21.125959396362305, |
|
"eval_fg_logps/policy_chosen": -15.278054237365723, |
|
"eval_fg_logps/policy_rejected": -19.65519905090332, |
|
"eval_fg_logps/reference_KL": -20.207021713256836, |
|
"eval_fg_logps/reference_chosen": -14.929487228393555, |
|
"eval_fg_logps/reference_rejected": -18.78682518005371, |
|
"eval_fg_loss": 0.736534595489502, |
|
"eval_fg_rewards/chosen_sum": 0.0694078877568245, |
|
"eval_fg_rewards/rejected_sum": -0.3622537851333618, |
|
"eval_kl": 0.008128926157951355, |
|
"eval_logps/chosen": -299.7121112089202, |
|
"eval_logps/rejected": -364.37436224489795, |
|
"eval_loss": 0.4815324544906616, |
|
"eval_rewards/chosen": -0.6601278815470951, |
|
"eval_rewards/margins": 1.9833717220703937, |
|
"eval_rewards/rejected": -2.6434996036174887, |
|
"eval_runtime": 808.1531, |
|
"eval_samples_per_second": 2.144, |
|
"eval_steps_per_second": 1.073, |
|
"step": 800 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.33333396911621, |
|
"count/fg_rejected": 2.7857143878936768, |
|
"epoch": 0.8308757533017054, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -23.438562393188477, |
|
"fg_logps/policy_chosen": -16.027084350585938, |
|
"fg_logps/policy_rejected": -18.6650390625, |
|
"fg_logps/reference_KL": -22.405675888061523, |
|
"fg_logps/reference_chosen": -15.913044929504395, |
|
"fg_logps/reference_rejected": -18.558101654052734, |
|
"fg_loss": 0.6452624201774597, |
|
"fg_rewards/chosen_sum": 0.40020328760147095, |
|
"fg_rewards/rejected_sum": -0.3264440894126892, |
|
"grad_norm": 7.011400584597267, |
|
"kl": 0.0, |
|
"learning_rate": 9.646118721461187e-08, |
|
"logps/chosen": -305.2624952936747, |
|
"logps/rejected": -375.5898944805195, |
|
"loss": 0.4673, |
|
"rewards/chosen": -0.7119015613234186, |
|
"rewards/margins": 2.1217946801792196, |
|
"rewards/rejected": -2.833696241502638, |
|
"step": 810 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.05555534362793, |
|
"count/fg_rejected": 4.8125, |
|
"epoch": 0.8411334786511091, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.090803146362305, |
|
"fg_logps/policy_chosen": -13.760422706604004, |
|
"fg_logps/policy_rejected": -19.657625198364258, |
|
"fg_logps/reference_KL": -19.27540397644043, |
|
"fg_logps/reference_chosen": -13.429532051086426, |
|
"fg_logps/reference_rejected": -18.345483779907227, |
|
"fg_loss": 0.7558559775352478, |
|
"fg_rewards/chosen_sum": 0.17663703858852386, |
|
"fg_rewards/rejected_sum": -0.7809099555015564, |
|
"grad_norm": 12.43361776512671, |
|
"kl": 0.0, |
|
"learning_rate": 9.075342465753424e-08, |
|
"logps/chosen": -316.12268350290697, |
|
"logps/rejected": -377.34243032094594, |
|
"loss": 0.4807, |
|
"rewards/chosen": -0.8466514764830123, |
|
"rewards/margins": 2.1515591211996314, |
|
"rewards/rejected": -2.9982105976826436, |
|
"step": 820 |
|
}, |
|
{ |
|
"count/fg_chosen": 18.95833396911621, |
|
"count/fg_rejected": 3.5999999046325684, |
|
"epoch": 0.8513912040005129, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.093595504760742, |
|
"fg_logps/policy_chosen": -14.394129753112793, |
|
"fg_logps/policy_rejected": -17.22884178161621, |
|
"fg_logps/reference_KL": -19.071619033813477, |
|
"fg_logps/reference_chosen": -13.635478019714355, |
|
"fg_logps/reference_rejected": -16.50658416748047, |
|
"fg_loss": 0.7184759974479675, |
|
"fg_rewards/chosen_sum": 0.43327081203460693, |
|
"fg_rewards/rejected_sum": -0.22198212146759033, |
|
"grad_norm": 13.25277133030139, |
|
"kl": 0.04477081447839737, |
|
"learning_rate": 8.504566210045662e-08, |
|
"logps/chosen": -338.75426793981484, |
|
"logps/rejected": -406.7663419699367, |
|
"loss": 0.5598, |
|
"rewards/chosen": -0.7480648653006848, |
|
"rewards/margins": 1.1518268552417399, |
|
"rewards/rejected": -1.8998917205424248, |
|
"step": 830 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.428571701049805, |
|
"count/fg_rejected": 4.277777671813965, |
|
"epoch": 0.8616489293499167, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.675304412841797, |
|
"fg_logps/policy_chosen": -14.73193073272705, |
|
"fg_logps/policy_rejected": -17.31828498840332, |
|
"fg_logps/reference_KL": -19.20144271850586, |
|
"fg_logps/reference_chosen": -14.565011024475098, |
|
"fg_logps/reference_rejected": -17.02730941772461, |
|
"fg_loss": 0.7666863799095154, |
|
"fg_rewards/chosen_sum": 0.5958693623542786, |
|
"fg_rewards/rejected_sum": -0.01988927833735943, |
|
"grad_norm": 11.745424230577617, |
|
"kl": 0.0641721710562706, |
|
"learning_rate": 7.933789954337899e-08, |
|
"logps/chosen": -295.41467126623377, |
|
"logps/rejected": -367.2850856551205, |
|
"loss": 0.5065, |
|
"rewards/chosen": -0.7968547375171215, |
|
"rewards/margins": 1.916948473373315, |
|
"rewards/rejected": -2.7138032108904366, |
|
"step": 840 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.1200008392334, |
|
"count/fg_rejected": 4.5714287757873535, |
|
"epoch": 0.8719066546993204, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.93068504333496, |
|
"fg_logps/policy_chosen": -14.386673927307129, |
|
"fg_logps/policy_rejected": -18.88810157775879, |
|
"fg_logps/reference_KL": -19.163719177246094, |
|
"fg_logps/reference_chosen": -13.809516906738281, |
|
"fg_logps/reference_rejected": -17.88907241821289, |
|
"fg_loss": 0.7888970375061035, |
|
"fg_rewards/chosen_sum": -0.009096489287912846, |
|
"fg_rewards/rejected_sum": -0.5189210772514343, |
|
"grad_norm": 10.115452996310198, |
|
"kl": 0.027889441698789597, |
|
"learning_rate": 7.363013698630137e-08, |
|
"logps/chosen": -345.9682221283784, |
|
"logps/rejected": -356.40765806686045, |
|
"loss": 0.5075, |
|
"rewards/chosen": -0.9859014975058066, |
|
"rewards/margins": 1.6604730496984996, |
|
"rewards/rejected": -2.646374547204306, |
|
"step": 850 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.9375, |
|
"count/fg_rejected": 4.214285850524902, |
|
"epoch": 0.8821643800487242, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -22.199317932128906, |
|
"fg_logps/policy_chosen": -14.535076141357422, |
|
"fg_logps/policy_rejected": -18.46771240234375, |
|
"fg_logps/reference_KL": -20.52707862854004, |
|
"fg_logps/reference_chosen": -14.20512866973877, |
|
"fg_logps/reference_rejected": -17.470090866088867, |
|
"fg_loss": 0.8201218247413635, |
|
"fg_rewards/chosen_sum": -0.14333221316337585, |
|
"fg_rewards/rejected_sum": -0.3655402958393097, |
|
"grad_norm": 10.49340621720449, |
|
"kl": 0.0, |
|
"learning_rate": 6.792237442922374e-08, |
|
"logps/chosen": -259.49934050324674, |
|
"logps/rejected": -336.4366999246988, |
|
"loss": 0.4497, |
|
"rewards/chosen": -0.43501192563539975, |
|
"rewards/margins": 2.218185581188772, |
|
"rewards/rejected": -2.6531975068241715, |
|
"step": 860 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.117647171020508, |
|
"count/fg_rejected": 3.076923131942749, |
|
"epoch": 0.8924221053981279, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -22.28684425354004, |
|
"fg_logps/policy_chosen": -14.350448608398438, |
|
"fg_logps/policy_rejected": -20.323551177978516, |
|
"fg_logps/reference_KL": -21.138818740844727, |
|
"fg_logps/reference_chosen": -14.148969650268555, |
|
"fg_logps/reference_rejected": -19.21809959411621, |
|
"fg_loss": 0.7609926462173462, |
|
"fg_rewards/chosen_sum": 0.2540108561515808, |
|
"fg_rewards/rejected_sum": -0.4159541428089142, |
|
"grad_norm": 14.052655899212164, |
|
"kl": 0.0, |
|
"learning_rate": 6.221461187214611e-08, |
|
"logps/chosen": -313.48386548913044, |
|
"logps/rejected": -408.48944024725273, |
|
"loss": 0.4252, |
|
"rewards/chosen": -0.7578884622325068, |
|
"rewards/margins": 2.286565588355577, |
|
"rewards/rejected": -3.044454050588084, |
|
"step": 870 |
|
}, |
|
{ |
|
"count/fg_chosen": 13.785714149475098, |
|
"count/fg_rejected": 4.363636493682861, |
|
"epoch": 0.9026798307475318, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -23.224103927612305, |
|
"fg_logps/policy_chosen": -17.700504302978516, |
|
"fg_logps/policy_rejected": -18.443458557128906, |
|
"fg_logps/reference_KL": -21.812795639038086, |
|
"fg_logps/reference_chosen": -16.529930114746094, |
|
"fg_logps/reference_rejected": -16.90056800842285, |
|
"fg_loss": 0.861905038356781, |
|
"fg_rewards/chosen_sum": -1.0540215969085693, |
|
"fg_rewards/rejected_sum": -0.6782628297805786, |
|
"grad_norm": 10.169312742642402, |
|
"kl": 0.0, |
|
"learning_rate": 5.650684931506849e-08, |
|
"logps/chosen": -271.11566840277777, |
|
"logps/rejected": -324.0809215198864, |
|
"loss": 0.4212, |
|
"rewards/chosen": -0.5925836563110352, |
|
"rewards/margins": 1.9108110774647105, |
|
"rewards/rejected": -2.5033947337757456, |
|
"step": 880 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.526315689086914, |
|
"count/fg_rejected": 6.0, |
|
"epoch": 0.9129375560969355, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.294906616210938, |
|
"fg_logps/policy_chosen": -13.935691833496094, |
|
"fg_logps/policy_rejected": -16.94110107421875, |
|
"fg_logps/reference_KL": -18.384138107299805, |
|
"fg_logps/reference_chosen": -13.67742919921875, |
|
"fg_logps/reference_rejected": -15.758844375610352, |
|
"fg_loss": 0.6794268488883972, |
|
"fg_rewards/chosen_sum": 0.0076361955143511295, |
|
"fg_rewards/rejected_sum": -0.4533938765525818, |
|
"grad_norm": 9.707055272049164, |
|
"kl": 0.0, |
|
"learning_rate": 5.0799086757990863e-08, |
|
"logps/chosen": -275.1984604779412, |
|
"logps/rejected": -400.91375, |
|
"loss": 0.4898, |
|
"rewards/chosen": -0.607273774988511, |
|
"rewards/margins": 2.4971288942823224, |
|
"rewards/rejected": -3.1044026692708333, |
|
"step": 890 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.18181800842285, |
|
"count/fg_rejected": 5.333333492279053, |
|
"epoch": 0.9231952814463392, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -19.024534225463867, |
|
"fg_logps/policy_chosen": -14.489638328552246, |
|
"fg_logps/policy_rejected": -17.007856369018555, |
|
"fg_logps/reference_KL": -18.624956130981445, |
|
"fg_logps/reference_chosen": -14.467068672180176, |
|
"fg_logps/reference_rejected": -16.36551284790039, |
|
"fg_loss": 0.7552723288536072, |
|
"fg_rewards/chosen_sum": 0.37475159764289856, |
|
"fg_rewards/rejected_sum": -0.4549465775489807, |
|
"grad_norm": 12.023866951403578, |
|
"kl": 0.0394529327750206, |
|
"learning_rate": 4.509132420091324e-08, |
|
"logps/chosen": -344.09639537183546, |
|
"logps/rejected": -425.6873553240741, |
|
"loss": 0.4817, |
|
"rewards/chosen": -0.8015480524376978, |
|
"rewards/margins": 2.245464465043977, |
|
"rewards/rejected": -3.0470125174816745, |
|
"step": 900 |
|
}, |
|
{ |
|
"count/fg_chosen": 16.54166603088379, |
|
"count/fg_rejected": 4.318181991577148, |
|
"epoch": 0.9334530067957431, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.088489532470703, |
|
"fg_logps/policy_chosen": -15.405600547790527, |
|
"fg_logps/policy_rejected": -16.747190475463867, |
|
"fg_logps/reference_KL": -19.137371063232422, |
|
"fg_logps/reference_chosen": -14.657370567321777, |
|
"fg_logps/reference_rejected": -15.770890235900879, |
|
"fg_loss": 0.6983441114425659, |
|
"fg_rewards/chosen_sum": -0.3915860652923584, |
|
"fg_rewards/rejected_sum": -0.3197772204875946, |
|
"grad_norm": 11.088479553468312, |
|
"kl": 0.0, |
|
"learning_rate": 3.938356164383561e-08, |
|
"logps/chosen": -306.00605715981015, |
|
"logps/rejected": -396.05049189814815, |
|
"loss": 0.4692, |
|
"rewards/chosen": -0.3026618957519531, |
|
"rewards/margins": 2.5836239567509405, |
|
"rewards/rejected": -2.8862858525028936, |
|
"step": 910 |
|
}, |
|
{ |
|
"count/fg_chosen": 17.454545974731445, |
|
"count/fg_rejected": 3.4000000953674316, |
|
"epoch": 0.9437107321451468, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -24.18083953857422, |
|
"fg_logps/policy_chosen": -15.12738037109375, |
|
"fg_logps/policy_rejected": -22.837459564208984, |
|
"fg_logps/reference_KL": -22.198238372802734, |
|
"fg_logps/reference_chosen": -13.988216400146484, |
|
"fg_logps/reference_rejected": -20.653087615966797, |
|
"fg_loss": 0.7821993231773376, |
|
"fg_rewards/chosen_sum": -1.5599383115768433, |
|
"fg_rewards/rejected_sum": -0.6531942486763, |
|
"grad_norm": 13.573243756082668, |
|
"kl": 0.0, |
|
"learning_rate": 3.367579908675799e-08, |
|
"logps/chosen": -334.25313527960526, |
|
"logps/rejected": -345.7750651041667, |
|
"loss": 0.5174, |
|
"rewards/chosen": -1.169078224583676, |
|
"rewards/margins": 1.8482150696871575, |
|
"rewards/rejected": -3.0172932942708335, |
|
"step": 920 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.75, |
|
"count/fg_rejected": 3.5, |
|
"epoch": 0.9539684574945506, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.988378524780273, |
|
"fg_logps/policy_chosen": -15.057655334472656, |
|
"fg_logps/policy_rejected": -21.019062042236328, |
|
"fg_logps/reference_KL": -20.459566116333008, |
|
"fg_logps/reference_chosen": -14.091443061828613, |
|
"fg_logps/reference_rejected": -20.186948776245117, |
|
"fg_loss": 0.7185892462730408, |
|
"fg_rewards/chosen_sum": -0.44822633266448975, |
|
"fg_rewards/rejected_sum": -0.26188215613365173, |
|
"grad_norm": 16.393627175860047, |
|
"kl": 0.0, |
|
"learning_rate": 2.796803652968036e-08, |
|
"logps/chosen": -283.63548677884614, |
|
"logps/rejected": -336.15740131578946, |
|
"loss": 0.4039, |
|
"rewards/chosen": -0.3187708928034856, |
|
"rewards/margins": 2.802559930592896, |
|
"rewards/rejected": -3.1213308233963817, |
|
"step": 930 |
|
}, |
|
{ |
|
"count/fg_chosen": 18.83333396911621, |
|
"count/fg_rejected": 5.933333396911621, |
|
"epoch": 0.9642261828439543, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -20.48985481262207, |
|
"fg_logps/policy_chosen": -15.466116905212402, |
|
"fg_logps/policy_rejected": -21.363821029663086, |
|
"fg_logps/reference_KL": -19.345491409301758, |
|
"fg_logps/reference_chosen": -14.387165069580078, |
|
"fg_logps/reference_rejected": -19.8946533203125, |
|
"fg_loss": 0.7254918217658997, |
|
"fg_rewards/chosen_sum": -1.1295708417892456, |
|
"fg_rewards/rejected_sum": -0.7322808504104614, |
|
"grad_norm": 12.073302615788874, |
|
"kl": 0.0, |
|
"learning_rate": 2.2260273972602736e-08, |
|
"logps/chosen": -311.50599500868054, |
|
"logps/rejected": -380.01899857954544, |
|
"loss": 0.4599, |
|
"rewards/chosen": -0.8199851247999403, |
|
"rewards/margins": 2.1154369055622757, |
|
"rewards/rejected": -2.935422030362216, |
|
"step": 940 |
|
}, |
|
{ |
|
"count/fg_chosen": 15.636363983154297, |
|
"count/fg_rejected": 3.4000000953674316, |
|
"epoch": 0.9744839081933582, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -18.018638610839844, |
|
"fg_logps/policy_chosen": -14.090137481689453, |
|
"fg_logps/policy_rejected": -13.102763175964355, |
|
"fg_logps/reference_KL": -16.83769416809082, |
|
"fg_logps/reference_chosen": -13.336016654968262, |
|
"fg_logps/reference_rejected": -12.24677848815918, |
|
"fg_loss": 0.8313923478126526, |
|
"fg_rewards/chosen_sum": -0.822435736656189, |
|
"fg_rewards/rejected_sum": -0.4055534303188324, |
|
"grad_norm": 8.942456920046205, |
|
"kl": 0.027703475207090378, |
|
"learning_rate": 1.6552511415525114e-08, |
|
"logps/chosen": -319.5060622970779, |
|
"logps/rejected": -353.5703830948795, |
|
"loss": 0.4562, |
|
"rewards/chosen": -1.2102871188869724, |
|
"rewards/margins": 1.2892790167493229, |
|
"rewards/rejected": -2.4995661356362953, |
|
"step": 950 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.909090995788574, |
|
"count/fg_rejected": 3.6315789222717285, |
|
"epoch": 0.9847416335427619, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -21.546527862548828, |
|
"fg_logps/policy_chosen": -15.540050506591797, |
|
"fg_logps/policy_rejected": -20.36016273498535, |
|
"fg_logps/reference_KL": -20.43901252746582, |
|
"fg_logps/reference_chosen": -14.649651527404785, |
|
"fg_logps/reference_rejected": -18.264400482177734, |
|
"fg_loss": 0.7249688506126404, |
|
"fg_rewards/chosen_sum": -0.600669801235199, |
|
"fg_rewards/rejected_sum": -0.7846862077713013, |
|
"grad_norm": 13.343937741670935, |
|
"kl": 0.0, |
|
"learning_rate": 1.0844748858447487e-08, |
|
"logps/chosen": -326.93775699013156, |
|
"logps/rejected": -363.2979445684524, |
|
"loss": 0.5157, |
|
"rewards/chosen": -1.1762562801963405, |
|
"rewards/margins": 1.8776036121492694, |
|
"rewards/rejected": -3.05385989234561, |
|
"step": 960 |
|
}, |
|
{ |
|
"count/fg_chosen": 14.61111068725586, |
|
"count/fg_rejected": 3.3333332538604736, |
|
"epoch": 0.9949993588921656, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -24.816984176635742, |
|
"fg_logps/policy_chosen": -17.12736701965332, |
|
"fg_logps/policy_rejected": -22.242828369140625, |
|
"fg_logps/reference_KL": -22.654382705688477, |
|
"fg_logps/reference_chosen": -15.033406257629395, |
|
"fg_logps/reference_rejected": -21.171716690063477, |
|
"fg_loss": 0.7910918593406677, |
|
"fg_rewards/chosen_sum": -1.5589678287506104, |
|
"fg_rewards/rejected_sum": -0.5098855495452881, |
|
"grad_norm": 12.686683889493278, |
|
"kl": 0.0, |
|
"learning_rate": 5.136986301369862e-09, |
|
"logps/chosen": -375.69940350506755, |
|
"logps/rejected": -364.8770212572674, |
|
"loss": 0.481, |
|
"rewards/chosen": -1.2619872222075592, |
|
"rewards/margins": 1.2282116672364813, |
|
"rewards/rejected": -2.4901988894440406, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9991024490319271, |
|
"step": 974, |
|
"total_flos": 0.0, |
|
"train_loss": 0.5029905087159644, |
|
"train_runtime": 13638.2416, |
|
"train_samples_per_second": 1.144, |
|
"train_steps_per_second": 0.071 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 974, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|