diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10280 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100, + "global_step": 6573, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 7.598784194528875e-10, + "logits/chosen": -2.901771306991577, + "logits/rejected": -2.8884711265563965, + "logps/chosen": -77.62923431396484, + "logps/rejected": -64.06584167480469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 7.598784194528875e-09, + "logits/chosen": -2.9898242950439453, + "logits/rejected": -2.947841167449951, + "logps/chosen": -95.11986541748047, + "logps/rejected": -74.35153198242188, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0006865501054562628, + "rewards/margins": -0.00023379885533358902, + "rewards/rejected": 0.0009203488007187843, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 1.519756838905775e-08, + "logits/chosen": -3.020481824874878, + "logits/rejected": -2.9794812202453613, + "logps/chosen": -91.61888122558594, + "logps/rejected": -73.55238342285156, + "loss": 0.6899, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0016287328908219934, + "rewards/margins": 0.006309092044830322, + "rewards/rejected": -0.004680359270423651, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 2.2796352583586623e-08, + "logits/chosen": -3.0296730995178223, + "logits/rejected": -2.9928271770477295, + "logps/chosen": -94.35389709472656, + "logps/rejected": -70.74224853515625, + "loss": 0.6781, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.02023407630622387, + "rewards/margins": 0.029119813814759254, + "rewards/rejected": -0.008885735645890236, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 3.03951367781155e-08, + "logits/chosen": -3.002256393432617, + "logits/rejected": -2.9691174030303955, + "logps/chosen": -96.49156188964844, + "logps/rejected": -69.25764465332031, + "loss": 0.6513, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.05460807681083679, + "rewards/margins": 0.0900125652551651, + "rewards/rejected": -0.03540449216961861, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 3.799392097264438e-08, + "logits/chosen": -3.0232231616973877, + "logits/rejected": -2.9752824306488037, + "logps/chosen": -97.46116638183594, + "logps/rejected": -74.12408447265625, + "loss": 0.6124, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.09053327888250351, + "rewards/margins": 0.17471307516098022, + "rewards/rejected": -0.08417979627847672, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.559270516717325e-08, + "logits/chosen": -2.9987754821777344, + "logits/rejected": -2.9764158725738525, + "logps/chosen": -90.7174072265625, + "logps/rejected": -77.50119018554688, + "loss": 0.5669, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.06676627695560455, + "rewards/margins": 0.2405029535293579, + "rewards/rejected": -0.17373664677143097, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 5.3191489361702123e-08, + "logits/chosen": -2.999079465866089, + "logits/rejected": -2.957610607147217, + "logps/chosen": -85.68513488769531, + "logps/rejected": -73.17382049560547, + "loss": 0.4937, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.11260080337524414, + "rewards/margins": 0.46337103843688965, + "rewards/rejected": -0.3507702052593231, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 6.0790273556231e-08, + "logits/chosen": -2.9929611682891846, + "logits/rejected": -2.9430298805236816, + "logps/chosen": -95.99066925048828, + "logps/rejected": -78.26673126220703, + "loss": 0.4392, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.16532005369663239, + "rewards/margins": 0.6958799362182617, + "rewards/rejected": -0.5305598378181458, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 6.838905775075987e-08, + "logits/chosen": -2.9992611408233643, + "logits/rejected": -2.9776453971862793, + "logps/chosen": -92.69783020019531, + "logps/rejected": -75.43290710449219, + "loss": 0.419, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.025346357375383377, + "rewards/margins": 0.6570864915847778, + "rewards/rejected": -0.6824327707290649, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 7.598784194528875e-08, + "logits/chosen": -2.982177257537842, + "logits/rejected": -2.935204029083252, + "logps/chosen": -93.23124694824219, + "logps/rejected": -81.80736541748047, + "loss": 0.3601, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.016981299966573715, + "rewards/margins": 1.017513632774353, + "rewards/rejected": -1.0005323886871338, + "step": 100 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -3.0686004161834717, + "eval_logits/rejected": -3.0200653076171875, + "eval_logps/chosen": -91.95072937011719, + "eval_logps/rejected": -80.77359008789062, + "eval_loss": 0.3409937024116516, + "eval_rewards/accuracies": 0.9444444179534912, + "eval_rewards/chosen": -0.07135287672281265, + "eval_rewards/margins": 0.9955466985702515, + "eval_rewards/rejected": -1.0668996572494507, + "eval_runtime": 54.6773, + "eval_samples_per_second": 52.343, + "eval_steps_per_second": 1.646, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 8.358662613981762e-08, + "logits/chosen": -3.0181825160980225, + "logits/rejected": -2.9785044193267822, + "logps/chosen": -92.45232391357422, + "logps/rejected": -82.97605895996094, + "loss": 0.3187, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.013919335789978504, + "rewards/margins": 1.1460493803024292, + "rewards/rejected": -1.1321300268173218, + "step": 110 + }, + { + "epoch": 0.05, + "learning_rate": 9.11854103343465e-08, + "logits/chosen": -2.996326208114624, + "logits/rejected": -2.960134506225586, + "logps/chosen": -91.56250762939453, + "logps/rejected": -86.30479431152344, + "loss": 0.2826, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2517639994621277, + "rewards/margins": 1.317551612854004, + "rewards/rejected": -1.5693156719207764, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 9.878419452887538e-08, + "logits/chosen": -2.9952170848846436, + "logits/rejected": -2.970567226409912, + "logps/chosen": -93.91593933105469, + "logps/rejected": -90.50540161132812, + "loss": 0.2448, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0845116376876831, + "rewards/margins": 1.7149988412857056, + "rewards/rejected": -1.7995105981826782, + "step": 130 + }, + { + "epoch": 0.06, + "learning_rate": 1.0638297872340425e-07, + "logits/chosen": -3.0097122192382812, + "logits/rejected": -2.9792752265930176, + "logps/chosen": -95.60907745361328, + "logps/rejected": -92.10499572753906, + "loss": 0.2252, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.3711767792701721, + "rewards/margins": 1.8317139148712158, + "rewards/rejected": -2.202890634536743, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 1.1398176291793313e-07, + "logits/chosen": -3.0108351707458496, + "logits/rejected": -2.9584834575653076, + "logps/chosen": -99.74345397949219, + "logps/rejected": -95.83207702636719, + "loss": 0.2017, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.43104758858680725, + "rewards/margins": 1.9699010848999023, + "rewards/rejected": -2.4009485244750977, + "step": 150 + }, + { + "epoch": 0.07, + "learning_rate": 1.21580547112462e-07, + "logits/chosen": -2.986536979675293, + "logits/rejected": -2.9522886276245117, + "logps/chosen": -101.01667785644531, + "logps/rejected": -100.94587707519531, + "loss": 0.1703, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.44724518060684204, + "rewards/margins": 2.5366241931915283, + "rewards/rejected": -2.9838690757751465, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 1.2917933130699087e-07, + "logits/chosen": -2.9869437217712402, + "logits/rejected": -2.9626340866088867, + "logps/chosen": -92.66294860839844, + "logps/rejected": -100.02458190917969, + "loss": 0.1652, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.6140026450157166, + "rewards/margins": 2.5113110542297363, + "rewards/rejected": -3.1253137588500977, + "step": 170 + }, + { + "epoch": 0.08, + "learning_rate": 1.3677811550151974e-07, + "logits/chosen": -2.9454994201660156, + "logits/rejected": -2.95060396194458, + "logps/chosen": -94.37382507324219, + "logps/rejected": -105.94987487792969, + "loss": 0.162, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.31808796525001526, + "rewards/margins": 3.3045032024383545, + "rewards/rejected": -3.622591495513916, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 1.4437689969604864e-07, + "logits/chosen": -2.9558322429656982, + "logits/rejected": -2.9460699558258057, + "logps/chosen": -96.69801330566406, + "logps/rejected": -107.38899230957031, + "loss": 0.1188, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.5944967269897461, + "rewards/margins": 3.4235737323760986, + "rewards/rejected": -4.018071174621582, + "step": 190 + }, + { + "epoch": 0.09, + "learning_rate": 1.519756838905775e-07, + "logits/chosen": -2.918544292449951, + "logits/rejected": -2.918253183364868, + "logps/chosen": -94.17301177978516, + "logps/rejected": -117.73758697509766, + "loss": 0.113, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9829786419868469, + "rewards/margins": 3.9101309776306152, + "rewards/rejected": -4.8931097984313965, + "step": 200 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -3.029731273651123, + "eval_logits/rejected": -3.0103673934936523, + "eval_logps/chosen": -103.97273254394531, + "eval_logps/rejected": -118.95235443115234, + "eval_loss": 0.11712019145488739, + "eval_rewards/accuracies": 0.9611111283302307, + "eval_rewards/chosen": -1.2735543251037598, + "eval_rewards/margins": 3.6112213134765625, + "eval_rewards/rejected": -4.884775638580322, + "eval_runtime": 46.8222, + "eval_samples_per_second": 61.125, + "eval_steps_per_second": 1.922, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 1.5957446808510638e-07, + "logits/chosen": -2.966219663619995, + "logits/rejected": -2.940929889678955, + "logps/chosen": -102.27266693115234, + "logps/rejected": -124.5510482788086, + "loss": 0.1011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.482550859451294, + "rewards/margins": 3.7979836463928223, + "rewards/rejected": -5.280534744262695, + "step": 210 + }, + { + "epoch": 0.1, + "learning_rate": 1.6717325227963525e-07, + "logits/chosen": -2.9579319953918457, + "logits/rejected": -2.9625790119171143, + "logps/chosen": -108.02095794677734, + "logps/rejected": -129.6359100341797, + "loss": 0.1094, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.5162633657455444, + "rewards/margins": 4.2703537940979, + "rewards/rejected": -5.786617279052734, + "step": 220 + }, + { + "epoch": 0.1, + "learning_rate": 1.7477203647416414e-07, + "logits/chosen": -2.9127144813537598, + "logits/rejected": -2.913215398788452, + "logps/chosen": -110.11048889160156, + "logps/rejected": -133.09237670898438, + "loss": 0.0955, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6744592189788818, + "rewards/margins": 4.5804924964904785, + "rewards/rejected": -6.254951000213623, + "step": 230 + }, + { + "epoch": 0.11, + "learning_rate": 1.82370820668693e-07, + "logits/chosen": -2.966217517852783, + "logits/rejected": -2.9733166694641113, + "logps/chosen": -105.95328521728516, + "logps/rejected": -129.27061462402344, + "loss": 0.1055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3941030502319336, + "rewards/margins": 4.414058208465576, + "rewards/rejected": -5.80816125869751, + "step": 240 + }, + { + "epoch": 0.11, + "learning_rate": 1.8996960486322188e-07, + "logits/chosen": -2.907132148742676, + "logits/rejected": -2.9205126762390137, + "logps/chosen": -109.4215316772461, + "logps/rejected": -137.55592346191406, + "loss": 0.0969, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6198132038116455, + "rewards/margins": 5.059004783630371, + "rewards/rejected": -6.6788177490234375, + "step": 250 + }, + { + "epoch": 0.12, + "learning_rate": 1.9756838905775075e-07, + "logits/chosen": -2.8727142810821533, + "logits/rejected": -2.857184886932373, + "logps/chosen": -111.29930114746094, + "logps/rejected": -142.2863311767578, + "loss": 0.0981, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.1142921447753906, + "rewards/margins": 5.329843997955322, + "rewards/rejected": -7.444136142730713, + "step": 260 + }, + { + "epoch": 0.12, + "learning_rate": 2.0516717325227962e-07, + "logits/chosen": -2.8235843181610107, + "logits/rejected": -2.819584608078003, + "logps/chosen": -122.10441589355469, + "logps/rejected": -152.1237335205078, + "loss": 0.1112, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.893561601638794, + "rewards/margins": 4.696758270263672, + "rewards/rejected": -7.590319633483887, + "step": 270 + }, + { + "epoch": 0.13, + "learning_rate": 2.127659574468085e-07, + "logits/chosen": -2.894500255584717, + "logits/rejected": -2.8776357173919678, + "logps/chosen": -117.2232666015625, + "logps/rejected": -143.18972778320312, + "loss": 0.0812, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.439744710922241, + "rewards/margins": 4.922734260559082, + "rewards/rejected": -7.362478733062744, + "step": 280 + }, + { + "epoch": 0.13, + "learning_rate": 2.2036474164133736e-07, + "logits/chosen": -2.8788487911224365, + "logits/rejected": -2.88078236579895, + "logps/chosen": -117.56974029541016, + "logps/rejected": -153.3238525390625, + "loss": 0.0703, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.690272092819214, + "rewards/margins": 5.456068992614746, + "rewards/rejected": -8.146341323852539, + "step": 290 + }, + { + "epoch": 0.14, + "learning_rate": 2.2796352583586626e-07, + "logits/chosen": -2.8312017917633057, + "logits/rejected": -2.8483071327209473, + "logps/chosen": -119.8138656616211, + "logps/rejected": -156.4137725830078, + "loss": 0.0734, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.0532338619232178, + "rewards/margins": 5.62264347076416, + "rewards/rejected": -8.675877571105957, + "step": 300 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -2.933035373687744, + "eval_logits/rejected": -2.90442156791687, + "eval_logps/chosen": -113.29481506347656, + "eval_logps/rejected": -142.68540954589844, + "eval_loss": 0.07679181545972824, + "eval_rewards/accuracies": 0.9777777791023254, + "eval_rewards/chosen": -2.2057607173919678, + "eval_rewards/margins": 5.05232048034668, + "eval_rewards/rejected": -7.258080959320068, + "eval_runtime": 47.4353, + "eval_samples_per_second": 60.335, + "eval_steps_per_second": 1.897, + "step": 300 + }, + { + "epoch": 0.14, + "learning_rate": 2.3556231003039513e-07, + "logits/chosen": -2.80320405960083, + "logits/rejected": -2.8362550735473633, + "logps/chosen": -113.18946838378906, + "logps/rejected": -147.78456115722656, + "loss": 0.0763, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.022726058959961, + "rewards/margins": 5.786845684051514, + "rewards/rejected": -7.809571743011475, + "step": 310 + }, + { + "epoch": 0.15, + "learning_rate": 2.43161094224924e-07, + "logits/chosen": -2.795830488204956, + "logits/rejected": -2.808887004852295, + "logps/chosen": -118.27098083496094, + "logps/rejected": -154.96951293945312, + "loss": 0.0719, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.9285690784454346, + "rewards/margins": 5.742952823638916, + "rewards/rejected": -8.67152214050293, + "step": 320 + }, + { + "epoch": 0.15, + "learning_rate": 2.507598784194529e-07, + "logits/chosen": -2.8046109676361084, + "logits/rejected": -2.817551612854004, + "logps/chosen": -118.660888671875, + "logps/rejected": -157.84669494628906, + "loss": 0.0757, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.268319606781006, + "rewards/margins": 6.382529258728027, + "rewards/rejected": -8.650848388671875, + "step": 330 + }, + { + "epoch": 0.16, + "learning_rate": 2.5835866261398174e-07, + "logits/chosen": -2.766819477081299, + "logits/rejected": -2.7905983924865723, + "logps/chosen": -114.24540710449219, + "logps/rejected": -155.21456909179688, + "loss": 0.0639, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.6502342224121094, + "rewards/margins": 6.039946556091309, + "rewards/rejected": -8.690180778503418, + "step": 340 + }, + { + "epoch": 0.16, + "learning_rate": 2.659574468085106e-07, + "logits/chosen": -2.7104740142822266, + "logits/rejected": -2.7129950523376465, + "logps/chosen": -118.3838882446289, + "logps/rejected": -163.64157104492188, + "loss": 0.0714, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.7546608448028564, + "rewards/margins": 6.270176887512207, + "rewards/rejected": -9.024836540222168, + "step": 350 + }, + { + "epoch": 0.16, + "learning_rate": 2.735562310030395e-07, + "logits/chosen": -2.6810834407806396, + "logits/rejected": -2.6747491359710693, + "logps/chosen": -117.26749420166016, + "logps/rejected": -165.12850952148438, + "loss": 0.0661, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.4232752323150635, + "rewards/margins": 7.114753723144531, + "rewards/rejected": -9.538028717041016, + "step": 360 + }, + { + "epoch": 0.17, + "learning_rate": 2.811550151975684e-07, + "logits/chosen": -2.7305426597595215, + "logits/rejected": -2.7588391304016113, + "logps/chosen": -119.80528259277344, + "logps/rejected": -160.10391235351562, + "loss": 0.0738, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.611847400665283, + "rewards/margins": 5.952840328216553, + "rewards/rejected": -8.564687728881836, + "step": 370 + }, + { + "epoch": 0.17, + "learning_rate": 2.887537993920973e-07, + "logits/chosen": -2.700817584991455, + "logits/rejected": -2.7069475650787354, + "logps/chosen": -127.48515319824219, + "logps/rejected": -173.31446838378906, + "loss": 0.0615, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6581039428710938, + "rewards/margins": 6.759530544281006, + "rewards/rejected": -10.417635917663574, + "step": 380 + }, + { + "epoch": 0.18, + "learning_rate": 2.9635258358662614e-07, + "logits/chosen": -2.72932767868042, + "logits/rejected": -2.7426867485046387, + "logps/chosen": -128.69351196289062, + "logps/rejected": -170.44024658203125, + "loss": 0.0596, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.062074899673462, + "rewards/margins": 6.59927225112915, + "rewards/rejected": -9.661347389221191, + "step": 390 + }, + { + "epoch": 0.18, + "learning_rate": 3.03951367781155e-07, + "logits/chosen": -2.737363815307617, + "logits/rejected": -2.728015184402466, + "logps/chosen": -118.87040710449219, + "logps/rejected": -177.7161102294922, + "loss": 0.0587, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.8841772079467773, + "rewards/margins": 7.693885803222656, + "rewards/rejected": -10.57806396484375, + "step": 400 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -2.727881669998169, + "eval_logits/rejected": -2.6966190338134766, + "eval_logps/chosen": -131.62547302246094, + "eval_logps/rejected": -187.50515747070312, + "eval_loss": 0.055923737585544586, + "eval_rewards/accuracies": 0.9694444537162781, + "eval_rewards/chosen": -4.0388264656066895, + "eval_rewards/margins": 7.701231479644775, + "eval_rewards/rejected": -11.740057945251465, + "eval_runtime": 49.6576, + "eval_samples_per_second": 57.635, + "eval_steps_per_second": 1.812, + "step": 400 + }, + { + "epoch": 0.19, + "learning_rate": 3.1155015197568383e-07, + "logits/chosen": -2.699958086013794, + "logits/rejected": -2.672335624694824, + "logps/chosen": -121.584716796875, + "logps/rejected": -185.4914093017578, + "loss": 0.0693, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.9695181846618652, + "rewards/margins": 8.059054374694824, + "rewards/rejected": -11.028572082519531, + "step": 410 + }, + { + "epoch": 0.19, + "learning_rate": 3.1914893617021275e-07, + "logits/chosen": -2.724929094314575, + "logits/rejected": -2.737008810043335, + "logps/chosen": -131.0947265625, + "logps/rejected": -174.7019805908203, + "loss": 0.0579, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.8909332752227783, + "rewards/margins": 6.2964396476745605, + "rewards/rejected": -10.187372207641602, + "step": 420 + }, + { + "epoch": 0.2, + "learning_rate": 3.267477203647416e-07, + "logits/chosen": -2.742372512817383, + "logits/rejected": -2.7384090423583984, + "logps/chosen": -120.4808120727539, + "logps/rejected": -170.55133056640625, + "loss": 0.0618, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.03888201713562, + "rewards/margins": 6.642469882965088, + "rewards/rejected": -9.681352615356445, + "step": 430 + }, + { + "epoch": 0.2, + "learning_rate": 3.343465045592705e-07, + "logits/chosen": -2.6667733192443848, + "logits/rejected": -2.596818447113037, + "logps/chosen": -120.44290924072266, + "logps/rejected": -168.67884826660156, + "loss": 0.0558, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.524181842803955, + "rewards/margins": 7.524356842041016, + "rewards/rejected": -10.048540115356445, + "step": 440 + }, + { + "epoch": 0.21, + "learning_rate": 3.4194528875379936e-07, + "logits/chosen": -2.6612601280212402, + "logits/rejected": -2.640688180923462, + "logps/chosen": -136.09475708007812, + "logps/rejected": -195.4031982421875, + "loss": 0.0597, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.248106002807617, + "rewards/margins": 8.086824417114258, + "rewards/rejected": -12.334931373596191, + "step": 450 + }, + { + "epoch": 0.21, + "learning_rate": 3.495440729483283e-07, + "logits/chosen": -2.6613001823425293, + "logits/rejected": -2.640634298324585, + "logps/chosen": -143.10586547851562, + "logps/rejected": -187.25064086914062, + "loss": 0.0674, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.273663520812988, + "rewards/margins": 7.212340354919434, + "rewards/rejected": -11.486001968383789, + "step": 460 + }, + { + "epoch": 0.21, + "learning_rate": 3.5714285714285716e-07, + "logits/chosen": -2.694119453430176, + "logits/rejected": -2.6654534339904785, + "logps/chosen": -135.16006469726562, + "logps/rejected": -191.01541137695312, + "loss": 0.0554, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.870119571685791, + "rewards/margins": 8.046853065490723, + "rewards/rejected": -11.916971206665039, + "step": 470 + }, + { + "epoch": 0.22, + "learning_rate": 3.64741641337386e-07, + "logits/chosen": -2.6668903827667236, + "logits/rejected": -2.668224334716797, + "logps/chosen": -135.64004516601562, + "logps/rejected": -191.2487335205078, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9338130950927734, + "rewards/margins": 8.0016450881958, + "rewards/rejected": -11.935457229614258, + "step": 480 + }, + { + "epoch": 0.22, + "learning_rate": 3.7234042553191484e-07, + "logits/chosen": -2.7070086002349854, + "logits/rejected": -2.6857943534851074, + "logps/chosen": -123.0178451538086, + "logps/rejected": -179.15554809570312, + "loss": 0.0499, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.212670087814331, + "rewards/margins": 7.310342311859131, + "rewards/rejected": -10.5230131149292, + "step": 490 + }, + { + "epoch": 0.23, + "learning_rate": 3.7993920972644377e-07, + "logits/chosen": -2.6694531440734863, + "logits/rejected": -2.672365665435791, + "logps/chosen": -126.01292419433594, + "logps/rejected": -187.0902557373047, + "loss": 0.0379, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.865762710571289, + "rewards/margins": 7.9018754959106445, + "rewards/rejected": -11.767638206481934, + "step": 500 + }, + { + "epoch": 0.23, + "eval_logits/chosen": -2.701693296432495, + "eval_logits/rejected": -2.6612892150878906, + "eval_logps/chosen": -130.73841857910156, + "eval_logps/rejected": -192.8392791748047, + "eval_loss": 0.04595184698700905, + "eval_rewards/accuracies": 0.9750000238418579, + "eval_rewards/chosen": -3.950122117996216, + "eval_rewards/margins": 8.323347091674805, + "eval_rewards/rejected": -12.273469924926758, + "eval_runtime": 50.7966, + "eval_samples_per_second": 56.342, + "eval_steps_per_second": 1.772, + "step": 500 + }, + { + "epoch": 0.23, + "learning_rate": 3.8753799392097264e-07, + "logits/chosen": -2.6502652168273926, + "logits/rejected": -2.642637014389038, + "logps/chosen": -124.89344787597656, + "logps/rejected": -187.1622314453125, + "loss": 0.0445, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.712517499923706, + "rewards/margins": 7.8403496742248535, + "rewards/rejected": -11.55286693572998, + "step": 510 + }, + { + "epoch": 0.24, + "learning_rate": 3.951367781155015e-07, + "logits/chosen": -2.6770434379577637, + "logits/rejected": -2.6957175731658936, + "logps/chosen": -135.3203125, + "logps/rejected": -196.0937957763672, + "loss": 0.041, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.613642454147339, + "rewards/margins": 8.412160873413086, + "rewards/rejected": -12.025801658630371, + "step": 520 + }, + { + "epoch": 0.24, + "learning_rate": 4.027355623100304e-07, + "logits/chosen": -2.6510725021362305, + "logits/rejected": -2.675741195678711, + "logps/chosen": -118.3866958618164, + "logps/rejected": -188.92434692382812, + "loss": 0.0476, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.810616970062256, + "rewards/margins": 9.09221076965332, + "rewards/rejected": -11.902826309204102, + "step": 530 + }, + { + "epoch": 0.25, + "learning_rate": 4.1033434650455925e-07, + "logits/chosen": -2.644740581512451, + "logits/rejected": -2.6704249382019043, + "logps/chosen": -132.53656005859375, + "logps/rejected": -188.7894287109375, + "loss": 0.0484, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.122117042541504, + "rewards/margins": 7.6324262619018555, + "rewards/rejected": -11.754544258117676, + "step": 540 + }, + { + "epoch": 0.25, + "learning_rate": 4.179331306990881e-07, + "logits/chosen": -2.683001756668091, + "logits/rejected": -2.6826956272125244, + "logps/chosen": -139.59835815429688, + "logps/rejected": -194.3646240234375, + "loss": 0.037, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.069397926330566, + "rewards/margins": 8.341604232788086, + "rewards/rejected": -12.411002159118652, + "step": 550 + }, + { + "epoch": 0.26, + "learning_rate": 4.25531914893617e-07, + "logits/chosen": -2.7073171138763428, + "logits/rejected": -2.7193078994750977, + "logps/chosen": -121.34649658203125, + "logps/rejected": -194.57553100585938, + "loss": 0.0433, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9804375171661377, + "rewards/margins": 9.331927299499512, + "rewards/rejected": -12.312365531921387, + "step": 560 + }, + { + "epoch": 0.26, + "learning_rate": 4.3313069908814586e-07, + "logits/chosen": -2.630943775177002, + "logits/rejected": -2.6553235054016113, + "logps/chosen": -128.59909057617188, + "logps/rejected": -181.20632934570312, + "loss": 0.0505, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.6925315856933594, + "rewards/margins": 7.2038726806640625, + "rewards/rejected": -10.896404266357422, + "step": 570 + }, + { + "epoch": 0.26, + "learning_rate": 4.4072948328267473e-07, + "logits/chosen": -2.540254831314087, + "logits/rejected": -2.5618560314178467, + "logps/chosen": -124.41963195800781, + "logps/rejected": -184.33001708984375, + "loss": 0.0367, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6125075817108154, + "rewards/margins": 8.0702543258667, + "rewards/rejected": -11.682764053344727, + "step": 580 + }, + { + "epoch": 0.27, + "learning_rate": 4.4832826747720365e-07, + "logits/chosen": -2.605494260787964, + "logits/rejected": -2.5571651458740234, + "logps/chosen": -130.73892211914062, + "logps/rejected": -192.65866088867188, + "loss": 0.0363, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.696895122528076, + "rewards/margins": 8.656723976135254, + "rewards/rejected": -12.353619575500488, + "step": 590 + }, + { + "epoch": 0.27, + "learning_rate": 4.559270516717325e-07, + "logits/chosen": -2.52860426902771, + "logits/rejected": -2.4886059761047363, + "logps/chosen": -126.6506118774414, + "logps/rejected": -203.23599243164062, + "loss": 0.0394, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.3456413745880127, + "rewards/margins": 9.531149864196777, + "rewards/rejected": -12.876792907714844, + "step": 600 + }, + { + "epoch": 0.27, + "eval_logits/chosen": -2.478564739227295, + "eval_logits/rejected": -2.390133857727051, + "eval_logps/chosen": -139.76361083984375, + "eval_logps/rejected": -215.0513916015625, + "eval_loss": 0.04050706699490547, + "eval_rewards/accuracies": 0.9750000238418579, + "eval_rewards/chosen": -4.85264253616333, + "eval_rewards/margins": 9.642037391662598, + "eval_rewards/rejected": -14.494680404663086, + "eval_runtime": 50.1674, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 1.794, + "step": 600 + }, + { + "epoch": 0.28, + "learning_rate": 4.635258358662614e-07, + "logits/chosen": -2.534883975982666, + "logits/rejected": -2.4874258041381836, + "logps/chosen": -131.05517578125, + "logps/rejected": -187.19418334960938, + "loss": 0.0537, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.8685455322265625, + "rewards/margins": 7.740345001220703, + "rewards/rejected": -11.60888957977295, + "step": 610 + }, + { + "epoch": 0.28, + "learning_rate": 4.7112462006079026e-07, + "logits/chosen": -2.5169684886932373, + "logits/rejected": -2.4521279335021973, + "logps/chosen": -126.732421875, + "logps/rejected": -196.65960693359375, + "loss": 0.0424, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.4762802124023438, + "rewards/margins": 9.320673942565918, + "rewards/rejected": -12.796956062316895, + "step": 620 + }, + { + "epoch": 0.29, + "learning_rate": 4.787234042553192e-07, + "logits/chosen": -2.539543867111206, + "logits/rejected": -2.4839327335357666, + "logps/chosen": -130.13665771484375, + "logps/rejected": -192.86544799804688, + "loss": 0.0569, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.813121795654297, + "rewards/margins": 7.917851448059082, + "rewards/rejected": -11.730974197387695, + "step": 630 + }, + { + "epoch": 0.29, + "learning_rate": 4.86322188449848e-07, + "logits/chosen": -2.5205817222595215, + "logits/rejected": -2.4679293632507324, + "logps/chosen": -114.6985092163086, + "logps/rejected": -168.59475708007812, + "loss": 0.0509, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.7615535259246826, + "rewards/margins": 6.972909450531006, + "rewards/rejected": -9.73446273803711, + "step": 640 + }, + { + "epoch": 0.3, + "learning_rate": 4.939209726443769e-07, + "logits/chosen": -2.5492513179779053, + "logits/rejected": -2.473130941390991, + "logps/chosen": -136.21096801757812, + "logps/rejected": -197.6907958984375, + "loss": 0.0363, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.211759090423584, + "rewards/margins": 9.240852355957031, + "rewards/rejected": -12.452611923217773, + "step": 650 + }, + { + "epoch": 0.3, + "learning_rate": 4.998309382924767e-07, + "logits/chosen": -2.629535675048828, + "logits/rejected": -2.5611090660095215, + "logps/chosen": -122.98139953613281, + "logps/rejected": -183.54830932617188, + "loss": 0.0477, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.009155750274658, + "rewards/margins": 8.353796005249023, + "rewards/rejected": -11.362951278686523, + "step": 660 + }, + { + "epoch": 0.31, + "learning_rate": 4.989856297548605e-07, + "logits/chosen": -2.580986738204956, + "logits/rejected": -2.527766704559326, + "logps/chosen": -131.6427001953125, + "logps/rejected": -198.3678741455078, + "loss": 0.0281, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6659603118896484, + "rewards/margins": 8.929367065429688, + "rewards/rejected": -12.59532642364502, + "step": 670 + }, + { + "epoch": 0.31, + "learning_rate": 4.981403212172442e-07, + "logits/chosen": -2.6364097595214844, + "logits/rejected": -2.56335186958313, + "logps/chosen": -131.08419799804688, + "logps/rejected": -209.58523559570312, + "loss": 0.0257, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.2999167442321777, + "rewards/margins": 10.14362621307373, + "rewards/rejected": -13.44354248046875, + "step": 680 + }, + { + "epoch": 0.31, + "learning_rate": 4.97295012679628e-07, + "logits/chosen": -2.5585570335388184, + "logits/rejected": -2.560281276702881, + "logps/chosen": -135.70570373535156, + "logps/rejected": -207.83718872070312, + "loss": 0.0497, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.453864097595215, + "rewards/margins": 9.309991836547852, + "rewards/rejected": -13.76385498046875, + "step": 690 + }, + { + "epoch": 0.32, + "learning_rate": 4.964497041420119e-07, + "logits/chosen": -2.6161398887634277, + "logits/rejected": -2.57515811920166, + "logps/chosen": -128.3286895751953, + "logps/rejected": -198.37640380859375, + "loss": 0.0375, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.2940287590026855, + "rewards/margins": 9.220344543457031, + "rewards/rejected": -12.514373779296875, + "step": 700 + }, + { + "epoch": 0.32, + "eval_logits/chosen": -2.714183807373047, + "eval_logits/rejected": -2.6707613468170166, + "eval_logps/chosen": -128.3318634033203, + "eval_logps/rejected": -196.30885314941406, + "eval_loss": 0.037593573331832886, + "eval_rewards/accuracies": 0.9750000238418579, + "eval_rewards/chosen": -3.709465980529785, + "eval_rewards/margins": 8.910959243774414, + "eval_rewards/rejected": -12.6204252243042, + "eval_runtime": 48.2423, + "eval_samples_per_second": 59.326, + "eval_steps_per_second": 1.866, + "step": 700 + }, + { + "epoch": 0.32, + "learning_rate": 4.956043956043956e-07, + "logits/chosen": -2.620093822479248, + "logits/rejected": -2.5982906818389893, + "logps/chosen": -123.15608978271484, + "logps/rejected": -197.61636352539062, + "loss": 0.0404, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.266045331954956, + "rewards/margins": 9.135406494140625, + "rewards/rejected": -12.401453018188477, + "step": 710 + }, + { + "epoch": 0.33, + "learning_rate": 4.947590870667794e-07, + "logits/chosen": -2.595463275909424, + "logits/rejected": -2.5666444301605225, + "logps/chosen": -125.81497955322266, + "logps/rejected": -189.8935546875, + "loss": 0.0391, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.0596022605895996, + "rewards/margins": 8.901065826416016, + "rewards/rejected": -11.960668563842773, + "step": 720 + }, + { + "epoch": 0.33, + "learning_rate": 4.939137785291631e-07, + "logits/chosen": -2.632361650466919, + "logits/rejected": -2.584230899810791, + "logps/chosen": -112.73439025878906, + "logps/rejected": -181.985595703125, + "loss": 0.0396, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.658551812171936, + "rewards/margins": 9.849242210388184, + "rewards/rejected": -11.507793426513672, + "step": 730 + }, + { + "epoch": 0.34, + "learning_rate": 4.930684699915469e-07, + "logits/chosen": -2.561582565307617, + "logits/rejected": -2.539332151412964, + "logps/chosen": -106.41217041015625, + "logps/rejected": -188.4742889404297, + "loss": 0.0229, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.267642021179199, + "rewards/margins": 9.631690979003906, + "rewards/rejected": -11.899332046508789, + "step": 740 + }, + { + "epoch": 0.34, + "learning_rate": 4.922231614539306e-07, + "logits/chosen": -2.5166029930114746, + "logits/rejected": -2.4404571056365967, + "logps/chosen": -111.54460144042969, + "logps/rejected": -179.897216796875, + "loss": 0.047, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.5172572135925293, + "rewards/margins": 8.489733695983887, + "rewards/rejected": -11.006990432739258, + "step": 750 + }, + { + "epoch": 0.35, + "learning_rate": 4.913778529163144e-07, + "logits/chosen": -2.363140106201172, + "logits/rejected": -2.2299087047576904, + "logps/chosen": -137.92788696289062, + "logps/rejected": -216.5609893798828, + "loss": 0.0331, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.3311238288879395, + "rewards/margins": 9.85867691040039, + "rewards/rejected": -14.189801216125488, + "step": 760 + }, + { + "epoch": 0.35, + "learning_rate": 4.905325443786982e-07, + "logits/chosen": -2.3814444541931152, + "logits/rejected": -2.3055179119110107, + "logps/chosen": -131.71412658691406, + "logps/rejected": -202.88714599609375, + "loss": 0.0316, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.11533260345459, + "rewards/margins": 9.29047679901123, + "rewards/rejected": -13.405810356140137, + "step": 770 + }, + { + "epoch": 0.36, + "learning_rate": 4.896872358410819e-07, + "logits/chosen": -2.4633522033691406, + "logits/rejected": -2.408992290496826, + "logps/chosen": -125.52821350097656, + "logps/rejected": -182.40823364257812, + "loss": 0.0554, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.523876667022705, + "rewards/margins": 7.773106575012207, + "rewards/rejected": -11.296982765197754, + "step": 780 + }, + { + "epoch": 0.36, + "learning_rate": 4.888419273034658e-07, + "logits/chosen": -2.465280055999756, + "logits/rejected": -2.4070966243743896, + "logps/chosen": -126.39984130859375, + "logps/rejected": -201.7521514892578, + "loss": 0.0206, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.4645187854766846, + "rewards/margins": 9.297203063964844, + "rewards/rejected": -12.76172161102295, + "step": 790 + }, + { + "epoch": 0.37, + "learning_rate": 4.879966187658495e-07, + "logits/chosen": -2.2858943939208984, + "logits/rejected": -2.219574451446533, + "logps/chosen": -141.95816040039062, + "logps/rejected": -227.12216186523438, + "loss": 0.043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.920865058898926, + "rewards/margins": 10.735105514526367, + "rewards/rejected": -15.655969619750977, + "step": 800 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -2.282928705215454, + "eval_logits/rejected": -2.165402889251709, + "eval_logps/chosen": -143.88894653320312, + "eval_logps/rejected": -218.6447296142578, + "eval_loss": 0.03752221167087555, + "eval_rewards/accuracies": 0.9694444537162781, + "eval_rewards/chosen": -5.26517391204834, + "eval_rewards/margins": 9.588841438293457, + "eval_rewards/rejected": -14.85401439666748, + "eval_runtime": 49.0421, + "eval_samples_per_second": 58.358, + "eval_steps_per_second": 1.835, + "step": 800 + }, + { + "epoch": 0.37, + "learning_rate": 4.871513102282333e-07, + "logits/chosen": -2.3610994815826416, + "logits/rejected": -2.256997585296631, + "logps/chosen": -133.1490478515625, + "logps/rejected": -206.186279296875, + "loss": 0.0394, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.193153381347656, + "rewards/margins": 9.370244026184082, + "rewards/rejected": -13.563395500183105, + "step": 810 + }, + { + "epoch": 0.37, + "learning_rate": 4.863060016906171e-07, + "logits/chosen": -2.439469814300537, + "logits/rejected": -2.413252115249634, + "logps/chosen": -144.38931274414062, + "logps/rejected": -226.8816680908203, + "loss": 0.027, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.668347358703613, + "rewards/margins": 10.250940322875977, + "rewards/rejected": -15.919286727905273, + "step": 820 + }, + { + "epoch": 0.38, + "learning_rate": 4.854606931530008e-07, + "logits/chosen": -2.410789728164673, + "logits/rejected": -2.3912742137908936, + "logps/chosen": -149.9718017578125, + "logps/rejected": -236.6019287109375, + "loss": 0.0313, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.080855846405029, + "rewards/margins": 10.09882926940918, + "rewards/rejected": -16.179683685302734, + "step": 830 + }, + { + "epoch": 0.38, + "learning_rate": 4.846153846153846e-07, + "logits/chosen": -2.477858066558838, + "logits/rejected": -2.388430118560791, + "logps/chosen": -127.3630142211914, + "logps/rejected": -198.08287048339844, + "loss": 0.0423, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.9977195262908936, + "rewards/margins": 9.791177749633789, + "rewards/rejected": -12.788896560668945, + "step": 840 + }, + { + "epoch": 0.39, + "learning_rate": 4.837700760777683e-07, + "logits/chosen": -2.3127739429473877, + "logits/rejected": -2.2208588123321533, + "logps/chosen": -137.19290161132812, + "logps/rejected": -220.7669677734375, + "loss": 0.0374, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.068154811859131, + "rewards/margins": 9.750088691711426, + "rewards/rejected": -14.818242073059082, + "step": 850 + }, + { + "epoch": 0.39, + "learning_rate": 4.829247675401522e-07, + "logits/chosen": -2.288473606109619, + "logits/rejected": -2.130735158920288, + "logps/chosen": -147.62582397460938, + "logps/rejected": -239.2997589111328, + "loss": 0.0348, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.74985408782959, + "rewards/margins": 11.903969764709473, + "rewards/rejected": -16.653823852539062, + "step": 860 + }, + { + "epoch": 0.4, + "learning_rate": 4.820794590025358e-07, + "logits/chosen": -2.256855010986328, + "logits/rejected": -2.172144651412964, + "logps/chosen": -141.7345733642578, + "logps/rejected": -226.32278442382812, + "loss": 0.0351, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.722945213317871, + "rewards/margins": 10.656365394592285, + "rewards/rejected": -15.379310607910156, + "step": 870 + }, + { + "epoch": 0.4, + "learning_rate": 4.812341504649197e-07, + "logits/chosen": -2.3097877502441406, + "logits/rejected": -2.2116963863372803, + "logps/chosen": -128.104248046875, + "logps/rejected": -195.17971801757812, + "loss": 0.0512, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.658078193664551, + "rewards/margins": 9.50889778137207, + "rewards/rejected": -12.166976928710938, + "step": 880 + }, + { + "epoch": 0.41, + "learning_rate": 4.803888419273035e-07, + "logits/chosen": -2.209960460662842, + "logits/rejected": -2.1225123405456543, + "logps/chosen": -127.14057922363281, + "logps/rejected": -200.28134155273438, + "loss": 0.0407, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.6977546215057373, + "rewards/margins": 9.140750885009766, + "rewards/rejected": -12.838505744934082, + "step": 890 + }, + { + "epoch": 0.41, + "learning_rate": 4.795435333896872e-07, + "logits/chosen": -2.12211275100708, + "logits/rejected": -2.003075361251831, + "logps/chosen": -134.07156372070312, + "logps/rejected": -211.1444549560547, + "loss": 0.0304, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.154332160949707, + "rewards/margins": 9.661005973815918, + "rewards/rejected": -13.815338134765625, + "step": 900 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.201496124267578, + "eval_logits/rejected": -2.1041057109832764, + "eval_logps/chosen": -133.43316650390625, + "eval_logps/rejected": -207.507080078125, + "eval_loss": 0.0380096472799778, + "eval_rewards/accuracies": 0.9750000238418579, + "eval_rewards/chosen": -4.219595432281494, + "eval_rewards/margins": 9.520654678344727, + "eval_rewards/rejected": -13.740249633789062, + "eval_runtime": 48.7922, + "eval_samples_per_second": 58.657, + "eval_steps_per_second": 1.845, + "step": 900 + }, + { + "epoch": 0.42, + "learning_rate": 4.78698224852071e-07, + "logits/chosen": -2.263471841812134, + "logits/rejected": -2.182316303253174, + "logps/chosen": -127.5316162109375, + "logps/rejected": -203.9865264892578, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.45963978767395, + "rewards/margins": 9.635172843933105, + "rewards/rejected": -13.094813346862793, + "step": 910 + }, + { + "epoch": 0.42, + "learning_rate": 4.778529163144547e-07, + "logits/chosen": -2.2711265087127686, + "logits/rejected": -2.161175489425659, + "logps/chosen": -135.1485137939453, + "logps/rejected": -213.77218627929688, + "loss": 0.0296, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.044560432434082, + "rewards/margins": 10.215932846069336, + "rewards/rejected": -14.260492324829102, + "step": 920 + }, + { + "epoch": 0.42, + "learning_rate": 4.770076077768385e-07, + "logits/chosen": -2.209508180618286, + "logits/rejected": -2.0596439838409424, + "logps/chosen": -139.1815948486328, + "logps/rejected": -226.0141143798828, + "loss": 0.0457, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.9361088275909424, + "rewards/margins": 11.679204940795898, + "rewards/rejected": -15.615313529968262, + "step": 930 + }, + { + "epoch": 0.43, + "learning_rate": 4.761622992392223e-07, + "logits/chosen": -2.2528536319732666, + "logits/rejected": -2.1429479122161865, + "logps/chosen": -134.35842895507812, + "logps/rejected": -205.593994140625, + "loss": 0.0432, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.7717106342315674, + "rewards/margins": 9.324674606323242, + "rewards/rejected": -13.096386909484863, + "step": 940 + }, + { + "epoch": 0.43, + "learning_rate": 4.7531699070160606e-07, + "logits/chosen": -2.2223353385925293, + "logits/rejected": -2.1092305183410645, + "logps/chosen": -128.02865600585938, + "logps/rejected": -215.7021484375, + "loss": 0.032, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.7584633827209473, + "rewards/margins": 10.545815467834473, + "rewards/rejected": -14.304278373718262, + "step": 950 + }, + { + "epoch": 0.44, + "learning_rate": 4.7447168216398987e-07, + "logits/chosen": -2.3026297092437744, + "logits/rejected": -2.2772367000579834, + "logps/chosen": -122.90687561035156, + "logps/rejected": -187.3572998046875, + "loss": 0.0507, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.7807776927948, + "rewards/margins": 8.745773315429688, + "rewards/rejected": -11.526552200317383, + "step": 960 + }, + { + "epoch": 0.44, + "learning_rate": 4.7362637362637357e-07, + "logits/chosen": -2.32552170753479, + "logits/rejected": -2.2577829360961914, + "logps/chosen": -131.49630737304688, + "logps/rejected": -190.88980102539062, + "loss": 0.0398, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.0015668869018555, + "rewards/margins": 8.384259223937988, + "rewards/rejected": -12.385825157165527, + "step": 970 + }, + { + "epoch": 0.45, + "learning_rate": 4.727810650887574e-07, + "logits/chosen": -2.3235411643981934, + "logits/rejected": -2.289680004119873, + "logps/chosen": -123.10699462890625, + "logps/rejected": -198.24154663085938, + "loss": 0.0344, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.2337048053741455, + "rewards/margins": 9.35986328125, + "rewards/rejected": -12.593567848205566, + "step": 980 + }, + { + "epoch": 0.45, + "learning_rate": 4.7193575655114114e-07, + "logits/chosen": -2.24157977104187, + "logits/rejected": -2.1801581382751465, + "logps/chosen": -118.62461853027344, + "logps/rejected": -207.7017059326172, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0132336616516113, + "rewards/margins": 10.910429954528809, + "rewards/rejected": -13.923663139343262, + "step": 990 + }, + { + "epoch": 0.46, + "learning_rate": 4.7109044801352495e-07, + "logits/chosen": -2.2588751316070557, + "logits/rejected": -2.164956569671631, + "logps/chosen": -125.435546875, + "logps/rejected": -215.0337371826172, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2047088146209717, + "rewards/margins": 11.243847846984863, + "rewards/rejected": -14.44855785369873, + "step": 1000 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -2.1849193572998047, + "eval_logits/rejected": -2.0859615802764893, + "eval_logps/chosen": -140.97811889648438, + "eval_logps/rejected": -231.99063110351562, + "eval_loss": 0.03243358060717583, + "eval_rewards/accuracies": 0.9722222089767456, + "eval_rewards/chosen": -4.974091529846191, + "eval_rewards/margins": 11.21451187133789, + "eval_rewards/rejected": -16.1886043548584, + "eval_runtime": 48.3459, + "eval_samples_per_second": 59.198, + "eval_steps_per_second": 1.862, + "step": 1000 + }, + { + "epoch": 0.46, + "learning_rate": 4.7024513947590865e-07, + "logits/chosen": -2.1504714488983154, + "logits/rejected": -2.056974411010742, + "logps/chosen": -134.57516479492188, + "logps/rejected": -228.6068878173828, + "loss": 0.0434, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.5758376121521, + "rewards/margins": 11.010690689086914, + "rewards/rejected": -15.586526870727539, + "step": 1010 + }, + { + "epoch": 0.47, + "learning_rate": 4.6939983093829246e-07, + "logits/chosen": -2.0870413780212402, + "logits/rejected": -2.02657151222229, + "logps/chosen": -139.79901123046875, + "logps/rejected": -235.48812866210938, + "loss": 0.0333, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.881598949432373, + "rewards/margins": 11.311357498168945, + "rewards/rejected": -16.192956924438477, + "step": 1020 + }, + { + "epoch": 0.47, + "learning_rate": 4.685545224006762e-07, + "logits/chosen": -2.180068016052246, + "logits/rejected": -2.1508913040161133, + "logps/chosen": -133.02784729003906, + "logps/rejected": -224.31314086914062, + "loss": 0.0336, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.727643013000488, + "rewards/margins": 10.623067855834961, + "rewards/rejected": -15.35071086883545, + "step": 1030 + }, + { + "epoch": 0.47, + "learning_rate": 4.6770921386306003e-07, + "logits/chosen": -2.2807860374450684, + "logits/rejected": -2.279236316680908, + "logps/chosen": -123.20567321777344, + "logps/rejected": -196.570068359375, + "loss": 0.0382, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.346115827560425, + "rewards/margins": 10.093769073486328, + "rewards/rejected": -12.4398832321167, + "step": 1040 + }, + { + "epoch": 0.48, + "learning_rate": 4.668639053254438e-07, + "logits/chosen": -2.246729850769043, + "logits/rejected": -2.1844513416290283, + "logps/chosen": -132.6374969482422, + "logps/rejected": -225.3860321044922, + "loss": 0.0263, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7644131183624268, + "rewards/margins": 11.513562202453613, + "rewards/rejected": -15.277974128723145, + "step": 1050 + }, + { + "epoch": 0.48, + "learning_rate": 4.660185967878275e-07, + "logits/chosen": -2.301955223083496, + "logits/rejected": -2.2792880535125732, + "logps/chosen": -125.2342758178711, + "logps/rejected": -197.12930297851562, + "loss": 0.0469, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.531769275665283, + "rewards/margins": 9.205384254455566, + "rewards/rejected": -12.737154006958008, + "step": 1060 + }, + { + "epoch": 0.49, + "learning_rate": 4.651732882502113e-07, + "logits/chosen": -2.2779886722564697, + "logits/rejected": -2.2681050300598145, + "logps/chosen": -126.11590576171875, + "logps/rejected": -211.461181640625, + "loss": 0.0292, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.280942440032959, + "rewards/margins": 10.733200073242188, + "rewards/rejected": -14.014142990112305, + "step": 1070 + }, + { + "epoch": 0.49, + "learning_rate": 4.6432797971259506e-07, + "logits/chosen": -2.2388012409210205, + "logits/rejected": -2.249535083770752, + "logps/chosen": -120.04434967041016, + "logps/rejected": -208.3643341064453, + "loss": 0.0316, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.521498203277588, + "rewards/margins": 11.035587310791016, + "rewards/rejected": -13.557085037231445, + "step": 1080 + }, + { + "epoch": 0.5, + "learning_rate": 4.6348267117497887e-07, + "logits/chosen": -2.1909823417663574, + "logits/rejected": -2.1376616954803467, + "logps/chosen": -125.14691162109375, + "logps/rejected": -222.58547973632812, + "loss": 0.0212, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6583189964294434, + "rewards/margins": 11.045095443725586, + "rewards/rejected": -14.703417778015137, + "step": 1090 + }, + { + "epoch": 0.5, + "learning_rate": 4.626373626373626e-07, + "logits/chosen": -2.2102527618408203, + "logits/rejected": -2.206092119216919, + "logps/chosen": -128.36289978027344, + "logps/rejected": -216.8212432861328, + "loss": 0.03, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.7434325218200684, + "rewards/margins": 10.838982582092285, + "rewards/rejected": -14.582415580749512, + "step": 1100 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -2.174874782562256, + "eval_logits/rejected": -2.11321759223938, + "eval_logps/chosen": -142.76707458496094, + "eval_logps/rejected": -234.18118286132812, + "eval_loss": 0.03423836827278137, + "eval_rewards/accuracies": 0.9666666388511658, + "eval_rewards/chosen": -5.152987480163574, + "eval_rewards/margins": 11.254671096801758, + "eval_rewards/rejected": -16.40765953063965, + "eval_runtime": 48.9677, + "eval_samples_per_second": 58.447, + "eval_steps_per_second": 1.838, + "step": 1100 + }, + { + "epoch": 0.51, + "learning_rate": 4.617920540997464e-07, + "logits/chosen": -2.216888904571533, + "logits/rejected": -2.1993260383605957, + "logps/chosen": -136.57183837890625, + "logps/rejected": -238.6022491455078, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9422149658203125, + "rewards/margins": 12.420379638671875, + "rewards/rejected": -16.362594604492188, + "step": 1110 + }, + { + "epoch": 0.51, + "learning_rate": 4.6094674556213014e-07, + "logits/chosen": -2.2214226722717285, + "logits/rejected": -2.162144660949707, + "logps/chosen": -121.65068054199219, + "logps/rejected": -201.74705505371094, + "loss": 0.0358, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.384767532348633, + "rewards/margins": 10.979083061218262, + "rewards/rejected": -13.363850593566895, + "step": 1120 + }, + { + "epoch": 0.52, + "learning_rate": 4.6010143702451395e-07, + "logits/chosen": -2.278418779373169, + "logits/rejected": -2.2508838176727295, + "logps/chosen": -119.28004455566406, + "logps/rejected": -186.50210571289062, + "loss": 0.0381, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.392752170562744, + "rewards/margins": 8.599018096923828, + "rewards/rejected": -10.991769790649414, + "step": 1130 + }, + { + "epoch": 0.52, + "learning_rate": 4.592561284868977e-07, + "logits/chosen": -2.179384469985962, + "logits/rejected": -2.1243185997009277, + "logps/chosen": -121.19415283203125, + "logps/rejected": -204.47756958007812, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.072291374206543, + "rewards/margins": 10.187277793884277, + "rewards/rejected": -13.25956916809082, + "step": 1140 + }, + { + "epoch": 0.52, + "learning_rate": 4.584108199492815e-07, + "logits/chosen": -2.1386260986328125, + "logits/rejected": -2.0709643363952637, + "logps/chosen": -129.10621643066406, + "logps/rejected": -223.4142608642578, + "loss": 0.0164, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.289857387542725, + "rewards/margins": 11.014361381530762, + "rewards/rejected": -15.304219245910645, + "step": 1150 + }, + { + "epoch": 0.53, + "learning_rate": 4.575655114116652e-07, + "logits/chosen": -2.069509744644165, + "logits/rejected": -1.9732223749160767, + "logps/chosen": -141.01705932617188, + "logps/rejected": -229.3037109375, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.0492401123046875, + "rewards/margins": 11.49431324005127, + "rewards/rejected": -15.543553352355957, + "step": 1160 + }, + { + "epoch": 0.53, + "learning_rate": 4.56720202874049e-07, + "logits/chosen": -2.0551540851593018, + "logits/rejected": -1.969403862953186, + "logps/chosen": -128.4068603515625, + "logps/rejected": -243.1908416748047, + "loss": 0.0324, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.475301742553711, + "rewards/margins": 13.750211715698242, + "rewards/rejected": -17.225513458251953, + "step": 1170 + }, + { + "epoch": 0.54, + "learning_rate": 4.558748943364328e-07, + "logits/chosen": -1.974999189376831, + "logits/rejected": -1.8650972843170166, + "logps/chosen": -133.85598754882812, + "logps/rejected": -262.3040466308594, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.252058506011963, + "rewards/margins": 14.034210205078125, + "rewards/rejected": -19.286270141601562, + "step": 1180 + }, + { + "epoch": 0.54, + "learning_rate": 4.5502958579881655e-07, + "logits/chosen": -2.0906567573547363, + "logits/rejected": -1.9362144470214844, + "logps/chosen": -136.1277618408203, + "logps/rejected": -231.7950897216797, + "loss": 0.0395, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.7756996154785156, + "rewards/margins": 12.124837875366211, + "rewards/rejected": -15.900537490844727, + "step": 1190 + }, + { + "epoch": 0.55, + "learning_rate": 4.5418427726120036e-07, + "logits/chosen": -2.1612088680267334, + "logits/rejected": -2.0685601234436035, + "logps/chosen": -118.42198181152344, + "logps/rejected": -212.84268188476562, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.731152296066284, + "rewards/margins": 11.037375450134277, + "rewards/rejected": -13.768527030944824, + "step": 1200 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -2.1124844551086426, + "eval_logits/rejected": -2.000330686569214, + "eval_logps/chosen": -118.87854766845703, + "eval_logps/rejected": -190.10202026367188, + "eval_loss": 0.031096385791897774, + "eval_rewards/accuracies": 0.9750000238418579, + "eval_rewards/chosen": -2.764134407043457, + "eval_rewards/margins": 9.23560619354248, + "eval_rewards/rejected": -11.999740600585938, + "eval_runtime": 48.5808, + "eval_samples_per_second": 58.912, + "eval_steps_per_second": 1.853, + "step": 1200 + }, + { + "epoch": 0.55, + "learning_rate": 4.5333896872358406e-07, + "logits/chosen": -2.116041898727417, + "logits/rejected": -1.9751415252685547, + "logps/chosen": -126.99690246582031, + "logps/rejected": -211.7047576904297, + "loss": 0.0263, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.922422409057617, + "rewards/margins": 10.941381454467773, + "rewards/rejected": -13.863802909851074, + "step": 1210 + }, + { + "epoch": 0.56, + "learning_rate": 4.5249366018596787e-07, + "logits/chosen": -2.0528061389923096, + "logits/rejected": -1.9215404987335205, + "logps/chosen": -123.11860656738281, + "logps/rejected": -220.2886199951172, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4065489768981934, + "rewards/margins": 11.671735763549805, + "rewards/rejected": -15.078285217285156, + "step": 1220 + }, + { + "epoch": 0.56, + "learning_rate": 4.5164835164835163e-07, + "logits/chosen": -2.0894837379455566, + "logits/rejected": -1.946913480758667, + "logps/chosen": -125.61395263671875, + "logps/rejected": -206.34445190429688, + "loss": 0.0319, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.333749771118164, + "rewards/margins": 10.101171493530273, + "rewards/rejected": -13.434921264648438, + "step": 1230 + }, + { + "epoch": 0.57, + "learning_rate": 4.5080304311073544e-07, + "logits/chosen": -1.9018142223358154, + "logits/rejected": -1.7255821228027344, + "logps/chosen": -130.88623046875, + "logps/rejected": -230.283935546875, + "loss": 0.033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.72715425491333, + "rewards/margins": 12.118368148803711, + "rewards/rejected": -15.845524787902832, + "step": 1240 + }, + { + "epoch": 0.57, + "learning_rate": 4.4995773457311914e-07, + "logits/chosen": -1.8912999629974365, + "logits/rejected": -1.6973804235458374, + "logps/chosen": -140.52346801757812, + "logps/rejected": -240.0919189453125, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.971592903137207, + "rewards/margins": 11.932929992675781, + "rewards/rejected": -16.904523849487305, + "step": 1250 + }, + { + "epoch": 0.58, + "learning_rate": 4.491124260355029e-07, + "logits/chosen": -2.0328290462493896, + "logits/rejected": -1.9123872518539429, + "logps/chosen": -135.49826049804688, + "logps/rejected": -225.2454833984375, + "loss": 0.0315, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.119205951690674, + "rewards/margins": 11.104761123657227, + "rewards/rejected": -15.223965644836426, + "step": 1260 + }, + { + "epoch": 0.58, + "learning_rate": 4.482671174978867e-07, + "logits/chosen": -2.009664297103882, + "logits/rejected": -1.873490333557129, + "logps/chosen": -130.07424926757812, + "logps/rejected": -234.64404296875, + "loss": 0.0303, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6427292823791504, + "rewards/margins": 12.3717041015625, + "rewards/rejected": -16.01443099975586, + "step": 1270 + }, + { + "epoch": 0.58, + "learning_rate": 4.4742180896027047e-07, + "logits/chosen": -1.9943698644638062, + "logits/rejected": -1.8417961597442627, + "logps/chosen": -128.2240447998047, + "logps/rejected": -208.1425018310547, + "loss": 0.0423, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.6072349548339844, + "rewards/margins": 10.532011985778809, + "rewards/rejected": -14.139246940612793, + "step": 1280 + }, + { + "epoch": 0.59, + "learning_rate": 4.465765004226543e-07, + "logits/chosen": -2.0972723960876465, + "logits/rejected": -1.9810869693756104, + "logps/chosen": -124.97412109375, + "logps/rejected": -192.53623962402344, + "loss": 0.0272, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.600572109222412, + "rewards/margins": 8.653288841247559, + "rewards/rejected": -12.253861427307129, + "step": 1290 + }, + { + "epoch": 0.59, + "learning_rate": 4.45731191885038e-07, + "logits/chosen": -2.0243687629699707, + "logits/rejected": -1.8825792074203491, + "logps/chosen": -124.19642639160156, + "logps/rejected": -210.21359252929688, + "loss": 0.0489, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.705894708633423, + "rewards/margins": 10.930899620056152, + "rewards/rejected": -13.636795043945312, + "step": 1300 + }, + { + "epoch": 0.59, + "eval_logits/chosen": -2.2572124004364014, + "eval_logits/rejected": -2.173858880996704, + "eval_logps/chosen": -119.61163330078125, + "eval_logps/rejected": -194.4359130859375, + "eval_loss": 0.0271957665681839, + "eval_rewards/accuracies": 0.9833333492279053, + "eval_rewards/chosen": -2.837442398071289, + "eval_rewards/margins": 9.59568977355957, + "eval_rewards/rejected": -12.43313217163086, + "eval_runtime": 48.8158, + "eval_samples_per_second": 58.629, + "eval_steps_per_second": 1.844, + "step": 1300 + }, + { + "epoch": 0.6, + "learning_rate": 4.448858833474218e-07, + "logits/chosen": -2.195492744445801, + "logits/rejected": -2.124648094177246, + "logps/chosen": -123.89532470703125, + "logps/rejected": -201.10519409179688, + "loss": 0.031, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.475634813308716, + "rewards/margins": 9.291130065917969, + "rewards/rejected": -12.766763687133789, + "step": 1310 + }, + { + "epoch": 0.6, + "learning_rate": 4.4404057480980555e-07, + "logits/chosen": -2.1314010620117188, + "logits/rejected": -2.0682311058044434, + "logps/chosen": -120.6261215209961, + "logps/rejected": -214.7279815673828, + "loss": 0.0371, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.261993885040283, + "rewards/margins": 10.915969848632812, + "rewards/rejected": -14.177961349487305, + "step": 1320 + }, + { + "epoch": 0.61, + "learning_rate": 4.4319526627218936e-07, + "logits/chosen": -2.127857208251953, + "logits/rejected": -2.0334818363189697, + "logps/chosen": -123.88873291015625, + "logps/rejected": -227.59896850585938, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.45524525642395, + "rewards/margins": 12.025127410888672, + "rewards/rejected": -15.480372428894043, + "step": 1330 + }, + { + "epoch": 0.61, + "learning_rate": 4.423499577345731e-07, + "logits/chosen": -2.1861257553100586, + "logits/rejected": -2.081003427505493, + "logps/chosen": -119.9664535522461, + "logps/rejected": -222.65853881835938, + "loss": 0.0317, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.040316343307495, + "rewards/margins": 12.255244255065918, + "rewards/rejected": -15.295560836791992, + "step": 1340 + }, + { + "epoch": 0.62, + "learning_rate": 4.4150464919695687e-07, + "logits/chosen": -2.072154998779297, + "logits/rejected": -1.9294564723968506, + "logps/chosen": -128.0748748779297, + "logps/rejected": -225.40939331054688, + "loss": 0.0233, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.229937553405762, + "rewards/margins": 11.25472354888916, + "rewards/rejected": -15.484660148620605, + "step": 1350 + }, + { + "epoch": 0.62, + "learning_rate": 4.4065934065934063e-07, + "logits/chosen": -2.095121383666992, + "logits/rejected": -1.9508041143417358, + "logps/chosen": -139.701904296875, + "logps/rejected": -224.1685791015625, + "loss": 0.0244, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.606893062591553, + "rewards/margins": 10.666409492492676, + "rewards/rejected": -15.273303031921387, + "step": 1360 + }, + { + "epoch": 0.63, + "learning_rate": 4.398140321217244e-07, + "logits/chosen": -2.2028756141662598, + "logits/rejected": -2.0891079902648926, + "logps/chosen": -124.32647705078125, + "logps/rejected": -217.9824981689453, + "loss": 0.0314, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.51623797416687, + "rewards/margins": 11.21560287475586, + "rewards/rejected": -14.731842041015625, + "step": 1370 + }, + { + "epoch": 0.63, + "learning_rate": 4.389687235841082e-07, + "logits/chosen": -2.18900990486145, + "logits/rejected": -2.0879733562469482, + "logps/chosen": -128.83123779296875, + "logps/rejected": -219.22506713867188, + "loss": 0.0356, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.9812655448913574, + "rewards/margins": 10.82696533203125, + "rewards/rejected": -14.80823040008545, + "step": 1380 + }, + { + "epoch": 0.63, + "learning_rate": 4.3812341504649195e-07, + "logits/chosen": -2.2501654624938965, + "logits/rejected": -2.13492488861084, + "logps/chosen": -117.8704833984375, + "logps/rejected": -204.0887451171875, + "loss": 0.0496, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5138015747070312, + "rewards/margins": 10.41077995300293, + "rewards/rejected": -12.924581527709961, + "step": 1390 + }, + { + "epoch": 0.64, + "learning_rate": 4.372781065088757e-07, + "logits/chosen": -2.152750253677368, + "logits/rejected": -2.0153543949127197, + "logps/chosen": -136.95753479003906, + "logps/rejected": -223.27490234375, + "loss": 0.0263, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.8503575325012207, + "rewards/margins": 11.449549674987793, + "rewards/rejected": -15.299906730651855, + "step": 1400 + }, + { + "epoch": 0.64, + "eval_logits/chosen": -2.2102935314178467, + "eval_logits/rejected": -2.081444263458252, + "eval_logps/chosen": -126.17691802978516, + "eval_logps/rejected": -202.63734436035156, + "eval_loss": 0.029057901352643967, + "eval_rewards/accuracies": 0.9833333492279053, + "eval_rewards/chosen": -3.4939706325531006, + "eval_rewards/margins": 9.75930404663086, + "eval_rewards/rejected": -13.253273963928223, + "eval_runtime": 49.2397, + "eval_samples_per_second": 58.124, + "eval_steps_per_second": 1.828, + "step": 1400 + }, + { + "epoch": 0.64, + "learning_rate": 4.3643279797125947e-07, + "logits/chosen": -2.223374128341675, + "logits/rejected": -2.125884771347046, + "logps/chosen": -130.22470092773438, + "logps/rejected": -208.01644897460938, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.154832124710083, + "rewards/margins": 10.32645320892334, + "rewards/rejected": -13.481285095214844, + "step": 1410 + }, + { + "epoch": 0.65, + "learning_rate": 4.355874894336433e-07, + "logits/chosen": -2.1675026416778564, + "logits/rejected": -2.0408825874328613, + "logps/chosen": -129.0258331298828, + "logps/rejected": -226.79629516601562, + "loss": 0.0287, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.7398815155029297, + "rewards/margins": 11.839478492736816, + "rewards/rejected": -15.579358100891113, + "step": 1420 + }, + { + "epoch": 0.65, + "learning_rate": 4.3474218089602703e-07, + "logits/chosen": -2.213987350463867, + "logits/rejected": -2.105513572692871, + "logps/chosen": -130.8446807861328, + "logps/rejected": -215.7467041015625, + "loss": 0.04, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.016310214996338, + "rewards/margins": 10.332174301147461, + "rewards/rejected": -14.348484992980957, + "step": 1430 + }, + { + "epoch": 0.66, + "learning_rate": 4.3389687235841084e-07, + "logits/chosen": -2.158055067062378, + "logits/rejected": -2.044100284576416, + "logps/chosen": -128.62557983398438, + "logps/rejected": -219.5900115966797, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4990944862365723, + "rewards/margins": 11.490110397338867, + "rewards/rejected": -14.989204406738281, + "step": 1440 + }, + { + "epoch": 0.66, + "learning_rate": 4.3305156382079455e-07, + "logits/chosen": -2.1493687629699707, + "logits/rejected": -2.0363144874572754, + "logps/chosen": -134.43417358398438, + "logps/rejected": -224.71237182617188, + "loss": 0.0338, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8965301513671875, + "rewards/margins": 11.341170310974121, + "rewards/rejected": -15.237699508666992, + "step": 1450 + }, + { + "epoch": 0.67, + "learning_rate": 4.3220625528317836e-07, + "logits/chosen": -2.225033760070801, + "logits/rejected": -2.1324281692504883, + "logps/chosen": -142.65785217285156, + "logps/rejected": -234.6016387939453, + "loss": 0.0233, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.940446376800537, + "rewards/margins": 11.09235954284668, + "rewards/rejected": -16.03280258178711, + "step": 1460 + }, + { + "epoch": 0.67, + "learning_rate": 4.313609467455621e-07, + "logits/chosen": -2.2476277351379395, + "logits/rejected": -2.195188045501709, + "logps/chosen": -134.739990234375, + "logps/rejected": -223.62112426757812, + "loss": 0.0327, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.509375095367432, + "rewards/margins": 11.092538833618164, + "rewards/rejected": -15.601913452148438, + "step": 1470 + }, + { + "epoch": 0.68, + "learning_rate": 4.3051563820794587e-07, + "logits/chosen": -2.212526798248291, + "logits/rejected": -2.1159298419952393, + "logps/chosen": -132.11997985839844, + "logps/rejected": -234.01931762695312, + "loss": 0.0337, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.121611595153809, + "rewards/margins": 12.537824630737305, + "rewards/rejected": -16.659435272216797, + "step": 1480 + }, + { + "epoch": 0.68, + "learning_rate": 4.2967032967032963e-07, + "logits/chosen": -2.276444911956787, + "logits/rejected": -2.1930670738220215, + "logps/chosen": -121.98631286621094, + "logps/rejected": -213.0667724609375, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8731472492218018, + "rewards/margins": 11.492844581604004, + "rewards/rejected": -14.365991592407227, + "step": 1490 + }, + { + "epoch": 0.68, + "learning_rate": 4.288250211327134e-07, + "logits/chosen": -2.245380163192749, + "logits/rejected": -2.15441632270813, + "logps/chosen": -124.43641662597656, + "logps/rejected": -232.03225708007812, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0712151527404785, + "rewards/margins": 12.98668098449707, + "rewards/rejected": -16.05789566040039, + "step": 1500 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -2.1682088375091553, + "eval_logits/rejected": -2.058032751083374, + "eval_logps/chosen": -138.2372283935547, + "eval_logps/rejected": -232.24818420410156, + "eval_loss": 0.026560302823781967, + "eval_rewards/accuracies": 0.9777777791023254, + "eval_rewards/chosen": -4.700002670288086, + "eval_rewards/margins": 11.514355659484863, + "eval_rewards/rejected": -16.2143611907959, + "eval_runtime": 48.5449, + "eval_samples_per_second": 58.956, + "eval_steps_per_second": 1.854, + "step": 1500 + }, + { + "epoch": 0.69, + "learning_rate": 4.279797125950972e-07, + "logits/chosen": -2.2608067989349365, + "logits/rejected": -2.193211555480957, + "logps/chosen": -124.4505386352539, + "logps/rejected": -219.6403350830078, + "loss": 0.0305, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.85394549369812, + "rewards/margins": 11.083495140075684, + "rewards/rejected": -14.937440872192383, + "step": 1510 + }, + { + "epoch": 0.69, + "learning_rate": 4.2713440405748095e-07, + "logits/chosen": -2.330735206604004, + "logits/rejected": -2.2386350631713867, + "logps/chosen": -121.85264587402344, + "logps/rejected": -214.347412109375, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3249454498291016, + "rewards/margins": 11.747182846069336, + "rewards/rejected": -14.07213020324707, + "step": 1520 + }, + { + "epoch": 0.7, + "learning_rate": 4.2628909551986476e-07, + "logits/chosen": -2.280710458755493, + "logits/rejected": -2.2095468044281006, + "logps/chosen": -127.70649719238281, + "logps/rejected": -224.08901977539062, + "loss": 0.0268, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.8593363761901855, + "rewards/margins": 11.323004722595215, + "rewards/rejected": -15.182344436645508, + "step": 1530 + }, + { + "epoch": 0.7, + "learning_rate": 4.2544378698224847e-07, + "logits/chosen": -2.1825404167175293, + "logits/rejected": -2.1189706325531006, + "logps/chosen": -132.66131591796875, + "logps/rejected": -222.04931640625, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.108658790588379, + "rewards/margins": 10.970823287963867, + "rewards/rejected": -15.07948112487793, + "step": 1540 + }, + { + "epoch": 0.71, + "learning_rate": 4.245984784446323e-07, + "logits/chosen": -2.1728508472442627, + "logits/rejected": -2.0841972827911377, + "logps/chosen": -140.6467742919922, + "logps/rejected": -246.8183135986328, + "loss": 0.0284, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.1096906661987305, + "rewards/margins": 12.453695297241211, + "rewards/rejected": -17.563385009765625, + "step": 1550 + }, + { + "epoch": 0.71, + "learning_rate": 4.2375316990701604e-07, + "logits/chosen": -2.0678842067718506, + "logits/rejected": -1.8705580234527588, + "logps/chosen": -127.19903564453125, + "logps/rejected": -243.9872589111328, + "loss": 0.037, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.363131523132324, + "rewards/margins": 13.238340377807617, + "rewards/rejected": -17.60147476196289, + "step": 1560 + }, + { + "epoch": 0.72, + "learning_rate": 4.2290786136939985e-07, + "logits/chosen": -2.123220205307007, + "logits/rejected": -1.9888830184936523, + "logps/chosen": -130.83587646484375, + "logps/rejected": -227.7192840576172, + "loss": 0.0253, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.2909469604492188, + "rewards/margins": 12.203306198120117, + "rewards/rejected": -15.494253158569336, + "step": 1570 + }, + { + "epoch": 0.72, + "learning_rate": 4.220625528317836e-07, + "logits/chosen": -2.1660406589508057, + "logits/rejected": -2.0749552249908447, + "logps/chosen": -129.2559356689453, + "logps/rejected": -239.82839965820312, + "loss": 0.0281, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.8316562175750732, + "rewards/margins": 12.731201171875, + "rewards/rejected": -16.562856674194336, + "step": 1580 + }, + { + "epoch": 0.73, + "learning_rate": 4.212172442941673e-07, + "logits/chosen": -2.0430922508239746, + "logits/rejected": -1.9348751306533813, + "logps/chosen": -156.99769592285156, + "logps/rejected": -265.02130126953125, + "loss": 0.0288, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.00015926361084, + "rewards/margins": 12.735962867736816, + "rewards/rejected": -18.736122131347656, + "step": 1590 + }, + { + "epoch": 0.73, + "learning_rate": 4.203719357565511e-07, + "logits/chosen": -2.006765604019165, + "logits/rejected": -1.8920835256576538, + "logps/chosen": -159.15304565429688, + "logps/rejected": -254.2825469970703, + "loss": 0.0272, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.988160610198975, + "rewards/margins": 12.050902366638184, + "rewards/rejected": -18.0390625, + "step": 1600 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -1.9654126167297363, + "eval_logits/rejected": -1.8091415166854858, + "eval_logps/chosen": -152.25767517089844, + "eval_logps/rejected": -251.39105224609375, + "eval_loss": 0.028284309431910515, + "eval_rewards/accuracies": 0.9666666388511658, + "eval_rewards/chosen": -6.102046489715576, + "eval_rewards/margins": 12.026601791381836, + "eval_rewards/rejected": -18.128646850585938, + "eval_runtime": 48.6434, + "eval_samples_per_second": 58.836, + "eval_steps_per_second": 1.85, + "step": 1600 + }, + { + "epoch": 0.73, + "learning_rate": 4.195266272189349e-07, + "logits/chosen": -1.9909400939941406, + "logits/rejected": -1.8486502170562744, + "logps/chosen": -139.93109130859375, + "logps/rejected": -240.50790405273438, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.681200981140137, + "rewards/margins": 12.465145111083984, + "rewards/rejected": -17.146345138549805, + "step": 1610 + }, + { + "epoch": 0.74, + "learning_rate": 4.186813186813187e-07, + "logits/chosen": -1.9522764682769775, + "logits/rejected": -1.859828233718872, + "logps/chosen": -135.3563995361328, + "logps/rejected": -242.638916015625, + "loss": 0.0324, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.077095985412598, + "rewards/margins": 12.791072845458984, + "rewards/rejected": -16.868167877197266, + "step": 1620 + }, + { + "epoch": 0.74, + "learning_rate": 4.1783601014370244e-07, + "logits/chosen": -2.0774741172790527, + "logits/rejected": -1.9764686822891235, + "logps/chosen": -127.30684661865234, + "logps/rejected": -224.0484619140625, + "loss": 0.028, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.535048723220825, + "rewards/margins": 11.87446403503418, + "rewards/rejected": -15.409512519836426, + "step": 1630 + }, + { + "epoch": 0.75, + "learning_rate": 4.169907016060862e-07, + "logits/chosen": -2.080463409423828, + "logits/rejected": -1.9346845149993896, + "logps/chosen": -139.84872436523438, + "logps/rejected": -237.13134765625, + "loss": 0.0275, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.80663800239563, + "rewards/margins": 12.872383117675781, + "rewards/rejected": -16.679019927978516, + "step": 1640 + }, + { + "epoch": 0.75, + "learning_rate": 4.1614539306846996e-07, + "logits/chosen": -2.105950355529785, + "logits/rejected": -1.9851375818252563, + "logps/chosen": -136.1564178466797, + "logps/rejected": -218.47354125976562, + "loss": 0.0251, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.8997530937194824, + "rewards/margins": 10.47309398651123, + "rewards/rejected": -14.372848510742188, + "step": 1650 + }, + { + "epoch": 0.76, + "learning_rate": 4.1530008453085377e-07, + "logits/chosen": -2.0644431114196777, + "logits/rejected": -1.9539110660552979, + "logps/chosen": -122.35018157958984, + "logps/rejected": -213.6524200439453, + "loss": 0.0352, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.2302792072296143, + "rewards/margins": 11.085116386413574, + "rewards/rejected": -14.315394401550293, + "step": 1660 + }, + { + "epoch": 0.76, + "learning_rate": 4.144547759932375e-07, + "logits/chosen": -2.0494117736816406, + "logits/rejected": -1.943672776222229, + "logps/chosen": -128.10084533691406, + "logps/rejected": -206.52450561523438, + "loss": 0.0257, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.70310640335083, + "rewards/margins": 10.135705947875977, + "rewards/rejected": -13.838810920715332, + "step": 1670 + }, + { + "epoch": 0.77, + "learning_rate": 4.1360946745562133e-07, + "logits/chosen": -2.0471606254577637, + "logits/rejected": -1.9206764698028564, + "logps/chosen": -134.4086456298828, + "logps/rejected": -238.36083984375, + "loss": 0.0248, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.129271984100342, + "rewards/margins": 12.306023597717285, + "rewards/rejected": -16.435293197631836, + "step": 1680 + }, + { + "epoch": 0.77, + "learning_rate": 4.1276415891800504e-07, + "logits/chosen": -2.1813762187957764, + "logits/rejected": -2.0932717323303223, + "logps/chosen": -117.71382904052734, + "logps/rejected": -211.7388153076172, + "loss": 0.0204, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.775641918182373, + "rewards/margins": 11.458349227905273, + "rewards/rejected": -14.233988761901855, + "step": 1690 + }, + { + "epoch": 0.78, + "learning_rate": 4.119188503803888e-07, + "logits/chosen": -2.187344789505005, + "logits/rejected": -2.0936601161956787, + "logps/chosen": -127.6551513671875, + "logps/rejected": -225.92593383789062, + "loss": 0.0278, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.2443504333496094, + "rewards/margins": 11.978067398071289, + "rewards/rejected": -15.222417831420898, + "step": 1700 + }, + { + "epoch": 0.78, + "eval_logits/chosen": -2.244328260421753, + "eval_logits/rejected": -2.165691375732422, + "eval_logps/chosen": -121.15006256103516, + "eval_logps/rejected": -200.86061096191406, + "eval_loss": 0.02541803941130638, + "eval_rewards/accuracies": 0.9750000238418579, + "eval_rewards/chosen": -2.991286277770996, + "eval_rewards/margins": 10.084315299987793, + "eval_rewards/rejected": -13.075600624084473, + "eval_runtime": 48.58, + "eval_samples_per_second": 58.913, + "eval_steps_per_second": 1.853, + "step": 1700 + }, + { + "epoch": 0.78, + "learning_rate": 4.110735418427726e-07, + "logits/chosen": -2.2411341667175293, + "logits/rejected": -2.151401996612549, + "logps/chosen": -121.0505142211914, + "logps/rejected": -218.67831420898438, + "loss": 0.0161, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.7752506732940674, + "rewards/margins": 12.061071395874023, + "rewards/rejected": -14.836321830749512, + "step": 1710 + }, + { + "epoch": 0.79, + "learning_rate": 4.1022823330515636e-07, + "logits/chosen": -2.269857883453369, + "logits/rejected": -2.196040153503418, + "logps/chosen": -112.02171325683594, + "logps/rejected": -191.1605224609375, + "loss": 0.0237, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.076202869415283, + "rewards/margins": 10.068809509277344, + "rewards/rejected": -12.145011901855469, + "step": 1720 + }, + { + "epoch": 0.79, + "learning_rate": 4.0938292476754017e-07, + "logits/chosen": -2.2377099990844727, + "logits/rejected": -2.1565918922424316, + "logps/chosen": -110.47419738769531, + "logps/rejected": -200.7766571044922, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1801040172576904, + "rewards/margins": 10.705169677734375, + "rewards/rejected": -12.885273933410645, + "step": 1730 + }, + { + "epoch": 0.79, + "learning_rate": 4.085376162299239e-07, + "logits/chosen": -2.2225890159606934, + "logits/rejected": -2.1323955059051514, + "logps/chosen": -120.49271392822266, + "logps/rejected": -214.49478149414062, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7662570476531982, + "rewards/margins": 11.611419677734375, + "rewards/rejected": -14.377676010131836, + "step": 1740 + }, + { + "epoch": 0.8, + "learning_rate": 4.076923076923077e-07, + "logits/chosen": -2.264892101287842, + "logits/rejected": -2.1854147911071777, + "logps/chosen": -111.61662292480469, + "logps/rejected": -202.98321533203125, + "loss": 0.0303, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.9836992025375366, + "rewards/margins": 11.16712474822998, + "rewards/rejected": -13.150823593139648, + "step": 1750 + }, + { + "epoch": 0.8, + "learning_rate": 4.0684699915469144e-07, + "logits/chosen": -2.3072378635406494, + "logits/rejected": -2.242619037628174, + "logps/chosen": -129.24737548828125, + "logps/rejected": -198.75900268554688, + "loss": 0.0347, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.685056209564209, + "rewards/margins": 8.488093376159668, + "rewards/rejected": -12.173150062561035, + "step": 1760 + }, + { + "epoch": 0.81, + "learning_rate": 4.0600169061707525e-07, + "logits/chosen": -2.2348296642303467, + "logits/rejected": -2.1900715827941895, + "logps/chosen": -114.0647964477539, + "logps/rejected": -217.90072631835938, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.479679584503174, + "rewards/margins": 12.287931442260742, + "rewards/rejected": -14.767611503601074, + "step": 1770 + }, + { + "epoch": 0.81, + "learning_rate": 4.0515638207945896e-07, + "logits/chosen": -2.238379716873169, + "logits/rejected": -2.193352699279785, + "logps/chosen": -128.66403198242188, + "logps/rejected": -215.4800262451172, + "loss": 0.0408, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.537277936935425, + "rewards/margins": 10.639458656311035, + "rewards/rejected": -14.176734924316406, + "step": 1780 + }, + { + "epoch": 0.82, + "learning_rate": 4.043110735418427e-07, + "logits/chosen": -2.1671791076660156, + "logits/rejected": -2.1061270236968994, + "logps/chosen": -123.48826599121094, + "logps/rejected": -213.7099151611328, + "loss": 0.019, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.847043514251709, + "rewards/margins": 11.608076095581055, + "rewards/rejected": -14.455119132995605, + "step": 1790 + }, + { + "epoch": 0.82, + "learning_rate": 4.034657650042265e-07, + "logits/chosen": -2.0503573417663574, + "logits/rejected": -1.9893563985824585, + "logps/chosen": -140.46481323242188, + "logps/rejected": -256.7433166503906, + "loss": 0.0291, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.495582580566406, + "rewards/margins": 13.987375259399414, + "rewards/rejected": -18.48295783996582, + "step": 1800 + }, + { + "epoch": 0.82, + "eval_logits/chosen": -2.036374807357788, + "eval_logits/rejected": -1.9426956176757812, + "eval_logps/chosen": -140.2257843017578, + "eval_logps/rejected": -234.42259216308594, + "eval_loss": 0.025952113792300224, + "eval_rewards/accuracies": 0.9722222089767456, + "eval_rewards/chosen": -4.898858070373535, + "eval_rewards/margins": 11.532942771911621, + "eval_rewards/rejected": -16.43180274963379, + "eval_runtime": 49.2187, + "eval_samples_per_second": 58.149, + "eval_steps_per_second": 1.829, + "step": 1800 + }, + { + "epoch": 0.83, + "learning_rate": 4.026204564666103e-07, + "logits/chosen": -1.964010238647461, + "logits/rejected": -1.896442174911499, + "logps/chosen": -146.27426147460938, + "logps/rejected": -250.4436798095703, + "loss": 0.0145, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.305529594421387, + "rewards/margins": 12.814935684204102, + "rewards/rejected": -18.120464324951172, + "step": 1810 + }, + { + "epoch": 0.83, + "learning_rate": 4.017751479289941e-07, + "logits/chosen": -2.0868825912475586, + "logits/rejected": -1.9830644130706787, + "logps/chosen": -137.53465270996094, + "logps/rejected": -241.1505126953125, + "loss": 0.0254, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.553969383239746, + "rewards/margins": 12.338434219360352, + "rewards/rejected": -16.892404556274414, + "step": 1820 + }, + { + "epoch": 0.84, + "learning_rate": 4.009298393913778e-07, + "logits/chosen": -2.0947585105895996, + "logits/rejected": -2.0051965713500977, + "logps/chosen": -123.03267669677734, + "logps/rejected": -234.7122344970703, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8716492652893066, + "rewards/margins": 12.589926719665527, + "rewards/rejected": -16.46157455444336, + "step": 1830 + }, + { + "epoch": 0.84, + "learning_rate": 4.000845308537616e-07, + "logits/chosen": -2.092618465423584, + "logits/rejected": -1.9781370162963867, + "logps/chosen": -134.5900421142578, + "logps/rejected": -260.53485107421875, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.325318336486816, + "rewards/margins": 13.917451858520508, + "rewards/rejected": -18.242769241333008, + "step": 1840 + }, + { + "epoch": 0.84, + "learning_rate": 3.9923922231614536e-07, + "logits/chosen": -2.052762508392334, + "logits/rejected": -1.9207212924957275, + "logps/chosen": -128.0731658935547, + "logps/rejected": -237.6293487548828, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.202644348144531, + "rewards/margins": 12.438554763793945, + "rewards/rejected": -16.641199111938477, + "step": 1850 + }, + { + "epoch": 0.85, + "learning_rate": 3.9839391377852917e-07, + "logits/chosen": -2.051992416381836, + "logits/rejected": -1.9083547592163086, + "logps/chosen": -139.25662231445312, + "logps/rejected": -266.41278076171875, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.40474271774292, + "rewards/margins": 14.871357917785645, + "rewards/rejected": -19.276100158691406, + "step": 1860 + }, + { + "epoch": 0.85, + "learning_rate": 3.9754860524091293e-07, + "logits/chosen": -1.98797607421875, + "logits/rejected": -1.8896089792251587, + "logps/chosen": -143.28067016601562, + "logps/rejected": -239.6479949951172, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.469711780548096, + "rewards/margins": 12.147082328796387, + "rewards/rejected": -16.61679458618164, + "step": 1870 + }, + { + "epoch": 0.86, + "learning_rate": 3.967032967032967e-07, + "logits/chosen": -2.1141982078552246, + "logits/rejected": -2.009969711303711, + "logps/chosen": -125.37733459472656, + "logps/rejected": -214.2640380859375, + "loss": 0.0455, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.7521488666534424, + "rewards/margins": 10.471065521240234, + "rewards/rejected": -14.223215103149414, + "step": 1880 + }, + { + "epoch": 0.86, + "learning_rate": 3.9585798816568044e-07, + "logits/chosen": -2.096876621246338, + "logits/rejected": -2.0146164894104004, + "logps/chosen": -142.41055297851562, + "logps/rejected": -233.24954223632812, + "loss": 0.0222, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.191798686981201, + "rewards/margins": 11.954326629638672, + "rewards/rejected": -16.14612579345703, + "step": 1890 + }, + { + "epoch": 0.87, + "learning_rate": 3.950126796280642e-07, + "logits/chosen": -2.0792036056518555, + "logits/rejected": -1.9863141775131226, + "logps/chosen": -134.00045776367188, + "logps/rejected": -232.57553100585938, + "loss": 0.0253, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6598198413848877, + "rewards/margins": 11.928800582885742, + "rewards/rejected": -15.588618278503418, + "step": 1900 + }, + { + "epoch": 0.87, + "eval_logits/chosen": -2.0360984802246094, + "eval_logits/rejected": -1.9098594188690186, + "eval_logps/chosen": -131.64044189453125, + "eval_logps/rejected": -223.32122802734375, + "eval_loss": 0.02517438866198063, + "eval_rewards/accuracies": 0.9777777791023254, + "eval_rewards/chosen": -4.0403242111206055, + "eval_rewards/margins": 11.28133773803711, + "eval_rewards/rejected": -15.321663856506348, + "eval_runtime": 49.6332, + "eval_samples_per_second": 57.663, + "eval_steps_per_second": 1.813, + "step": 1900 + }, + { + "epoch": 0.87, + "learning_rate": 3.94167371090448e-07, + "logits/chosen": -2.0065677165985107, + "logits/rejected": -1.8861503601074219, + "logps/chosen": -125.51326751708984, + "logps/rejected": -241.58425903320312, + "loss": 0.0167, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.468381881713867, + "rewards/margins": 13.503756523132324, + "rewards/rejected": -16.972139358520508, + "step": 1910 + }, + { + "epoch": 0.88, + "learning_rate": 3.9332206255283177e-07, + "logits/chosen": -2.0069472789764404, + "logits/rejected": -1.880659818649292, + "logps/chosen": -126.3309555053711, + "logps/rejected": -214.9152069091797, + "loss": 0.0505, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.8646080493927, + "rewards/margins": 10.696456909179688, + "rewards/rejected": -14.561065673828125, + "step": 1920 + }, + { + "epoch": 0.88, + "learning_rate": 3.924767540152155e-07, + "logits/chosen": -2.0810904502868652, + "logits/rejected": -1.9871727228164673, + "logps/chosen": -123.64485931396484, + "logps/rejected": -200.4116973876953, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4952170848846436, + "rewards/margins": 10.431968688964844, + "rewards/rejected": -12.92718505859375, + "step": 1930 + }, + { + "epoch": 0.89, + "learning_rate": 3.916314454775993e-07, + "logits/chosen": -1.965213418006897, + "logits/rejected": -1.8638643026351929, + "logps/chosen": -123.869384765625, + "logps/rejected": -213.0116424560547, + "loss": 0.026, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.5767924785614014, + "rewards/margins": 11.060912132263184, + "rewards/rejected": -14.63770580291748, + "step": 1940 + }, + { + "epoch": 0.89, + "learning_rate": 3.907861369399831e-07, + "logits/chosen": -2.027801036834717, + "logits/rejected": -1.8885034322738647, + "logps/chosen": -115.36368560791016, + "logps/rejected": -218.797607421875, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2753968238830566, + "rewards/margins": 12.602838516235352, + "rewards/rejected": -14.87823486328125, + "step": 1950 + }, + { + "epoch": 0.89, + "learning_rate": 3.8994082840236685e-07, + "logits/chosen": -2.0216193199157715, + "logits/rejected": -1.917438268661499, + "logps/chosen": -136.36891174316406, + "logps/rejected": -224.12875366210938, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.036121368408203, + "rewards/margins": 11.403396606445312, + "rewards/rejected": -15.4395170211792, + "step": 1960 + }, + { + "epoch": 0.9, + "learning_rate": 3.8909551986475066e-07, + "logits/chosen": -2.039957284927368, + "logits/rejected": -1.9109203815460205, + "logps/chosen": -123.08699798583984, + "logps/rejected": -234.58975219726562, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9215152263641357, + "rewards/margins": 13.935195922851562, + "rewards/rejected": -16.85671043395996, + "step": 1970 + }, + { + "epoch": 0.9, + "learning_rate": 3.8825021132713436e-07, + "logits/chosen": -2.0436453819274902, + "logits/rejected": -1.9319490194320679, + "logps/chosen": -127.397216796875, + "logps/rejected": -237.54476928710938, + "loss": 0.0271, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.8864715099334717, + "rewards/margins": 12.645808219909668, + "rewards/rejected": -16.53228187561035, + "step": 1980 + }, + { + "epoch": 0.91, + "learning_rate": 3.874049027895182e-07, + "logits/chosen": -2.097534656524658, + "logits/rejected": -1.9975534677505493, + "logps/chosen": -117.48333740234375, + "logps/rejected": -218.18301391601562, + "loss": 0.0189, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.8377158641815186, + "rewards/margins": 11.74644660949707, + "rewards/rejected": -14.584162712097168, + "step": 1990 + }, + { + "epoch": 0.91, + "learning_rate": 3.8655959425190193e-07, + "logits/chosen": -2.1223201751708984, + "logits/rejected": -2.0484108924865723, + "logps/chosen": -127.0828857421875, + "logps/rejected": -223.9336395263672, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3377633094787598, + "rewards/margins": 11.691206932067871, + "rewards/rejected": -15.028970718383789, + "step": 2000 + }, + { + "epoch": 0.91, + "eval_logits/chosen": -2.157291889190674, + "eval_logits/rejected": -2.044750213623047, + "eval_logps/chosen": -123.21009826660156, + "eval_logps/rejected": -218.69635009765625, + "eval_loss": 0.022263653576374054, + "eval_rewards/accuracies": 0.9750000238418579, + "eval_rewards/chosen": -3.1972897052764893, + "eval_rewards/margins": 11.661887168884277, + "eval_rewards/rejected": -14.859176635742188, + "eval_runtime": 49.2513, + "eval_samples_per_second": 58.11, + "eval_steps_per_second": 1.827, + "step": 2000 + }, + { + "epoch": 0.92, + "learning_rate": 3.857142857142857e-07, + "logits/chosen": -2.145784378051758, + "logits/rejected": -2.039353132247925, + "logps/chosen": -118.86814880371094, + "logps/rejected": -231.6697540283203, + "loss": 0.0229, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.6437346935272217, + "rewards/margins": 13.244104385375977, + "rewards/rejected": -15.887837409973145, + "step": 2010 + }, + { + "epoch": 0.92, + "learning_rate": 3.8486897717666945e-07, + "logits/chosen": -2.0739400386810303, + "logits/rejected": -1.9888814687728882, + "logps/chosen": -115.21482849121094, + "logps/rejected": -220.90774536132812, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.819551706314087, + "rewards/margins": 11.673441886901855, + "rewards/rejected": -14.49299144744873, + "step": 2020 + }, + { + "epoch": 0.93, + "learning_rate": 3.840236686390532e-07, + "logits/chosen": -2.0994973182678223, + "logits/rejected": -2.0037684440612793, + "logps/chosen": -115.40962219238281, + "logps/rejected": -210.97677612304688, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4768853187561035, + "rewards/margins": 12.01233959197998, + "rewards/rejected": -14.489225387573242, + "step": 2030 + }, + { + "epoch": 0.93, + "learning_rate": 3.83178360101437e-07, + "logits/chosen": -2.139526844024658, + "logits/rejected": -2.0501785278320312, + "logps/chosen": -122.64337158203125, + "logps/rejected": -215.5990447998047, + "loss": 0.0437, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.588408946990967, + "rewards/margins": 11.165366172790527, + "rewards/rejected": -14.753776550292969, + "step": 2040 + }, + { + "epoch": 0.94, + "learning_rate": 3.8233305156382077e-07, + "logits/chosen": -2.2497003078460693, + "logits/rejected": -2.1790757179260254, + "logps/chosen": -126.99018859863281, + "logps/rejected": -205.12118530273438, + "loss": 0.0373, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9164180755615234, + "rewards/margins": 10.327306747436523, + "rewards/rejected": -13.243725776672363, + "step": 2050 + }, + { + "epoch": 0.94, + "learning_rate": 3.814877430262046e-07, + "logits/chosen": -2.2753875255584717, + "logits/rejected": -2.2370798587799072, + "logps/chosen": -117.94771575927734, + "logps/rejected": -200.80496215820312, + "loss": 0.0271, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.391221523284912, + "rewards/margins": 10.8762788772583, + "rewards/rejected": -13.267499923706055, + "step": 2060 + }, + { + "epoch": 0.94, + "learning_rate": 3.806424344885883e-07, + "logits/chosen": -2.2422733306884766, + "logits/rejected": -2.1572372913360596, + "logps/chosen": -113.2877426147461, + "logps/rejected": -220.81710815429688, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4801509380340576, + "rewards/margins": 12.62053394317627, + "rewards/rejected": -15.100683212280273, + "step": 2070 + }, + { + "epoch": 0.95, + "learning_rate": 3.797971259509721e-07, + "logits/chosen": -2.186211109161377, + "logits/rejected": -2.132835865020752, + "logps/chosen": -116.13663482666016, + "logps/rejected": -209.317626953125, + "loss": 0.0248, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.663703441619873, + "rewards/margins": 11.38145637512207, + "rewards/rejected": -14.045160293579102, + "step": 2080 + }, + { + "epoch": 0.95, + "learning_rate": 3.7895181741335585e-07, + "logits/chosen": -2.203897476196289, + "logits/rejected": -2.1628737449645996, + "logps/chosen": -118.30888366699219, + "logps/rejected": -203.86724853515625, + "loss": 0.0264, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.612675428390503, + "rewards/margins": 10.86390495300293, + "rewards/rejected": -13.476580619812012, + "step": 2090 + }, + { + "epoch": 0.96, + "learning_rate": 3.7810650887573966e-07, + "logits/chosen": -2.103102922439575, + "logits/rejected": -2.016601800918579, + "logps/chosen": -119.75636291503906, + "logps/rejected": -212.2992401123047, + "loss": 0.0272, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.717388868331909, + "rewards/margins": 11.71783447265625, + "rewards/rejected": -14.435221672058105, + "step": 2100 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -2.089155673980713, + "eval_logits/rejected": -1.9834376573562622, + "eval_logps/chosen": -127.06487274169922, + "eval_logps/rejected": -223.385498046875, + "eval_loss": 0.02362261526286602, + "eval_rewards/accuracies": 0.9750000238418579, + "eval_rewards/chosen": -3.5827677249908447, + "eval_rewards/margins": 11.745320320129395, + "eval_rewards/rejected": -15.328089714050293, + "eval_runtime": 48.1236, + "eval_samples_per_second": 59.472, + "eval_steps_per_second": 1.87, + "step": 2100 + }, + { + "epoch": 0.96, + "learning_rate": 3.772612003381234e-07, + "logits/chosen": -2.150611162185669, + "logits/rejected": -2.0834269523620605, + "logps/chosen": -117.68843078613281, + "logps/rejected": -215.09017944335938, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.300182342529297, + "rewards/margins": 12.045984268188477, + "rewards/rejected": -14.346165657043457, + "step": 2110 + }, + { + "epoch": 0.97, + "learning_rate": 3.764158918005071e-07, + "logits/chosen": -2.176175832748413, + "logits/rejected": -2.113129138946533, + "logps/chosen": -121.611083984375, + "logps/rejected": -220.4056854248047, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.795717477798462, + "rewards/margins": 12.352693557739258, + "rewards/rejected": -15.148412704467773, + "step": 2120 + }, + { + "epoch": 0.97, + "learning_rate": 3.7557058326289093e-07, + "logits/chosen": -2.181874990463257, + "logits/rejected": -2.119330883026123, + "logps/chosen": -128.18763732910156, + "logps/rejected": -238.7275848388672, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2725563049316406, + "rewards/margins": 13.210433959960938, + "rewards/rejected": -16.482990264892578, + "step": 2130 + }, + { + "epoch": 0.98, + "learning_rate": 3.747252747252747e-07, + "logits/chosen": -2.2190940380096436, + "logits/rejected": -2.1502795219421387, + "logps/chosen": -118.45751953125, + "logps/rejected": -222.6544189453125, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5364670753479004, + "rewards/margins": 12.805526733398438, + "rewards/rejected": -15.34199333190918, + "step": 2140 + }, + { + "epoch": 0.98, + "learning_rate": 3.738799661876585e-07, + "logits/chosen": -2.198096990585327, + "logits/rejected": -2.1333017349243164, + "logps/chosen": -120.4395751953125, + "logps/rejected": -215.0843963623047, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.054147243499756, + "rewards/margins": 11.440935134887695, + "rewards/rejected": -14.495083808898926, + "step": 2150 + }, + { + "epoch": 0.99, + "learning_rate": 3.7303465765004226e-07, + "logits/chosen": -2.176610231399536, + "logits/rejected": -2.1423356533050537, + "logps/chosen": -126.62446594238281, + "logps/rejected": -223.6035614013672, + "loss": 0.0245, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.833204984664917, + "rewards/margins": 12.73131275177002, + "rewards/rejected": -15.564518928527832, + "step": 2160 + }, + { + "epoch": 0.99, + "learning_rate": 3.72189349112426e-07, + "logits/chosen": -2.1511826515197754, + "logits/rejected": -2.072523355484009, + "logps/chosen": -133.12416076660156, + "logps/rejected": -233.90939331054688, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2800636291503906, + "rewards/margins": 12.904279708862305, + "rewards/rejected": -16.184341430664062, + "step": 2170 + }, + { + "epoch": 0.99, + "learning_rate": 3.7134404057480977e-07, + "logits/chosen": -2.2596356868743896, + "logits/rejected": -2.1797854900360107, + "logps/chosen": -109.3218994140625, + "logps/rejected": -208.5428009033203, + "loss": 0.0244, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.163811206817627, + "rewards/margins": 11.893083572387695, + "rewards/rejected": -14.056894302368164, + "step": 2180 + }, + { + "epoch": 1.0, + "learning_rate": 3.704987320371936e-07, + "logits/chosen": -2.264486789703369, + "logits/rejected": -2.2104756832122803, + "logps/chosen": -116.32193756103516, + "logps/rejected": -205.1827392578125, + "loss": 0.0121, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8635953664779663, + "rewards/margins": 11.354402542114258, + "rewards/rejected": -13.217997550964355, + "step": 2190 + }, + { + "epoch": 1.0, + "learning_rate": 3.6965342349957734e-07, + "logits/chosen": -2.2398998737335205, + "logits/rejected": -2.144261121749878, + "logps/chosen": -117.9054183959961, + "logps/rejected": -221.7399139404297, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8189895153045654, + "rewards/margins": 12.08039665222168, + "rewards/rejected": -14.899385452270508, + "step": 2200 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -2.265789270401001, + "eval_logits/rejected": -2.1606945991516113, + "eval_logps/chosen": -117.26538848876953, + "eval_logps/rejected": -213.8473358154297, + "eval_loss": 0.020574109628796577, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -2.6028175354003906, + "eval_rewards/margins": 11.771455764770508, + "eval_rewards/rejected": -14.374273300170898, + "eval_runtime": 48.9012, + "eval_samples_per_second": 58.526, + "eval_steps_per_second": 1.84, + "step": 2200 + }, + { + "epoch": 1.01, + "learning_rate": 3.6880811496196115e-07, + "logits/chosen": -2.2239885330200195, + "logits/rejected": -2.099027156829834, + "logps/chosen": -117.2328872680664, + "logps/rejected": -240.2711944580078, + "loss": 0.0122, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.659773588180542, + "rewards/margins": 13.949972152709961, + "rewards/rejected": -16.6097469329834, + "step": 2210 + }, + { + "epoch": 1.01, + "learning_rate": 3.6796280642434485e-07, + "logits/chosen": -2.224212169647217, + "logits/rejected": -2.1168534755706787, + "logps/chosen": -119.9342269897461, + "logps/rejected": -238.63882446289062, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.521542549133301, + "rewards/margins": 13.85914134979248, + "rewards/rejected": -16.38068199157715, + "step": 2220 + }, + { + "epoch": 1.02, + "learning_rate": 3.671174978867286e-07, + "logits/chosen": -2.223118543624878, + "logits/rejected": -2.092538356781006, + "logps/chosen": -116.990234375, + "logps/rejected": -240.6658477783203, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.179953098297119, + "rewards/margins": 14.794723510742188, + "rewards/rejected": -16.974674224853516, + "step": 2230 + }, + { + "epoch": 1.02, + "learning_rate": 3.662721893491124e-07, + "logits/chosen": -2.176121234893799, + "logits/rejected": -2.0341637134552, + "logps/chosen": -129.12535095214844, + "logps/rejected": -250.8475341796875, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3982722759246826, + "rewards/margins": 14.736839294433594, + "rewards/rejected": -18.135112762451172, + "step": 2240 + }, + { + "epoch": 1.03, + "learning_rate": 3.654268808114962e-07, + "logits/chosen": -2.2421722412109375, + "logits/rejected": -2.151844024658203, + "logps/chosen": -111.7896957397461, + "logps/rejected": -223.0345458984375, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0418715476989746, + "rewards/margins": 13.331483840942383, + "rewards/rejected": -15.373356819152832, + "step": 2250 + }, + { + "epoch": 1.03, + "learning_rate": 3.6458157227387994e-07, + "logits/chosen": -2.268448829650879, + "logits/rejected": -2.1708810329437256, + "logps/chosen": -111.51778411865234, + "logps/rejected": -217.6341552734375, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3873281478881836, + "rewards/margins": 13.243179321289062, + "rewards/rejected": -14.63050651550293, + "step": 2260 + }, + { + "epoch": 1.04, + "learning_rate": 3.637362637362637e-07, + "logits/chosen": -2.2276995182037354, + "logits/rejected": -2.134836196899414, + "logps/chosen": -118.00798034667969, + "logps/rejected": -232.2769775390625, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6417064666748047, + "rewards/margins": 13.054730415344238, + "rewards/rejected": -15.696436882019043, + "step": 2270 + }, + { + "epoch": 1.04, + "learning_rate": 3.628909551986475e-07, + "logits/chosen": -2.2070775032043457, + "logits/rejected": -2.111219644546509, + "logps/chosen": -123.40742492675781, + "logps/rejected": -239.35726928710938, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2174816131591797, + "rewards/margins": 13.53497314453125, + "rewards/rejected": -16.752452850341797, + "step": 2280 + }, + { + "epoch": 1.05, + "learning_rate": 3.6204564666103126e-07, + "logits/chosen": -2.104203701019287, + "logits/rejected": -1.9640731811523438, + "logps/chosen": -128.63992309570312, + "logps/rejected": -239.8550262451172, + "loss": 0.0094, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.4442787170410156, + "rewards/margins": 13.817400932312012, + "rewards/rejected": -17.261680603027344, + "step": 2290 + }, + { + "epoch": 1.05, + "learning_rate": 3.6120033812341507e-07, + "logits/chosen": -2.0865983963012695, + "logits/rejected": -1.9611084461212158, + "logps/chosen": -129.01255798339844, + "logps/rejected": -271.47900390625, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7618050575256348, + "rewards/margins": 16.001571655273438, + "rewards/rejected": -19.763378143310547, + "step": 2300 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -2.055070638656616, + "eval_logits/rejected": -1.8950544595718384, + "eval_logps/chosen": -133.615234375, + "eval_logps/rejected": -254.9934844970703, + "eval_loss": 0.020873844623565674, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -4.237802505493164, + "eval_rewards/margins": 14.251086235046387, + "eval_rewards/rejected": -18.488889694213867, + "eval_runtime": 48.722, + "eval_samples_per_second": 58.741, + "eval_steps_per_second": 1.847, + "step": 2300 + }, + { + "epoch": 1.05, + "learning_rate": 3.603550295857988e-07, + "logits/chosen": -2.0648961067199707, + "logits/rejected": -1.911163568496704, + "logps/chosen": -124.85935974121094, + "logps/rejected": -266.6103210449219, + "loss": 0.0118, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6062381267547607, + "rewards/margins": 15.78801155090332, + "rewards/rejected": -19.394250869750977, + "step": 2310 + }, + { + "epoch": 1.06, + "learning_rate": 3.5950972104818253e-07, + "logits/chosen": -2.039515733718872, + "logits/rejected": -1.8882137537002563, + "logps/chosen": -128.62649536132812, + "logps/rejected": -267.51885986328125, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.149783134460449, + "rewards/margins": 15.50981616973877, + "rewards/rejected": -19.65959930419922, + "step": 2320 + }, + { + "epoch": 1.06, + "learning_rate": 3.5866441251056634e-07, + "logits/chosen": -2.0585365295410156, + "logits/rejected": -1.901302695274353, + "logps/chosen": -135.6258087158203, + "logps/rejected": -253.69482421875, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.203586578369141, + "rewards/margins": 14.143606185913086, + "rewards/rejected": -18.347192764282227, + "step": 2330 + }, + { + "epoch": 1.07, + "learning_rate": 3.578191039729501e-07, + "logits/chosen": -2.0926547050476074, + "logits/rejected": -1.9490468502044678, + "logps/chosen": -123.04887390136719, + "logps/rejected": -267.77203369140625, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0432724952697754, + "rewards/margins": 16.452411651611328, + "rewards/rejected": -19.495681762695312, + "step": 2340 + }, + { + "epoch": 1.07, + "learning_rate": 3.569737954353339e-07, + "logits/chosen": -2.0938773155212402, + "logits/rejected": -1.9247863292694092, + "logps/chosen": -127.2049331665039, + "logps/rejected": -267.74725341796875, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.875643730163574, + "rewards/margins": 16.645126342773438, + "rewards/rejected": -19.520769119262695, + "step": 2350 + }, + { + "epoch": 1.08, + "learning_rate": 3.561284868977176e-07, + "logits/chosen": -2.120603322982788, + "logits/rejected": -1.9793163537979126, + "logps/chosen": -119.56201171875, + "logps/rejected": -280.1813659667969, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0645883083343506, + "rewards/margins": 17.966808319091797, + "rewards/rejected": -21.03139877319336, + "step": 2360 + }, + { + "epoch": 1.08, + "learning_rate": 3.552831783601014e-07, + "logits/chosen": -2.135152578353882, + "logits/rejected": -1.9979181289672852, + "logps/chosen": -132.65444946289062, + "logps/rejected": -299.92486572265625, + "loss": 0.011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.212158679962158, + "rewards/margins": 18.336050033569336, + "rewards/rejected": -22.548208236694336, + "step": 2370 + }, + { + "epoch": 1.09, + "learning_rate": 3.544378698224852e-07, + "logits/chosen": -2.131582260131836, + "logits/rejected": -1.9988138675689697, + "logps/chosen": -138.9375457763672, + "logps/rejected": -296.9496154785156, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.4065937995910645, + "rewards/margins": 17.19466781616211, + "rewards/rejected": -22.601261138916016, + "step": 2380 + }, + { + "epoch": 1.09, + "learning_rate": 3.53592561284869e-07, + "logits/chosen": -2.23891282081604, + "logits/rejected": -2.1590425968170166, + "logps/chosen": -122.85921478271484, + "logps/rejected": -241.72793579101562, + "loss": 0.0136, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6361231803894043, + "rewards/margins": 13.326749801635742, + "rewards/rejected": -16.962873458862305, + "step": 2390 + }, + { + "epoch": 1.1, + "learning_rate": 3.5274725274725275e-07, + "logits/chosen": -2.228616714477539, + "logits/rejected": -2.143519878387451, + "logps/chosen": -121.30659484863281, + "logps/rejected": -244.88333129882812, + "loss": 0.0103, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.2300007343292236, + "rewards/margins": 13.734130859375, + "rewards/rejected": -16.964130401611328, + "step": 2400 + }, + { + "epoch": 1.1, + "eval_logits/chosen": -2.257730722427368, + "eval_logits/rejected": -2.151594877243042, + "eval_logps/chosen": -129.5041961669922, + "eval_logps/rejected": -237.86705017089844, + "eval_loss": 0.022165490314364433, + "eval_rewards/accuracies": 0.9777777791023254, + "eval_rewards/chosen": -3.826699733734131, + "eval_rewards/margins": 12.949548721313477, + "eval_rewards/rejected": -16.776248931884766, + "eval_runtime": 48.3089, + "eval_samples_per_second": 59.244, + "eval_steps_per_second": 1.863, + "step": 2400 + }, + { + "epoch": 1.1, + "learning_rate": 3.519019442096365e-07, + "logits/chosen": -2.211165428161621, + "logits/rejected": -2.1267261505126953, + "logps/chosen": -128.97445678710938, + "logps/rejected": -244.82559204101562, + "loss": 0.0184, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.3847079277038574, + "rewards/margins": 13.957148551940918, + "rewards/rejected": -17.341854095458984, + "step": 2410 + }, + { + "epoch": 1.1, + "learning_rate": 3.5105663567202026e-07, + "logits/chosen": -2.083887815475464, + "logits/rejected": -1.9280351400375366, + "logps/chosen": -133.95697021484375, + "logps/rejected": -273.7509765625, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.235071659088135, + "rewards/margins": 16.102216720581055, + "rewards/rejected": -20.33728790283203, + "step": 2420 + }, + { + "epoch": 1.11, + "learning_rate": 3.50211327134404e-07, + "logits/chosen": -2.0661416053771973, + "logits/rejected": -1.9433352947235107, + "logps/chosen": -131.2800750732422, + "logps/rejected": -283.74798583984375, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.225616931915283, + "rewards/margins": 16.740453720092773, + "rewards/rejected": -20.9660701751709, + "step": 2430 + }, + { + "epoch": 1.11, + "learning_rate": 3.4936601859678783e-07, + "logits/chosen": -2.0689034461975098, + "logits/rejected": -1.9006198644638062, + "logps/chosen": -140.2222900390625, + "logps/rejected": -282.52154541015625, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.639592170715332, + "rewards/margins": 16.51576805114746, + "rewards/rejected": -21.155359268188477, + "step": 2440 + }, + { + "epoch": 1.12, + "learning_rate": 3.485207100591716e-07, + "logits/chosen": -2.069180965423584, + "logits/rejected": -1.8828113079071045, + "logps/chosen": -132.38973999023438, + "logps/rejected": -282.66888427734375, + "loss": 0.008, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.3987526893615723, + "rewards/margins": 17.470693588256836, + "rewards/rejected": -20.86944580078125, + "step": 2450 + }, + { + "epoch": 1.12, + "learning_rate": 3.4767540152155534e-07, + "logits/chosen": -2.0478618144989014, + "logits/rejected": -1.8923364877700806, + "logps/chosen": -137.94956970214844, + "logps/rejected": -262.4928283691406, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1068878173828125, + "rewards/margins": 14.977930068969727, + "rewards/rejected": -19.08481788635254, + "step": 2460 + }, + { + "epoch": 1.13, + "learning_rate": 3.468300929839391e-07, + "logits/chosen": -2.032320499420166, + "logits/rejected": -1.8694692850112915, + "logps/chosen": -129.439697265625, + "logps/rejected": -256.06854248046875, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.800882339477539, + "rewards/margins": 14.736040115356445, + "rewards/rejected": -18.53692054748535, + "step": 2470 + }, + { + "epoch": 1.13, + "learning_rate": 3.459847844463229e-07, + "logits/chosen": -1.9632114171981812, + "logits/rejected": -1.772853136062622, + "logps/chosen": -137.02362060546875, + "logps/rejected": -271.421875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.54182243347168, + "rewards/margins": 15.439663887023926, + "rewards/rejected": -19.981487274169922, + "step": 2480 + }, + { + "epoch": 1.14, + "learning_rate": 3.4513947590870667e-07, + "logits/chosen": -1.9685713052749634, + "logits/rejected": -1.7855579853057861, + "logps/chosen": -116.3403549194336, + "logps/rejected": -269.75408935546875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9411520957946777, + "rewards/margins": 16.898277282714844, + "rewards/rejected": -19.839427947998047, + "step": 2490 + }, + { + "epoch": 1.14, + "learning_rate": 3.442941673710904e-07, + "logits/chosen": -1.9229958057403564, + "logits/rejected": -1.7180604934692383, + "logps/chosen": -141.33531188964844, + "logps/rejected": -287.43255615234375, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.679624080657959, + "rewards/margins": 17.07419204711914, + "rewards/rejected": -21.753812789916992, + "step": 2500 + }, + { + "epoch": 1.14, + "eval_logits/chosen": -1.8552100658416748, + "eval_logits/rejected": -1.660325288772583, + "eval_logps/chosen": -147.8096923828125, + "eval_logps/rejected": -279.1315002441406, + "eval_loss": 0.022998971864581108, + "eval_rewards/accuracies": 0.9750000238418579, + "eval_rewards/chosen": -5.65725040435791, + "eval_rewards/margins": 15.245442390441895, + "eval_rewards/rejected": -20.902692794799805, + "eval_runtime": 48.7103, + "eval_samples_per_second": 58.756, + "eval_steps_per_second": 1.848, + "step": 2500 + }, + { + "epoch": 1.15, + "learning_rate": 3.434488588334742e-07, + "logits/chosen": -1.945347785949707, + "logits/rejected": -1.7307497262954712, + "logps/chosen": -135.29379272460938, + "logps/rejected": -286.13726806640625, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.308348178863525, + "rewards/margins": 17.14834976196289, + "rewards/rejected": -21.45669937133789, + "step": 2510 + }, + { + "epoch": 1.15, + "learning_rate": 3.42603550295858e-07, + "logits/chosen": -1.9633537530899048, + "logits/rejected": -1.753211259841919, + "logps/chosen": -133.03085327148438, + "logps/rejected": -293.79693603515625, + "loss": 0.0056, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.341958999633789, + "rewards/margins": 17.850200653076172, + "rewards/rejected": -22.19215965270996, + "step": 2520 + }, + { + "epoch": 1.15, + "learning_rate": 3.4175824175824175e-07, + "logits/chosen": -1.9777822494506836, + "logits/rejected": -1.8326003551483154, + "logps/chosen": -136.5177459716797, + "logps/rejected": -278.86993408203125, + "loss": 0.0194, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.523541450500488, + "rewards/margins": 16.020450592041016, + "rewards/rejected": -20.543991088867188, + "step": 2530 + }, + { + "epoch": 1.16, + "learning_rate": 3.409129332206255e-07, + "logits/chosen": -2.049921989440918, + "logits/rejected": -1.9096603393554688, + "logps/chosen": -128.6142578125, + "logps/rejected": -250.35678100585938, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5399975776672363, + "rewards/margins": 14.6093111038208, + "rewards/rejected": -18.149309158325195, + "step": 2540 + }, + { + "epoch": 1.16, + "learning_rate": 3.4006762468300926e-07, + "logits/chosen": -2.027036666870117, + "logits/rejected": -1.8620399236679077, + "logps/chosen": -134.22491455078125, + "logps/rejected": -270.4568786621094, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9924380779266357, + "rewards/margins": 16.017200469970703, + "rewards/rejected": -20.009639739990234, + "step": 2550 + }, + { + "epoch": 1.17, + "learning_rate": 3.39222316145393e-07, + "logits/chosen": -1.9359347820281982, + "logits/rejected": -1.7949869632720947, + "logps/chosen": -141.54837036132812, + "logps/rejected": -281.8461608886719, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.612332344055176, + "rewards/margins": 15.11158561706543, + "rewards/rejected": -20.723918914794922, + "step": 2560 + }, + { + "epoch": 1.17, + "learning_rate": 3.3837700760777683e-07, + "logits/chosen": -1.879448652267456, + "logits/rejected": -1.6673691272735596, + "logps/chosen": -160.62313842773438, + "logps/rejected": -294.2522277832031, + "loss": 0.0097, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.579423427581787, + "rewards/margins": 15.896377563476562, + "rewards/rejected": -22.47580337524414, + "step": 2570 + }, + { + "epoch": 1.18, + "learning_rate": 3.375316990701606e-07, + "logits/chosen": -2.0175633430480957, + "logits/rejected": -1.8550834655761719, + "logps/chosen": -142.2264404296875, + "logps/rejected": -282.3786926269531, + "loss": 0.0061, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.80888032913208, + "rewards/margins": 16.41634178161621, + "rewards/rejected": -21.225223541259766, + "step": 2580 + }, + { + "epoch": 1.18, + "learning_rate": 3.366863905325444e-07, + "logits/chosen": -1.9725208282470703, + "logits/rejected": -1.8113142251968384, + "logps/chosen": -134.61044311523438, + "logps/rejected": -288.10003662109375, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.315024375915527, + "rewards/margins": 16.41974639892578, + "rewards/rejected": -21.73476791381836, + "step": 2590 + }, + { + "epoch": 1.19, + "learning_rate": 3.358410819949281e-07, + "logits/chosen": -2.056762218475342, + "logits/rejected": -1.913812279701233, + "logps/chosen": -122.4278793334961, + "logps/rejected": -275.90936279296875, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6767172813415527, + "rewards/margins": 16.875553131103516, + "rewards/rejected": -20.552268981933594, + "step": 2600 + }, + { + "epoch": 1.19, + "eval_logits/chosen": -2.0171782970428467, + "eval_logits/rejected": -1.8500170707702637, + "eval_logps/chosen": -140.48916625976562, + "eval_logps/rejected": -263.9971618652344, + "eval_loss": 0.02173527143895626, + "eval_rewards/accuracies": 0.9833333492279053, + "eval_rewards/chosen": -4.925196170806885, + "eval_rewards/margins": 14.46406078338623, + "eval_rewards/rejected": -19.38925552368164, + "eval_runtime": 48.2981, + "eval_samples_per_second": 59.257, + "eval_steps_per_second": 1.863, + "step": 2600 + }, + { + "epoch": 1.19, + "learning_rate": 3.349957734573119e-07, + "logits/chosen": -2.0181078910827637, + "logits/rejected": -1.8825185298919678, + "logps/chosen": -137.3406982421875, + "logps/rejected": -271.9612121582031, + "loss": 0.0104, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.693453788757324, + "rewards/margins": 15.234161376953125, + "rewards/rejected": -19.927616119384766, + "step": 2610 + }, + { + "epoch": 1.2, + "learning_rate": 3.3415046491969567e-07, + "logits/chosen": -2.041337251663208, + "logits/rejected": -1.866619348526001, + "logps/chosen": -137.9915313720703, + "logps/rejected": -269.0408020019531, + "loss": 0.0136, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.2950334548950195, + "rewards/margins": 15.195487976074219, + "rewards/rejected": -19.490522384643555, + "step": 2620 + }, + { + "epoch": 1.2, + "learning_rate": 3.333051563820795e-07, + "logits/chosen": -2.074596881866455, + "logits/rejected": -1.9203710556030273, + "logps/chosen": -132.4041748046875, + "logps/rejected": -252.46218872070312, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1531267166137695, + "rewards/margins": 13.75012493133545, + "rewards/rejected": -17.903249740600586, + "step": 2630 + }, + { + "epoch": 1.2, + "learning_rate": 3.3245984784446324e-07, + "logits/chosen": -2.0898518562316895, + "logits/rejected": -1.9567184448242188, + "logps/chosen": -140.36544799804688, + "logps/rejected": -271.5333557128906, + "loss": 0.0187, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.7434005737304688, + "rewards/margins": 15.86798095703125, + "rewards/rejected": -19.611379623413086, + "step": 2640 + }, + { + "epoch": 1.21, + "learning_rate": 3.3161453930684694e-07, + "logits/chosen": -1.94447922706604, + "logits/rejected": -1.8102020025253296, + "logps/chosen": -150.39633178710938, + "logps/rejected": -288.087158203125, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.932501792907715, + "rewards/margins": 15.888578414916992, + "rewards/rejected": -21.821081161499023, + "step": 2650 + }, + { + "epoch": 1.21, + "learning_rate": 3.3076923076923075e-07, + "logits/chosen": -1.962162733078003, + "logits/rejected": -1.796449065208435, + "logps/chosen": -137.46371459960938, + "logps/rejected": -309.79241943359375, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.807311534881592, + "rewards/margins": 19.060232162475586, + "rewards/rejected": -23.867542266845703, + "step": 2660 + }, + { + "epoch": 1.22, + "learning_rate": 3.299239222316145e-07, + "logits/chosen": -2.0989317893981934, + "logits/rejected": -1.9328184127807617, + "logps/chosen": -129.36871337890625, + "logps/rejected": -271.81866455078125, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.052420139312744, + "rewards/margins": 16.742918014526367, + "rewards/rejected": -19.795337677001953, + "step": 2670 + }, + { + "epoch": 1.22, + "learning_rate": 3.290786136939983e-07, + "logits/chosen": -2.135300397872925, + "logits/rejected": -2.0134975910186768, + "logps/chosen": -128.8970489501953, + "logps/rejected": -272.21417236328125, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.772590160369873, + "rewards/margins": 16.183759689331055, + "rewards/rejected": -19.956350326538086, + "step": 2680 + }, + { + "epoch": 1.23, + "learning_rate": 3.282333051563821e-07, + "logits/chosen": -2.1228604316711426, + "logits/rejected": -1.9986953735351562, + "logps/chosen": -134.7543182373047, + "logps/rejected": -265.08306884765625, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.4857401847839355, + "rewards/margins": 15.023765563964844, + "rewards/rejected": -19.509506225585938, + "step": 2690 + }, + { + "epoch": 1.23, + "learning_rate": 3.2738799661876583e-07, + "logits/chosen": -2.0504837036132812, + "logits/rejected": -1.9117425680160522, + "logps/chosen": -145.67308044433594, + "logps/rejected": -270.6615905761719, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.501971244812012, + "rewards/margins": 14.847882270812988, + "rewards/rejected": -20.349851608276367, + "step": 2700 + }, + { + "epoch": 1.23, + "eval_logits/chosen": -2.017770290374756, + "eval_logits/rejected": -1.867299199104309, + "eval_logps/chosen": -145.2077178955078, + "eval_logps/rejected": -271.73480224609375, + "eval_loss": 0.023520223796367645, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -5.397050380706787, + "eval_rewards/margins": 14.765971183776855, + "eval_rewards/rejected": -20.163022994995117, + "eval_runtime": 49.0085, + "eval_samples_per_second": 58.398, + "eval_steps_per_second": 1.836, + "step": 2700 + }, + { + "epoch": 1.24, + "learning_rate": 3.265426880811496e-07, + "logits/chosen": -2.0202062129974365, + "logits/rejected": -1.8841352462768555, + "logps/chosen": -151.43978881835938, + "logps/rejected": -286.70758056640625, + "loss": 0.0055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.582226753234863, + "rewards/margins": 15.728716850280762, + "rewards/rejected": -21.310945510864258, + "step": 2710 + }, + { + "epoch": 1.24, + "learning_rate": 3.256973795435334e-07, + "logits/chosen": -2.0473127365112305, + "logits/rejected": -1.9340074062347412, + "logps/chosen": -134.10830688476562, + "logps/rejected": -270.43170166015625, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.385322570800781, + "rewards/margins": 15.43200969696045, + "rewards/rejected": -19.817331314086914, + "step": 2720 + }, + { + "epoch": 1.25, + "learning_rate": 3.2485207100591716e-07, + "logits/chosen": -2.0043697357177734, + "logits/rejected": -1.8764839172363281, + "logps/chosen": -128.83837890625, + "logps/rejected": -265.72991943359375, + "loss": 0.0118, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.232174873352051, + "rewards/margins": 15.495455741882324, + "rewards/rejected": -19.727630615234375, + "step": 2730 + }, + { + "epoch": 1.25, + "learning_rate": 3.2400676246830097e-07, + "logits/chosen": -1.9958341121673584, + "logits/rejected": -1.8539024591445923, + "logps/chosen": -134.306884765625, + "logps/rejected": -271.220458984375, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.187023162841797, + "rewards/margins": 15.482475280761719, + "rewards/rejected": -19.66950035095215, + "step": 2740 + }, + { + "epoch": 1.26, + "learning_rate": 3.2316145393068467e-07, + "logits/chosen": -1.9477897882461548, + "logits/rejected": -1.8062279224395752, + "logps/chosen": -127.55613708496094, + "logps/rejected": -272.15313720703125, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4938323497772217, + "rewards/margins": 16.706642150878906, + "rewards/rejected": -20.20047378540039, + "step": 2750 + }, + { + "epoch": 1.26, + "learning_rate": 3.2231614539306843e-07, + "logits/chosen": -1.957558035850525, + "logits/rejected": -1.832824468612671, + "logps/chosen": -131.24839782714844, + "logps/rejected": -260.27166748046875, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.663699150085449, + "rewards/margins": 14.462875366210938, + "rewards/rejected": -19.12657356262207, + "step": 2760 + }, + { + "epoch": 1.26, + "learning_rate": 3.2147083685545224e-07, + "logits/chosen": -2.097468852996826, + "logits/rejected": -2.0030126571655273, + "logps/chosen": -131.1923828125, + "logps/rejected": -261.2097473144531, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6247544288635254, + "rewards/margins": 15.17779541015625, + "rewards/rejected": -18.802549362182617, + "step": 2770 + }, + { + "epoch": 1.27, + "learning_rate": 3.20625528317836e-07, + "logits/chosen": -2.0422565937042236, + "logits/rejected": -1.9468787908554077, + "logps/chosen": -135.0767822265625, + "logps/rejected": -260.1372985839844, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.394379615783691, + "rewards/margins": 14.503211975097656, + "rewards/rejected": -18.897592544555664, + "step": 2780 + }, + { + "epoch": 1.27, + "learning_rate": 3.1978021978021975e-07, + "logits/chosen": -2.0489416122436523, + "logits/rejected": -1.9344393014907837, + "logps/chosen": -137.5537109375, + "logps/rejected": -256.15869140625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1104559898376465, + "rewards/margins": 14.742477416992188, + "rewards/rejected": -18.852933883666992, + "step": 2790 + }, + { + "epoch": 1.28, + "learning_rate": 3.189349112426035e-07, + "logits/chosen": -2.0865895748138428, + "logits/rejected": -1.9760797023773193, + "logps/chosen": -133.05752563476562, + "logps/rejected": -261.711181640625, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.011336326599121, + "rewards/margins": 15.45531940460205, + "rewards/rejected": -19.466655731201172, + "step": 2800 + }, + { + "epoch": 1.28, + "eval_logits/chosen": -2.152179002761841, + "eval_logits/rejected": -2.0417330265045166, + "eval_logps/chosen": -135.65919494628906, + "eval_logps/rejected": -252.37962341308594, + "eval_loss": 0.025323208421468735, + "eval_rewards/accuracies": 0.9777777791023254, + "eval_rewards/chosen": -4.44219970703125, + "eval_rewards/margins": 13.785304069519043, + "eval_rewards/rejected": -18.22750473022461, + "eval_runtime": 49.1425, + "eval_samples_per_second": 58.239, + "eval_steps_per_second": 1.831, + "step": 2800 + }, + { + "epoch": 1.28, + "learning_rate": 3.180896027049873e-07, + "logits/chosen": -2.1095643043518066, + "logits/rejected": -2.021012783050537, + "logps/chosen": -137.2529296875, + "logps/rejected": -278.6671447753906, + "loss": 0.0047, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.625119686126709, + "rewards/margins": 16.213436126708984, + "rewards/rejected": -20.83855628967285, + "step": 2810 + }, + { + "epoch": 1.29, + "learning_rate": 3.172442941673711e-07, + "logits/chosen": -2.064513683319092, + "logits/rejected": -1.9486706256866455, + "logps/chosen": -141.18331909179688, + "logps/rejected": -278.5491638183594, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.658184051513672, + "rewards/margins": 16.239248275756836, + "rewards/rejected": -20.897432327270508, + "step": 2820 + }, + { + "epoch": 1.29, + "learning_rate": 3.163989856297549e-07, + "logits/chosen": -2.0233511924743652, + "logits/rejected": -1.8923966884613037, + "logps/chosen": -143.33706665039062, + "logps/rejected": -295.3087463378906, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.4150919914245605, + "rewards/margins": 17.231006622314453, + "rewards/rejected": -22.64609718322754, + "step": 2830 + }, + { + "epoch": 1.3, + "learning_rate": 3.155536770921386e-07, + "logits/chosen": -2.0582056045532227, + "logits/rejected": -1.9745107889175415, + "logps/chosen": -147.27874755859375, + "logps/rejected": -292.4425354003906, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.995936870574951, + "rewards/margins": 16.957895278930664, + "rewards/rejected": -21.95383071899414, + "step": 2840 + }, + { + "epoch": 1.3, + "learning_rate": 3.147083685545224e-07, + "logits/chosen": -2.301547050476074, + "logits/rejected": -2.2333405017852783, + "logps/chosen": -114.71829986572266, + "logps/rejected": -219.6496124267578, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4453327655792236, + "rewards/margins": 12.367315292358398, + "rewards/rejected": -14.812649726867676, + "step": 2850 + }, + { + "epoch": 1.31, + "learning_rate": 3.1386306001690616e-07, + "logits/chosen": -2.29642391204834, + "logits/rejected": -2.225308656692505, + "logps/chosen": -117.47834777832031, + "logps/rejected": -219.9888153076172, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5083842277526855, + "rewards/margins": 12.631601333618164, + "rewards/rejected": -15.139986991882324, + "step": 2860 + }, + { + "epoch": 1.31, + "learning_rate": 3.130177514792899e-07, + "logits/chosen": -2.269688129425049, + "logits/rejected": -2.1952171325683594, + "logps/chosen": -123.7551498413086, + "logps/rejected": -238.8106689453125, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.254749298095703, + "rewards/margins": 13.318034172058105, + "rewards/rejected": -16.572784423828125, + "step": 2870 + }, + { + "epoch": 1.31, + "learning_rate": 3.121724429416737e-07, + "logits/chosen": -2.225463628768921, + "logits/rejected": -2.132333993911743, + "logps/chosen": -126.89964294433594, + "logps/rejected": -262.2653503417969, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3202064037323, + "rewards/margins": 15.432042121887207, + "rewards/rejected": -18.752248764038086, + "step": 2880 + }, + { + "epoch": 1.32, + "learning_rate": 3.1132713440405743e-07, + "logits/chosen": -2.1518263816833496, + "logits/rejected": -2.0612220764160156, + "logps/chosen": -134.36441040039062, + "logps/rejected": -267.93707275390625, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.020511627197266, + "rewards/margins": 15.682164192199707, + "rewards/rejected": -19.70267677307129, + "step": 2890 + }, + { + "epoch": 1.32, + "learning_rate": 3.1048182586644124e-07, + "logits/chosen": -2.1560277938842773, + "logits/rejected": -2.056459426879883, + "logps/chosen": -131.1350555419922, + "logps/rejected": -267.2361755371094, + "loss": 0.0064, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.494834899902344, + "rewards/margins": 14.966562271118164, + "rewards/rejected": -19.461395263671875, + "step": 2900 + }, + { + "epoch": 1.32, + "eval_logits/chosen": -2.141598701477051, + "eval_logits/rejected": -2.027343273162842, + "eval_logps/chosen": -134.41439819335938, + "eval_logps/rejected": -257.9463195800781, + "eval_loss": 0.02687516249716282, + "eval_rewards/accuracies": 0.9750000238418579, + "eval_rewards/chosen": -4.317718505859375, + "eval_rewards/margins": 14.466452598571777, + "eval_rewards/rejected": -18.784168243408203, + "eval_runtime": 48.4811, + "eval_samples_per_second": 59.033, + "eval_steps_per_second": 1.856, + "step": 2900 + }, + { + "epoch": 1.33, + "learning_rate": 3.09636517328825e-07, + "logits/chosen": -2.116936683654785, + "logits/rejected": -2.014040231704712, + "logps/chosen": -132.79800415039062, + "logps/rejected": -271.3309020996094, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.4362220764160156, + "rewards/margins": 16.35221290588379, + "rewards/rejected": -19.788434982299805, + "step": 2910 + }, + { + "epoch": 1.33, + "learning_rate": 3.087912087912088e-07, + "logits/chosen": -2.075852632522583, + "logits/rejected": -2.0212950706481934, + "logps/chosen": -141.65907287597656, + "logps/rejected": -264.9969482421875, + "loss": 0.0156, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.6742095947265625, + "rewards/margins": 15.127553939819336, + "rewards/rejected": -19.8017635345459, + "step": 2920 + }, + { + "epoch": 1.34, + "learning_rate": 3.0794590025359256e-07, + "logits/chosen": -2.0961661338806152, + "logits/rejected": -1.9971954822540283, + "logps/chosen": -130.35186767578125, + "logps/rejected": -270.8241271972656, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.06374454498291, + "rewards/margins": 16.1806697845459, + "rewards/rejected": -20.244413375854492, + "step": 2930 + }, + { + "epoch": 1.34, + "learning_rate": 3.071005917159763e-07, + "logits/chosen": -2.0451531410217285, + "logits/rejected": -1.938254952430725, + "logps/chosen": -130.60731506347656, + "logps/rejected": -266.6194763183594, + "loss": 0.0138, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.539628505706787, + "rewards/margins": 15.338510513305664, + "rewards/rejected": -19.878137588500977, + "step": 2940 + }, + { + "epoch": 1.35, + "learning_rate": 3.062552831783601e-07, + "logits/chosen": -2.068493366241455, + "logits/rejected": -1.9677813053131104, + "logps/chosen": -131.9779510498047, + "logps/rejected": -259.90789794921875, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.384075164794922, + "rewards/margins": 14.50419807434082, + "rewards/rejected": -18.88827133178711, + "step": 2950 + }, + { + "epoch": 1.35, + "learning_rate": 3.054099746407439e-07, + "logits/chosen": -2.090090751647949, + "logits/rejected": -1.969435691833496, + "logps/chosen": -131.5757598876953, + "logps/rejected": -254.5954132080078, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.72786021232605, + "rewards/margins": 14.49266529083252, + "rewards/rejected": -18.220523834228516, + "step": 2960 + }, + { + "epoch": 1.36, + "learning_rate": 3.0456466610312764e-07, + "logits/chosen": -1.991431474685669, + "logits/rejected": -1.859548568725586, + "logps/chosen": -126.8050765991211, + "logps/rejected": -244.7470245361328, + "loss": 0.0064, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.15117883682251, + "rewards/margins": 13.41639232635498, + "rewards/rejected": -17.56757164001465, + "step": 2970 + }, + { + "epoch": 1.36, + "learning_rate": 3.037193575655114e-07, + "logits/chosen": -2.0137648582458496, + "logits/rejected": -1.9008442163467407, + "logps/chosen": -134.04664611816406, + "logps/rejected": -252.976318359375, + "loss": 0.0052, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.397458553314209, + "rewards/margins": 13.81383228302002, + "rewards/rejected": -18.211292266845703, + "step": 2980 + }, + { + "epoch": 1.36, + "learning_rate": 3.0287404902789516e-07, + "logits/chosen": -2.012768268585205, + "logits/rejected": -1.8527495861053467, + "logps/chosen": -140.04957580566406, + "logps/rejected": -276.19427490234375, + "loss": 0.0221, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.443761348724365, + "rewards/margins": 15.549433708190918, + "rewards/rejected": -19.993196487426758, + "step": 2990 + }, + { + "epoch": 1.37, + "learning_rate": 3.020287404902789e-07, + "logits/chosen": -2.1158039569854736, + "logits/rejected": -1.957754373550415, + "logps/chosen": -128.07528686523438, + "logps/rejected": -259.60809326171875, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.460177183151245, + "rewards/margins": 15.643338203430176, + "rewards/rejected": -19.103515625, + "step": 3000 + }, + { + "epoch": 1.37, + "eval_logits/chosen": -2.078118324279785, + "eval_logits/rejected": -1.9133172035217285, + "eval_logps/chosen": -137.57186889648438, + "eval_logps/rejected": -264.0636291503906, + "eval_loss": 0.023424891754984856, + "eval_rewards/accuracies": 0.9722222089767456, + "eval_rewards/chosen": -4.633467674255371, + "eval_rewards/margins": 14.76243782043457, + "eval_rewards/rejected": -19.395904541015625, + "eval_runtime": 49.0999, + "eval_samples_per_second": 58.289, + "eval_steps_per_second": 1.833, + "step": 3000 + }, + { + "epoch": 1.37, + "learning_rate": 3.011834319526627e-07, + "logits/chosen": -2.0927577018737793, + "logits/rejected": -1.9399240016937256, + "logps/chosen": -135.44723510742188, + "logps/rejected": -270.5841979980469, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9826979637145996, + "rewards/margins": 16.10317611694336, + "rewards/rejected": -20.085874557495117, + "step": 3010 + }, + { + "epoch": 1.38, + "learning_rate": 3.003381234150465e-07, + "logits/chosen": -1.9384254217147827, + "logits/rejected": -1.738167405128479, + "logps/chosen": -132.06727600097656, + "logps/rejected": -276.01617431640625, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.956315279006958, + "rewards/margins": 16.69976043701172, + "rewards/rejected": -20.65607452392578, + "step": 3020 + }, + { + "epoch": 1.38, + "learning_rate": 2.9949281487743024e-07, + "logits/chosen": -1.9904791116714478, + "logits/rejected": -1.8166635036468506, + "logps/chosen": -134.52444458007812, + "logps/rejected": -280.89031982421875, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.553272247314453, + "rewards/margins": 16.27239990234375, + "rewards/rejected": -20.82567024230957, + "step": 3030 + }, + { + "epoch": 1.39, + "learning_rate": 2.98647506339814e-07, + "logits/chosen": -1.996466040611267, + "logits/rejected": -1.8200994729995728, + "logps/chosen": -126.57682800292969, + "logps/rejected": -277.4131774902344, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.318772554397583, + "rewards/margins": 17.410900115966797, + "rewards/rejected": -20.729673385620117, + "step": 3040 + }, + { + "epoch": 1.39, + "learning_rate": 2.978021978021978e-07, + "logits/chosen": -1.935706377029419, + "logits/rejected": -1.7807499170303345, + "logps/chosen": -138.53733825683594, + "logps/rejected": -271.65582275390625, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.194025993347168, + "rewards/margins": 15.971760749816895, + "rewards/rejected": -20.16578483581543, + "step": 3050 + }, + { + "epoch": 1.4, + "learning_rate": 2.9695688926458157e-07, + "logits/chosen": -1.9109838008880615, + "logits/rejected": -1.7679493427276611, + "logps/chosen": -138.1450958251953, + "logps/rejected": -284.1612243652344, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.85819149017334, + "rewards/margins": 16.53154754638672, + "rewards/rejected": -21.389739990234375, + "step": 3060 + }, + { + "epoch": 1.4, + "learning_rate": 2.961115807269654e-07, + "logits/chosen": -1.9031598567962646, + "logits/rejected": -1.7178453207015991, + "logps/chosen": -133.40499877929688, + "logps/rejected": -313.7869567871094, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.607630729675293, + "rewards/margins": 19.082489013671875, + "rewards/rejected": -23.690120697021484, + "step": 3070 + }, + { + "epoch": 1.41, + "learning_rate": 2.952662721893491e-07, + "logits/chosen": -1.9240505695343018, + "logits/rejected": -1.7546335458755493, + "logps/chosen": -140.02825927734375, + "logps/rejected": -313.56463623046875, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.939627647399902, + "rewards/margins": 18.84872817993164, + "rewards/rejected": -23.788354873657227, + "step": 3080 + }, + { + "epoch": 1.41, + "learning_rate": 2.9442096365173284e-07, + "logits/chosen": -1.9146785736083984, + "logits/rejected": -1.750884771347046, + "logps/chosen": -136.1601104736328, + "logps/rejected": -272.26361083984375, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.955554962158203, + "rewards/margins": 15.811803817749023, + "rewards/rejected": -19.767358779907227, + "step": 3090 + }, + { + "epoch": 1.41, + "learning_rate": 2.9357565511411665e-07, + "logits/chosen": -1.888639211654663, + "logits/rejected": -1.7459430694580078, + "logps/chosen": -144.42047119140625, + "logps/rejected": -279.8617248535156, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.060923099517822, + "rewards/margins": 15.9983549118042, + "rewards/rejected": -21.059276580810547, + "step": 3100 + }, + { + "epoch": 1.41, + "eval_logits/chosen": -1.909785270690918, + "eval_logits/rejected": -1.7623956203460693, + "eval_logps/chosen": -142.03558349609375, + "eval_logps/rejected": -271.3341064453125, + "eval_loss": 0.022984443232417107, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -5.079836845397949, + "eval_rewards/margins": 15.043112754821777, + "eval_rewards/rejected": -20.122955322265625, + "eval_runtime": 49.0951, + "eval_samples_per_second": 58.295, + "eval_steps_per_second": 1.833, + "step": 3100 + }, + { + "epoch": 1.42, + "learning_rate": 2.927303465765004e-07, + "logits/chosen": -1.9462015628814697, + "logits/rejected": -1.7958303689956665, + "logps/chosen": -140.11074829101562, + "logps/rejected": -295.42303466796875, + "loss": 0.0057, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.818387508392334, + "rewards/margins": 17.30405044555664, + "rewards/rejected": -22.1224365234375, + "step": 3110 + }, + { + "epoch": 1.42, + "learning_rate": 2.918850380388842e-07, + "logits/chosen": -1.9005266427993774, + "logits/rejected": -1.7481244802474976, + "logps/chosen": -152.64340209960938, + "logps/rejected": -297.1541442871094, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.619594573974609, + "rewards/margins": 16.809993743896484, + "rewards/rejected": -22.429584503173828, + "step": 3120 + }, + { + "epoch": 1.43, + "learning_rate": 2.910397295012679e-07, + "logits/chosen": -1.849311113357544, + "logits/rejected": -1.6929523944854736, + "logps/chosen": -148.63531494140625, + "logps/rejected": -290.8329772949219, + "loss": 0.0122, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.668116569519043, + "rewards/margins": 16.208858489990234, + "rewards/rejected": -21.87697410583496, + "step": 3130 + }, + { + "epoch": 1.43, + "learning_rate": 2.9019442096365173e-07, + "logits/chosen": -2.0505404472351074, + "logits/rejected": -1.8951294422149658, + "logps/chosen": -141.0372314453125, + "logps/rejected": -290.4275817871094, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.243703842163086, + "rewards/margins": 17.539813995361328, + "rewards/rejected": -21.78351593017578, + "step": 3140 + }, + { + "epoch": 1.44, + "learning_rate": 2.893491124260355e-07, + "logits/chosen": -2.0169425010681152, + "logits/rejected": -1.894471526145935, + "logps/chosen": -141.3474578857422, + "logps/rejected": -292.564453125, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.002017974853516, + "rewards/margins": 16.942005157470703, + "rewards/rejected": -21.944019317626953, + "step": 3150 + }, + { + "epoch": 1.44, + "learning_rate": 2.885038038884193e-07, + "logits/chosen": -2.069235324859619, + "logits/rejected": -1.9347785711288452, + "logps/chosen": -137.05416870117188, + "logps/rejected": -259.1018981933594, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.348564147949219, + "rewards/margins": 14.3115234375, + "rewards/rejected": -18.66008949279785, + "step": 3160 + }, + { + "epoch": 1.45, + "learning_rate": 2.8765849535080305e-07, + "logits/chosen": -2.0588865280151367, + "logits/rejected": -1.9449745416641235, + "logps/chosen": -130.26327514648438, + "logps/rejected": -272.8006286621094, + "loss": 0.0085, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.459763526916504, + "rewards/margins": 15.614460945129395, + "rewards/rejected": -20.0742244720459, + "step": 3170 + }, + { + "epoch": 1.45, + "learning_rate": 2.8681318681318676e-07, + "logits/chosen": -1.9862502813339233, + "logits/rejected": -1.8353523015975952, + "logps/chosen": -128.68417358398438, + "logps/rejected": -297.4949645996094, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.29546594619751, + "rewards/margins": 18.210203170776367, + "rewards/rejected": -22.50567054748535, + "step": 3180 + }, + { + "epoch": 1.46, + "learning_rate": 2.8596787827557057e-07, + "logits/chosen": -2.0963692665100098, + "logits/rejected": -1.99569833278656, + "logps/chosen": -130.62600708007812, + "logps/rejected": -252.66830444335938, + "loss": 0.008, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6897976398468018, + "rewards/margins": 14.709360122680664, + "rewards/rejected": -18.399158477783203, + "step": 3190 + }, + { + "epoch": 1.46, + "learning_rate": 2.851225697379543e-07, + "logits/chosen": -2.1021742820739746, + "logits/rejected": -1.9880530834197998, + "logps/chosen": -132.0804443359375, + "logps/rejected": -262.18572998046875, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.257116317749023, + "rewards/margins": 14.728506088256836, + "rewards/rejected": -18.98562240600586, + "step": 3200 + }, + { + "epoch": 1.46, + "eval_logits/chosen": -2.074456214904785, + "eval_logits/rejected": -1.949985384941101, + "eval_logps/chosen": -130.15731811523438, + "eval_logps/rejected": -251.4248504638672, + "eval_loss": 0.02166706882417202, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -3.892010450363159, + "eval_rewards/margins": 14.240015983581543, + "eval_rewards/rejected": -18.13202667236328, + "eval_runtime": 49.0701, + "eval_samples_per_second": 58.325, + "eval_steps_per_second": 1.834, + "step": 3200 + }, + { + "epoch": 1.47, + "learning_rate": 2.8427726120033813e-07, + "logits/chosen": -2.067147731781006, + "logits/rejected": -1.9356224536895752, + "logps/chosen": -124.56694030761719, + "logps/rejected": -271.8417053222656, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7113606929779053, + "rewards/margins": 17.175228118896484, + "rewards/rejected": -19.886587142944336, + "step": 3210 + }, + { + "epoch": 1.47, + "learning_rate": 2.834319526627219e-07, + "logits/chosen": -1.9816830158233643, + "logits/rejected": -1.8282426595687866, + "logps/chosen": -140.33413696289062, + "logps/rejected": -292.79754638671875, + "loss": 0.0076, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.944754123687744, + "rewards/margins": 17.03341293334961, + "rewards/rejected": -21.978168487548828, + "step": 3220 + }, + { + "epoch": 1.47, + "learning_rate": 2.8258664412510565e-07, + "logits/chosen": -1.9779627323150635, + "logits/rejected": -1.847848892211914, + "logps/chosen": -149.28030395507812, + "logps/rejected": -304.99969482421875, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.514216899871826, + "rewards/margins": 18.037212371826172, + "rewards/rejected": -23.551427841186523, + "step": 3230 + }, + { + "epoch": 1.48, + "learning_rate": 2.817413355874894e-07, + "logits/chosen": -1.9442542791366577, + "logits/rejected": -1.8127539157867432, + "logps/chosen": -143.69705200195312, + "logps/rejected": -310.5235900878906, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.017391204833984, + "rewards/margins": 18.894908905029297, + "rewards/rejected": -23.91229820251465, + "step": 3240 + }, + { + "epoch": 1.48, + "learning_rate": 2.808960270498732e-07, + "logits/chosen": -1.882361650466919, + "logits/rejected": -1.78091561794281, + "logps/chosen": -144.84976196289062, + "logps/rejected": -306.044921875, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.687883377075195, + "rewards/margins": 17.577857971191406, + "rewards/rejected": -23.265743255615234, + "step": 3250 + }, + { + "epoch": 1.49, + "learning_rate": 2.8005071851225697e-07, + "logits/chosen": -1.921932578086853, + "logits/rejected": -1.798011064529419, + "logps/chosen": -145.02206420898438, + "logps/rejected": -306.07464599609375, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.535216331481934, + "rewards/margins": 17.976417541503906, + "rewards/rejected": -23.51163101196289, + "step": 3260 + }, + { + "epoch": 1.49, + "learning_rate": 2.7920540997464073e-07, + "logits/chosen": -1.9915612936019897, + "logits/rejected": -1.8652572631835938, + "logps/chosen": -144.52069091796875, + "logps/rejected": -326.04132080078125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.750706672668457, + "rewards/margins": 19.223308563232422, + "rewards/rejected": -24.974014282226562, + "step": 3270 + }, + { + "epoch": 1.5, + "learning_rate": 2.783601014370245e-07, + "logits/chosen": -2.0454087257385254, + "logits/rejected": -1.9038234949111938, + "logps/chosen": -143.76519775390625, + "logps/rejected": -308.48480224609375, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.117907524108887, + "rewards/margins": 18.567800521850586, + "rewards/rejected": -23.685707092285156, + "step": 3280 + }, + { + "epoch": 1.5, + "learning_rate": 2.7751479289940824e-07, + "logits/chosen": -2.01210355758667, + "logits/rejected": -1.9171651601791382, + "logps/chosen": -149.78817749023438, + "logps/rejected": -302.19879150390625, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.596650123596191, + "rewards/margins": 17.594165802001953, + "rewards/rejected": -23.190814971923828, + "step": 3290 + }, + { + "epoch": 1.51, + "learning_rate": 2.7666948436179205e-07, + "logits/chosen": -2.067559003829956, + "logits/rejected": -1.9251827001571655, + "logps/chosen": -160.0213623046875, + "logps/rejected": -330.7239074707031, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.127202033996582, + "rewards/margins": 19.1259822845459, + "rewards/rejected": -25.253185272216797, + "step": 3300 + }, + { + "epoch": 1.51, + "eval_logits/chosen": -2.0357284545898438, + "eval_logits/rejected": -1.9011106491088867, + "eval_logps/chosen": -148.20668029785156, + "eval_logps/rejected": -288.69830322265625, + "eval_loss": 0.022296199575066566, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -5.696948051452637, + "eval_rewards/margins": 16.16242218017578, + "eval_rewards/rejected": -21.8593692779541, + "eval_runtime": 47.7021, + "eval_samples_per_second": 59.997, + "eval_steps_per_second": 1.887, + "step": 3300 + }, + { + "epoch": 1.51, + "learning_rate": 2.758241758241758e-07, + "logits/chosen": -2.0515849590301514, + "logits/rejected": -1.942239761352539, + "logps/chosen": -131.13009643554688, + "logps/rejected": -284.80010986328125, + "loss": 0.0073, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.422455787658691, + "rewards/margins": 16.477149963378906, + "rewards/rejected": -20.899606704711914, + "step": 3310 + }, + { + "epoch": 1.52, + "learning_rate": 2.7497886728655957e-07, + "logits/chosen": -2.049119710922241, + "logits/rejected": -1.9266822338104248, + "logps/chosen": -127.2447509765625, + "logps/rejected": -258.5926208496094, + "loss": 0.0221, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6637301445007324, + "rewards/margins": 15.236650466918945, + "rewards/rejected": -18.900381088256836, + "step": 3320 + }, + { + "epoch": 1.52, + "learning_rate": 2.741335587489433e-07, + "logits/chosen": -2.138721466064453, + "logits/rejected": -2.068021535873413, + "logps/chosen": -126.6335678100586, + "logps/rejected": -244.0006103515625, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2384860515594482, + "rewards/margins": 13.476173400878906, + "rewards/rejected": -16.714656829833984, + "step": 3330 + }, + { + "epoch": 1.52, + "learning_rate": 2.7328825021132714e-07, + "logits/chosen": -2.1016390323638916, + "logits/rejected": -1.9984807968139648, + "logps/chosen": -135.56008911132812, + "logps/rejected": -260.299560546875, + "loss": 0.0088, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.3465962409973145, + "rewards/margins": 14.227472305297852, + "rewards/rejected": -18.574068069458008, + "step": 3340 + }, + { + "epoch": 1.53, + "learning_rate": 2.724429416737109e-07, + "logits/chosen": -2.007490634918213, + "logits/rejected": -1.8925546407699585, + "logps/chosen": -142.90896606445312, + "logps/rejected": -262.3196716308594, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.897563457489014, + "rewards/margins": 14.112276077270508, + "rewards/rejected": -19.009838104248047, + "step": 3350 + }, + { + "epoch": 1.53, + "learning_rate": 2.715976331360947e-07, + "logits/chosen": -2.0170576572418213, + "logits/rejected": -1.9034395217895508, + "logps/chosen": -139.79205322265625, + "logps/rejected": -262.28289794921875, + "loss": 0.011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.756741046905518, + "rewards/margins": 14.202244758605957, + "rewards/rejected": -18.958986282348633, + "step": 3360 + }, + { + "epoch": 1.54, + "learning_rate": 2.707523245984784e-07, + "logits/chosen": -1.954490065574646, + "logits/rejected": -1.847299337387085, + "logps/chosen": -137.61386108398438, + "logps/rejected": -256.4851379394531, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.715554237365723, + "rewards/margins": 13.87446403503418, + "rewards/rejected": -18.59002113342285, + "step": 3370 + }, + { + "epoch": 1.54, + "learning_rate": 2.699070160608622e-07, + "logits/chosen": -1.930686593055725, + "logits/rejected": -1.7932345867156982, + "logps/chosen": -137.0919647216797, + "logps/rejected": -268.186767578125, + "loss": 0.0057, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.4739766120910645, + "rewards/margins": 15.397375106811523, + "rewards/rejected": -19.87135124206543, + "step": 3380 + }, + { + "epoch": 1.55, + "learning_rate": 2.69061707523246e-07, + "logits/chosen": -1.902661681175232, + "logits/rejected": -1.7563111782073975, + "logps/chosen": -132.42283630371094, + "logps/rejected": -278.3194885253906, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.570847034454346, + "rewards/margins": 16.12563133239746, + "rewards/rejected": -20.69647979736328, + "step": 3390 + }, + { + "epoch": 1.55, + "learning_rate": 2.6821639898562973e-07, + "logits/chosen": -1.9157822132110596, + "logits/rejected": -1.7864373922348022, + "logps/chosen": -136.57540893554688, + "logps/rejected": -260.8625793457031, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.147719383239746, + "rewards/margins": 15.035512924194336, + "rewards/rejected": -19.183231353759766, + "step": 3400 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -1.8729345798492432, + "eval_logits/rejected": -1.7467317581176758, + "eval_logps/chosen": -139.7764129638672, + "eval_logps/rejected": -252.1146697998047, + "eval_loss": 0.021492039784789085, + "eval_rewards/accuracies": 0.9861111044883728, + "eval_rewards/chosen": -4.8539228439331055, + "eval_rewards/margins": 13.347084999084473, + "eval_rewards/rejected": -18.20100975036621, + "eval_runtime": 48.9371, + "eval_samples_per_second": 58.483, + "eval_steps_per_second": 1.839, + "step": 3400 + }, + { + "epoch": 1.56, + "learning_rate": 2.6737109044801354e-07, + "logits/chosen": -1.9103200435638428, + "logits/rejected": -1.763331651687622, + "logps/chosen": -137.3577117919922, + "logps/rejected": -259.80828857421875, + "loss": 0.0142, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.467274188995361, + "rewards/margins": 14.363429069519043, + "rewards/rejected": -18.830707550048828, + "step": 3410 + }, + { + "epoch": 1.56, + "learning_rate": 2.6652578191039725e-07, + "logits/chosen": -1.8430370092391968, + "logits/rejected": -1.711909532546997, + "logps/chosen": -148.16954040527344, + "logps/rejected": -279.33135986328125, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.628726482391357, + "rewards/margins": 15.455856323242188, + "rewards/rejected": -21.08458137512207, + "step": 3420 + }, + { + "epoch": 1.57, + "learning_rate": 2.6568047337278106e-07, + "logits/chosen": -1.8523824214935303, + "logits/rejected": -1.7088029384613037, + "logps/chosen": -142.4571990966797, + "logps/rejected": -282.54547119140625, + "loss": 0.0082, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.997100830078125, + "rewards/margins": 15.815884590148926, + "rewards/rejected": -20.81298828125, + "step": 3430 + }, + { + "epoch": 1.57, + "learning_rate": 2.648351648351648e-07, + "logits/chosen": -1.9678691625595093, + "logits/rejected": -1.867297887802124, + "logps/chosen": -135.94393920898438, + "logps/rejected": -251.23452758789062, + "loss": 0.0127, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.613978624343872, + "rewards/margins": 13.765337944030762, + "rewards/rejected": -17.379316329956055, + "step": 3440 + }, + { + "epoch": 1.57, + "learning_rate": 2.639898562975486e-07, + "logits/chosen": -1.9418270587921143, + "logits/rejected": -1.833435297012329, + "logps/chosen": -131.86508178710938, + "logps/rejected": -259.078857421875, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.501054286956787, + "rewards/margins": 15.120953559875488, + "rewards/rejected": -18.622005462646484, + "step": 3450 + }, + { + "epoch": 1.58, + "learning_rate": 2.631445477599324e-07, + "logits/chosen": -1.9227845668792725, + "logits/rejected": -1.8239774703979492, + "logps/chosen": -133.7466278076172, + "logps/rejected": -259.53369140625, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.523486614227295, + "rewards/margins": 14.523590087890625, + "rewards/rejected": -19.047077178955078, + "step": 3460 + }, + { + "epoch": 1.58, + "learning_rate": 2.6229923922231614e-07, + "logits/chosen": -2.066107988357544, + "logits/rejected": -1.9604514837265015, + "logps/chosen": -122.82148742675781, + "logps/rejected": -241.93679809570312, + "loss": 0.0126, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.367501735687256, + "rewards/margins": 13.92534065246582, + "rewards/rejected": -17.2928409576416, + "step": 3470 + }, + { + "epoch": 1.59, + "learning_rate": 2.614539306846999e-07, + "logits/chosen": -2.0362019538879395, + "logits/rejected": -1.9253215789794922, + "logps/chosen": -126.20308685302734, + "logps/rejected": -241.9541473388672, + "loss": 0.0126, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.430163860321045, + "rewards/margins": 13.962501525878906, + "rewards/rejected": -17.39266586303711, + "step": 3480 + }, + { + "epoch": 1.59, + "learning_rate": 2.606086221470837e-07, + "logits/chosen": -2.0087404251098633, + "logits/rejected": -1.8917499780654907, + "logps/chosen": -138.81875610351562, + "logps/rejected": -268.839111328125, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.0037407875061035, + "rewards/margins": 15.63740348815918, + "rewards/rejected": -19.641145706176758, + "step": 3490 + }, + { + "epoch": 1.6, + "learning_rate": 2.5976331360946746e-07, + "logits/chosen": -1.9867855310440063, + "logits/rejected": -1.867781400680542, + "logps/chosen": -122.08673095703125, + "logps/rejected": -257.277587890625, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2326064109802246, + "rewards/margins": 15.269567489624023, + "rewards/rejected": -18.502174377441406, + "step": 3500 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8975638151168823, + "eval_logits/rejected": -1.7721211910247803, + "eval_logps/chosen": -142.01966857910156, + "eval_logps/rejected": -266.510009765625, + "eval_loss": 0.021842440590262413, + "eval_rewards/accuracies": 0.9833333492279053, + "eval_rewards/chosen": -5.078246116638184, + "eval_rewards/margins": 14.562296867370605, + "eval_rewards/rejected": -19.64054298400879, + "eval_runtime": 48.4963, + "eval_samples_per_second": 59.015, + "eval_steps_per_second": 1.856, + "step": 3500 + }, + { + "epoch": 1.6, + "learning_rate": 2.5891800507185117e-07, + "logits/chosen": -1.9047114849090576, + "logits/rejected": -1.7761516571044922, + "logps/chosen": -140.54165649414062, + "logps/rejected": -272.0914611816406, + "loss": 0.0052, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.641226768493652, + "rewards/margins": 15.182680130004883, + "rewards/rejected": -19.82390594482422, + "step": 3510 + }, + { + "epoch": 1.61, + "learning_rate": 2.58072696534235e-07, + "logits/chosen": -1.892962098121643, + "logits/rejected": -1.7451508045196533, + "logps/chosen": -150.09315490722656, + "logps/rejected": -296.66632080078125, + "loss": 0.0107, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.678755760192871, + "rewards/margins": 16.952327728271484, + "rewards/rejected": -22.63108253479004, + "step": 3520 + }, + { + "epoch": 1.61, + "learning_rate": 2.5722738799661873e-07, + "logits/chosen": -1.8864864110946655, + "logits/rejected": -1.7408676147460938, + "logps/chosen": -141.12594604492188, + "logps/rejected": -289.6314697265625, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.263436794281006, + "rewards/margins": 16.858762741088867, + "rewards/rejected": -22.122201919555664, + "step": 3530 + }, + { + "epoch": 1.62, + "learning_rate": 2.5638207945900254e-07, + "logits/chosen": -1.8301013708114624, + "logits/rejected": -1.6557966470718384, + "logps/chosen": -144.84866333007812, + "logps/rejected": -302.3169860839844, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.072390079498291, + "rewards/margins": 17.961421966552734, + "rewards/rejected": -23.033809661865234, + "step": 3540 + }, + { + "epoch": 1.62, + "learning_rate": 2.555367709213863e-07, + "logits/chosen": -1.874068021774292, + "logits/rejected": -1.708491325378418, + "logps/chosen": -133.3509063720703, + "logps/rejected": -283.3302917480469, + "loss": 0.0113, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.035571098327637, + "rewards/margins": 17.044132232666016, + "rewards/rejected": -21.079702377319336, + "step": 3550 + }, + { + "epoch": 1.62, + "learning_rate": 2.5469146238377006e-07, + "logits/chosen": -1.8753341436386108, + "logits/rejected": -1.7035486698150635, + "logps/chosen": -132.8619384765625, + "logps/rejected": -260.8714294433594, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.194260835647583, + "rewards/margins": 15.806689262390137, + "rewards/rejected": -19.00094985961914, + "step": 3560 + }, + { + "epoch": 1.63, + "learning_rate": 2.538461538461538e-07, + "logits/chosen": -1.8306610584259033, + "logits/rejected": -1.6790813207626343, + "logps/chosen": -129.2947540283203, + "logps/rejected": -261.3028259277344, + "loss": 0.0061, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.236842632293701, + "rewards/margins": 14.93622875213623, + "rewards/rejected": -19.173070907592773, + "step": 3570 + }, + { + "epoch": 1.63, + "learning_rate": 2.530008453085376e-07, + "logits/chosen": -1.8658215999603271, + "logits/rejected": -1.695593237876892, + "logps/chosen": -122.03929138183594, + "logps/rejected": -287.39764404296875, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4829065799713135, + "rewards/margins": 19.046436309814453, + "rewards/rejected": -21.529342651367188, + "step": 3580 + }, + { + "epoch": 1.64, + "learning_rate": 2.521555367709214e-07, + "logits/chosen": -1.784368872642517, + "logits/rejected": -1.6300216913223267, + "logps/chosen": -140.92726135253906, + "logps/rejected": -296.0577087402344, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.633268356323242, + "rewards/margins": 16.99161148071289, + "rewards/rejected": -21.6248779296875, + "step": 3590 + }, + { + "epoch": 1.64, + "learning_rate": 2.513102282333052e-07, + "logits/chosen": -1.7988407611846924, + "logits/rejected": -1.5915801525115967, + "logps/chosen": -130.07949829101562, + "logps/rejected": -284.1544494628906, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.352548122406006, + "rewards/margins": 18.076284408569336, + "rewards/rejected": -21.428834915161133, + "step": 3600 + }, + { + "epoch": 1.64, + "eval_logits/chosen": -1.7237591743469238, + "eval_logits/rejected": -1.5604692697525024, + "eval_logps/chosen": -142.53948974609375, + "eval_logps/rejected": -272.5372314453125, + "eval_loss": 0.021309753879904747, + "eval_rewards/accuracies": 0.9777777791023254, + "eval_rewards/chosen": -5.13023042678833, + "eval_rewards/margins": 15.113033294677734, + "eval_rewards/rejected": -20.243263244628906, + "eval_runtime": 48.3893, + "eval_samples_per_second": 59.145, + "eval_steps_per_second": 1.86, + "step": 3600 + }, + { + "epoch": 1.65, + "learning_rate": 2.504649196956889e-07, + "logits/chosen": -1.7095403671264648, + "logits/rejected": -1.5458117723464966, + "logps/chosen": -145.26849365234375, + "logps/rejected": -284.03741455078125, + "loss": 0.0075, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.109766960144043, + "rewards/margins": 16.189342498779297, + "rewards/rejected": -21.299108505249023, + "step": 3610 + }, + { + "epoch": 1.65, + "learning_rate": 2.4961961115807265e-07, + "logits/chosen": -1.779497504234314, + "logits/rejected": -1.641000747680664, + "logps/chosen": -131.517333984375, + "logps/rejected": -281.7998352050781, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.562431335449219, + "rewards/margins": 16.709604263305664, + "rewards/rejected": -21.272035598754883, + "step": 3620 + }, + { + "epoch": 1.66, + "learning_rate": 2.4877430262045646e-07, + "logits/chosen": -1.8060439825057983, + "logits/rejected": -1.6531273126602173, + "logps/chosen": -133.15786743164062, + "logps/rejected": -289.7575988769531, + "loss": 0.0101, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.9135138988494873, + "rewards/margins": 17.889179229736328, + "rewards/rejected": -21.802692413330078, + "step": 3630 + }, + { + "epoch": 1.66, + "learning_rate": 2.479289940828402e-07, + "logits/chosen": -1.8380343914031982, + "logits/rejected": -1.681099534034729, + "logps/chosen": -142.34280395507812, + "logps/rejected": -281.3111267089844, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.688026428222656, + "rewards/margins": 16.42477035522461, + "rewards/rejected": -21.112796783447266, + "step": 3640 + }, + { + "epoch": 1.67, + "learning_rate": 2.47083685545224e-07, + "logits/chosen": -1.7750627994537354, + "logits/rejected": -1.6246888637542725, + "logps/chosen": -138.1237335205078, + "logps/rejected": -296.9304504394531, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.636274337768555, + "rewards/margins": 17.649145126342773, + "rewards/rejected": -22.285419464111328, + "step": 3650 + }, + { + "epoch": 1.67, + "learning_rate": 2.462383770076078e-07, + "logits/chosen": -1.7485411167144775, + "logits/rejected": -1.5784364938735962, + "logps/chosen": -146.9974365234375, + "logps/rejected": -324.06549072265625, + "loss": 0.0072, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.838747978210449, + "rewards/margins": 19.2270565032959, + "rewards/rejected": -25.065805435180664, + "step": 3660 + }, + { + "epoch": 1.68, + "learning_rate": 2.4539306846999154e-07, + "logits/chosen": -1.648329496383667, + "logits/rejected": -1.4489551782608032, + "logps/chosen": -146.84127807617188, + "logps/rejected": -308.2528381347656, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.316080570220947, + "rewards/margins": 18.388378143310547, + "rewards/rejected": -23.70446014404297, + "step": 3670 + }, + { + "epoch": 1.68, + "learning_rate": 2.445477599323753e-07, + "logits/chosen": -1.696155309677124, + "logits/rejected": -1.4816488027572632, + "logps/chosen": -144.35195922851562, + "logps/rejected": -316.3251953125, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.834813594818115, + "rewards/margins": 19.11313819885254, + "rewards/rejected": -23.947948455810547, + "step": 3680 + }, + { + "epoch": 1.68, + "learning_rate": 2.437024513947591e-07, + "logits/chosen": -1.6801557540893555, + "logits/rejected": -1.4912935495376587, + "logps/chosen": -148.6354522705078, + "logps/rejected": -324.12432861328125, + "loss": 0.0073, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.01719856262207, + "rewards/margins": 20.103734970092773, + "rewards/rejected": -25.120933532714844, + "step": 3690 + }, + { + "epoch": 1.69, + "learning_rate": 2.4285714285714287e-07, + "logits/chosen": -1.7024446725845337, + "logits/rejected": -1.4968335628509521, + "logps/chosen": -130.99986267089844, + "logps/rejected": -303.06231689453125, + "loss": 0.0076, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.061335563659668, + "rewards/margins": 18.91769027709961, + "rewards/rejected": -22.979026794433594, + "step": 3700 + }, + { + "epoch": 1.69, + "eval_logits/chosen": -1.6558696031570435, + "eval_logits/rejected": -1.4704645872116089, + "eval_logps/chosen": -147.81668090820312, + "eval_logps/rejected": -295.7735290527344, + "eval_loss": 0.022238925099372864, + "eval_rewards/accuracies": 0.9777777791023254, + "eval_rewards/chosen": -5.657946586608887, + "eval_rewards/margins": 16.908945083618164, + "eval_rewards/rejected": -22.566892623901367, + "eval_runtime": 48.4889, + "eval_samples_per_second": 59.024, + "eval_steps_per_second": 1.856, + "step": 3700 + }, + { + "epoch": 1.69, + "learning_rate": 2.420118343195266e-07, + "logits/chosen": -1.7344181537628174, + "logits/rejected": -1.5882803201675415, + "logps/chosen": -140.1300506591797, + "logps/rejected": -290.84539794921875, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.182340145111084, + "rewards/margins": 17.883474349975586, + "rewards/rejected": -22.065814971923828, + "step": 3710 + }, + { + "epoch": 1.7, + "learning_rate": 2.411665257819104e-07, + "logits/chosen": -1.741615653038025, + "logits/rejected": -1.5667476654052734, + "logps/chosen": -133.61981201171875, + "logps/rejected": -274.44439697265625, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.086359977722168, + "rewards/margins": 16.344524383544922, + "rewards/rejected": -20.43088150024414, + "step": 3720 + }, + { + "epoch": 1.7, + "learning_rate": 2.4032121724429414e-07, + "logits/chosen": -1.8133243322372437, + "logits/rejected": -1.6576799154281616, + "logps/chosen": -137.39431762695312, + "logps/rejected": -262.45318603515625, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.237823486328125, + "rewards/margins": 14.64301872253418, + "rewards/rejected": -18.880840301513672, + "step": 3730 + }, + { + "epoch": 1.71, + "learning_rate": 2.394759087066779e-07, + "logits/chosen": -1.7763252258300781, + "logits/rejected": -1.5898211002349854, + "logps/chosen": -134.40628051757812, + "logps/rejected": -280.9346008300781, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.767348527908325, + "rewards/margins": 17.052318572998047, + "rewards/rejected": -20.81966781616211, + "step": 3740 + }, + { + "epoch": 1.71, + "learning_rate": 2.386306001690617e-07, + "logits/chosen": -1.6497255563735962, + "logits/rejected": -1.4708611965179443, + "logps/chosen": -136.38125610351562, + "logps/rejected": -289.7961730957031, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.592088222503662, + "rewards/margins": 17.545528411865234, + "rewards/rejected": -22.13762092590332, + "step": 3750 + }, + { + "epoch": 1.72, + "learning_rate": 2.377852916314455e-07, + "logits/chosen": -1.7078943252563477, + "logits/rejected": -1.5805634260177612, + "logps/chosen": -135.6635284423828, + "logps/rejected": -278.7815856933594, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.852513313293457, + "rewards/margins": 15.829513549804688, + "rewards/rejected": -20.68202781677246, + "step": 3760 + }, + { + "epoch": 1.72, + "learning_rate": 2.3693998309382922e-07, + "logits/chosen": -1.7141424417495728, + "logits/rejected": -1.585723638534546, + "logps/chosen": -130.74488830566406, + "logps/rejected": -281.3462829589844, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7584216594696045, + "rewards/margins": 16.653173446655273, + "rewards/rejected": -20.41159439086914, + "step": 3770 + }, + { + "epoch": 1.73, + "learning_rate": 2.36094674556213e-07, + "logits/chosen": -1.7240254878997803, + "logits/rejected": -1.5711407661437988, + "logps/chosen": -131.56781005859375, + "logps/rejected": -273.98455810546875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.972919464111328, + "rewards/margins": 16.38176155090332, + "rewards/rejected": -20.354679107666016, + "step": 3780 + }, + { + "epoch": 1.73, + "learning_rate": 2.3524936601859676e-07, + "logits/chosen": -1.7031100988388062, + "logits/rejected": -1.5339725017547607, + "logps/chosen": -139.65846252441406, + "logps/rejected": -275.16375732421875, + "loss": 0.0161, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.4210100173950195, + "rewards/margins": 15.863576889038086, + "rewards/rejected": -20.284587860107422, + "step": 3790 + }, + { + "epoch": 1.73, + "learning_rate": 2.3440405748098055e-07, + "logits/chosen": -1.8004308938980103, + "logits/rejected": -1.6649887561798096, + "logps/chosen": -116.216552734375, + "logps/rejected": -240.1760711669922, + "loss": 0.0124, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.5760066509246826, + "rewards/margins": 14.410638809204102, + "rewards/rejected": -16.98664665222168, + "step": 3800 + }, + { + "epoch": 1.73, + "eval_logits/chosen": -1.7606146335601807, + "eval_logits/rejected": -1.6381984949111938, + "eval_logps/chosen": -126.71410369873047, + "eval_logps/rejected": -231.4264373779297, + "eval_loss": 0.022137422114610672, + "eval_rewards/accuracies": 0.9861111044883728, + "eval_rewards/chosen": -3.5476911067962646, + "eval_rewards/margins": 12.584492683410645, + "eval_rewards/rejected": -16.132184982299805, + "eval_runtime": 49.0971, + "eval_samples_per_second": 58.293, + "eval_steps_per_second": 1.833, + "step": 3800 + }, + { + "epoch": 1.74, + "learning_rate": 2.3355874894336433e-07, + "logits/chosen": -1.7343565225601196, + "logits/rejected": -1.6265987157821655, + "logps/chosen": -127.9572982788086, + "logps/rejected": -241.6963653564453, + "loss": 0.0048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.091909408569336, + "rewards/margins": 13.281129837036133, + "rewards/rejected": -17.373037338256836, + "step": 3810 + }, + { + "epoch": 1.74, + "learning_rate": 2.327134404057481e-07, + "logits/chosen": -1.6919950246810913, + "logits/rejected": -1.5495738983154297, + "logps/chosen": -132.85983276367188, + "logps/rejected": -267.4934387207031, + "loss": 0.0059, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.089395046234131, + "rewards/margins": 15.883015632629395, + "rewards/rejected": -19.972410202026367, + "step": 3820 + }, + { + "epoch": 1.75, + "learning_rate": 2.3186813186813187e-07, + "logits/chosen": -1.7256600856781006, + "logits/rejected": -1.553453803062439, + "logps/chosen": -134.67343139648438, + "logps/rejected": -279.3134765625, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.113102912902832, + "rewards/margins": 16.799463272094727, + "rewards/rejected": -20.912565231323242, + "step": 3830 + }, + { + "epoch": 1.75, + "learning_rate": 2.3102282333051563e-07, + "logits/chosen": -1.6534430980682373, + "logits/rejected": -1.5118329524993896, + "logps/chosen": -132.36349487304688, + "logps/rejected": -284.98406982421875, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.512646198272705, + "rewards/margins": 17.21428871154785, + "rewards/rejected": -21.7269344329834, + "step": 3840 + }, + { + "epoch": 1.76, + "learning_rate": 2.301775147928994e-07, + "logits/chosen": -1.6359570026397705, + "logits/rejected": -1.4759365320205688, + "logps/chosen": -138.23097229003906, + "logps/rejected": -283.275146484375, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.575328350067139, + "rewards/margins": 16.573030471801758, + "rewards/rejected": -21.148357391357422, + "step": 3850 + }, + { + "epoch": 1.76, + "learning_rate": 2.2933220625528317e-07, + "logits/chosen": -1.7314882278442383, + "logits/rejected": -1.6037628650665283, + "logps/chosen": -136.9398651123047, + "logps/rejected": -272.9326171875, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.366036415100098, + "rewards/margins": 15.990483283996582, + "rewards/rejected": -20.35651969909668, + "step": 3860 + }, + { + "epoch": 1.77, + "learning_rate": 2.2848689771766693e-07, + "logits/chosen": -1.7622716426849365, + "logits/rejected": -1.6511011123657227, + "logps/chosen": -128.43710327148438, + "logps/rejected": -265.1844787597656, + "loss": 0.0087, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.14860200881958, + "rewards/margins": 15.2080659866333, + "rewards/rejected": -19.356666564941406, + "step": 3870 + }, + { + "epoch": 1.77, + "learning_rate": 2.276415891800507e-07, + "logits/chosen": -1.8153717517852783, + "logits/rejected": -1.6824384927749634, + "logps/chosen": -138.5544891357422, + "logps/rejected": -276.2153015136719, + "loss": 0.0031, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.317188739776611, + "rewards/margins": 15.966781616210938, + "rewards/rejected": -20.28396987915039, + "step": 3880 + }, + { + "epoch": 1.78, + "learning_rate": 2.2679628064243447e-07, + "logits/chosen": -1.8020213842391968, + "logits/rejected": -1.6542917490005493, + "logps/chosen": -123.3238754272461, + "logps/rejected": -290.70306396484375, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0738091468811035, + "rewards/margins": 18.742576599121094, + "rewards/rejected": -21.81638526916504, + "step": 3890 + }, + { + "epoch": 1.78, + "learning_rate": 2.2595097210481825e-07, + "logits/chosen": -1.7502750158309937, + "logits/rejected": -1.5834579467773438, + "logps/chosen": -141.47828674316406, + "logps/rejected": -303.58599853515625, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.480618476867676, + "rewards/margins": 18.75547981262207, + "rewards/rejected": -23.236099243164062, + "step": 3900 + }, + { + "epoch": 1.78, + "eval_logits/chosen": -1.713238000869751, + "eval_logits/rejected": -1.565711259841919, + "eval_logps/chosen": -141.85250854492188, + "eval_logps/rejected": -270.1767272949219, + "eval_loss": 0.02007424458861351, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -5.06152868270874, + "eval_rewards/margins": 14.945685386657715, + "eval_rewards/rejected": -20.007213592529297, + "eval_runtime": 48.854, + "eval_samples_per_second": 58.583, + "eval_steps_per_second": 1.842, + "step": 3900 + }, + { + "epoch": 1.78, + "learning_rate": 2.25105663567202e-07, + "logits/chosen": -1.744927167892456, + "logits/rejected": -1.593875527381897, + "logps/chosen": -132.0017547607422, + "logps/rejected": -284.510498046875, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.235764026641846, + "rewards/margins": 17.11020278930664, + "rewards/rejected": -21.345964431762695, + "step": 3910 + }, + { + "epoch": 1.79, + "learning_rate": 2.242603550295858e-07, + "logits/chosen": -1.7243611812591553, + "logits/rejected": -1.575244426727295, + "logps/chosen": -135.41140747070312, + "logps/rejected": -294.58331298828125, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.456456184387207, + "rewards/margins": 17.783428192138672, + "rewards/rejected": -22.23988151550293, + "step": 3920 + }, + { + "epoch": 1.79, + "learning_rate": 2.2341504649196957e-07, + "logits/chosen": -1.7868388891220093, + "logits/rejected": -1.6776450872421265, + "logps/chosen": -143.6422576904297, + "logps/rejected": -281.7991638183594, + "loss": 0.0085, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.448180198669434, + "rewards/margins": 16.026264190673828, + "rewards/rejected": -20.474443435668945, + "step": 3930 + }, + { + "epoch": 1.8, + "learning_rate": 2.2256973795435333e-07, + "logits/chosen": -1.7921279668807983, + "logits/rejected": -1.6391985416412354, + "logps/chosen": -136.37741088867188, + "logps/rejected": -279.2124328613281, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.4318318367004395, + "rewards/margins": 16.44356918334961, + "rewards/rejected": -20.87540054321289, + "step": 3940 + }, + { + "epoch": 1.8, + "learning_rate": 2.2172442941673711e-07, + "logits/chosen": -1.7016105651855469, + "logits/rejected": -1.5557249784469604, + "logps/chosen": -137.5961151123047, + "logps/rejected": -281.9116516113281, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.446589469909668, + "rewards/margins": 16.51205825805664, + "rewards/rejected": -20.958646774291992, + "step": 3950 + }, + { + "epoch": 1.81, + "learning_rate": 2.2087912087912087e-07, + "logits/chosen": -1.70087468624115, + "logits/rejected": -1.5272462368011475, + "logps/chosen": -134.16659545898438, + "logps/rejected": -300.7137145996094, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.51452112197876, + "rewards/margins": 18.287456512451172, + "rewards/rejected": -22.80197525024414, + "step": 3960 + }, + { + "epoch": 1.81, + "learning_rate": 2.2003381234150466e-07, + "logits/chosen": -1.7029697895050049, + "logits/rejected": -1.5384533405303955, + "logps/chosen": -135.23904418945312, + "logps/rejected": -311.7080993652344, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.424857139587402, + "rewards/margins": 19.508625030517578, + "rewards/rejected": -23.933483123779297, + "step": 3970 + }, + { + "epoch": 1.82, + "learning_rate": 2.1918850380388839e-07, + "logits/chosen": -1.7048301696777344, + "logits/rejected": -1.5668028593063354, + "logps/chosen": -155.3131103515625, + "logps/rejected": -305.61492919921875, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.269586086273193, + "rewards/margins": 17.230533599853516, + "rewards/rejected": -23.5001163482666, + "step": 3980 + }, + { + "epoch": 1.82, + "learning_rate": 2.1834319526627217e-07, + "logits/chosen": -1.716619849205017, + "logits/rejected": -1.5665405988693237, + "logps/chosen": -146.93238830566406, + "logps/rejected": -297.7852783203125, + "loss": 0.0243, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.378264427185059, + "rewards/margins": 17.037546157836914, + "rewards/rejected": -22.41581153869629, + "step": 3990 + }, + { + "epoch": 1.83, + "learning_rate": 2.1749788672865595e-07, + "logits/chosen": -1.7857425212860107, + "logits/rejected": -1.641950011253357, + "logps/chosen": -142.95773315429688, + "logps/rejected": -291.2868347167969, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.872292518615723, + "rewards/margins": 17.185726165771484, + "rewards/rejected": -22.058019638061523, + "step": 4000 + }, + { + "epoch": 1.83, + "eval_logits/chosen": -1.762734293937683, + "eval_logits/rejected": -1.618021011352539, + "eval_logps/chosen": -146.5255126953125, + "eval_logps/rejected": -275.50830078125, + "eval_loss": 0.019976630806922913, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -5.528830528259277, + "eval_rewards/margins": 15.011541366577148, + "eval_rewards/rejected": -20.540372848510742, + "eval_runtime": 49.1871, + "eval_samples_per_second": 58.186, + "eval_steps_per_second": 1.83, + "step": 4000 + }, + { + "epoch": 1.83, + "learning_rate": 2.166525781910397e-07, + "logits/chosen": -1.8257217407226562, + "logits/rejected": -1.7216644287109375, + "logps/chosen": -149.7119598388672, + "logps/rejected": -270.9111633300781, + "loss": 0.0134, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.1475443840026855, + "rewards/margins": 14.822622299194336, + "rewards/rejected": -19.970165252685547, + "step": 4010 + }, + { + "epoch": 1.83, + "learning_rate": 2.158072696534235e-07, + "logits/chosen": -1.81096613407135, + "logits/rejected": -1.7163082361221313, + "logps/chosen": -126.8602294921875, + "logps/rejected": -255.57363891601562, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.170827865600586, + "rewards/margins": 14.215136528015137, + "rewards/rejected": -18.385963439941406, + "step": 4020 + }, + { + "epoch": 1.84, + "learning_rate": 2.1496196111580725e-07, + "logits/chosen": -1.9823904037475586, + "logits/rejected": -1.87985360622406, + "logps/chosen": -136.60366821289062, + "logps/rejected": -257.38446044921875, + "loss": 0.0059, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.11444091796875, + "rewards/margins": 14.040242195129395, + "rewards/rejected": -18.154682159423828, + "step": 4030 + }, + { + "epoch": 1.84, + "learning_rate": 2.1411665257819104e-07, + "logits/chosen": -1.9636681079864502, + "logits/rejected": -1.8081839084625244, + "logps/chosen": -132.60195922851562, + "logps/rejected": -262.4747619628906, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0300045013427734, + "rewards/margins": 15.940587043762207, + "rewards/rejected": -18.970592498779297, + "step": 4040 + }, + { + "epoch": 1.85, + "learning_rate": 2.1327134404057482e-07, + "logits/chosen": -1.9444208145141602, + "logits/rejected": -1.8486173152923584, + "logps/chosen": -131.70828247070312, + "logps/rejected": -259.8726501464844, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.042513370513916, + "rewards/margins": 16.066484451293945, + "rewards/rejected": -19.108997344970703, + "step": 4050 + }, + { + "epoch": 1.85, + "learning_rate": 2.1242603550295858e-07, + "logits/chosen": -1.8983211517333984, + "logits/rejected": -1.7571252584457397, + "logps/chosen": -134.80789184570312, + "logps/rejected": -263.4311218261719, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.634955644607544, + "rewards/margins": 15.251360893249512, + "rewards/rejected": -18.886316299438477, + "step": 4060 + }, + { + "epoch": 1.86, + "learning_rate": 2.1158072696534236e-07, + "logits/chosen": -1.8505229949951172, + "logits/rejected": -1.7038662433624268, + "logps/chosen": -133.12786865234375, + "logps/rejected": -275.3601379394531, + "loss": 0.0042, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.181855201721191, + "rewards/margins": 15.866403579711914, + "rewards/rejected": -20.04825782775879, + "step": 4070 + }, + { + "epoch": 1.86, + "learning_rate": 2.107354184277261e-07, + "logits/chosen": -1.8010523319244385, + "logits/rejected": -1.6853234767913818, + "logps/chosen": -131.32154846191406, + "logps/rejected": -284.50250244140625, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.455392837524414, + "rewards/margins": 16.990983963012695, + "rewards/rejected": -21.44637680053711, + "step": 4080 + }, + { + "epoch": 1.87, + "learning_rate": 2.0989010989010987e-07, + "logits/chosen": -1.8266456127166748, + "logits/rejected": -1.6853101253509521, + "logps/chosen": -136.2552947998047, + "logps/rejected": -295.13653564453125, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.681432723999023, + "rewards/margins": 17.44306182861328, + "rewards/rejected": -22.12449073791504, + "step": 4090 + }, + { + "epoch": 1.87, + "learning_rate": 2.0904480135249363e-07, + "logits/chosen": -1.8539117574691772, + "logits/rejected": -1.6934545040130615, + "logps/chosen": -141.95208740234375, + "logps/rejected": -311.4937438964844, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.835737228393555, + "rewards/margins": 18.804773330688477, + "rewards/rejected": -23.64051055908203, + "step": 4100 + }, + { + "epoch": 1.87, + "eval_logits/chosen": -1.799795150756836, + "eval_logits/rejected": -1.637331247329712, + "eval_logps/chosen": -147.08396911621094, + "eval_logps/rejected": -292.2131652832031, + "eval_loss": 0.020981300622224808, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -5.584676265716553, + "eval_rewards/margins": 16.62618064880371, + "eval_rewards/rejected": -22.21085548400879, + "eval_runtime": 48.7877, + "eval_samples_per_second": 58.662, + "eval_steps_per_second": 1.845, + "step": 4100 + }, + { + "epoch": 1.88, + "learning_rate": 2.0819949281487741e-07, + "logits/chosen": -1.8189401626586914, + "logits/rejected": -1.630281686782837, + "logps/chosen": -142.0605010986328, + "logps/rejected": -307.12725830078125, + "loss": 0.0057, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.467617034912109, + "rewards/margins": 19.189804077148438, + "rewards/rejected": -23.657419204711914, + "step": 4110 + }, + { + "epoch": 1.88, + "learning_rate": 2.073541842772612e-07, + "logits/chosen": -1.814287781715393, + "logits/rejected": -1.632960557937622, + "logps/chosen": -142.54632568359375, + "logps/rejected": -323.35992431640625, + "loss": 0.005, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.713547706604004, + "rewards/margins": 20.30032730102539, + "rewards/rejected": -25.013872146606445, + "step": 4120 + }, + { + "epoch": 1.88, + "learning_rate": 2.0650887573964496e-07, + "logits/chosen": -1.8119704723358154, + "logits/rejected": -1.6724140644073486, + "logps/chosen": -145.4338836669922, + "logps/rejected": -319.1186218261719, + "loss": 0.009, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.1287946701049805, + "rewards/margins": 18.313566207885742, + "rewards/rejected": -24.44236183166504, + "step": 4130 + }, + { + "epoch": 1.89, + "learning_rate": 2.0566356720202874e-07, + "logits/chosen": -1.8369214534759521, + "logits/rejected": -1.675672173500061, + "logps/chosen": -154.06927490234375, + "logps/rejected": -305.5340576171875, + "loss": 0.0025, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.684229850769043, + "rewards/margins": 17.68790626525879, + "rewards/rejected": -23.372135162353516, + "step": 4140 + }, + { + "epoch": 1.89, + "learning_rate": 2.048182586644125e-07, + "logits/chosen": -1.8302981853485107, + "logits/rejected": -1.6432266235351562, + "logps/chosen": -141.11495971679688, + "logps/rejected": -297.094482421875, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.820776462554932, + "rewards/margins": 18.21160316467285, + "rewards/rejected": -23.032377243041992, + "step": 4150 + }, + { + "epoch": 1.9, + "learning_rate": 2.0397295012679628e-07, + "logits/chosen": -1.8787815570831299, + "logits/rejected": -1.7506415843963623, + "logps/chosen": -139.04129028320312, + "logps/rejected": -285.0088806152344, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.390732765197754, + "rewards/margins": 17.23700523376465, + "rewards/rejected": -21.627737045288086, + "step": 4160 + }, + { + "epoch": 1.9, + "learning_rate": 2.0312764158918006e-07, + "logits/chosen": -1.8781499862670898, + "logits/rejected": -1.731313705444336, + "logps/chosen": -134.88818359375, + "logps/rejected": -275.73553466796875, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5239157676696777, + "rewards/margins": 16.483020782470703, + "rewards/rejected": -20.006938934326172, + "step": 4170 + }, + { + "epoch": 1.91, + "learning_rate": 2.0228233305156382e-07, + "logits/chosen": -1.7998660802841187, + "logits/rejected": -1.6408767700195312, + "logps/chosen": -135.11972045898438, + "logps/rejected": -277.9287414550781, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2441205978393555, + "rewards/margins": 16.757963180541992, + "rewards/rejected": -21.002086639404297, + "step": 4180 + }, + { + "epoch": 1.91, + "learning_rate": 2.0143702451394758e-07, + "logits/chosen": -1.7851566076278687, + "logits/rejected": -1.6121336221694946, + "logps/chosen": -137.66769409179688, + "logps/rejected": -306.99420166015625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.519832134246826, + "rewards/margins": 18.73788070678711, + "rewards/rejected": -23.257715225219727, + "step": 4190 + }, + { + "epoch": 1.92, + "learning_rate": 2.0059171597633133e-07, + "logits/chosen": -1.7903058528900146, + "logits/rejected": -1.6327577829360962, + "logps/chosen": -142.868896484375, + "logps/rejected": -303.55194091796875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.210319519042969, + "rewards/margins": 18.065744400024414, + "rewards/rejected": -23.27606201171875, + "step": 4200 + }, + { + "epoch": 1.92, + "eval_logits/chosen": -1.7379149198532104, + "eval_logits/rejected": -1.5687103271484375, + "eval_logps/chosen": -143.3500213623047, + "eval_logps/rejected": -284.3860778808594, + "eval_loss": 0.02061247080564499, + "eval_rewards/accuracies": 0.9833333492279053, + "eval_rewards/chosen": -5.211281776428223, + "eval_rewards/margins": 16.216867446899414, + "eval_rewards/rejected": -21.428150177001953, + "eval_runtime": 49.1094, + "eval_samples_per_second": 58.278, + "eval_steps_per_second": 1.833, + "step": 4200 + }, + { + "epoch": 1.92, + "learning_rate": 1.9974640743871512e-07, + "logits/chosen": -1.810712456703186, + "logits/rejected": -1.6843817234039307, + "logps/chosen": -139.18353271484375, + "logps/rejected": -281.2368469238281, + "loss": 0.0172, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.270883560180664, + "rewards/margins": 15.727676391601562, + "rewards/rejected": -20.998559951782227, + "step": 4210 + }, + { + "epoch": 1.93, + "learning_rate": 1.9890109890109888e-07, + "logits/chosen": -1.8806030750274658, + "logits/rejected": -1.7819697856903076, + "logps/chosen": -130.38177490234375, + "logps/rejected": -267.53607177734375, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8572211265563965, + "rewards/margins": 15.687589645385742, + "rewards/rejected": -19.544809341430664, + "step": 4220 + }, + { + "epoch": 1.93, + "learning_rate": 1.9805579036348266e-07, + "logits/chosen": -1.7186689376831055, + "logits/rejected": -1.5683701038360596, + "logps/chosen": -141.64805603027344, + "logps/rejected": -306.37347412109375, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.315321922302246, + "rewards/margins": 18.028160095214844, + "rewards/rejected": -23.343482971191406, + "step": 4230 + }, + { + "epoch": 1.94, + "learning_rate": 1.9721048182586644e-07, + "logits/chosen": -1.793931007385254, + "logits/rejected": -1.5868475437164307, + "logps/chosen": -127.1364974975586, + "logps/rejected": -309.6034240722656, + "loss": 0.0128, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.7159628868103027, + "rewards/margins": 20.01531982421875, + "rewards/rejected": -23.731281280517578, + "step": 4240 + }, + { + "epoch": 1.94, + "learning_rate": 1.963651732882502e-07, + "logits/chosen": -1.8754409551620483, + "logits/rejected": -1.7193734645843506, + "logps/chosen": -130.42030334472656, + "logps/rejected": -281.035400390625, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.504946231842041, + "rewards/margins": 16.956892013549805, + "rewards/rejected": -20.461841583251953, + "step": 4250 + }, + { + "epoch": 1.94, + "learning_rate": 1.9551986475063398e-07, + "logits/chosen": -1.868322730064392, + "logits/rejected": -1.758927583694458, + "logps/chosen": -137.1085968017578, + "logps/rejected": -269.8552551269531, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.547211647033691, + "rewards/margins": 14.988856315612793, + "rewards/rejected": -19.536067962646484, + "step": 4260 + }, + { + "epoch": 1.95, + "learning_rate": 1.9467455621301774e-07, + "logits/chosen": -1.849692702293396, + "logits/rejected": -1.6743018627166748, + "logps/chosen": -134.50845336914062, + "logps/rejected": -278.919677734375, + "loss": 0.0112, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.260858058929443, + "rewards/margins": 16.32442283630371, + "rewards/rejected": -20.585281372070312, + "step": 4270 + }, + { + "epoch": 1.95, + "learning_rate": 1.9382924767540152e-07, + "logits/chosen": -1.8525829315185547, + "logits/rejected": -1.713321328163147, + "logps/chosen": -135.14071655273438, + "logps/rejected": -287.2046813964844, + "loss": 0.0037, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.9108681678771973, + "rewards/margins": 17.29202651977539, + "rewards/rejected": -21.202892303466797, + "step": 4280 + }, + { + "epoch": 1.96, + "learning_rate": 1.929839391377853e-07, + "logits/chosen": -1.8243554830551147, + "logits/rejected": -1.714613914489746, + "logps/chosen": -139.86141967773438, + "logps/rejected": -265.8287658691406, + "loss": 0.0143, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.9251556396484375, + "rewards/margins": 14.7702054977417, + "rewards/rejected": -19.695362091064453, + "step": 4290 + }, + { + "epoch": 1.96, + "learning_rate": 1.9213863060016904e-07, + "logits/chosen": -1.7749605178833008, + "logits/rejected": -1.6101394891738892, + "logps/chosen": -127.2191162109375, + "logps/rejected": -277.84893798828125, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.9290008544921875, + "rewards/margins": 16.82949447631836, + "rewards/rejected": -20.758495330810547, + "step": 4300 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -1.74306321144104, + "eval_logits/rejected": -1.588865041732788, + "eval_logps/chosen": -135.57318115234375, + "eval_logps/rejected": -258.4649658203125, + "eval_loss": 0.020533427596092224, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -4.433597087860107, + "eval_rewards/margins": 14.402440071105957, + "eval_rewards/rejected": -18.836036682128906, + "eval_runtime": 48.6151, + "eval_samples_per_second": 58.871, + "eval_steps_per_second": 1.851, + "step": 4300 + }, + { + "epoch": 1.97, + "learning_rate": 1.9129332206255282e-07, + "logits/chosen": -1.7776132822036743, + "logits/rejected": -1.624101996421814, + "logps/chosen": -125.65428161621094, + "logps/rejected": -264.5531005859375, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.207595109939575, + "rewards/margins": 16.06104850769043, + "rewards/rejected": -19.26864242553711, + "step": 4310 + }, + { + "epoch": 1.97, + "learning_rate": 1.9044801352493658e-07, + "logits/chosen": -1.7687575817108154, + "logits/rejected": -1.5868823528289795, + "logps/chosen": -138.88534545898438, + "logps/rejected": -266.2568359375, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.900606632232666, + "rewards/margins": 15.659704208374023, + "rewards/rejected": -19.560312271118164, + "step": 4320 + }, + { + "epoch": 1.98, + "learning_rate": 1.8960270498732036e-07, + "logits/chosen": -1.6516854763031006, + "logits/rejected": -1.4852774143218994, + "logps/chosen": -138.44424438476562, + "logps/rejected": -276.5455322265625, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.644902229309082, + "rewards/margins": 15.883813858032227, + "rewards/rejected": -20.528715133666992, + "step": 4330 + }, + { + "epoch": 1.98, + "learning_rate": 1.8875739644970412e-07, + "logits/chosen": -1.6602948904037476, + "logits/rejected": -1.497868299484253, + "logps/chosen": -138.5507354736328, + "logps/rejected": -286.0603942871094, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.921391487121582, + "rewards/margins": 16.544239044189453, + "rewards/rejected": -21.46563148498535, + "step": 4340 + }, + { + "epoch": 1.99, + "learning_rate": 1.879120879120879e-07, + "logits/chosen": -1.8231804370880127, + "logits/rejected": -1.7026121616363525, + "logps/chosen": -118.9980697631836, + "logps/rejected": -241.5241241455078, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.846381187438965, + "rewards/margins": 14.30328369140625, + "rewards/rejected": -17.149662017822266, + "step": 4350 + }, + { + "epoch": 1.99, + "learning_rate": 1.870667793744717e-07, + "logits/chosen": -1.856925368309021, + "logits/rejected": -1.760263442993164, + "logps/chosen": -126.03316497802734, + "logps/rejected": -230.318115234375, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0299785137176514, + "rewards/margins": 12.88720417022705, + "rewards/rejected": -15.917182922363281, + "step": 4360 + }, + { + "epoch": 1.99, + "learning_rate": 1.8622147083685544e-07, + "logits/chosen": -1.8362869024276733, + "logits/rejected": -1.7145744562149048, + "logps/chosen": -123.3587875366211, + "logps/rejected": -238.42489624023438, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1427197456359863, + "rewards/margins": 13.606206893920898, + "rewards/rejected": -16.74892807006836, + "step": 4370 + }, + { + "epoch": 2.0, + "learning_rate": 1.8537616229923923e-07, + "logits/chosen": -1.8129431009292603, + "logits/rejected": -1.6960035562515259, + "logps/chosen": -118.24918365478516, + "logps/rejected": -240.0215606689453, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.538170576095581, + "rewards/margins": 14.578079223632812, + "rewards/rejected": -17.116249084472656, + "step": 4380 + }, + { + "epoch": 2.0, + "learning_rate": 1.8453085376162298e-07, + "logits/chosen": -1.8822195529937744, + "logits/rejected": -1.7461490631103516, + "logps/chosen": -131.69345092773438, + "logps/rejected": -253.98416137695312, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.098832845687866, + "rewards/margins": 14.841659545898438, + "rewards/rejected": -17.940494537353516, + "step": 4390 + }, + { + "epoch": 2.01, + "learning_rate": 1.8368554522400674e-07, + "logits/chosen": -1.83819580078125, + "logits/rejected": -1.7222115993499756, + "logps/chosen": -127.88763427734375, + "logps/rejected": -243.81893920898438, + "loss": 0.0038, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.604341983795166, + "rewards/margins": 13.784266471862793, + "rewards/rejected": -17.388608932495117, + "step": 4400 + }, + { + "epoch": 2.01, + "eval_logits/chosen": -1.7572911977767944, + "eval_logits/rejected": -1.6138116121292114, + "eval_logps/chosen": -129.26815795898438, + "eval_logps/rejected": -244.04730224609375, + "eval_loss": 0.02128242887556553, + "eval_rewards/accuracies": 0.9833333492279053, + "eval_rewards/chosen": -3.803096294403076, + "eval_rewards/margins": 13.591176986694336, + "eval_rewards/rejected": -17.39427375793457, + "eval_runtime": 48.6945, + "eval_samples_per_second": 58.775, + "eval_steps_per_second": 1.848, + "step": 4400 + }, + { + "epoch": 2.01, + "learning_rate": 1.8284023668639053e-07, + "logits/chosen": -1.8123576641082764, + "logits/rejected": -1.6658599376678467, + "logps/chosen": -133.7542266845703, + "logps/rejected": -276.7223815917969, + "loss": 0.0066, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.7751827239990234, + "rewards/margins": 16.35031509399414, + "rewards/rejected": -20.125499725341797, + "step": 4410 + }, + { + "epoch": 2.02, + "learning_rate": 1.8199492814877428e-07, + "logits/chosen": -1.8025636672973633, + "logits/rejected": -1.6769447326660156, + "logps/chosen": -130.52369689941406, + "logps/rejected": -256.2034912109375, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9366908073425293, + "rewards/margins": 14.231219291687012, + "rewards/rejected": -18.167909622192383, + "step": 4420 + }, + { + "epoch": 2.02, + "learning_rate": 1.8114961961115807e-07, + "logits/chosen": -1.7440084218978882, + "logits/rejected": -1.6063076257705688, + "logps/chosen": -114.4801025390625, + "logps/rejected": -257.86651611328125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5977623462677, + "rewards/margins": 15.983970642089844, + "rewards/rejected": -18.581729888916016, + "step": 4430 + }, + { + "epoch": 2.03, + "learning_rate": 1.8030431107354182e-07, + "logits/chosen": -1.795680284500122, + "logits/rejected": -1.661131501197815, + "logps/chosen": -127.7900619506836, + "logps/rejected": -260.941162109375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.44555401802063, + "rewards/margins": 15.371556282043457, + "rewards/rejected": -18.817108154296875, + "step": 4440 + }, + { + "epoch": 2.03, + "learning_rate": 1.794590025359256e-07, + "logits/chosen": -1.7703081369400024, + "logits/rejected": -1.6057188510894775, + "logps/chosen": -125.89506530761719, + "logps/rejected": -273.585205078125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.688451051712036, + "rewards/margins": 16.332950592041016, + "rewards/rejected": -20.02140235900879, + "step": 4450 + }, + { + "epoch": 2.04, + "learning_rate": 1.7861369399830936e-07, + "logits/chosen": -1.7694547176361084, + "logits/rejected": -1.6058471202850342, + "logps/chosen": -137.6060791015625, + "logps/rejected": -278.2977600097656, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.031754970550537, + "rewards/margins": 17.0496826171875, + "rewards/rejected": -21.081439971923828, + "step": 4460 + }, + { + "epoch": 2.04, + "learning_rate": 1.7776838546069315e-07, + "logits/chosen": -1.7135499715805054, + "logits/rejected": -1.5273189544677734, + "logps/chosen": -123.12062072753906, + "logps/rejected": -284.689208984375, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.752751588821411, + "rewards/margins": 17.560134887695312, + "rewards/rejected": -21.31288719177246, + "step": 4470 + }, + { + "epoch": 2.04, + "learning_rate": 1.7692307692307693e-07, + "logits/chosen": -1.724585771560669, + "logits/rejected": -1.5345683097839355, + "logps/chosen": -123.19392395019531, + "logps/rejected": -282.0025939941406, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.395287036895752, + "rewards/margins": 17.836999893188477, + "rewards/rejected": -21.23228645324707, + "step": 4480 + }, + { + "epoch": 2.05, + "learning_rate": 1.760777683854607e-07, + "logits/chosen": -1.7193084955215454, + "logits/rejected": -1.5445727109909058, + "logps/chosen": -131.0773468017578, + "logps/rejected": -282.26812744140625, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6276519298553467, + "rewards/margins": 17.360347747802734, + "rewards/rejected": -20.988000869750977, + "step": 4490 + }, + { + "epoch": 2.05, + "learning_rate": 1.7523245984784447e-07, + "logits/chosen": -1.7349720001220703, + "logits/rejected": -1.5807268619537354, + "logps/chosen": -132.5692596435547, + "logps/rejected": -283.40283203125, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.059118270874023, + "rewards/margins": 17.672191619873047, + "rewards/rejected": -21.73130989074707, + "step": 4500 + }, + { + "epoch": 2.05, + "eval_logits/chosen": -1.6392802000045776, + "eval_logits/rejected": -1.457102656364441, + "eval_logps/chosen": -139.0337371826172, + "eval_logps/rejected": -276.6232604980469, + "eval_loss": 0.020971935242414474, + "eval_rewards/accuracies": 0.9833333492279053, + "eval_rewards/chosen": -4.779654502868652, + "eval_rewards/margins": 15.872212409973145, + "eval_rewards/rejected": -20.651866912841797, + "eval_runtime": 49.0643, + "eval_samples_per_second": 58.332, + "eval_steps_per_second": 1.834, + "step": 4500 + }, + { + "epoch": 2.06, + "learning_rate": 1.743871513102282e-07, + "logits/chosen": -1.683262825012207, + "logits/rejected": -1.528511643409729, + "logps/chosen": -142.10989379882812, + "logps/rejected": -294.67877197265625, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.435738563537598, + "rewards/margins": 17.726200103759766, + "rewards/rejected": -22.161941528320312, + "step": 4510 + }, + { + "epoch": 2.06, + "learning_rate": 1.7354184277261199e-07, + "logits/chosen": -1.707883596420288, + "logits/rejected": -1.5181795358657837, + "logps/chosen": -138.83139038085938, + "logps/rejected": -298.7277526855469, + "loss": 0.0061, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.555205345153809, + "rewards/margins": 17.840625762939453, + "rewards/rejected": -22.395830154418945, + "step": 4520 + }, + { + "epoch": 2.07, + "learning_rate": 1.7269653423499577e-07, + "logits/chosen": -1.706210732460022, + "logits/rejected": -1.4901323318481445, + "logps/chosen": -141.46340942382812, + "logps/rejected": -306.2601623535156, + "loss": 0.0063, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.152948379516602, + "rewards/margins": 19.567630767822266, + "rewards/rejected": -23.720579147338867, + "step": 4530 + }, + { + "epoch": 2.07, + "learning_rate": 1.7185122569737953e-07, + "logits/chosen": -1.6828571557998657, + "logits/rejected": -1.473172903060913, + "logps/chosen": -137.48602294921875, + "logps/rejected": -301.6971740722656, + "loss": 0.0056, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.706709861755371, + "rewards/margins": 18.48426055908203, + "rewards/rejected": -23.190969467163086, + "step": 4540 + }, + { + "epoch": 2.08, + "learning_rate": 1.710059171597633e-07, + "logits/chosen": -1.6979420185089111, + "logits/rejected": -1.5147123336791992, + "logps/chosen": -126.75444030761719, + "logps/rejected": -299.8186340332031, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.0670061111450195, + "rewards/margins": 18.573415756225586, + "rewards/rejected": -22.640424728393555, + "step": 4550 + }, + { + "epoch": 2.08, + "learning_rate": 1.7016060862214707e-07, + "logits/chosen": -1.7006571292877197, + "logits/rejected": -1.5056321620941162, + "logps/chosen": -123.4179916381836, + "logps/rejected": -292.84478759765625, + "loss": 0.0079, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.3246817588806152, + "rewards/margins": 19.180789947509766, + "rewards/rejected": -22.50547218322754, + "step": 4560 + }, + { + "epoch": 2.09, + "learning_rate": 1.6931530008453085e-07, + "logits/chosen": -1.684525489807129, + "logits/rejected": -1.4895942211151123, + "logps/chosen": -134.53253173828125, + "logps/rejected": -290.8706359863281, + "loss": 0.0056, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.204358100891113, + "rewards/margins": 18.149574279785156, + "rewards/rejected": -22.353931427001953, + "step": 4570 + }, + { + "epoch": 2.09, + "learning_rate": 1.684699915469146e-07, + "logits/chosen": -1.6476118564605713, + "logits/rejected": -1.4635541439056396, + "logps/chosen": -141.87469482421875, + "logps/rejected": -301.1070251464844, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.55865478515625, + "rewards/margins": 18.13406753540039, + "rewards/rejected": -22.69272232055664, + "step": 4580 + }, + { + "epoch": 2.09, + "learning_rate": 1.676246830092984e-07, + "logits/chosen": -1.6086757183074951, + "logits/rejected": -1.4354054927825928, + "logps/chosen": -141.43751525878906, + "logps/rejected": -300.6250305175781, + "loss": 0.0025, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.488182544708252, + "rewards/margins": 17.32658576965332, + "rewards/rejected": -22.814769744873047, + "step": 4590 + }, + { + "epoch": 2.1, + "learning_rate": 1.6677937447168218e-07, + "logits/chosen": -1.6444076299667358, + "logits/rejected": -1.440915584564209, + "logps/chosen": -131.90924072265625, + "logps/rejected": -313.8179016113281, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.731848955154419, + "rewards/margins": 20.591495513916016, + "rewards/rejected": -24.32334327697754, + "step": 4600 + }, + { + "epoch": 2.1, + "eval_logits/chosen": -1.611390233039856, + "eval_logits/rejected": -1.4136455059051514, + "eval_logps/chosen": -144.26724243164062, + "eval_logps/rejected": -291.9064025878906, + "eval_loss": 0.022027108818292618, + "eval_rewards/accuracies": 0.9833333492279053, + "eval_rewards/chosen": -5.30300235748291, + "eval_rewards/margins": 16.877180099487305, + "eval_rewards/rejected": -22.18018341064453, + "eval_runtime": 48.9227, + "eval_samples_per_second": 58.5, + "eval_steps_per_second": 1.84, + "step": 4600 + }, + { + "epoch": 2.1, + "learning_rate": 1.6593406593406593e-07, + "logits/chosen": -1.66204035282135, + "logits/rejected": -1.4702497720718384, + "logps/chosen": -149.71023559570312, + "logps/rejected": -299.20025634765625, + "loss": 0.0038, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.900949001312256, + "rewards/margins": 16.8973388671875, + "rewards/rejected": -22.79828643798828, + "step": 4610 + }, + { + "epoch": 2.11, + "learning_rate": 1.650887573964497e-07, + "logits/chosen": -1.6408029794692993, + "logits/rejected": -1.4777195453643799, + "logps/chosen": -144.4774627685547, + "logps/rejected": -303.2586364746094, + "loss": 0.0024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.714861869812012, + "rewards/margins": 17.49419593811035, + "rewards/rejected": -23.20905876159668, + "step": 4620 + }, + { + "epoch": 2.11, + "learning_rate": 1.6424344885883345e-07, + "logits/chosen": -1.6856443881988525, + "logits/rejected": -1.4800993204116821, + "logps/chosen": -140.2975311279297, + "logps/rejected": -325.49261474609375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.508673667907715, + "rewards/margins": 21.040714263916016, + "rewards/rejected": -25.549386978149414, + "step": 4630 + }, + { + "epoch": 2.12, + "learning_rate": 1.6339814032121723e-07, + "logits/chosen": -1.6470167636871338, + "logits/rejected": -1.4312283992767334, + "logps/chosen": -144.17864990234375, + "logps/rejected": -332.71923828125, + "loss": 0.0047, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.611395835876465, + "rewards/margins": 20.437816619873047, + "rewards/rejected": -26.049213409423828, + "step": 4640 + }, + { + "epoch": 2.12, + "learning_rate": 1.6255283178360101e-07, + "logits/chosen": -1.6887588500976562, + "logits/rejected": -1.4585940837860107, + "logps/chosen": -145.76132202148438, + "logps/rejected": -341.21124267578125, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.22983455657959, + "rewards/margins": 21.57925796508789, + "rewards/rejected": -26.809091567993164, + "step": 4650 + }, + { + "epoch": 2.13, + "learning_rate": 1.6170752324598477e-07, + "logits/chosen": -1.6384716033935547, + "logits/rejected": -1.452253818511963, + "logps/chosen": -140.97543334960938, + "logps/rejected": -318.2596130371094, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.9640793800354, + "rewards/margins": 19.87236785888672, + "rewards/rejected": -24.836448669433594, + "step": 4660 + }, + { + "epoch": 2.13, + "learning_rate": 1.6086221470836856e-07, + "logits/chosen": -1.6505107879638672, + "logits/rejected": -1.484649658203125, + "logps/chosen": -153.21498107910156, + "logps/rejected": -319.28643798828125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.582076072692871, + "rewards/margins": 19.079242706298828, + "rewards/rejected": -24.661317825317383, + "step": 4670 + }, + { + "epoch": 2.14, + "learning_rate": 1.600169061707523e-07, + "logits/chosen": -1.6539497375488281, + "logits/rejected": -1.4012236595153809, + "logps/chosen": -136.5476837158203, + "logps/rejected": -337.0199279785156, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.0985212326049805, + "rewards/margins": 22.486196517944336, + "rewards/rejected": -26.584720611572266, + "step": 4680 + }, + { + "epoch": 2.14, + "learning_rate": 1.591715976331361e-07, + "logits/chosen": -1.6659095287322998, + "logits/rejected": -1.4670120477676392, + "logps/chosen": -157.36988830566406, + "logps/rejected": -344.1231384277344, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.303195476531982, + "rewards/margins": 20.90297508239746, + "rewards/rejected": -27.2061710357666, + "step": 4690 + }, + { + "epoch": 2.15, + "learning_rate": 1.5832628909551985e-07, + "logits/chosen": -1.6164276599884033, + "logits/rejected": -1.414585828781128, + "logps/chosen": -165.75399780273438, + "logps/rejected": -359.6187438964844, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.301906585693359, + "rewards/margins": 20.94976043701172, + "rewards/rejected": -28.25166893005371, + "step": 4700 + }, + { + "epoch": 2.15, + "eval_logits/chosen": -1.5344696044921875, + "eval_logits/rejected": -1.310213327407837, + "eval_logps/chosen": -160.243408203125, + "eval_logps/rejected": -335.3616638183594, + "eval_loss": 0.023973895236849785, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -6.900619029998779, + "eval_rewards/margins": 19.625089645385742, + "eval_rewards/rejected": -26.525711059570312, + "eval_runtime": 48.6837, + "eval_samples_per_second": 58.788, + "eval_steps_per_second": 1.849, + "step": 4700 + }, + { + "epoch": 2.15, + "learning_rate": 1.5748098055790364e-07, + "logits/chosen": -1.5866243839263916, + "logits/rejected": -1.3655788898468018, + "logps/chosen": -151.45236206054688, + "logps/rejected": -345.5313720703125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.042023181915283, + "rewards/margins": 21.149606704711914, + "rewards/rejected": -27.19162940979004, + "step": 4710 + }, + { + "epoch": 2.15, + "learning_rate": 1.5663567202028742e-07, + "logits/chosen": -1.568950891494751, + "logits/rejected": -1.3398112058639526, + "logps/chosen": -144.99575805664062, + "logps/rejected": -346.5789794921875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.015862464904785, + "rewards/margins": 22.201229095458984, + "rewards/rejected": -27.217090606689453, + "step": 4720 + }, + { + "epoch": 2.16, + "learning_rate": 1.5579036348267115e-07, + "logits/chosen": -1.6242564916610718, + "logits/rejected": -1.4060070514678955, + "logps/chosen": -163.28323364257812, + "logps/rejected": -360.0334777832031, + "loss": 0.0067, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.851768493652344, + "rewards/margins": 21.580278396606445, + "rewards/rejected": -28.43204689025879, + "step": 4730 + }, + { + "epoch": 2.16, + "learning_rate": 1.5494505494505493e-07, + "logits/chosen": -1.5484068393707275, + "logits/rejected": -1.3234050273895264, + "logps/chosen": -156.7251434326172, + "logps/rejected": -364.34539794921875, + "loss": 0.0024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.725734710693359, + "rewards/margins": 22.758655548095703, + "rewards/rejected": -29.484386444091797, + "step": 4740 + }, + { + "epoch": 2.17, + "learning_rate": 1.540997464074387e-07, + "logits/chosen": -1.5197080373764038, + "logits/rejected": -1.2975013256072998, + "logps/chosen": -163.72894287109375, + "logps/rejected": -360.9460754394531, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.5403289794921875, + "rewards/margins": 22.118616104125977, + "rewards/rejected": -28.658945083618164, + "step": 4750 + }, + { + "epoch": 2.17, + "learning_rate": 1.5325443786982248e-07, + "logits/chosen": -1.5585333108901978, + "logits/rejected": -1.3211743831634521, + "logps/chosen": -158.20669555664062, + "logps/rejected": -374.6647033691406, + "loss": 0.0015, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.051785945892334, + "rewards/margins": 24.1824893951416, + "rewards/rejected": -30.234272003173828, + "step": 4760 + }, + { + "epoch": 2.18, + "learning_rate": 1.5240912933220626e-07, + "logits/chosen": -1.5456629991531372, + "logits/rejected": -1.304001808166504, + "logps/chosen": -154.15481567382812, + "logps/rejected": -383.1095886230469, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.04030179977417, + "rewards/margins": 24.318885803222656, + "rewards/rejected": -30.35919189453125, + "step": 4770 + }, + { + "epoch": 2.18, + "learning_rate": 1.5156382079459002e-07, + "logits/chosen": -1.554931402206421, + "logits/rejected": -1.3219283819198608, + "logps/chosen": -156.97434997558594, + "logps/rejected": -355.1632995605469, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.934172630310059, + "rewards/margins": 22.271665573120117, + "rewards/rejected": -28.205841064453125, + "step": 4780 + }, + { + "epoch": 2.19, + "learning_rate": 1.507185122569738e-07, + "logits/chosen": -1.5992720127105713, + "logits/rejected": -1.357313871383667, + "logps/chosen": -159.3846893310547, + "logps/rejected": -356.4526062011719, + "loss": 0.0048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.093033790588379, + "rewards/margins": 22.349348068237305, + "rewards/rejected": -28.4423828125, + "step": 4790 + }, + { + "epoch": 2.19, + "learning_rate": 1.4987320371935756e-07, + "logits/chosen": -1.603276014328003, + "logits/rejected": -1.3721181154251099, + "logps/chosen": -150.90286254882812, + "logps/rejected": -358.8915100097656, + "loss": 0.0075, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.906771183013916, + "rewards/margins": 22.509685516357422, + "rewards/rejected": -28.416454315185547, + "step": 4800 + }, + { + "epoch": 2.19, + "eval_logits/chosen": -1.4933305978775024, + "eval_logits/rejected": -1.2593837976455688, + "eval_logps/chosen": -157.8497314453125, + "eval_logps/rejected": -338.1903076171875, + "eval_loss": 0.025256937369704247, + "eval_rewards/accuracies": 0.9777777791023254, + "eval_rewards/chosen": -6.661252975463867, + "eval_rewards/margins": 20.14731788635254, + "eval_rewards/rejected": -26.80857276916504, + "eval_runtime": 48.1166, + "eval_samples_per_second": 59.481, + "eval_steps_per_second": 1.87, + "step": 4800 + }, + { + "epoch": 2.2, + "learning_rate": 1.4902789518174134e-07, + "logits/chosen": -1.5837562084197998, + "logits/rejected": -1.3809980154037476, + "logps/chosen": -147.4069366455078, + "logps/rejected": -335.81890869140625, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.58919620513916, + "rewards/margins": 20.910274505615234, + "rewards/rejected": -26.49947166442871, + "step": 4810 + }, + { + "epoch": 2.2, + "learning_rate": 1.4818258664412512e-07, + "logits/chosen": -1.6084327697753906, + "logits/rejected": -1.3756760358810425, + "logps/chosen": -147.60806274414062, + "logps/rejected": -345.25799560546875, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.7579145431518555, + "rewards/margins": 21.470531463623047, + "rewards/rejected": -27.22844886779785, + "step": 4820 + }, + { + "epoch": 2.2, + "learning_rate": 1.4733727810650885e-07, + "logits/chosen": -1.6319348812103271, + "logits/rejected": -1.444595456123352, + "logps/chosen": -136.5446014404297, + "logps/rejected": -318.79443359375, + "loss": 0.0028, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.821673393249512, + "rewards/margins": 20.090484619140625, + "rewards/rejected": -24.91215705871582, + "step": 4830 + }, + { + "epoch": 2.21, + "learning_rate": 1.4649196956889264e-07, + "logits/chosen": -1.6026424169540405, + "logits/rejected": -1.3949694633483887, + "logps/chosen": -138.6693572998047, + "logps/rejected": -329.12579345703125, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.085145473480225, + "rewards/margins": 20.640195846557617, + "rewards/rejected": -25.725341796875, + "step": 4840 + }, + { + "epoch": 2.21, + "learning_rate": 1.456466610312764e-07, + "logits/chosen": -1.6504093408584595, + "logits/rejected": -1.434548020362854, + "logps/chosen": -141.36990356445312, + "logps/rejected": -325.2200622558594, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.669219017028809, + "rewards/margins": 20.838899612426758, + "rewards/rejected": -25.50812339782715, + "step": 4850 + }, + { + "epoch": 2.22, + "learning_rate": 1.4480135249366018e-07, + "logits/chosen": -1.621881127357483, + "logits/rejected": -1.3952808380126953, + "logps/chosen": -139.6595458984375, + "logps/rejected": -318.80841064453125, + "loss": 0.0014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.290844917297363, + "rewards/margins": 20.612688064575195, + "rewards/rejected": -24.903533935546875, + "step": 4860 + }, + { + "epoch": 2.22, + "learning_rate": 1.4395604395604394e-07, + "logits/chosen": -1.5858741998672485, + "logits/rejected": -1.3580384254455566, + "logps/chosen": -136.5399627685547, + "logps/rejected": -336.61773681640625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.7228240966796875, + "rewards/margins": 21.94614601135254, + "rewards/rejected": -26.668970108032227, + "step": 4870 + }, + { + "epoch": 2.23, + "learning_rate": 1.4311073541842772e-07, + "logits/chosen": -1.5768780708312988, + "logits/rejected": -1.3338980674743652, + "logps/chosen": -148.2299346923828, + "logps/rejected": -345.2416076660156, + "loss": 0.0025, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.761619567871094, + "rewards/margins": 21.782390594482422, + "rewards/rejected": -27.54401206970215, + "step": 4880 + }, + { + "epoch": 2.23, + "learning_rate": 1.422654268808115e-07, + "logits/chosen": -1.598933458328247, + "logits/rejected": -1.406333088874817, + "logps/chosen": -157.4684295654297, + "logps/rejected": -347.3194274902344, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.242190361022949, + "rewards/margins": 21.24979591369629, + "rewards/rejected": -27.49198341369629, + "step": 4890 + }, + { + "epoch": 2.24, + "learning_rate": 1.4142011834319526e-07, + "logits/chosen": -1.6200568675994873, + "logits/rejected": -1.3976492881774902, + "logps/chosen": -132.42913818359375, + "logps/rejected": -337.7710876464844, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.944077968597412, + "rewards/margins": 22.762954711914062, + "rewards/rejected": -26.70703125, + "step": 4900 + }, + { + "epoch": 2.24, + "eval_logits/chosen": -1.531797170639038, + "eval_logits/rejected": -1.3038618564605713, + "eval_logps/chosen": -150.62075805664062, + "eval_logps/rejected": -323.0674133300781, + "eval_loss": 0.023812316358089447, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -5.938355922698975, + "eval_rewards/margins": 19.35792350769043, + "eval_rewards/rejected": -25.296281814575195, + "eval_runtime": 48.8333, + "eval_samples_per_second": 58.608, + "eval_steps_per_second": 1.843, + "step": 4900 + }, + { + "epoch": 2.24, + "learning_rate": 1.4057480980557904e-07, + "logits/chosen": -1.5462377071380615, + "logits/rejected": -1.3417539596557617, + "logps/chosen": -150.1999969482422, + "logps/rejected": -328.15264892578125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.9226484298706055, + "rewards/margins": 19.603456497192383, + "rewards/rejected": -25.526103973388672, + "step": 4910 + }, + { + "epoch": 2.25, + "learning_rate": 1.397295012679628e-07, + "logits/chosen": -1.648712158203125, + "logits/rejected": -1.4413989782333374, + "logps/chosen": -148.29783630371094, + "logps/rejected": -334.0263977050781, + "loss": 0.0073, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.927051544189453, + "rewards/margins": 21.278085708618164, + "rewards/rejected": -26.205135345458984, + "step": 4920 + }, + { + "epoch": 2.25, + "learning_rate": 1.3888419273034658e-07, + "logits/chosen": -1.710524320602417, + "logits/rejected": -1.5135892629623413, + "logps/chosen": -137.21377563476562, + "logps/rejected": -329.8485412597656, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.232450008392334, + "rewards/margins": 21.307376861572266, + "rewards/rejected": -25.539825439453125, + "step": 4930 + }, + { + "epoch": 2.25, + "learning_rate": 1.3803888419273034e-07, + "logits/chosen": -1.6205447912216187, + "logits/rejected": -1.3998703956604004, + "logps/chosen": -141.4749755859375, + "logps/rejected": -336.75555419921875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.847267150878906, + "rewards/margins": 21.587642669677734, + "rewards/rejected": -26.434911727905273, + "step": 4940 + }, + { + "epoch": 2.26, + "learning_rate": 1.371935756551141e-07, + "logits/chosen": -1.5820177793502808, + "logits/rejected": -1.3905309438705444, + "logps/chosen": -152.05067443847656, + "logps/rejected": -321.4071960449219, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.205025672912598, + "rewards/margins": 18.604568481445312, + "rewards/rejected": -24.80959701538086, + "step": 4950 + }, + { + "epoch": 2.26, + "learning_rate": 1.3634826711749788e-07, + "logits/chosen": -1.5784461498260498, + "logits/rejected": -1.381110429763794, + "logps/chosen": -140.77227783203125, + "logps/rejected": -335.337158203125, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.567250728607178, + "rewards/margins": 21.762025833129883, + "rewards/rejected": -26.329275131225586, + "step": 4960 + }, + { + "epoch": 2.27, + "learning_rate": 1.3550295857988164e-07, + "logits/chosen": -1.5645751953125, + "logits/rejected": -1.3427592515945435, + "logps/chosen": -148.5469207763672, + "logps/rejected": -318.39874267578125, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.7441606521606445, + "rewards/margins": 18.878780364990234, + "rewards/rejected": -24.62293815612793, + "step": 4970 + }, + { + "epoch": 2.27, + "learning_rate": 1.3465765004226542e-07, + "logits/chosen": -1.573047161102295, + "logits/rejected": -1.3629062175750732, + "logps/chosen": -141.57052612304688, + "logps/rejected": -312.4440002441406, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.663500785827637, + "rewards/margins": 19.396190643310547, + "rewards/rejected": -24.059690475463867, + "step": 4980 + }, + { + "epoch": 2.28, + "learning_rate": 1.3381234150464918e-07, + "logits/chosen": -1.5677963495254517, + "logits/rejected": -1.3477671146392822, + "logps/chosen": -147.5939483642578, + "logps/rejected": -329.4576416015625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.594753742218018, + "rewards/margins": 19.6628360748291, + "rewards/rejected": -25.257587432861328, + "step": 4990 + }, + { + "epoch": 2.28, + "learning_rate": 1.3296703296703296e-07, + "logits/chosen": -1.5639418363571167, + "logits/rejected": -1.344002604484558, + "logps/chosen": -148.0439910888672, + "logps/rejected": -328.4283752441406, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.538041591644287, + "rewards/margins": 19.774883270263672, + "rewards/rejected": -25.312923431396484, + "step": 5000 + }, + { + "epoch": 2.28, + "eval_logits/chosen": -1.5276734828948975, + "eval_logits/rejected": -1.3104116916656494, + "eval_logps/chosen": -148.38333129882812, + "eval_logps/rejected": -302.6257019042969, + "eval_loss": 0.021709125488996506, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -5.71461296081543, + "eval_rewards/margins": 17.53749656677246, + "eval_rewards/rejected": -23.25210952758789, + "eval_runtime": 48.0344, + "eval_samples_per_second": 59.582, + "eval_steps_per_second": 1.874, + "step": 5000 + }, + { + "epoch": 2.29, + "learning_rate": 1.3212172442941675e-07, + "logits/chosen": -1.5523064136505127, + "logits/rejected": -1.3519313335418701, + "logps/chosen": -147.11253356933594, + "logps/rejected": -307.6585693359375, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.264976501464844, + "rewards/margins": 18.00934600830078, + "rewards/rejected": -23.27431869506836, + "step": 5010 + }, + { + "epoch": 2.29, + "learning_rate": 1.312764158918005e-07, + "logits/chosen": -1.6357170343399048, + "logits/rejected": -1.4496716260910034, + "logps/chosen": -143.47427368164062, + "logps/rejected": -306.81475830078125, + "loss": 0.0067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.771214485168457, + "rewards/margins": 19.028942108154297, + "rewards/rejected": -23.800159454345703, + "step": 5020 + }, + { + "epoch": 2.3, + "learning_rate": 1.304311073541843e-07, + "logits/chosen": -1.6925318241119385, + "logits/rejected": -1.476401925086975, + "logps/chosen": -133.5230255126953, + "logps/rejected": -321.890869140625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.030728340148926, + "rewards/margins": 21.044105529785156, + "rewards/rejected": -25.0748291015625, + "step": 5030 + }, + { + "epoch": 2.3, + "learning_rate": 1.2958579881656802e-07, + "logits/chosen": -1.63198983669281, + "logits/rejected": -1.4392688274383545, + "logps/chosen": -147.86672973632812, + "logps/rejected": -322.14739990234375, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.490933418273926, + "rewards/margins": 19.4083194732666, + "rewards/rejected": -24.899255752563477, + "step": 5040 + }, + { + "epoch": 2.3, + "learning_rate": 1.287404902789518e-07, + "logits/chosen": -1.5643590688705444, + "logits/rejected": -1.3293625116348267, + "logps/chosen": -137.88485717773438, + "logps/rejected": -315.028564453125, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.360368251800537, + "rewards/margins": 19.42208480834961, + "rewards/rejected": -24.782455444335938, + "step": 5050 + }, + { + "epoch": 2.31, + "learning_rate": 1.2789518174133559e-07, + "logits/chosen": -1.5989701747894287, + "logits/rejected": -1.4192179441452026, + "logps/chosen": -151.6392364501953, + "logps/rejected": -300.0302734375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.988175392150879, + "rewards/margins": 17.0905704498291, + "rewards/rejected": -23.078744888305664, + "step": 5060 + }, + { + "epoch": 2.31, + "learning_rate": 1.2704987320371934e-07, + "logits/chosen": -1.6071436405181885, + "logits/rejected": -1.3919312953948975, + "logps/chosen": -153.93104553222656, + "logps/rejected": -318.19659423828125, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.994771957397461, + "rewards/margins": 18.597782135009766, + "rewards/rejected": -24.592554092407227, + "step": 5070 + }, + { + "epoch": 2.32, + "learning_rate": 1.2620456466610313e-07, + "logits/chosen": -1.5697176456451416, + "logits/rejected": -1.3543591499328613, + "logps/chosen": -148.27279663085938, + "logps/rejected": -330.57574462890625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1298627853393555, + "rewards/margins": 20.94051742553711, + "rewards/rejected": -26.070384979248047, + "step": 5080 + }, + { + "epoch": 2.32, + "learning_rate": 1.2535925612848688e-07, + "logits/chosen": -1.525294542312622, + "logits/rejected": -1.2792097330093384, + "logps/chosen": -144.99293518066406, + "logps/rejected": -362.4373474121094, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.554945945739746, + "rewards/margins": 23.6368350982666, + "rewards/rejected": -29.191781997680664, + "step": 5090 + }, + { + "epoch": 2.33, + "learning_rate": 1.2451394759087067e-07, + "logits/chosen": -1.530333399772644, + "logits/rejected": -1.2804116010665894, + "logps/chosen": -157.20462036132812, + "logps/rejected": -360.80126953125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.6529669761657715, + "rewards/margins": 22.147747039794922, + "rewards/rejected": -28.800708770751953, + "step": 5100 + }, + { + "epoch": 2.33, + "eval_logits/chosen": -1.4798411130905151, + "eval_logits/rejected": -1.236836552619934, + "eval_logps/chosen": -159.94859313964844, + "eval_logps/rejected": -338.69610595703125, + "eval_loss": 0.023448189720511436, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -6.871140003204346, + "eval_rewards/margins": 19.988006591796875, + "eval_rewards/rejected": -26.859148025512695, + "eval_runtime": 48.5272, + "eval_samples_per_second": 58.977, + "eval_steps_per_second": 1.855, + "step": 5100 + }, + { + "epoch": 2.33, + "learning_rate": 1.2366863905325443e-07, + "logits/chosen": -1.570920705795288, + "logits/rejected": -1.345963954925537, + "logps/chosen": -157.03396606445312, + "logps/rejected": -333.129150390625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.871922969818115, + "rewards/margins": 20.749706268310547, + "rewards/rejected": -26.621631622314453, + "step": 5110 + }, + { + "epoch": 2.34, + "learning_rate": 1.228233305156382e-07, + "logits/chosen": -1.5429118871688843, + "logits/rejected": -1.3001679182052612, + "logps/chosen": -141.0686492919922, + "logps/rejected": -349.8362731933594, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.485856056213379, + "rewards/margins": 22.739933013916016, + "rewards/rejected": -28.22579002380371, + "step": 5120 + }, + { + "epoch": 2.34, + "learning_rate": 1.2197802197802197e-07, + "logits/chosen": -1.5656944513320923, + "logits/rejected": -1.3424060344696045, + "logps/chosen": -164.17990112304688, + "logps/rejected": -354.5301208496094, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.938088893890381, + "rewards/margins": 21.791156768798828, + "rewards/rejected": -28.7292423248291, + "step": 5130 + }, + { + "epoch": 2.35, + "learning_rate": 1.2113271344040575e-07, + "logits/chosen": -1.5167311429977417, + "logits/rejected": -1.2887309789657593, + "logps/chosen": -162.03640747070312, + "logps/rejected": -361.96142578125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.25159215927124, + "rewards/margins": 22.400646209716797, + "rewards/rejected": -28.652240753173828, + "step": 5140 + }, + { + "epoch": 2.35, + "learning_rate": 1.202874049027895e-07, + "logits/chosen": -1.601619005203247, + "logits/rejected": -1.3553307056427002, + "logps/chosen": -149.4628448486328, + "logps/rejected": -330.8864440917969, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.38382625579834, + "rewards/margins": 20.474346160888672, + "rewards/rejected": -25.85817527770996, + "step": 5150 + }, + { + "epoch": 2.36, + "learning_rate": 1.194420963651733e-07, + "logits/chosen": -1.597741723060608, + "logits/rejected": -1.4130442142486572, + "logps/chosen": -158.59152221679688, + "logps/rejected": -321.4584045410156, + "loss": 0.0047, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.9201979637146, + "rewards/margins": 18.02041244506836, + "rewards/rejected": -24.940608978271484, + "step": 5160 + }, + { + "epoch": 2.36, + "learning_rate": 1.1859678782755706e-07, + "logits/chosen": -1.563659906387329, + "logits/rejected": -1.3519216775894165, + "logps/chosen": -145.5685272216797, + "logps/rejected": -319.1047668457031, + "loss": 0.0045, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.211328029632568, + "rewards/margins": 19.444351196289062, + "rewards/rejected": -24.655681610107422, + "step": 5170 + }, + { + "epoch": 2.36, + "learning_rate": 1.1775147928994082e-07, + "logits/chosen": -1.58176589012146, + "logits/rejected": -1.3772716522216797, + "logps/chosen": -152.3541259765625, + "logps/rejected": -326.29095458984375, + "loss": 0.0024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.837064743041992, + "rewards/margins": 19.613500595092773, + "rewards/rejected": -25.450565338134766, + "step": 5180 + }, + { + "epoch": 2.37, + "learning_rate": 1.1690617075232459e-07, + "logits/chosen": -1.5791524648666382, + "logits/rejected": -1.3527902364730835, + "logps/chosen": -157.03770446777344, + "logps/rejected": -346.9322204589844, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.26717472076416, + "rewards/margins": 21.00448226928711, + "rewards/rejected": -27.271657943725586, + "step": 5190 + }, + { + "epoch": 2.37, + "learning_rate": 1.1606086221470836e-07, + "logits/chosen": -1.599036455154419, + "logits/rejected": -1.371401309967041, + "logps/chosen": -143.78948974609375, + "logps/rejected": -343.8502502441406, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.401170253753662, + "rewards/margins": 22.104997634887695, + "rewards/rejected": -27.506168365478516, + "step": 5200 + }, + { + "epoch": 2.37, + "eval_logits/chosen": -1.5095221996307373, + "eval_logits/rejected": -1.2791478633880615, + "eval_logps/chosen": -155.79994201660156, + "eval_logps/rejected": -324.2215270996094, + "eval_loss": 0.023295849561691284, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -6.456273555755615, + "eval_rewards/margins": 18.955419540405273, + "eval_rewards/rejected": -25.411693572998047, + "eval_runtime": 49.6179, + "eval_samples_per_second": 57.681, + "eval_steps_per_second": 1.814, + "step": 5200 + }, + { + "epoch": 2.38, + "learning_rate": 1.1521555367709214e-07, + "logits/chosen": -1.4285168647766113, + "logits/rejected": -1.1557750701904297, + "logps/chosen": -154.73887634277344, + "logps/rejected": -342.5699157714844, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.326033592224121, + "rewards/margins": 20.766834259033203, + "rewards/rejected": -27.09286880493164, + "step": 5210 + }, + { + "epoch": 2.38, + "learning_rate": 1.1437024513947591e-07, + "logits/chosen": -1.3615461587905884, + "logits/rejected": -1.1402201652526855, + "logps/chosen": -144.0945587158203, + "logps/rejected": -319.58355712890625, + "loss": 0.0029, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.830936431884766, + "rewards/margins": 18.93877601623535, + "rewards/rejected": -24.769710540771484, + "step": 5220 + }, + { + "epoch": 2.39, + "learning_rate": 1.1352493660185967e-07, + "logits/chosen": -1.5037003755569458, + "logits/rejected": -1.2450432777404785, + "logps/chosen": -141.416015625, + "logps/rejected": -322.6919860839844, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.933599948883057, + "rewards/margins": 20.433279037475586, + "rewards/rejected": -25.366878509521484, + "step": 5230 + }, + { + "epoch": 2.39, + "learning_rate": 1.1267962806424344e-07, + "logits/chosen": -1.6027101278305054, + "logits/rejected": -1.3594125509262085, + "logps/chosen": -143.9319610595703, + "logps/rejected": -300.5264587402344, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.489424705505371, + "rewards/margins": 18.510822296142578, + "rewards/rejected": -23.000247955322266, + "step": 5240 + }, + { + "epoch": 2.4, + "learning_rate": 1.1183431952662721e-07, + "logits/chosen": -1.5492119789123535, + "logits/rejected": -1.3233853578567505, + "logps/chosen": -142.66136169433594, + "logps/rejected": -319.67828369140625, + "loss": 0.005, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.087625980377197, + "rewards/margins": 19.76156234741211, + "rewards/rejected": -24.849185943603516, + "step": 5250 + }, + { + "epoch": 2.4, + "learning_rate": 1.1098901098901098e-07, + "logits/chosen": -1.5399987697601318, + "logits/rejected": -1.2904198169708252, + "logps/chosen": -142.1669464111328, + "logps/rejected": -317.7565612792969, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.012362003326416, + "rewards/margins": 19.859041213989258, + "rewards/rejected": -24.871402740478516, + "step": 5260 + }, + { + "epoch": 2.41, + "learning_rate": 1.1014370245139476e-07, + "logits/chosen": -1.5936027765274048, + "logits/rejected": -1.3584023714065552, + "logps/chosen": -147.6734619140625, + "logps/rejected": -336.40167236328125, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.050702095031738, + "rewards/margins": 20.767940521240234, + "rewards/rejected": -25.81864356994629, + "step": 5270 + }, + { + "epoch": 2.41, + "learning_rate": 1.0929839391377852e-07, + "logits/chosen": -1.5691344738006592, + "logits/rejected": -1.3617496490478516, + "logps/chosen": -140.85934448242188, + "logps/rejected": -320.1832580566406, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1365251541137695, + "rewards/margins": 19.71951675415039, + "rewards/rejected": -24.856042861938477, + "step": 5280 + }, + { + "epoch": 2.41, + "learning_rate": 1.0845308537616229e-07, + "logits/chosen": -1.5475467443466187, + "logits/rejected": -1.3007447719573975, + "logps/chosen": -139.46876525878906, + "logps/rejected": -312.297607421875, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.21682071685791, + "rewards/margins": 20.048919677734375, + "rewards/rejected": -24.26573944091797, + "step": 5290 + }, + { + "epoch": 2.42, + "learning_rate": 1.0760777683854606e-07, + "logits/chosen": -1.5608899593353271, + "logits/rejected": -1.3258769512176514, + "logps/chosen": -141.82247924804688, + "logps/rejected": -311.65533447265625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.637762546539307, + "rewards/margins": 19.574703216552734, + "rewards/rejected": -24.212467193603516, + "step": 5300 + }, + { + "epoch": 2.42, + "eval_logits/chosen": -1.4908230304718018, + "eval_logits/rejected": -1.2663732767105103, + "eval_logps/chosen": -151.76470947265625, + "eval_logps/rejected": -307.0641784667969, + "eval_loss": 0.022586598992347717, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -6.0527520179748535, + "eval_rewards/margins": 17.64320945739746, + "eval_rewards/rejected": -23.69596290588379, + "eval_runtime": 48.3485, + "eval_samples_per_second": 59.195, + "eval_steps_per_second": 1.861, + "step": 5300 + }, + { + "epoch": 2.42, + "learning_rate": 1.0676246830092983e-07, + "logits/chosen": -1.626143217086792, + "logits/rejected": -1.4086579084396362, + "logps/chosen": -145.34234619140625, + "logps/rejected": -313.7509765625, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.302502632141113, + "rewards/margins": 18.770069122314453, + "rewards/rejected": -24.07257080078125, + "step": 5310 + }, + { + "epoch": 2.43, + "learning_rate": 1.059171597633136e-07, + "logits/chosen": -1.6133781671524048, + "logits/rejected": -1.3685221672058105, + "logps/chosen": -137.14315795898438, + "logps/rejected": -323.71917724609375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.440985679626465, + "rewards/margins": 20.653667449951172, + "rewards/rejected": -25.094654083251953, + "step": 5320 + }, + { + "epoch": 2.43, + "learning_rate": 1.0507185122569739e-07, + "logits/chosen": -1.6073236465454102, + "logits/rejected": -1.4107120037078857, + "logps/chosen": -142.0266571044922, + "logps/rejected": -305.3078308105469, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.942056655883789, + "rewards/margins": 18.439678192138672, + "rewards/rejected": -23.381732940673828, + "step": 5330 + }, + { + "epoch": 2.44, + "learning_rate": 1.0422654268808114e-07, + "logits/chosen": -1.6311960220336914, + "logits/rejected": -1.3799140453338623, + "logps/chosen": -144.93405151367188, + "logps/rejected": -332.131103515625, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.900788307189941, + "rewards/margins": 20.720251083374023, + "rewards/rejected": -25.621042251586914, + "step": 5340 + }, + { + "epoch": 2.44, + "learning_rate": 1.0338123415046491e-07, + "logits/chosen": -1.599922776222229, + "logits/rejected": -1.3647892475128174, + "logps/chosen": -144.65731811523438, + "logps/rejected": -324.754638671875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.055871486663818, + "rewards/margins": 19.931934356689453, + "rewards/rejected": -24.987808227539062, + "step": 5350 + }, + { + "epoch": 2.45, + "learning_rate": 1.0253592561284868e-07, + "logits/chosen": -1.5935919284820557, + "logits/rejected": -1.3738867044448853, + "logps/chosen": -143.09603881835938, + "logps/rejected": -315.74591064453125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.438485145568848, + "rewards/margins": 19.96566390991211, + "rewards/rejected": -24.40414810180664, + "step": 5360 + }, + { + "epoch": 2.45, + "learning_rate": 1.0169061707523245e-07, + "logits/chosen": -1.609093427658081, + "logits/rejected": -1.3772783279418945, + "logps/chosen": -140.55917358398438, + "logps/rejected": -324.67547607421875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.489682197570801, + "rewards/margins": 20.565767288208008, + "rewards/rejected": -25.055450439453125, + "step": 5370 + }, + { + "epoch": 2.46, + "learning_rate": 1.0084530853761623e-07, + "logits/chosen": -1.5920069217681885, + "logits/rejected": -1.364201545715332, + "logps/chosen": -141.46438598632812, + "logps/rejected": -338.9762268066406, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.8665313720703125, + "rewards/margins": 21.738967895507812, + "rewards/rejected": -26.605499267578125, + "step": 5380 + }, + { + "epoch": 2.46, + "learning_rate": 1e-07, + "logits/chosen": -1.5002410411834717, + "logits/rejected": -1.2806379795074463, + "logps/chosen": -155.71710205078125, + "logps/rejected": -321.7295837402344, + "loss": 0.0058, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.903067588806152, + "rewards/margins": 18.76999855041504, + "rewards/rejected": -24.673063278198242, + "step": 5390 + }, + { + "epoch": 2.46, + "learning_rate": 9.915469146238377e-08, + "logits/chosen": -1.5670658349990845, + "logits/rejected": -1.3476722240447998, + "logps/chosen": -156.94216918945312, + "logps/rejected": -331.74029541015625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.655655860900879, + "rewards/margins": 19.565311431884766, + "rewards/rejected": -26.220966339111328, + "step": 5400 + }, + { + "epoch": 2.46, + "eval_logits/chosen": -1.478770136833191, + "eval_logits/rejected": -1.2360926866531372, + "eval_logps/chosen": -157.89659118652344, + "eval_logps/rejected": -326.7684326171875, + "eval_loss": 0.02334408089518547, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -6.665939807891846, + "eval_rewards/margins": 19.000446319580078, + "eval_rewards/rejected": -25.666383743286133, + "eval_runtime": 48.9157, + "eval_samples_per_second": 58.509, + "eval_steps_per_second": 1.84, + "step": 5400 + }, + { + "epoch": 2.47, + "learning_rate": 9.830938292476754e-08, + "logits/chosen": -1.5275108814239502, + "logits/rejected": -1.2839040756225586, + "logps/chosen": -142.76083374023438, + "logps/rejected": -345.90472412109375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.783443450927734, + "rewards/margins": 22.5191650390625, + "rewards/rejected": -27.302608489990234, + "step": 5410 + }, + { + "epoch": 2.47, + "learning_rate": 9.74640743871513e-08, + "logits/chosen": -1.548032522201538, + "logits/rejected": -1.3077924251556396, + "logps/chosen": -156.00958251953125, + "logps/rejected": -349.4509582519531, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.358696460723877, + "rewards/margins": 21.26180648803711, + "rewards/rejected": -27.62050437927246, + "step": 5420 + }, + { + "epoch": 2.48, + "learning_rate": 9.661876584953508e-08, + "logits/chosen": -1.5474439859390259, + "logits/rejected": -1.3252089023590088, + "logps/chosen": -151.78616333007812, + "logps/rejected": -343.47808837890625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.073981285095215, + "rewards/margins": 21.182497024536133, + "rewards/rejected": -27.2564754486084, + "step": 5430 + }, + { + "epoch": 2.48, + "learning_rate": 9.577345731191883e-08, + "logits/chosen": -1.5336410999298096, + "logits/rejected": -1.2730783224105835, + "logps/chosen": -150.72671508789062, + "logps/rejected": -336.71807861328125, + "loss": 0.0012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.051814556121826, + "rewards/margins": 20.846153259277344, + "rewards/rejected": -26.89797019958496, + "step": 5440 + }, + { + "epoch": 2.49, + "learning_rate": 9.492814877430262e-08, + "logits/chosen": -1.5212593078613281, + "logits/rejected": -1.2231850624084473, + "logps/chosen": -145.03448486328125, + "logps/rejected": -345.9190368652344, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.964097023010254, + "rewards/margins": 22.741249084472656, + "rewards/rejected": -27.705347061157227, + "step": 5450 + }, + { + "epoch": 2.49, + "learning_rate": 9.408284023668639e-08, + "logits/chosen": -1.5088036060333252, + "logits/rejected": -1.2679582834243774, + "logps/chosen": -146.006103515625, + "logps/rejected": -361.91949462890625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.680733680725098, + "rewards/margins": 23.57914161682129, + "rewards/rejected": -29.259876251220703, + "step": 5460 + }, + { + "epoch": 2.5, + "learning_rate": 9.323753169907016e-08, + "logits/chosen": -1.5367735624313354, + "logits/rejected": -1.2960542440414429, + "logps/chosen": -145.0201873779297, + "logps/rejected": -355.9728088378906, + "loss": 0.0026, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.884902000427246, + "rewards/margins": 22.25425148010254, + "rewards/rejected": -28.1391544342041, + "step": 5470 + }, + { + "epoch": 2.5, + "learning_rate": 9.239222316145393e-08, + "logits/chosen": -1.5144281387329102, + "logits/rejected": -1.254509449005127, + "logps/chosen": -157.70468139648438, + "logps/rejected": -352.5044860839844, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.387197971343994, + "rewards/margins": 21.703859329223633, + "rewards/rejected": -28.0910587310791, + "step": 5480 + }, + { + "epoch": 2.51, + "learning_rate": 9.15469146238377e-08, + "logits/chosen": -1.5131080150604248, + "logits/rejected": -1.2870782613754272, + "logps/chosen": -148.97203063964844, + "logps/rejected": -355.46661376953125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.038233757019043, + "rewards/margins": 22.466554641723633, + "rewards/rejected": -28.50478744506836, + "step": 5490 + }, + { + "epoch": 2.51, + "learning_rate": 9.070160608622146e-08, + "logits/chosen": -1.5873641967773438, + "logits/rejected": -1.3531397581100464, + "logps/chosen": -159.92434692382812, + "logps/rejected": -348.5457458496094, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.174721717834473, + "rewards/margins": 21.510509490966797, + "rewards/rejected": -27.685232162475586, + "step": 5500 + }, + { + "epoch": 2.51, + "eval_logits/chosen": -1.494807481765747, + "eval_logits/rejected": -1.2609878778457642, + "eval_logps/chosen": -158.99363708496094, + "eval_logps/rejected": -331.5590515136719, + "eval_loss": 0.024974165484309196, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -6.775642395019531, + "eval_rewards/margins": 19.36980438232422, + "eval_rewards/rejected": -26.145444869995117, + "eval_runtime": 48.9031, + "eval_samples_per_second": 58.524, + "eval_steps_per_second": 1.84, + "step": 5500 + }, + { + "epoch": 2.51, + "learning_rate": 8.985629754860524e-08, + "logits/chosen": -1.6191699504852295, + "logits/rejected": -1.376314640045166, + "logps/chosen": -148.91360473632812, + "logps/rejected": -344.6647644042969, + "loss": 0.0037, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.875352382659912, + "rewards/margins": 21.492692947387695, + "rewards/rejected": -27.3680419921875, + "step": 5510 + }, + { + "epoch": 2.52, + "learning_rate": 8.901098901098901e-08, + "logits/chosen": -1.5830731391906738, + "logits/rejected": -1.3820655345916748, + "logps/chosen": -151.00320434570312, + "logps/rejected": -325.35076904296875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.6005425453186035, + "rewards/margins": 18.949426651000977, + "rewards/rejected": -25.549968719482422, + "step": 5520 + }, + { + "epoch": 2.52, + "learning_rate": 8.816568047337278e-08, + "logits/chosen": -1.613149881362915, + "logits/rejected": -1.3727449178695679, + "logps/chosen": -159.81369018554688, + "logps/rejected": -357.45867919921875, + "loss": 0.0035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.34648323059082, + "rewards/margins": 22.07329559326172, + "rewards/rejected": -28.41977882385254, + "step": 5530 + }, + { + "epoch": 2.53, + "learning_rate": 8.732037193575655e-08, + "logits/chosen": -1.6033811569213867, + "logits/rejected": -1.3570531606674194, + "logps/chosen": -151.11459350585938, + "logps/rejected": -353.0989074707031, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.0476179122924805, + "rewards/margins": 22.07670783996582, + "rewards/rejected": -28.124324798583984, + "step": 5540 + }, + { + "epoch": 2.53, + "learning_rate": 8.647506339814031e-08, + "logits/chosen": -1.6857588291168213, + "logits/rejected": -1.4397852420806885, + "logps/chosen": -150.65194702148438, + "logps/rejected": -328.0426025390625, + "loss": 0.0048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.40293025970459, + "rewards/margins": 20.280824661254883, + "rewards/rejected": -25.68375587463379, + "step": 5550 + }, + { + "epoch": 2.54, + "learning_rate": 8.562975486052408e-08, + "logits/chosen": -1.6101045608520508, + "logits/rejected": -1.3583890199661255, + "logps/chosen": -145.62823486328125, + "logps/rejected": -346.24053955078125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.573197364807129, + "rewards/margins": 21.59356689453125, + "rewards/rejected": -27.166759490966797, + "step": 5560 + }, + { + "epoch": 2.54, + "learning_rate": 8.478444632290786e-08, + "logits/chosen": -1.6645548343658447, + "logits/rejected": -1.454132318496704, + "logps/chosen": -150.60423278808594, + "logps/rejected": -328.65740966796875, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.984841346740723, + "rewards/margins": 20.694120407104492, + "rewards/rejected": -25.678964614868164, + "step": 5570 + }, + { + "epoch": 2.55, + "learning_rate": 8.393913778529163e-08, + "logits/chosen": -1.6236995458602905, + "logits/rejected": -1.3915711641311646, + "logps/chosen": -140.1810302734375, + "logps/rejected": -341.6992492675781, + "loss": 0.0035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.804213523864746, + "rewards/margins": 22.202869415283203, + "rewards/rejected": -27.007083892822266, + "step": 5580 + }, + { + "epoch": 2.55, + "learning_rate": 8.30938292476754e-08, + "logits/chosen": -1.602725625038147, + "logits/rejected": -1.38978111743927, + "logps/chosen": -160.79714965820312, + "logps/rejected": -334.1737365722656, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.724407196044922, + "rewards/margins": 19.819522857666016, + "rewards/rejected": -26.543926239013672, + "step": 5590 + }, + { + "epoch": 2.56, + "learning_rate": 8.224852071005916e-08, + "logits/chosen": -1.6001827716827393, + "logits/rejected": -1.327143907546997, + "logps/chosen": -159.7926025390625, + "logps/rejected": -371.45941162109375, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.600203514099121, + "rewards/margins": 23.463272094726562, + "rewards/rejected": -30.063480377197266, + "step": 5600 + }, + { + "epoch": 2.56, + "eval_logits/chosen": -1.5472838878631592, + "eval_logits/rejected": -1.3104890584945679, + "eval_logps/chosen": -157.7054443359375, + "eval_logps/rejected": -330.0379638671875, + "eval_loss": 0.02463771402835846, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -6.646824359893799, + "eval_rewards/margins": 19.346515655517578, + "eval_rewards/rejected": -25.99333953857422, + "eval_runtime": 48.2452, + "eval_samples_per_second": 59.322, + "eval_steps_per_second": 1.865, + "step": 5600 + }, + { + "epoch": 2.56, + "learning_rate": 8.140321217244293e-08, + "logits/chosen": -1.5861504077911377, + "logits/rejected": -1.3507585525512695, + "logps/chosen": -149.94241333007812, + "logps/rejected": -344.6899719238281, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.634446620941162, + "rewards/margins": 21.981483459472656, + "rewards/rejected": -27.615930557250977, + "step": 5610 + }, + { + "epoch": 2.57, + "learning_rate": 8.05579036348267e-08, + "logits/chosen": -1.5509120225906372, + "logits/rejected": -1.3108220100402832, + "logps/chosen": -155.71743774414062, + "logps/rejected": -339.0517578125, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.951332092285156, + "rewards/margins": 19.823192596435547, + "rewards/rejected": -26.774524688720703, + "step": 5620 + }, + { + "epoch": 2.57, + "learning_rate": 7.971259509721048e-08, + "logits/chosen": -1.5993841886520386, + "logits/rejected": -1.3499586582183838, + "logps/chosen": -142.0042724609375, + "logps/rejected": -348.3353576660156, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.063478946685791, + "rewards/margins": 22.494701385498047, + "rewards/rejected": -27.558177947998047, + "step": 5630 + }, + { + "epoch": 2.57, + "learning_rate": 7.886728655959425e-08, + "logits/chosen": -1.5573492050170898, + "logits/rejected": -1.3277238607406616, + "logps/chosen": -149.28855895996094, + "logps/rejected": -334.1047668457031, + "loss": 0.0025, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.636155128479004, + "rewards/margins": 20.696395874023438, + "rewards/rejected": -26.332550048828125, + "step": 5640 + }, + { + "epoch": 2.58, + "learning_rate": 7.802197802197803e-08, + "logits/chosen": -1.5743649005889893, + "logits/rejected": -1.3265464305877686, + "logps/chosen": -152.75967407226562, + "logps/rejected": -342.3771057128906, + "loss": 0.0025, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.603058815002441, + "rewards/margins": 21.599857330322266, + "rewards/rejected": -27.20291519165039, + "step": 5650 + }, + { + "epoch": 2.58, + "learning_rate": 7.717666948436178e-08, + "logits/chosen": -1.5480163097381592, + "logits/rejected": -1.304038405418396, + "logps/chosen": -148.1897430419922, + "logps/rejected": -345.70404052734375, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.109408378601074, + "rewards/margins": 21.218894958496094, + "rewards/rejected": -27.328304290771484, + "step": 5660 + }, + { + "epoch": 2.59, + "learning_rate": 7.633136094674555e-08, + "logits/chosen": -1.564328908920288, + "logits/rejected": -1.3425943851470947, + "logps/chosen": -167.41494750976562, + "logps/rejected": -334.5479431152344, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.401378631591797, + "rewards/margins": 18.970346450805664, + "rewards/rejected": -26.371723175048828, + "step": 5670 + }, + { + "epoch": 2.59, + "learning_rate": 7.548605240912932e-08, + "logits/chosen": -1.5382803678512573, + "logits/rejected": -1.3093044757843018, + "logps/chosen": -149.05081176757812, + "logps/rejected": -340.0987854003906, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.222987174987793, + "rewards/margins": 20.633708953857422, + "rewards/rejected": -26.8566951751709, + "step": 5680 + }, + { + "epoch": 2.6, + "learning_rate": 7.464074387151311e-08, + "logits/chosen": -1.5770385265350342, + "logits/rejected": -1.3411109447479248, + "logps/chosen": -162.41824340820312, + "logps/rejected": -363.8817443847656, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.601351737976074, + "rewards/margins": 21.970367431640625, + "rewards/rejected": -28.57172203063965, + "step": 5690 + }, + { + "epoch": 2.6, + "learning_rate": 7.379543533389688e-08, + "logits/chosen": -1.576453447341919, + "logits/rejected": -1.3042480945587158, + "logps/chosen": -161.98483276367188, + "logps/rejected": -331.50091552734375, + "loss": 0.0017, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.679347991943359, + "rewards/margins": 19.423847198486328, + "rewards/rejected": -26.103191375732422, + "step": 5700 + }, + { + "epoch": 2.6, + "eval_logits/chosen": -1.4879993200302124, + "eval_logits/rejected": -1.2445040941238403, + "eval_logps/chosen": -160.33197021484375, + "eval_logps/rejected": -330.4330749511719, + "eval_loss": 0.02477310597896576, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -6.909476280212402, + "eval_rewards/margins": 19.123369216918945, + "eval_rewards/rejected": -26.03284454345703, + "eval_runtime": 49.4109, + "eval_samples_per_second": 57.922, + "eval_steps_per_second": 1.821, + "step": 5700 + }, + { + "epoch": 2.61, + "learning_rate": 7.295012679628063e-08, + "logits/chosen": -1.5742652416229248, + "logits/rejected": -1.3203189373016357, + "logps/chosen": -144.94931030273438, + "logps/rejected": -353.5227355957031, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.223757743835449, + "rewards/margins": 23.126510620117188, + "rewards/rejected": -28.350269317626953, + "step": 5710 + }, + { + "epoch": 2.61, + "learning_rate": 7.21048182586644e-08, + "logits/chosen": -1.5544811487197876, + "logits/rejected": -1.3336037397384644, + "logps/chosen": -150.41961669921875, + "logps/rejected": -340.2244873046875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.044550895690918, + "rewards/margins": 20.950010299682617, + "rewards/rejected": -26.99456214904785, + "step": 5720 + }, + { + "epoch": 2.62, + "learning_rate": 7.125950972104817e-08, + "logits/chosen": -1.5454206466674805, + "logits/rejected": -1.2845289707183838, + "logps/chosen": -153.46827697753906, + "logps/rejected": -342.5339660644531, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.792252063751221, + "rewards/margins": 21.301063537597656, + "rewards/rejected": -27.09331703186035, + "step": 5730 + }, + { + "epoch": 2.62, + "learning_rate": 7.041420118343195e-08, + "logits/chosen": -1.501680612564087, + "logits/rejected": -1.2636568546295166, + "logps/chosen": -157.52508544921875, + "logps/rejected": -365.23486328125, + "loss": 0.0038, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.290675163269043, + "rewards/margins": 22.773101806640625, + "rewards/rejected": -29.06377601623535, + "step": 5740 + }, + { + "epoch": 2.62, + "learning_rate": 6.956889264581573e-08, + "logits/chosen": -1.498518943786621, + "logits/rejected": -1.2680155038833618, + "logps/chosen": -157.18557739257812, + "logps/rejected": -360.58941650390625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.826033115386963, + "rewards/margins": 22.10856819152832, + "rewards/rejected": -28.934600830078125, + "step": 5750 + }, + { + "epoch": 2.63, + "learning_rate": 6.872358410819949e-08, + "logits/chosen": -1.4816702604293823, + "logits/rejected": -1.2337892055511475, + "logps/chosen": -147.61468505859375, + "logps/rejected": -333.8875732421875, + "loss": 0.0035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.283675670623779, + "rewards/margins": 19.97292709350586, + "rewards/rejected": -26.256603240966797, + "step": 5760 + }, + { + "epoch": 2.63, + "learning_rate": 6.787827557058326e-08, + "logits/chosen": -1.5251476764678955, + "logits/rejected": -1.2596557140350342, + "logps/chosen": -158.60626220703125, + "logps/rejected": -352.7878723144531, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.330402374267578, + "rewards/margins": 21.95157241821289, + "rewards/rejected": -28.2819766998291, + "step": 5770 + }, + { + "epoch": 2.64, + "learning_rate": 6.703296703296703e-08, + "logits/chosen": -1.4867092370986938, + "logits/rejected": -1.262742042541504, + "logps/chosen": -171.3079071044922, + "logps/rejected": -350.98760986328125, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.340981960296631, + "rewards/margins": 20.05600929260254, + "rewards/rejected": -27.396991729736328, + "step": 5780 + }, + { + "epoch": 2.64, + "learning_rate": 6.61876584953508e-08, + "logits/chosen": -1.4862596988677979, + "logits/rejected": -1.2417380809783936, + "logps/chosen": -163.3212432861328, + "logps/rejected": -348.84796142578125, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.111495018005371, + "rewards/margins": 20.963470458984375, + "rewards/rejected": -28.074966430664062, + "step": 5790 + }, + { + "epoch": 2.65, + "learning_rate": 6.534234995773457e-08, + "logits/chosen": -1.4852240085601807, + "logits/rejected": -1.2178817987442017, + "logps/chosen": -149.77035522460938, + "logps/rejected": -368.5277099609375, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.752679347991943, + "rewards/margins": 23.98304557800293, + "rewards/rejected": -29.735727310180664, + "step": 5800 + }, + { + "epoch": 2.65, + "eval_logits/chosen": -1.4105994701385498, + "eval_logits/rejected": -1.1525661945343018, + "eval_logps/chosen": -166.90325927734375, + "eval_logps/rejected": -348.3179931640625, + "eval_loss": 0.025658363476395607, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -7.566605567932129, + "eval_rewards/margins": 20.25473403930664, + "eval_rewards/rejected": -27.821340560913086, + "eval_runtime": 48.6793, + "eval_samples_per_second": 58.793, + "eval_steps_per_second": 1.849, + "step": 5800 + }, + { + "epoch": 2.65, + "learning_rate": 6.449704142011835e-08, + "logits/chosen": -1.5362986326217651, + "logits/rejected": -1.2705743312835693, + "logps/chosen": -162.11782836914062, + "logps/rejected": -355.174072265625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.987889289855957, + "rewards/margins": 21.60100746154785, + "rewards/rejected": -28.588897705078125, + "step": 5810 + }, + { + "epoch": 2.66, + "learning_rate": 6.365173288250211e-08, + "logits/chosen": -1.5174287557601929, + "logits/rejected": -1.2580162286758423, + "logps/chosen": -152.82260131835938, + "logps/rejected": -358.1809997558594, + "loss": 0.0024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.536625862121582, + "rewards/margins": 22.3090763092041, + "rewards/rejected": -28.845699310302734, + "step": 5820 + }, + { + "epoch": 2.66, + "learning_rate": 6.280642434488588e-08, + "logits/chosen": -1.5420914888381958, + "logits/rejected": -1.278857707977295, + "logps/chosen": -142.987548828125, + "logps/rejected": -353.4092712402344, + "loss": 0.0037, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.59360408782959, + "rewards/margins": 22.749876022338867, + "rewards/rejected": -28.343481063842773, + "step": 5830 + }, + { + "epoch": 2.67, + "learning_rate": 6.196111580726965e-08, + "logits/chosen": -1.4608005285263062, + "logits/rejected": -1.1506479978561401, + "logps/chosen": -152.3258056640625, + "logps/rejected": -392.17266845703125, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.077305793762207, + "rewards/margins": 25.937328338623047, + "rewards/rejected": -32.01463317871094, + "step": 5840 + }, + { + "epoch": 2.67, + "learning_rate": 6.111580726965342e-08, + "logits/chosen": -1.4075108766555786, + "logits/rejected": -1.1176444292068481, + "logps/chosen": -152.30905151367188, + "logps/rejected": -353.33258056640625, + "loss": 0.0058, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.439084053039551, + "rewards/margins": 22.003307342529297, + "rewards/rejected": -28.4423885345459, + "step": 5850 + }, + { + "epoch": 2.67, + "learning_rate": 6.027049873203719e-08, + "logits/chosen": -1.5007381439208984, + "logits/rejected": -1.2174341678619385, + "logps/chosen": -160.6419677734375, + "logps/rejected": -369.1076354980469, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.2555928230285645, + "rewards/margins": 23.638750076293945, + "rewards/rejected": -29.894338607788086, + "step": 5860 + }, + { + "epoch": 2.68, + "learning_rate": 5.942519019442096e-08, + "logits/chosen": -1.4120880365371704, + "logits/rejected": -1.1486588716506958, + "logps/chosen": -165.69900512695312, + "logps/rejected": -370.86114501953125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.068509101867676, + "rewards/margins": 22.668869018554688, + "rewards/rejected": -29.737377166748047, + "step": 5870 + }, + { + "epoch": 2.68, + "learning_rate": 5.857988165680473e-08, + "logits/chosen": -1.4748567342758179, + "logits/rejected": -1.2470533847808838, + "logps/chosen": -152.8289794921875, + "logps/rejected": -346.9162292480469, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.39940881729126, + "rewards/margins": 21.182292938232422, + "rewards/rejected": -27.581701278686523, + "step": 5880 + }, + { + "epoch": 2.69, + "learning_rate": 5.77345731191885e-08, + "logits/chosen": -1.3950707912445068, + "logits/rejected": -1.1131783723831177, + "logps/chosen": -155.28848266601562, + "logps/rejected": -378.28997802734375, + "loss": 0.0091, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.076868534088135, + "rewards/margins": 24.460920333862305, + "rewards/rejected": -30.537792205810547, + "step": 5890 + }, + { + "epoch": 2.69, + "learning_rate": 5.688926458157227e-08, + "logits/chosen": -1.5006979703903198, + "logits/rejected": -1.1969038248062134, + "logps/chosen": -157.77194213867188, + "logps/rejected": -364.2388000488281, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.46331787109375, + "rewards/margins": 22.99692726135254, + "rewards/rejected": -29.46024513244629, + "step": 5900 + }, + { + "epoch": 2.69, + "eval_logits/chosen": -1.3563249111175537, + "eval_logits/rejected": -1.0876926183700562, + "eval_logps/chosen": -167.66436767578125, + "eval_logps/rejected": -355.3266296386719, + "eval_loss": 0.026315541937947273, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -7.6427178382873535, + "eval_rewards/margins": 20.879486083984375, + "eval_rewards/rejected": -28.522199630737305, + "eval_runtime": 49.1795, + "eval_samples_per_second": 58.195, + "eval_steps_per_second": 1.83, + "step": 5900 + }, + { + "epoch": 2.7, + "learning_rate": 5.604395604395604e-08, + "logits/chosen": -1.4665958881378174, + "logits/rejected": -1.1959645748138428, + "logps/chosen": -149.21376037597656, + "logps/rejected": -355.60443115234375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.6861572265625, + "rewards/margins": 22.935123443603516, + "rewards/rejected": -28.621280670166016, + "step": 5910 + }, + { + "epoch": 2.7, + "learning_rate": 5.519864750633981e-08, + "logits/chosen": -1.4556392431259155, + "logits/rejected": -1.1394058465957642, + "logps/chosen": -160.46078491210938, + "logps/rejected": -371.42120361328125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.608606815338135, + "rewards/margins": 23.34994125366211, + "rewards/rejected": -29.958547592163086, + "step": 5920 + }, + { + "epoch": 2.71, + "learning_rate": 5.435333896872358e-08, + "logits/chosen": -1.4221923351287842, + "logits/rejected": -1.1590121984481812, + "logps/chosen": -154.56790161132812, + "logps/rejected": -368.48944091796875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.784639835357666, + "rewards/margins": 23.924802780151367, + "rewards/rejected": -29.709442138671875, + "step": 5930 + }, + { + "epoch": 2.71, + "learning_rate": 5.350803043110735e-08, + "logits/chosen": -1.4274301528930664, + "logits/rejected": -1.168290376663208, + "logps/chosen": -167.0966796875, + "logps/rejected": -396.8948669433594, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.695293426513672, + "rewards/margins": 24.249807357788086, + "rewards/rejected": -31.945098876953125, + "step": 5940 + }, + { + "epoch": 2.72, + "learning_rate": 5.266272189349112e-08, + "logits/chosen": -1.474579095840454, + "logits/rejected": -1.196025013923645, + "logps/chosen": -152.70132446289062, + "logps/rejected": -369.7622985839844, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.682868003845215, + "rewards/margins": 23.80048370361328, + "rewards/rejected": -29.483348846435547, + "step": 5950 + }, + { + "epoch": 2.72, + "learning_rate": 5.181741335587489e-08, + "logits/chosen": -1.4360884428024292, + "logits/rejected": -1.2026808261871338, + "logps/chosen": -161.60153198242188, + "logps/rejected": -364.6895751953125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.776439666748047, + "rewards/margins": 21.810924530029297, + "rewards/rejected": -28.58736228942871, + "step": 5960 + }, + { + "epoch": 2.72, + "learning_rate": 5.0972104818258664e-08, + "logits/chosen": -1.4061036109924316, + "logits/rejected": -1.1320244073867798, + "logps/chosen": -158.87969970703125, + "logps/rejected": -367.72894287109375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.624148368835449, + "rewards/margins": 23.071802139282227, + "rewards/rejected": -29.695947647094727, + "step": 5970 + }, + { + "epoch": 2.73, + "learning_rate": 5.0126796280642434e-08, + "logits/chosen": -1.4696153402328491, + "logits/rejected": -1.1484925746917725, + "logps/chosen": -165.90017700195312, + "logps/rejected": -379.2989501953125, + "loss": 0.0045, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -7.214005947113037, + "rewards/margins": 23.535503387451172, + "rewards/rejected": -30.74951171875, + "step": 5980 + }, + { + "epoch": 2.73, + "learning_rate": 4.92814877430262e-08, + "logits/chosen": -1.5003819465637207, + "logits/rejected": -1.194960355758667, + "logps/chosen": -153.00140380859375, + "logps/rejected": -358.32098388671875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.173357963562012, + "rewards/margins": 23.310537338256836, + "rewards/rejected": -28.483896255493164, + "step": 5990 + }, + { + "epoch": 2.74, + "learning_rate": 4.8436179205409975e-08, + "logits/chosen": -1.4279770851135254, + "logits/rejected": -1.1825916767120361, + "logps/chosen": -152.53591918945312, + "logps/rejected": -368.3629455566406, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.5308427810668945, + "rewards/margins": 23.156478881835938, + "rewards/rejected": -29.687320709228516, + "step": 6000 + }, + { + "epoch": 2.74, + "eval_logits/chosen": -1.4632593393325806, + "eval_logits/rejected": -1.2176082134246826, + "eval_logps/chosen": -159.17491149902344, + "eval_logps/rejected": -333.15716552734375, + "eval_loss": 0.024197373539209366, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -6.793771743774414, + "eval_rewards/margins": 19.51148796081543, + "eval_rewards/rejected": -26.30525779724121, + "eval_runtime": 48.3964, + "eval_samples_per_second": 59.137, + "eval_steps_per_second": 1.86, + "step": 6000 + }, + { + "epoch": 2.74, + "learning_rate": 4.7590870667793745e-08, + "logits/chosen": -1.5039942264556885, + "logits/rejected": -1.2951033115386963, + "logps/chosen": -154.0543975830078, + "logps/rejected": -350.24090576171875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.628406524658203, + "rewards/margins": 21.300485610961914, + "rewards/rejected": -27.928890228271484, + "step": 6010 + }, + { + "epoch": 2.75, + "learning_rate": 4.674556213017751e-08, + "logits/chosen": -1.557823896408081, + "logits/rejected": -1.3056771755218506, + "logps/chosen": -140.63388061523438, + "logps/rejected": -359.533447265625, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.889941215515137, + "rewards/margins": 23.574077606201172, + "rewards/rejected": -28.46401596069336, + "step": 6020 + }, + { + "epoch": 2.75, + "learning_rate": 4.5900253592561286e-08, + "logits/chosen": -1.5295337438583374, + "logits/rejected": -1.261516809463501, + "logps/chosen": -152.05203247070312, + "logps/rejected": -349.45220947265625, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.756598472595215, + "rewards/margins": 22.195484161376953, + "rewards/rejected": -27.95208168029785, + "step": 6030 + }, + { + "epoch": 2.76, + "learning_rate": 4.505494505494505e-08, + "logits/chosen": -1.489527940750122, + "logits/rejected": -1.2501986026763916, + "logps/chosen": -149.98753356933594, + "logps/rejected": -355.73828125, + "loss": 0.0063, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.916062355041504, + "rewards/margins": 22.23210906982422, + "rewards/rejected": -28.14817237854004, + "step": 6040 + }, + { + "epoch": 2.76, + "learning_rate": 4.420963651732882e-08, + "logits/chosen": -1.5801336765289307, + "logits/rejected": -1.3497835397720337, + "logps/chosen": -158.12344360351562, + "logps/rejected": -346.0705871582031, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.168856620788574, + "rewards/margins": 20.862895965576172, + "rewards/rejected": -27.031749725341797, + "step": 6050 + }, + { + "epoch": 2.77, + "learning_rate": 4.33643279797126e-08, + "logits/chosen": -1.532447338104248, + "logits/rejected": -1.340090274810791, + "logps/chosen": -151.39772033691406, + "logps/rejected": -325.19647216796875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.766188621520996, + "rewards/margins": 18.759479522705078, + "rewards/rejected": -25.525672912597656, + "step": 6060 + }, + { + "epoch": 2.77, + "learning_rate": 4.251901944209636e-08, + "logits/chosen": -1.5740821361541748, + "logits/rejected": -1.3083857297897339, + "logps/chosen": -155.3162841796875, + "logps/rejected": -347.370361328125, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.801811218261719, + "rewards/margins": 21.371374130249023, + "rewards/rejected": -27.173187255859375, + "step": 6070 + }, + { + "epoch": 2.77, + "learning_rate": 4.167371090448013e-08, + "logits/chosen": -1.5464942455291748, + "logits/rejected": -1.306731104850769, + "logps/chosen": -142.73570251464844, + "logps/rejected": -342.77777099609375, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.27818489074707, + "rewards/margins": 21.76172637939453, + "rewards/rejected": -27.0399112701416, + "step": 6080 + }, + { + "epoch": 2.78, + "learning_rate": 4.082840236686391e-08, + "logits/chosen": -1.4867476224899292, + "logits/rejected": -1.2121250629425049, + "logps/chosen": -142.33984375, + "logps/rejected": -352.05255126953125, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.472867012023926, + "rewards/margins": 22.67243003845215, + "rewards/rejected": -28.145299911499023, + "step": 6090 + }, + { + "epoch": 2.78, + "learning_rate": 3.998309382924767e-08, + "logits/chosen": -1.5143063068389893, + "logits/rejected": -1.2820769548416138, + "logps/chosen": -154.45347595214844, + "logps/rejected": -353.0608215332031, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.095731258392334, + "rewards/margins": 22.116703033447266, + "rewards/rejected": -28.21243667602539, + "step": 6100 + }, + { + "epoch": 2.78, + "eval_logits/chosen": -1.4429353475570679, + "eval_logits/rejected": -1.1958377361297607, + "eval_logps/chosen": -159.91256713867188, + "eval_logps/rejected": -335.501953125, + "eval_loss": 0.02424330823123455, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -6.8675360679626465, + "eval_rewards/margins": 19.67220115661621, + "eval_rewards/rejected": -26.539735794067383, + "eval_runtime": 48.4944, + "eval_samples_per_second": 59.017, + "eval_steps_per_second": 1.856, + "step": 6100 + }, + { + "epoch": 2.79, + "learning_rate": 3.913778529163144e-08, + "logits/chosen": -1.4917789697647095, + "logits/rejected": -1.2775121927261353, + "logps/chosen": -166.1258544921875, + "logps/rejected": -356.85479736328125, + "loss": 0.0034, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -7.622159004211426, + "rewards/margins": 20.657934188842773, + "rewards/rejected": -28.280094146728516, + "step": 6110 + }, + { + "epoch": 2.79, + "learning_rate": 3.829247675401521e-08, + "logits/chosen": -1.5115435123443604, + "logits/rejected": -1.2851530313491821, + "logps/chosen": -157.784912109375, + "logps/rejected": -343.08648681640625, + "loss": 0.0035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.815495491027832, + "rewards/margins": 21.354597091674805, + "rewards/rejected": -27.170089721679688, + "step": 6120 + }, + { + "epoch": 2.8, + "learning_rate": 3.744716821639898e-08, + "logits/chosen": -1.4754083156585693, + "logits/rejected": -1.2475736141204834, + "logps/chosen": -153.1291046142578, + "logps/rejected": -343.4634704589844, + "loss": 0.0045, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.1217546463012695, + "rewards/margins": 21.024320602416992, + "rewards/rejected": -27.146076202392578, + "step": 6130 + }, + { + "epoch": 2.8, + "learning_rate": 3.6601859678782753e-08, + "logits/chosen": -1.540523648262024, + "logits/rejected": -1.3162726163864136, + "logps/chosen": -160.690185546875, + "logps/rejected": -339.6878967285156, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.333074569702148, + "rewards/margins": 20.471248626708984, + "rewards/rejected": -26.804325103759766, + "step": 6140 + }, + { + "epoch": 2.81, + "learning_rate": 3.5756551141166524e-08, + "logits/chosen": -1.5146772861480713, + "logits/rejected": -1.2612508535385132, + "logps/chosen": -160.58139038085938, + "logps/rejected": -364.7797546386719, + "loss": 0.0048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.714122772216797, + "rewards/margins": 22.714826583862305, + "rewards/rejected": -29.428951263427734, + "step": 6150 + }, + { + "epoch": 2.81, + "learning_rate": 3.4911242603550294e-08, + "logits/chosen": -1.504428744316101, + "logits/rejected": -1.236289620399475, + "logps/chosen": -149.89620971679688, + "logps/rejected": -360.3150939941406, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.815154075622559, + "rewards/margins": 23.259376525878906, + "rewards/rejected": -29.07452964782715, + "step": 6160 + }, + { + "epoch": 2.82, + "learning_rate": 3.4065934065934065e-08, + "logits/chosen": -1.5386370420455933, + "logits/rejected": -1.2299727201461792, + "logps/chosen": -150.00607299804688, + "logps/rejected": -350.6068115234375, + "loss": 0.0019, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.3740010261535645, + "rewards/margins": 22.33721923828125, + "rewards/rejected": -27.71121597290039, + "step": 6170 + }, + { + "epoch": 2.82, + "learning_rate": 3.3220625528317835e-08, + "logits/chosen": -1.4936457872390747, + "logits/rejected": -1.2583879232406616, + "logps/chosen": -163.72991943359375, + "logps/rejected": -359.2135925292969, + "loss": 0.0024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.795522212982178, + "rewards/margins": 21.916730880737305, + "rewards/rejected": -28.712255477905273, + "step": 6180 + }, + { + "epoch": 2.83, + "learning_rate": 3.2375316990701605e-08, + "logits/chosen": -1.4773077964782715, + "logits/rejected": -1.2353664636611938, + "logps/chosen": -158.53857421875, + "logps/rejected": -351.96295166015625, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.630317687988281, + "rewards/margins": 21.631898880004883, + "rewards/rejected": -28.262216567993164, + "step": 6190 + }, + { + "epoch": 2.83, + "learning_rate": 3.153000845308537e-08, + "logits/chosen": -1.4404528141021729, + "logits/rejected": -1.2168524265289307, + "logps/chosen": -151.5290069580078, + "logps/rejected": -339.2045593261719, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.218883991241455, + "rewards/margins": 20.83730125427246, + "rewards/rejected": -27.05618667602539, + "step": 6200 + }, + { + "epoch": 2.83, + "eval_logits/chosen": -1.400346279144287, + "eval_logits/rejected": -1.1466337442398071, + "eval_logps/chosen": -162.3729248046875, + "eval_logps/rejected": -342.3287048339844, + "eval_loss": 0.024862240999937057, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -7.11357307434082, + "eval_rewards/margins": 20.108837127685547, + "eval_rewards/rejected": -27.222412109375, + "eval_runtime": 49.356, + "eval_samples_per_second": 57.987, + "eval_steps_per_second": 1.823, + "step": 6200 + }, + { + "epoch": 2.83, + "learning_rate": 3.0684699915469146e-08, + "logits/chosen": -1.5231372117996216, + "logits/rejected": -1.2724891901016235, + "logps/chosen": -153.64529418945312, + "logps/rejected": -357.3453369140625, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.938612937927246, + "rewards/margins": 22.537324905395508, + "rewards/rejected": -28.475936889648438, + "step": 6210 + }, + { + "epoch": 2.84, + "learning_rate": 2.9839391377852916e-08, + "logits/chosen": -1.4560668468475342, + "logits/rejected": -1.2280206680297852, + "logps/chosen": -169.55458068847656, + "logps/rejected": -367.2767639160156, + "loss": 0.0024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -7.545860290527344, + "rewards/margins": 21.704364776611328, + "rewards/rejected": -29.250225067138672, + "step": 6220 + }, + { + "epoch": 2.84, + "learning_rate": 2.8994082840236687e-08, + "logits/chosen": -1.512521743774414, + "logits/rejected": -1.2356914281845093, + "logps/chosen": -156.36895751953125, + "logps/rejected": -352.8377380371094, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.077000617980957, + "rewards/margins": 22.118440628051758, + "rewards/rejected": -28.1954402923584, + "step": 6230 + }, + { + "epoch": 2.85, + "learning_rate": 2.8148774302620457e-08, + "logits/chosen": -1.5163729190826416, + "logits/rejected": -1.242244839668274, + "logps/chosen": -152.9321746826172, + "logps/rejected": -368.83221435546875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.0060224533081055, + "rewards/margins": 23.76858139038086, + "rewards/rejected": -29.774608612060547, + "step": 6240 + }, + { + "epoch": 2.85, + "learning_rate": 2.7303465765004224e-08, + "logits/chosen": -1.4778908491134644, + "logits/rejected": -1.2573697566986084, + "logps/chosen": -162.7844696044922, + "logps/rejected": -369.68560791015625, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -7.185385227203369, + "rewards/margins": 22.159021377563477, + "rewards/rejected": -29.344406127929688, + "step": 6250 + }, + { + "epoch": 2.86, + "learning_rate": 2.6458157227387995e-08, + "logits/chosen": -1.4954620599746704, + "logits/rejected": -1.2360405921936035, + "logps/chosen": -147.39080810546875, + "logps/rejected": -339.0125732421875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.425978660583496, + "rewards/margins": 21.780513763427734, + "rewards/rejected": -27.206493377685547, + "step": 6260 + }, + { + "epoch": 2.86, + "learning_rate": 2.5612848689771768e-08, + "logits/chosen": -1.482560634613037, + "logits/rejected": -1.218420386314392, + "logps/chosen": -162.4448699951172, + "logps/rejected": -368.6003723144531, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.563105583190918, + "rewards/margins": 23.120697021484375, + "rewards/rejected": -29.68380355834961, + "step": 6270 + }, + { + "epoch": 2.87, + "learning_rate": 2.4767540152155535e-08, + "logits/chosen": -1.4622437953948975, + "logits/rejected": -1.205731749534607, + "logps/chosen": -158.74293518066406, + "logps/rejected": -361.62799072265625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.524346351623535, + "rewards/margins": 22.388973236083984, + "rewards/rejected": -28.913320541381836, + "step": 6280 + }, + { + "epoch": 2.87, + "learning_rate": 2.3922231614539306e-08, + "logits/chosen": -1.429924488067627, + "logits/rejected": -1.2010109424591064, + "logps/chosen": -162.57614135742188, + "logps/rejected": -357.17596435546875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.556378364562988, + "rewards/margins": 21.281539916992188, + "rewards/rejected": -28.83791732788086, + "step": 6290 + }, + { + "epoch": 2.88, + "learning_rate": 2.3076923076923076e-08, + "logits/chosen": -1.4317352771759033, + "logits/rejected": -1.1608693599700928, + "logps/chosen": -144.91014099121094, + "logps/rejected": -360.8427734375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.919538974761963, + "rewards/margins": 24.008647918701172, + "rewards/rejected": -28.92818832397461, + "step": 6300 + }, + { + "epoch": 2.88, + "eval_logits/chosen": -1.4024568796157837, + "eval_logits/rejected": -1.1501376628875732, + "eval_logps/chosen": -160.83221435546875, + "eval_logps/rejected": -341.0271911621094, + "eval_loss": 0.02510543167591095, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -6.95950174331665, + "eval_rewards/margins": 20.132761001586914, + "eval_rewards/rejected": -27.092260360717773, + "eval_runtime": 49.4653, + "eval_samples_per_second": 57.859, + "eval_steps_per_second": 1.819, + "step": 6300 + }, + { + "epoch": 2.88, + "learning_rate": 2.2231614539306847e-08, + "logits/chosen": -1.5092194080352783, + "logits/rejected": -1.2494463920593262, + "logps/chosen": -160.4544677734375, + "logps/rejected": -357.55181884765625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.256556034088135, + "rewards/margins": 22.368412017822266, + "rewards/rejected": -28.624969482421875, + "step": 6310 + }, + { + "epoch": 2.88, + "learning_rate": 2.1386306001690617e-08, + "logits/chosen": -1.4365084171295166, + "logits/rejected": -1.1854063272476196, + "logps/chosen": -153.9835662841797, + "logps/rejected": -371.0440979003906, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.483023166656494, + "rewards/margins": 23.244380950927734, + "rewards/rejected": -29.727405548095703, + "step": 6320 + }, + { + "epoch": 2.89, + "learning_rate": 2.0540997464074387e-08, + "logits/chosen": -1.4596434831619263, + "logits/rejected": -1.1587064266204834, + "logps/chosen": -154.91305541992188, + "logps/rejected": -355.8430480957031, + "loss": 0.0081, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.246144771575928, + "rewards/margins": 22.51506996154785, + "rewards/rejected": -28.761215209960938, + "step": 6330 + }, + { + "epoch": 2.89, + "learning_rate": 1.9695688926458154e-08, + "logits/chosen": -1.4274392127990723, + "logits/rejected": -1.1878191232681274, + "logps/chosen": -157.2246551513672, + "logps/rejected": -373.7826232910156, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.614645481109619, + "rewards/margins": 23.510032653808594, + "rewards/rejected": -30.124679565429688, + "step": 6340 + }, + { + "epoch": 2.9, + "learning_rate": 1.8850380388841928e-08, + "logits/chosen": -1.468880534172058, + "logits/rejected": -1.1757152080535889, + "logps/chosen": -150.7034912109375, + "logps/rejected": -348.62127685546875, + "loss": 0.0023, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.9861860275268555, + "rewards/margins": 21.781715393066406, + "rewards/rejected": -27.767902374267578, + "step": 6350 + }, + { + "epoch": 2.9, + "learning_rate": 1.80050718512257e-08, + "logits/chosen": -1.419793725013733, + "logits/rejected": -1.1910134553909302, + "logps/chosen": -155.99221801757812, + "logps/rejected": -349.697265625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.731202602386475, + "rewards/margins": 21.212007522583008, + "rewards/rejected": -27.94321060180664, + "step": 6360 + }, + { + "epoch": 2.91, + "learning_rate": 1.7159763313609465e-08, + "logits/chosen": -1.4699496030807495, + "logits/rejected": -1.1939094066619873, + "logps/chosen": -159.7974090576172, + "logps/rejected": -363.49420166015625, + "loss": 0.0088, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.286614418029785, + "rewards/margins": 22.9273681640625, + "rewards/rejected": -29.213979721069336, + "step": 6370 + }, + { + "epoch": 2.91, + "learning_rate": 1.6314454775993236e-08, + "logits/chosen": -1.443658709526062, + "logits/rejected": -1.1851694583892822, + "logps/chosen": -140.8810272216797, + "logps/rejected": -347.7745056152344, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.21396541595459, + "rewards/margins": 22.442602157592773, + "rewards/rejected": -27.656566619873047, + "step": 6380 + }, + { + "epoch": 2.92, + "learning_rate": 1.5469146238377006e-08, + "logits/chosen": -1.4029505252838135, + "logits/rejected": -1.1696784496307373, + "logps/chosen": -152.52932739257812, + "logps/rejected": -353.049560546875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.933673858642578, + "rewards/margins": 22.368228912353516, + "rewards/rejected": -28.301904678344727, + "step": 6390 + }, + { + "epoch": 2.92, + "learning_rate": 1.4623837700760778e-08, + "logits/chosen": -1.5081956386566162, + "logits/rejected": -1.2300573587417603, + "logps/chosen": -142.51382446289062, + "logps/rejected": -364.7895202636719, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.941340446472168, + "rewards/margins": 24.369924545288086, + "rewards/rejected": -29.311267852783203, + "step": 6400 + }, + { + "epoch": 2.92, + "eval_logits/chosen": -1.386672854423523, + "eval_logits/rejected": -1.128792405128479, + "eval_logps/chosen": -162.50047302246094, + "eval_logps/rejected": -347.1543884277344, + "eval_loss": 0.025260092690587044, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -7.126327037811279, + "eval_rewards/margins": 20.57865333557129, + "eval_rewards/rejected": -27.70498275756836, + "eval_runtime": 48.9439, + "eval_samples_per_second": 58.475, + "eval_steps_per_second": 1.839, + "step": 6400 + }, + { + "epoch": 2.93, + "learning_rate": 1.3778529163144547e-08, + "logits/chosen": -1.3817229270935059, + "logits/rejected": -1.1325594186782837, + "logps/chosen": -154.70213317871094, + "logps/rejected": -367.01953125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.565983772277832, + "rewards/margins": 22.861417770385742, + "rewards/rejected": -29.42740249633789, + "step": 6410 + }, + { + "epoch": 2.93, + "learning_rate": 1.2933220625528317e-08, + "logits/chosen": -1.4914562702178955, + "logits/rejected": -1.2700700759887695, + "logps/chosen": -153.0315399169922, + "logps/rejected": -350.91888427734375, + "loss": 0.0047, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.609368801116943, + "rewards/margins": 21.178762435913086, + "rewards/rejected": -27.788131713867188, + "step": 6420 + }, + { + "epoch": 2.93, + "learning_rate": 1.2087912087912088e-08, + "logits/chosen": -1.445056676864624, + "logits/rejected": -1.1885985136032104, + "logps/chosen": -154.37966918945312, + "logps/rejected": -371.3355407714844, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.280908107757568, + "rewards/margins": 23.554365158081055, + "rewards/rejected": -29.835275650024414, + "step": 6430 + }, + { + "epoch": 2.94, + "learning_rate": 1.1242603550295858e-08, + "logits/chosen": -1.4893832206726074, + "logits/rejected": -1.2540134191513062, + "logps/chosen": -157.16061401367188, + "logps/rejected": -351.6845703125, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.9279327392578125, + "rewards/margins": 20.94320297241211, + "rewards/rejected": -27.871135711669922, + "step": 6440 + }, + { + "epoch": 2.94, + "learning_rate": 1.0397295012679627e-08, + "logits/chosen": -1.4614284038543701, + "logits/rejected": -1.2071059942245483, + "logps/chosen": -154.94326782226562, + "logps/rejected": -356.627197265625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.525387763977051, + "rewards/margins": 21.915996551513672, + "rewards/rejected": -28.441381454467773, + "step": 6450 + }, + { + "epoch": 2.95, + "learning_rate": 9.551986475063399e-09, + "logits/chosen": -1.4893442392349243, + "logits/rejected": -1.2231934070587158, + "logps/chosen": -154.13449096679688, + "logps/rejected": -371.9717712402344, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.246697425842285, + "rewards/margins": 23.48638153076172, + "rewards/rejected": -29.733081817626953, + "step": 6460 + }, + { + "epoch": 2.95, + "learning_rate": 8.706677937447167e-09, + "logits/chosen": -1.5489857196807861, + "logits/rejected": -1.2381068468093872, + "logps/chosen": -157.33367919921875, + "logps/rejected": -368.0112609863281, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.018952369689941, + "rewards/margins": 23.636343002319336, + "rewards/rejected": -29.655292510986328, + "step": 6470 + }, + { + "epoch": 2.96, + "learning_rate": 7.861369399830938e-09, + "logits/chosen": -1.4903433322906494, + "logits/rejected": -1.2489861249923706, + "logps/chosen": -157.29364013671875, + "logps/rejected": -350.2141418457031, + "loss": 0.0035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.610640525817871, + "rewards/margins": 21.394311904907227, + "rewards/rejected": -28.004953384399414, + "step": 6480 + }, + { + "epoch": 2.96, + "learning_rate": 7.016060862214708e-09, + "logits/chosen": -1.4743247032165527, + "logits/rejected": -1.2524207830429077, + "logps/chosen": -162.22581481933594, + "logps/rejected": -361.66302490234375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.312684535980225, + "rewards/margins": 21.844139099121094, + "rewards/rejected": -29.15682601928711, + "step": 6490 + }, + { + "epoch": 2.97, + "learning_rate": 6.170752324598478e-09, + "logits/chosen": -1.4401403665542603, + "logits/rejected": -1.2095723152160645, + "logps/chosen": -158.8749237060547, + "logps/rejected": -342.3064880371094, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.759598731994629, + "rewards/margins": 20.1064510345459, + "rewards/rejected": -26.866052627563477, + "step": 6500 + }, + { + "epoch": 2.97, + "eval_logits/chosen": -1.3866673707962036, + "eval_logits/rejected": -1.12844979763031, + "eval_logps/chosen": -162.0652313232422, + "eval_logps/rejected": -347.0838317871094, + "eval_loss": 0.02533269301056862, + "eval_rewards/accuracies": 0.980555534362793, + "eval_rewards/chosen": -7.0828022956848145, + "eval_rewards/margins": 20.615121841430664, + "eval_rewards/rejected": -27.697925567626953, + "eval_runtime": 48.9445, + "eval_samples_per_second": 58.474, + "eval_steps_per_second": 1.839, + "step": 6500 + }, + { + "epoch": 2.97, + "learning_rate": 5.325443786982248e-09, + "logits/chosen": -1.5094420909881592, + "logits/rejected": -1.2922451496124268, + "logps/chosen": -153.08706665039062, + "logps/rejected": -348.4012756347656, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.878302097320557, + "rewards/margins": 22.082895278930664, + "rewards/rejected": -27.961196899414062, + "step": 6510 + }, + { + "epoch": 2.98, + "learning_rate": 4.4801352493660185e-09, + "logits/chosen": -1.4939616918563843, + "logits/rejected": -1.2416346073150635, + "logps/chosen": -144.56320190429688, + "logps/rejected": -368.6065673828125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.104874610900879, + "rewards/margins": 24.626834869384766, + "rewards/rejected": -29.73171043395996, + "step": 6520 + }, + { + "epoch": 2.98, + "learning_rate": 3.6348267117497885e-09, + "logits/chosen": -1.456238031387329, + "logits/rejected": -1.2182656526565552, + "logps/chosen": -169.69577026367188, + "logps/rejected": -350.4523620605469, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.985285758972168, + "rewards/margins": 20.02364158630371, + "rewards/rejected": -28.008926391601562, + "step": 6530 + }, + { + "epoch": 2.98, + "learning_rate": 2.7895181741335584e-09, + "logits/chosen": -1.459014654159546, + "logits/rejected": -1.2330175638198853, + "logps/chosen": -160.15809631347656, + "logps/rejected": -352.0977478027344, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.381222724914551, + "rewards/margins": 21.789058685302734, + "rewards/rejected": -28.170278549194336, + "step": 6540 + }, + { + "epoch": 2.99, + "learning_rate": 1.944209636517329e-09, + "logits/chosen": -1.5096580982208252, + "logits/rejected": -1.2079827785491943, + "logps/chosen": -148.11973571777344, + "logps/rejected": -359.211181640625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.379202365875244, + "rewards/margins": 23.468761444091797, + "rewards/rejected": -28.847965240478516, + "step": 6550 + }, + { + "epoch": 2.99, + "learning_rate": 1.0989010989010988e-09, + "logits/chosen": -1.4669349193572998, + "logits/rejected": -1.241938591003418, + "logps/chosen": -158.7514190673828, + "logps/rejected": -364.5976257324219, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.362316131591797, + "rewards/margins": 22.672849655151367, + "rewards/rejected": -29.035167694091797, + "step": 6560 + }, + { + "epoch": 3.0, + "learning_rate": 2.53592561284869e-10, + "logits/chosen": -1.4910657405853271, + "logits/rejected": -1.1942976713180542, + "logps/chosen": -152.2435760498047, + "logps/rejected": -353.9536437988281, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.943681716918945, + "rewards/margins": 22.530424118041992, + "rewards/rejected": -28.474105834960938, + "step": 6570 + }, + { + "epoch": 3.0, + "step": 6573, + "total_flos": 0.0, + "train_loss": 0.026962732661432107, + "train_runtime": 14195.2291, + "train_samples_per_second": 29.63, + "train_steps_per_second": 0.463 + } + ], + "logging_steps": 10, + "max_steps": 6573, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}