{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.952662721893491, "eval_steps": 1, "global_step": 672, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011834319526627219, "grad_norm": 55.38106439710106, "learning_rate": 7.352941176470588e-09, "logits/chosen": -0.587167501449585, "logits/rejected": -0.6672874093055725, "logps/chosen": -39.686065673828125, "logps/rejected": -46.94537353515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.023668639053254437, "grad_norm": 54.65079993565704, "learning_rate": 1.4705882352941176e-08, "logits/chosen": -0.3381628394126892, "logits/rejected": -0.2981947958469391, "logps/chosen": -38.55506134033203, "logps/rejected": -47.09852600097656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.03550295857988166, "grad_norm": 54.205478847015584, "learning_rate": 2.2058823529411764e-08, "logits/chosen": -0.47191303968429565, "logits/rejected": -0.5924246311187744, "logps/chosen": -36.32940673828125, "logps/rejected": -37.75663375854492, "loss": 0.6975, "rewards/accuracies": 0.625, "rewards/chosen": -0.004663002677261829, "rewards/margins": 0.020077597349882126, "rewards/rejected": -0.02474059723317623, "step": 3 }, { "epoch": 0.047337278106508875, "grad_norm": 53.90641855823955, "learning_rate": 2.941176470588235e-08, "logits/chosen": -0.715237021446228, "logits/rejected": -0.8035542964935303, "logps/chosen": -39.215999603271484, "logps/rejected": -47.370750427246094, "loss": 0.6839, "rewards/accuracies": 0.5, "rewards/chosen": -0.01616680435836315, "rewards/margins": 0.024854015558958054, "rewards/rejected": -0.04102082550525665, "step": 4 }, { "epoch": 0.05917159763313609, "grad_norm": 52.25558555929606, "learning_rate": 3.676470588235294e-08, "logits/chosen": -0.8661510944366455, "logits/rejected": -0.7564424276351929, "logps/chosen": -46.795005798339844, "logps/rejected": -44.85298538208008, "loss": 0.6964, "rewards/accuracies": 0.4375, "rewards/chosen": -0.021712014451622963, "rewards/margins": -0.05135791748762131, "rewards/rejected": 0.0296458937227726, "step": 5 }, { "epoch": 0.07100591715976332, "grad_norm": 61.52227993189212, "learning_rate": 4.411764705882353e-08, "logits/chosen": -0.6120268106460571, "logits/rejected": -0.5849899053573608, "logps/chosen": -38.418251037597656, "logps/rejected": -42.02568054199219, "loss": 0.7025, "rewards/accuracies": 0.5625, "rewards/chosen": -0.042597055435180664, "rewards/margins": -0.009472893550992012, "rewards/rejected": -0.0331241637468338, "step": 6 }, { "epoch": 0.08284023668639054, "grad_norm": 54.74581505857575, "learning_rate": 5.147058823529411e-08, "logits/chosen": -0.382029265165329, "logits/rejected": -0.3890838623046875, "logps/chosen": -38.0916633605957, "logps/rejected": -48.64350509643555, "loss": 0.6941, "rewards/accuracies": 0.625, "rewards/chosen": -0.0039642686024308205, "rewards/margins": 0.00704039353877306, "rewards/rejected": -0.01100466400384903, "step": 7 }, { "epoch": 0.09467455621301775, "grad_norm": 55.737912792059, "learning_rate": 5.88235294117647e-08, "logits/chosen": -0.4821730852127075, "logits/rejected": -0.497173935174942, "logps/chosen": -41.562705993652344, "logps/rejected": -36.803367614746094, "loss": 0.6874, "rewards/accuracies": 0.4375, "rewards/chosen": -0.02364335022866726, "rewards/margins": -0.013885259628295898, "rewards/rejected": -0.009758088737726212, "step": 8 }, { "epoch": 0.10650887573964497, "grad_norm": 57.095682285700036, "learning_rate": 6.617647058823529e-08, "logits/chosen": -0.3777148723602295, "logits/rejected": -0.5628172755241394, "logps/chosen": -39.834068298339844, "logps/rejected": -40.3427734375, "loss": 0.6969, "rewards/accuracies": 0.5, "rewards/chosen": -0.025170041248202324, "rewards/margins": -0.03208901360630989, "rewards/rejected": 0.006918976083397865, "step": 9 }, { "epoch": 0.11834319526627218, "grad_norm": 54.157479939562684, "learning_rate": 7.352941176470588e-08, "logits/chosen": -0.7878235578536987, "logits/rejected": -0.9192472696304321, "logps/chosen": -42.91743469238281, "logps/rejected": -41.28839874267578, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": -0.03524742275476456, "rewards/margins": 0.016644442453980446, "rewards/rejected": -0.051891863346099854, "step": 10 }, { "epoch": 0.1301775147928994, "grad_norm": 57.291604252818274, "learning_rate": 8.088235294117647e-08, "logits/chosen": -0.8743460774421692, "logits/rejected": -0.8040152788162231, "logps/chosen": -46.50861358642578, "logps/rejected": -48.608097076416016, "loss": 0.6911, "rewards/accuracies": 0.8125, "rewards/chosen": 0.04767334461212158, "rewards/margins": 0.07948525249958038, "rewards/rejected": -0.0318119041621685, "step": 11 }, { "epoch": 0.14201183431952663, "grad_norm": 54.33703237230041, "learning_rate": 8.823529411764706e-08, "logits/chosen": -0.7734875679016113, "logits/rejected": -0.7457428574562073, "logps/chosen": -39.402000427246094, "logps/rejected": -47.64637756347656, "loss": 0.6986, "rewards/accuracies": 0.5, "rewards/chosen": -0.00954131968319416, "rewards/margins": -0.019432254135608673, "rewards/rejected": 0.009890936315059662, "step": 12 }, { "epoch": 0.15384615384615385, "grad_norm": 52.16368650444846, "learning_rate": 9.558823529411763e-08, "logits/chosen": -0.6576703786849976, "logits/rejected": -0.6208328008651733, "logps/chosen": -38.19804000854492, "logps/rejected": -49.989341735839844, "loss": 0.6868, "rewards/accuracies": 0.5625, "rewards/chosen": 0.007916189730167389, "rewards/margins": 0.02689201757311821, "rewards/rejected": -0.01897583156824112, "step": 13 }, { "epoch": 0.16568047337278108, "grad_norm": 59.87799896888102, "learning_rate": 1.0294117647058822e-07, "logits/chosen": -0.4240450859069824, "logits/rejected": -0.5086762309074402, "logps/chosen": -31.308774948120117, "logps/rejected": -37.28623580932617, "loss": 0.6977, "rewards/accuracies": 0.625, "rewards/chosen": -0.005666483659297228, "rewards/margins": -0.00042060669511556625, "rewards/rejected": -0.005245877429842949, "step": 14 }, { "epoch": 0.17751479289940827, "grad_norm": 51.93559977619766, "learning_rate": 1.1029411764705881e-07, "logits/chosen": -0.6712979674339294, "logits/rejected": -0.40396082401275635, "logps/chosen": -32.445457458496094, "logps/rejected": -51.878196716308594, "loss": 0.6975, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0392225980758667, "rewards/margins": -0.022353485226631165, "rewards/rejected": -0.016869116574525833, "step": 15 }, { "epoch": 0.1893491124260355, "grad_norm": 54.783459080693845, "learning_rate": 1.176470588235294e-07, "logits/chosen": -0.5053819417953491, "logits/rejected": -0.7511165738105774, "logps/chosen": -39.814659118652344, "logps/rejected": -38.76427459716797, "loss": 0.6901, "rewards/accuracies": 0.5625, "rewards/chosen": -0.004495692439377308, "rewards/margins": 0.00818572100251913, "rewards/rejected": -0.01268141157925129, "step": 16 }, { "epoch": 0.20118343195266272, "grad_norm": 50.13716138258182, "learning_rate": 1.25e-07, "logits/chosen": -0.38788411021232605, "logits/rejected": -0.5259600877761841, "logps/chosen": -39.752655029296875, "logps/rejected": -35.34233856201172, "loss": 0.6873, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0529358834028244, "rewards/margins": 0.05223493278026581, "rewards/rejected": 0.0007009506225585938, "step": 17 }, { "epoch": 0.21301775147928995, "grad_norm": 53.56051736985278, "learning_rate": 1.3235294117647057e-07, "logits/chosen": -0.6249361038208008, "logits/rejected": -0.2918693721294403, "logps/chosen": -37.1358528137207, "logps/rejected": -54.117340087890625, "loss": 0.6908, "rewards/accuracies": 0.625, "rewards/chosen": -0.01282811164855957, "rewards/margins": 0.035147905349731445, "rewards/rejected": -0.047976016998291016, "step": 18 }, { "epoch": 0.22485207100591717, "grad_norm": 55.4929889677955, "learning_rate": 1.3970588235294117e-07, "logits/chosen": -0.7626937627792358, "logits/rejected": -0.6267987489700317, "logps/chosen": -34.65644073486328, "logps/rejected": -41.60660934448242, "loss": 0.687, "rewards/accuracies": 0.5, "rewards/chosen": 0.019578397274017334, "rewards/margins": 0.030142582952976227, "rewards/rejected": -0.010564185678958893, "step": 19 }, { "epoch": 0.23668639053254437, "grad_norm": 53.81366773458671, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -0.9788577556610107, "logits/rejected": -0.9218689203262329, "logps/chosen": -35.290775299072266, "logps/rejected": -40.17462158203125, "loss": 0.6949, "rewards/accuracies": 0.5, "rewards/chosen": -0.02336425706744194, "rewards/margins": -0.008133504539728165, "rewards/rejected": -0.015230750665068626, "step": 20 }, { "epoch": 0.2485207100591716, "grad_norm": 51.609039023069855, "learning_rate": 1.5441176470588236e-07, "logits/chosen": -0.7643724083900452, "logits/rejected": -0.8248336911201477, "logps/chosen": -39.32087707519531, "logps/rejected": -44.490169525146484, "loss": 0.6859, "rewards/accuracies": 0.625, "rewards/chosen": 0.023657750338315964, "rewards/margins": 0.06776070594787598, "rewards/rejected": -0.04410295560956001, "step": 21 }, { "epoch": 0.2603550295857988, "grad_norm": 52.42721385787476, "learning_rate": 1.6176470588235293e-07, "logits/chosen": -0.6979169249534607, "logits/rejected": -0.5965849161148071, "logps/chosen": -35.5829963684082, "logps/rejected": -46.63249206542969, "loss": 0.686, "rewards/accuracies": 0.5625, "rewards/chosen": 0.017513036727905273, "rewards/margins": 0.06794863194227219, "rewards/rejected": -0.05043559521436691, "step": 22 }, { "epoch": 0.27218934911242604, "grad_norm": 52.14376613382141, "learning_rate": 1.6911764705882354e-07, "logits/chosen": -0.26172494888305664, "logits/rejected": -0.34854474663734436, "logps/chosen": -43.53734588623047, "logps/rejected": -47.87934494018555, "loss": 0.684, "rewards/accuracies": 0.5, "rewards/chosen": -0.08613558113574982, "rewards/margins": -0.0002110544592142105, "rewards/rejected": -0.08592452853918076, "step": 23 }, { "epoch": 0.28402366863905326, "grad_norm": 50.932647643811045, "learning_rate": 1.764705882352941e-07, "logits/chosen": -0.5405491590499878, "logits/rejected": -0.6527445316314697, "logps/chosen": -41.2894287109375, "logps/rejected": -42.569549560546875, "loss": 0.6868, "rewards/accuracies": 0.6875, "rewards/chosen": -0.017691707238554955, "rewards/margins": 0.07997651398181915, "rewards/rejected": -0.09766822308301926, "step": 24 }, { "epoch": 0.2958579881656805, "grad_norm": 52.98859914460327, "learning_rate": 1.8382352941176472e-07, "logits/chosen": -0.8704635500907898, "logits/rejected": -0.8379695415496826, "logps/chosen": -28.815597534179688, "logps/rejected": -40.756103515625, "loss": 0.6874, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02325289323925972, "rewards/margins": 0.040268998593091965, "rewards/rejected": -0.06352189183235168, "step": 25 }, { "epoch": 0.3076923076923077, "grad_norm": 48.75103011196724, "learning_rate": 1.9117647058823527e-07, "logits/chosen": -0.7580403089523315, "logits/rejected": -0.5783815979957581, "logps/chosen": -37.27444839477539, "logps/rejected": -50.22969055175781, "loss": 0.679, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04801575839519501, "rewards/margins": 0.08580491691827774, "rewards/rejected": -0.13382068276405334, "step": 26 }, { "epoch": 0.31952662721893493, "grad_norm": 50.933141633314996, "learning_rate": 1.9852941176470587e-07, "logits/chosen": -0.5684085488319397, "logits/rejected": -0.41421839594841003, "logps/chosen": -29.699729919433594, "logps/rejected": -45.36609649658203, "loss": 0.6783, "rewards/accuracies": 0.5, "rewards/chosen": -0.1085064709186554, "rewards/margins": -0.0001874007284641266, "rewards/rejected": -0.10831907391548157, "step": 27 }, { "epoch": 0.33136094674556216, "grad_norm": 51.922779020745416, "learning_rate": 2.0588235294117645e-07, "logits/chosen": -0.6725336909294128, "logits/rejected": -0.6964073777198792, "logps/chosen": -35.67041015625, "logps/rejected": -41.71068572998047, "loss": 0.6749, "rewards/accuracies": 0.6875, "rewards/chosen": -0.059418633580207825, "rewards/margins": 0.05292558670043945, "rewards/rejected": -0.11234420537948608, "step": 28 }, { "epoch": 0.3431952662721893, "grad_norm": 50.897993802285555, "learning_rate": 2.1323529411764705e-07, "logits/chosen": -0.45363789796829224, "logits/rejected": -0.5877288579940796, "logps/chosen": -41.74443817138672, "logps/rejected": -40.014991760253906, "loss": 0.6687, "rewards/accuracies": 0.5, "rewards/chosen": -0.09418225288391113, "rewards/margins": 0.036634661257267, "rewards/rejected": -0.13081692159175873, "step": 29 }, { "epoch": 0.35502958579881655, "grad_norm": 47.17244651587936, "learning_rate": 2.2058823529411763e-07, "logits/chosen": -0.2768702805042267, "logits/rejected": -0.2787284255027771, "logps/chosen": -42.12376403808594, "logps/rejected": -51.230594635009766, "loss": 0.6608, "rewards/accuracies": 0.625, "rewards/chosen": -0.11093667894601822, "rewards/margins": 0.1094408631324768, "rewards/rejected": -0.22037753462791443, "step": 30 }, { "epoch": 0.3668639053254438, "grad_norm": 47.9586113592662, "learning_rate": 2.2794117647058823e-07, "logits/chosen": -0.4938819110393524, "logits/rejected": -0.5225380659103394, "logps/chosen": -32.536563873291016, "logps/rejected": -38.28359603881836, "loss": 0.6474, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12956911325454712, "rewards/margins": 0.13196122646331787, "rewards/rejected": -0.261530339717865, "step": 31 }, { "epoch": 0.378698224852071, "grad_norm": 52.2429961485054, "learning_rate": 2.352941176470588e-07, "logits/chosen": -0.7137393951416016, "logits/rejected": -0.7953929305076599, "logps/chosen": -34.03690719604492, "logps/rejected": -42.32886505126953, "loss": 0.6439, "rewards/accuracies": 0.625, "rewards/chosen": -0.13849107921123505, "rewards/margins": 0.14229899644851685, "rewards/rejected": -0.2807900607585907, "step": 32 }, { "epoch": 0.3905325443786982, "grad_norm": 48.070320304458484, "learning_rate": 2.426470588235294e-07, "logits/chosen": -0.709818959236145, "logits/rejected": -0.4917900562286377, "logps/chosen": -32.813262939453125, "logps/rejected": -52.584774017333984, "loss": 0.6496, "rewards/accuracies": 0.875, "rewards/chosen": -0.10162033885717392, "rewards/margins": 0.21768181025981903, "rewards/rejected": -0.31930214166641235, "step": 33 }, { "epoch": 0.40236686390532544, "grad_norm": 50.501375226344486, "learning_rate": 2.5e-07, "logits/chosen": -1.2210612297058105, "logits/rejected": -1.069713830947876, "logps/chosen": -28.096343994140625, "logps/rejected": -42.77171325683594, "loss": 0.6609, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06562051922082901, "rewards/margins": 0.1742829531431198, "rewards/rejected": -0.23990347981452942, "step": 34 }, { "epoch": 0.41420118343195267, "grad_norm": 50.18256699637281, "learning_rate": 2.5735294117647057e-07, "logits/chosen": -0.5130794048309326, "logits/rejected": -0.5719175934791565, "logps/chosen": -37.741825103759766, "logps/rejected": -45.18247985839844, "loss": 0.6448, "rewards/accuracies": 0.8125, "rewards/chosen": -0.18347424268722534, "rewards/margins": 0.13004478812217712, "rewards/rejected": -0.31351903080940247, "step": 35 }, { "epoch": 0.4260355029585799, "grad_norm": 49.643545907741306, "learning_rate": 2.6470588235294114e-07, "logits/chosen": -0.9145612716674805, "logits/rejected": -0.8272578120231628, "logps/chosen": -34.39950180053711, "logps/rejected": -54.22731018066406, "loss": 0.6389, "rewards/accuracies": 0.625, "rewards/chosen": -0.22830158472061157, "rewards/margins": 0.19368687272071838, "rewards/rejected": -0.42198845744132996, "step": 36 }, { "epoch": 0.4378698224852071, "grad_norm": 46.37858153447604, "learning_rate": 2.720588235294117e-07, "logits/chosen": -0.8228033781051636, "logits/rejected": -0.6848031282424927, "logps/chosen": -30.318634033203125, "logps/rejected": -42.5218505859375, "loss": 0.6339, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1820099651813507, "rewards/margins": 0.23087123036384583, "rewards/rejected": -0.41288119554519653, "step": 37 }, { "epoch": 0.44970414201183434, "grad_norm": 47.68646874364111, "learning_rate": 2.7941176470588235e-07, "logits/chosen": -0.41094350814819336, "logits/rejected": -0.457292765378952, "logps/chosen": -40.93132019042969, "logps/rejected": -44.33328628540039, "loss": 0.6219, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2081282138824463, "rewards/margins": 0.17258334159851074, "rewards/rejected": -0.38071155548095703, "step": 38 }, { "epoch": 0.46153846153846156, "grad_norm": 46.99139408792661, "learning_rate": 2.8676470588235293e-07, "logits/chosen": -0.9174866676330566, "logits/rejected": -0.8664580583572388, "logps/chosen": -34.22110366821289, "logps/rejected": -45.25912857055664, "loss": 0.6077, "rewards/accuracies": 0.875, "rewards/chosen": -0.24102823436260223, "rewards/margins": 0.3032039701938629, "rewards/rejected": -0.544232189655304, "step": 39 }, { "epoch": 0.47337278106508873, "grad_norm": 44.92025317341631, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.5698567628860474, "logits/rejected": -0.6562440395355225, "logps/chosen": -35.16241455078125, "logps/rejected": -43.87369155883789, "loss": 0.6074, "rewards/accuracies": 0.625, "rewards/chosen": -0.3447490930557251, "rewards/margins": 0.1720276027917862, "rewards/rejected": -0.5167766809463501, "step": 40 }, { "epoch": 0.48520710059171596, "grad_norm": 43.61954903949639, "learning_rate": 3.014705882352941e-07, "logits/chosen": -0.9510785341262817, "logits/rejected": -1.0125081539154053, "logps/chosen": -43.066551208496094, "logps/rejected": -50.81156921386719, "loss": 0.5926, "rewards/accuracies": 0.75, "rewards/chosen": -0.42604392766952515, "rewards/margins": 0.2472018599510193, "rewards/rejected": -0.6732457876205444, "step": 41 }, { "epoch": 0.4970414201183432, "grad_norm": 48.79189553867012, "learning_rate": 3.088235294117647e-07, "logits/chosen": -1.0401208400726318, "logits/rejected": -0.9916763305664062, "logps/chosen": -36.0028076171875, "logps/rejected": -47.314449310302734, "loss": 0.6188, "rewards/accuracies": 0.75, "rewards/chosen": -0.3239697813987732, "rewards/margins": 0.3483524024486542, "rewards/rejected": -0.672322154045105, "step": 42 }, { "epoch": 0.5088757396449705, "grad_norm": 45.29293394827699, "learning_rate": 3.161764705882353e-07, "logits/chosen": -0.6100953817367554, "logits/rejected": -0.5119140148162842, "logps/chosen": -36.110836029052734, "logps/rejected": -55.10944366455078, "loss": 0.5727, "rewards/accuracies": 0.8125, "rewards/chosen": -0.42954936623573303, "rewards/margins": 0.548741340637207, "rewards/rejected": -0.9782906770706177, "step": 43 }, { "epoch": 0.5207100591715976, "grad_norm": 43.75505094492696, "learning_rate": 3.2352941176470586e-07, "logits/chosen": -0.5729663372039795, "logits/rejected": -0.4085710644721985, "logps/chosen": -37.87840270996094, "logps/rejected": -50.89206314086914, "loss": 0.6021, "rewards/accuracies": 0.5, "rewards/chosen": -0.3693719506263733, "rewards/margins": 0.343736857175827, "rewards/rejected": -0.7131087779998779, "step": 44 }, { "epoch": 0.5325443786982249, "grad_norm": 46.651471884498214, "learning_rate": 3.3088235294117644e-07, "logits/chosen": -0.5654891133308411, "logits/rejected": -0.689947783946991, "logps/chosen": -41.97024154663086, "logps/rejected": -46.030967712402344, "loss": 0.573, "rewards/accuracies": 0.75, "rewards/chosen": -0.35547372698783875, "rewards/margins": 0.4130653738975525, "rewards/rejected": -0.7685391902923584, "step": 45 }, { "epoch": 0.5443786982248521, "grad_norm": 43.74968458385381, "learning_rate": 3.3823529411764707e-07, "logits/chosen": -0.6354714632034302, "logits/rejected": -0.7248853445053101, "logps/chosen": -34.041595458984375, "logps/rejected": -39.55751037597656, "loss": 0.5944, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3268294930458069, "rewards/margins": 0.2843489646911621, "rewards/rejected": -0.611178457736969, "step": 46 }, { "epoch": 0.5562130177514792, "grad_norm": 44.663170642243635, "learning_rate": 3.4558823529411765e-07, "logits/chosen": -0.43914204835891724, "logits/rejected": -0.41449761390686035, "logps/chosen": -32.82707595825195, "logps/rejected": -37.95698547363281, "loss": 0.5942, "rewards/accuracies": 0.625, "rewards/chosen": -0.3479520082473755, "rewards/margins": 0.3092345595359802, "rewards/rejected": -0.6571865677833557, "step": 47 }, { "epoch": 0.5680473372781065, "grad_norm": 43.36900384363229, "learning_rate": 3.529411764705882e-07, "logits/chosen": -0.6816117167472839, "logits/rejected": -0.6095008254051208, "logps/chosen": -33.6269416809082, "logps/rejected": -47.18854522705078, "loss": 0.5445, "rewards/accuracies": 0.8125, "rewards/chosen": -0.40539008378982544, "rewards/margins": 0.5039654970169067, "rewards/rejected": -0.909355640411377, "step": 48 }, { "epoch": 0.5798816568047337, "grad_norm": 44.70080178640567, "learning_rate": 3.602941176470588e-07, "logits/chosen": -0.49769917130470276, "logits/rejected": -0.33341550827026367, "logps/chosen": -37.26922607421875, "logps/rejected": -45.382469177246094, "loss": 0.5435, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41994184255599976, "rewards/margins": 0.42326265573501587, "rewards/rejected": -0.8432044982910156, "step": 49 }, { "epoch": 0.591715976331361, "grad_norm": 42.832062592121694, "learning_rate": 3.6764705882352943e-07, "logits/chosen": -0.6227316856384277, "logits/rejected": -0.6291834712028503, "logps/chosen": -37.443336486816406, "logps/rejected": -56.363712310791016, "loss": 0.564, "rewards/accuracies": 0.875, "rewards/chosen": -0.3350003659725189, "rewards/margins": 0.5508270263671875, "rewards/rejected": -0.8858274817466736, "step": 50 }, { "epoch": 0.6035502958579881, "grad_norm": 40.537726641128835, "learning_rate": 3.75e-07, "logits/chosen": -0.5767950415611267, "logits/rejected": -0.40233951807022095, "logps/chosen": -34.15302658081055, "logps/rejected": -41.203514099121094, "loss": 0.5447, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4783581495285034, "rewards/margins": 0.36022478342056274, "rewards/rejected": -0.8385828733444214, "step": 51 }, { "epoch": 0.6153846153846154, "grad_norm": 37.3320312919307, "learning_rate": 3.8235294117647053e-07, "logits/chosen": -0.33203282952308655, "logits/rejected": -0.39482319355010986, "logps/chosen": -35.42144012451172, "logps/rejected": -38.94422149658203, "loss": 0.52, "rewards/accuracies": 0.75, "rewards/chosen": -0.3685012459754944, "rewards/margins": 0.5317042469978333, "rewards/rejected": -0.9002054929733276, "step": 52 }, { "epoch": 0.6272189349112426, "grad_norm": 44.962671519597414, "learning_rate": 3.8970588235294116e-07, "logits/chosen": -0.7768542766571045, "logits/rejected": -0.7662684917449951, "logps/chosen": -40.90711975097656, "logps/rejected": -43.82652282714844, "loss": 0.5597, "rewards/accuracies": 0.75, "rewards/chosen": -0.5888789892196655, "rewards/margins": 0.7121487259864807, "rewards/rejected": -1.3010276556015015, "step": 53 }, { "epoch": 0.6390532544378699, "grad_norm": 40.11509893356046, "learning_rate": 3.9705882352941174e-07, "logits/chosen": -0.44021129608154297, "logits/rejected": -0.33102425932884216, "logps/chosen": -35.29465103149414, "logps/rejected": -48.99385070800781, "loss": 0.4633, "rewards/accuracies": 0.75, "rewards/chosen": -0.4893972873687744, "rewards/margins": 0.4504549503326416, "rewards/rejected": -0.939852237701416, "step": 54 }, { "epoch": 0.650887573964497, "grad_norm": 43.113444451145575, "learning_rate": 4.044117647058823e-07, "logits/chosen": -0.546736478805542, "logits/rejected": -0.6314712762832642, "logps/chosen": -43.929542541503906, "logps/rejected": -49.84693908691406, "loss": 0.5246, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6373113393783569, "rewards/margins": 0.9987615346908569, "rewards/rejected": -1.6360729932785034, "step": 55 }, { "epoch": 0.6627218934911243, "grad_norm": 44.49441569027861, "learning_rate": 4.117647058823529e-07, "logits/chosen": -0.5550782084465027, "logits/rejected": -0.5997810959815979, "logps/chosen": -39.631263732910156, "logps/rejected": -41.65299987792969, "loss": 0.5692, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5492139458656311, "rewards/margins": 0.6075623631477356, "rewards/rejected": -1.1567763090133667, "step": 56 }, { "epoch": 0.6745562130177515, "grad_norm": 49.99500479519036, "learning_rate": 4.191176470588235e-07, "logits/chosen": -0.7147572040557861, "logits/rejected": -0.809965193271637, "logps/chosen": -39.72154998779297, "logps/rejected": -44.34455871582031, "loss": 0.5461, "rewards/accuracies": 0.75, "rewards/chosen": -0.4619465172290802, "rewards/margins": 0.43853867053985596, "rewards/rejected": -0.9004851579666138, "step": 57 }, { "epoch": 0.6863905325443787, "grad_norm": 41.686424338259535, "learning_rate": 4.264705882352941e-07, "logits/chosen": -0.4276542663574219, "logits/rejected": -0.6359944939613342, "logps/chosen": -38.81695556640625, "logps/rejected": -44.787322998046875, "loss": 0.4877, "rewards/accuracies": 0.875, "rewards/chosen": -0.31248077750205994, "rewards/margins": 0.9359371662139893, "rewards/rejected": -1.248417854309082, "step": 58 }, { "epoch": 0.6982248520710059, "grad_norm": 39.36173522050298, "learning_rate": 4.338235294117647e-07, "logits/chosen": -0.6473718285560608, "logits/rejected": -0.7196489572525024, "logps/chosen": -39.105743408203125, "logps/rejected": -43.301734924316406, "loss": 0.4509, "rewards/accuracies": 0.75, "rewards/chosen": -0.5941877365112305, "rewards/margins": 0.8893916010856628, "rewards/rejected": -1.483579397201538, "step": 59 }, { "epoch": 0.7100591715976331, "grad_norm": 37.06656262082701, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -0.4076049029827118, "logits/rejected": -0.20640525221824646, "logps/chosen": -32.933773040771484, "logps/rejected": -50.366722106933594, "loss": 0.4341, "rewards/accuracies": 0.875, "rewards/chosen": -0.39851048588752747, "rewards/margins": 0.7430359125137329, "rewards/rejected": -1.141546368598938, "step": 60 }, { "epoch": 0.7218934911242604, "grad_norm": 36.19815173586054, "learning_rate": 4.485294117647059e-07, "logits/chosen": -0.7895621061325073, "logits/rejected": -0.8529470562934875, "logps/chosen": -41.30886459350586, "logps/rejected": -52.98210906982422, "loss": 0.4463, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4878369867801666, "rewards/margins": 1.4172559976577759, "rewards/rejected": -1.9050930738449097, "step": 61 }, { "epoch": 0.7337278106508875, "grad_norm": 37.646252584686685, "learning_rate": 4.5588235294117646e-07, "logits/chosen": -0.46224623918533325, "logits/rejected": -0.5132017731666565, "logps/chosen": -35.61504364013672, "logps/rejected": -45.02849578857422, "loss": 0.446, "rewards/accuracies": 0.75, "rewards/chosen": -0.31568989157676697, "rewards/margins": 1.0938689708709717, "rewards/rejected": -1.409558892250061, "step": 62 }, { "epoch": 0.7455621301775148, "grad_norm": 40.99920932267914, "learning_rate": 4.6323529411764704e-07, "logits/chosen": -0.24282173812389374, "logits/rejected": -0.41654685139656067, "logps/chosen": -40.87584686279297, "logps/rejected": -45.074466705322266, "loss": 0.4864, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5393080711364746, "rewards/margins": 0.729945182800293, "rewards/rejected": -1.2692532539367676, "step": 63 }, { "epoch": 0.757396449704142, "grad_norm": 39.08132618780929, "learning_rate": 4.705882352941176e-07, "logits/chosen": -0.7257353067398071, "logits/rejected": -0.6613651514053345, "logps/chosen": -31.330947875976562, "logps/rejected": -46.061580657958984, "loss": 0.431, "rewards/accuracies": 0.75, "rewards/chosen": -0.2516774535179138, "rewards/margins": 1.220921277999878, "rewards/rejected": -1.472598910331726, "step": 64 }, { "epoch": 0.7692307692307693, "grad_norm": 41.475907121673465, "learning_rate": 4.779411764705882e-07, "logits/chosen": -0.609380841255188, "logits/rejected": -0.6923696994781494, "logps/chosen": -42.626522064208984, "logps/rejected": -47.41869354248047, "loss": 0.4345, "rewards/accuracies": 0.625, "rewards/chosen": -0.4935339689254761, "rewards/margins": 0.8159966468811035, "rewards/rejected": -1.30953049659729, "step": 65 }, { "epoch": 0.7810650887573964, "grad_norm": 40.932537298379174, "learning_rate": 4.852941176470588e-07, "logits/chosen": -1.1143670082092285, "logits/rejected": -1.1686656475067139, "logps/chosen": -49.556392669677734, "logps/rejected": -38.85927200317383, "loss": 0.4568, "rewards/accuracies": 0.75, "rewards/chosen": -0.46525275707244873, "rewards/margins": 0.4739494323730469, "rewards/rejected": -0.9392022490501404, "step": 66 }, { "epoch": 0.7928994082840237, "grad_norm": 36.93572313017648, "learning_rate": 4.926470588235295e-07, "logits/chosen": -0.659454882144928, "logits/rejected": -0.7761635184288025, "logps/chosen": -36.797096252441406, "logps/rejected": -46.951560974121094, "loss": 0.4149, "rewards/accuracies": 0.875, "rewards/chosen": -0.47968119382858276, "rewards/margins": 1.5291297435760498, "rewards/rejected": -2.0088109970092773, "step": 67 }, { "epoch": 0.8047337278106509, "grad_norm": 44.10212062947347, "learning_rate": 5e-07, "logits/chosen": -0.22745545208454132, "logits/rejected": -0.3337535858154297, "logps/chosen": -43.72065734863281, "logps/rejected": -46.58799743652344, "loss": 0.451, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11050184071063995, "rewards/margins": 1.0298746824264526, "rewards/rejected": -1.1403765678405762, "step": 68 }, { "epoch": 0.8165680473372781, "grad_norm": 41.21338166081281, "learning_rate": 4.999966183013662e-07, "logits/chosen": -0.5445861220359802, "logits/rejected": -0.8019598126411438, "logps/chosen": -41.4810791015625, "logps/rejected": -36.41514587402344, "loss": 0.4593, "rewards/accuracies": 0.625, "rewards/chosen": -0.505042314529419, "rewards/margins": 0.4535430669784546, "rewards/rejected": -0.9585853815078735, "step": 69 }, { "epoch": 0.8284023668639053, "grad_norm": 35.095204565269476, "learning_rate": 4.999864732969518e-07, "logits/chosen": -0.7983517050743103, "logits/rejected": -0.7316067814826965, "logps/chosen": -34.78556442260742, "logps/rejected": -48.34429931640625, "loss": 0.409, "rewards/accuracies": 0.75, "rewards/chosen": -0.520796537399292, "rewards/margins": 1.445544958114624, "rewards/rejected": -1.966341495513916, "step": 70 }, { "epoch": 0.8402366863905325, "grad_norm": 40.30265829692534, "learning_rate": 4.999695652612155e-07, "logits/chosen": -0.9361473917961121, "logits/rejected": -1.0400917530059814, "logps/chosen": -34.00445556640625, "logps/rejected": -45.20707702636719, "loss": 0.4569, "rewards/accuracies": 0.875, "rewards/chosen": -0.12054769694805145, "rewards/margins": 1.341543197631836, "rewards/rejected": -1.4620908498764038, "step": 71 }, { "epoch": 0.8520710059171598, "grad_norm": 40.415075736728824, "learning_rate": 4.999458946515807e-07, "logits/chosen": -0.5303232669830322, "logits/rejected": -0.6230794191360474, "logps/chosen": -39.0274658203125, "logps/rejected": -45.97068786621094, "loss": 0.4644, "rewards/accuracies": 0.6875, "rewards/chosen": -0.48386603593826294, "rewards/margins": 0.9363454580307007, "rewards/rejected": -1.4202115535736084, "step": 72 }, { "epoch": 0.863905325443787, "grad_norm": 38.77891075432772, "learning_rate": 4.999154621084221e-07, "logits/chosen": -0.6193030476570129, "logits/rejected": -0.6146333813667297, "logps/chosen": -43.54295349121094, "logps/rejected": -48.63895034790039, "loss": 0.4151, "rewards/accuracies": 0.8125, "rewards/chosen": -0.49407535791397095, "rewards/margins": 1.051137924194336, "rewards/rejected": -1.5452133417129517, "step": 73 }, { "epoch": 0.8757396449704142, "grad_norm": 37.77568612282111, "learning_rate": 4.998782684550491e-07, "logits/chosen": -0.36244261264801025, "logits/rejected": -0.44385403394699097, "logps/chosen": -42.133628845214844, "logps/rejected": -43.378028869628906, "loss": 0.4466, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2831823229789734, "rewards/margins": 0.7465813159942627, "rewards/rejected": -1.0297636985778809, "step": 74 }, { "epoch": 0.8875739644970414, "grad_norm": 41.66281833701898, "learning_rate": 4.998343146976837e-07, "logits/chosen": -0.9405574202537537, "logits/rejected": -0.9447466135025024, "logps/chosen": -37.16999053955078, "logps/rejected": -50.3101921081543, "loss": 0.4843, "rewards/accuracies": 0.875, "rewards/chosen": -0.34810084104537964, "rewards/margins": 1.4797228574752808, "rewards/rejected": -1.8278236389160156, "step": 75 }, { "epoch": 0.8994082840236687, "grad_norm": 41.38364147290228, "learning_rate": 4.997836020254328e-07, "logits/chosen": -0.760882556438446, "logits/rejected": -0.8380026817321777, "logps/chosen": -32.650291442871094, "logps/rejected": -44.52061462402344, "loss": 0.4716, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0004928633570671082, "rewards/margins": 1.002596378326416, "rewards/rejected": -1.003089189529419, "step": 76 }, { "epoch": 0.9112426035502958, "grad_norm": 37.663962046996495, "learning_rate": 4.99726131810256e-07, "logits/chosen": -0.6014434099197388, "logits/rejected": -0.8074021935462952, "logps/chosen": -40.66423797607422, "logps/rejected": -30.966102600097656, "loss": 0.414, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06758510321378708, "rewards/margins": 0.7224689722061157, "rewards/rejected": -0.7900540232658386, "step": 77 }, { "epoch": 0.9230769230769231, "grad_norm": 35.62116536902, "learning_rate": 4.996619056069291e-07, "logits/chosen": -0.8005455732345581, "logits/rejected": -1.0737569332122803, "logps/chosen": -47.74522399902344, "logps/rejected": -40.22245788574219, "loss": 0.3874, "rewards/accuracies": 0.625, "rewards/chosen": 0.11318114399909973, "rewards/margins": 0.9375712275505066, "rewards/rejected": -0.8243900537490845, "step": 78 }, { "epoch": 0.9349112426035503, "grad_norm": 41.14929411833454, "learning_rate": 4.995909251530013e-07, "logits/chosen": -0.764075517654419, "logits/rejected": -0.7951244115829468, "logps/chosen": -37.42706298828125, "logps/rejected": -49.54686737060547, "loss": 0.4372, "rewards/accuracies": 0.75, "rewards/chosen": -0.2584840655326843, "rewards/margins": 1.3500514030456543, "rewards/rejected": -1.6085355281829834, "step": 79 }, { "epoch": 0.9467455621301775, "grad_norm": 37.43972651882339, "learning_rate": 4.995131923687487e-07, "logits/chosen": -0.6362431049346924, "logits/rejected": -0.4747117757797241, "logps/chosen": -37.49502944946289, "logps/rejected": -55.17448806762695, "loss": 0.3999, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2551206946372986, "rewards/margins": 0.9605333209037781, "rewards/rejected": -1.2156541347503662, "step": 80 }, { "epoch": 0.9585798816568047, "grad_norm": 43.94031843678492, "learning_rate": 4.994287093571221e-07, "logits/chosen": -0.7600383758544922, "logits/rejected": -0.537135660648346, "logps/chosen": -39.631553649902344, "logps/rejected": -59.48577880859375, "loss": 0.4709, "rewards/accuracies": 0.8125, "rewards/chosen": -0.30705738067626953, "rewards/margins": 1.0295747518539429, "rewards/rejected": -1.3366321325302124, "step": 81 }, { "epoch": 0.9704142011834319, "grad_norm": 43.25344667732382, "learning_rate": 4.993374784036901e-07, "logits/chosen": -0.8365252017974854, "logits/rejected": -0.7885805368423462, "logps/chosen": -38.859161376953125, "logps/rejected": -54.36302947998047, "loss": 0.3922, "rewards/accuracies": 0.875, "rewards/chosen": -0.3023694157600403, "rewards/margins": 2.4610185623168945, "rewards/rejected": -2.763388156890869, "step": 82 }, { "epoch": 0.9822485207100592, "grad_norm": 37.685612874220524, "learning_rate": 4.992395019765775e-07, "logits/chosen": -0.6076855659484863, "logits/rejected": -0.6970850229263306, "logps/chosen": -37.532691955566406, "logps/rejected": -53.90351486206055, "loss": 0.3857, "rewards/accuracies": 0.75, "rewards/chosen": -0.5352523326873779, "rewards/margins": 1.9930164813995361, "rewards/rejected": -2.528268814086914, "step": 83 }, { "epoch": 0.9940828402366864, "grad_norm": 36.826262132209905, "learning_rate": 4.991347827263982e-07, "logits/chosen": -0.8733373880386353, "logits/rejected": -0.7132407426834106, "logps/chosen": -33.2780647277832, "logps/rejected": -47.66956329345703, "loss": 0.4261, "rewards/accuracies": 0.75, "rewards/chosen": 0.13416826725006104, "rewards/margins": 1.7117326259613037, "rewards/rejected": -1.5775643587112427, "step": 84 }, { "epoch": 1.0059171597633136, "grad_norm": 40.77876491907824, "learning_rate": 4.990233234861839e-07, "logits/chosen": -0.6671120524406433, "logits/rejected": -0.961428701877594, "logps/chosen": -37.66517639160156, "logps/rejected": -42.085426330566406, "loss": 0.375, "rewards/accuracies": 1.0, "rewards/chosen": -0.03282582014799118, "rewards/margins": 2.404395580291748, "rewards/rejected": -2.4372215270996094, "step": 85 }, { "epoch": 1.017751479289941, "grad_norm": 35.343447731733626, "learning_rate": 4.989051272713069e-07, "logits/chosen": -0.9536569118499756, "logits/rejected": -1.1731913089752197, "logps/chosen": -35.637535095214844, "logps/rejected": -42.364036560058594, "loss": 0.3999, "rewards/accuracies": 0.75, "rewards/chosen": -0.12619183957576752, "rewards/margins": 2.5188794136047363, "rewards/rejected": -2.645071268081665, "step": 86 }, { "epoch": 1.029585798816568, "grad_norm": 33.223106342915955, "learning_rate": 4.987801972793993e-07, "logits/chosen": -0.7712712287902832, "logits/rejected": -0.9531198740005493, "logps/chosen": -45.71641540527344, "logps/rejected": -58.85322952270508, "loss": 0.3053, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2449014037847519, "rewards/margins": 3.286306142807007, "rewards/rejected": -3.0414042472839355, "step": 87 }, { "epoch": 1.0414201183431953, "grad_norm": 35.64530002995125, "learning_rate": 4.986485368902656e-07, "logits/chosen": -0.6423381567001343, "logits/rejected": -0.6168457865715027, "logps/chosen": -31.16773223876953, "logps/rejected": -45.484893798828125, "loss": 0.3375, "rewards/accuracies": 0.75, "rewards/chosen": -0.22324074804782867, "rewards/margins": 1.2495863437652588, "rewards/rejected": -1.4728271961212158, "step": 88 }, { "epoch": 1.0532544378698225, "grad_norm": 37.20259605794752, "learning_rate": 4.985101496657918e-07, "logits/chosen": -1.1634238958358765, "logits/rejected": -1.103615164756775, "logps/chosen": -41.237342834472656, "logps/rejected": -62.546531677246094, "loss": 0.344, "rewards/accuracies": 0.875, "rewards/chosen": -0.3175942599773407, "rewards/margins": 2.2205255031585693, "rewards/rejected": -2.5381197929382324, "step": 89 }, { "epoch": 1.0650887573964498, "grad_norm": 32.598385557495284, "learning_rate": 4.983650393498489e-07, "logits/chosen": -0.922864556312561, "logits/rejected": -0.8289706707000732, "logps/chosen": -29.635332107543945, "logps/rejected": -45.355323791503906, "loss": 0.3229, "rewards/accuracies": 0.875, "rewards/chosen": -0.7630589008331299, "rewards/margins": 1.704155683517456, "rewards/rejected": -2.467214584350586, "step": 90 }, { "epoch": 1.0769230769230769, "grad_norm": 38.26125209425027, "learning_rate": 4.982132098681923e-07, "logits/chosen": -1.0017441511154175, "logits/rejected": -0.9490557909011841, "logps/chosen": -40.40355682373047, "logps/rejected": -53.06212615966797, "loss": 0.3536, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7460950016975403, "rewards/margins": 2.146270513534546, "rewards/rejected": -2.8923654556274414, "step": 91 }, { "epoch": 1.0887573964497042, "grad_norm": 35.48709030046864, "learning_rate": 4.980546653283537e-07, "logits/chosen": -0.8610115051269531, "logits/rejected": -0.8497661352157593, "logps/chosen": -33.734352111816406, "logps/rejected": -51.21432113647461, "loss": 0.3493, "rewards/accuracies": 0.875, "rewards/chosen": 0.08930137753486633, "rewards/margins": 2.130716562271118, "rewards/rejected": -2.041415214538574, "step": 92 }, { "epoch": 1.1005917159763314, "grad_norm": 33.05555278643355, "learning_rate": 4.978894100195324e-07, "logits/chosen": -0.7339059114456177, "logits/rejected": -0.7007895112037659, "logps/chosen": -43.019500732421875, "logps/rejected": -54.64815902709961, "loss": 0.2908, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5389220118522644, "rewards/margins": 2.059274673461914, "rewards/rejected": -2.5981967449188232, "step": 93 }, { "epoch": 1.1124260355029585, "grad_norm": 32.64806139806569, "learning_rate": 4.977174484124775e-07, "logits/chosen": -0.6110660433769226, "logits/rejected": -0.6840114593505859, "logps/chosen": -39.90541458129883, "logps/rejected": -41.75571060180664, "loss": 0.322, "rewards/accuracies": 0.625, "rewards/chosen": -0.3695860803127289, "rewards/margins": 1.257794737815857, "rewards/rejected": -1.6273807287216187, "step": 94 }, { "epoch": 1.1242603550295858, "grad_norm": 28.73456224275126, "learning_rate": 4.975387851593676e-07, "logits/chosen": -1.0169286727905273, "logits/rejected": -0.9107025861740112, "logps/chosen": -38.64891052246094, "logps/rejected": -54.81282043457031, "loss": 0.3026, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09714408218860626, "rewards/margins": 1.6703901290893555, "rewards/rejected": -1.7675341367721558, "step": 95 }, { "epoch": 1.136094674556213, "grad_norm": 40.46066004537338, "learning_rate": 4.97353425093685e-07, "logits/chosen": -0.5108687877655029, "logits/rejected": -0.5603746175765991, "logps/chosen": -38.765052795410156, "logps/rejected": -44.74797058105469, "loss": 0.3836, "rewards/accuracies": 0.75, "rewards/chosen": -0.35121989250183105, "rewards/margins": 1.2221122980117798, "rewards/rejected": -1.5733323097229004, "step": 96 }, { "epoch": 1.1479289940828403, "grad_norm": 33.115171654161905, "learning_rate": 4.971613732300848e-07, "logits/chosen": -0.5614181756973267, "logits/rejected": -0.44806018471717834, "logps/chosen": -34.740657806396484, "logps/rejected": -51.19026184082031, "loss": 0.3033, "rewards/accuracies": 1.0, "rewards/chosen": -0.37993931770324707, "rewards/margins": 2.2012622356414795, "rewards/rejected": -2.5812015533447266, "step": 97 }, { "epoch": 1.1597633136094674, "grad_norm": 30.314211745480826, "learning_rate": 4.96962634764259e-07, "logits/chosen": -0.6665958762168884, "logits/rejected": -0.9045260548591614, "logps/chosen": -50.40956115722656, "logps/rejected": -52.3349494934082, "loss": 0.2727, "rewards/accuracies": 0.875, "rewards/chosen": -0.28259751200675964, "rewards/margins": 2.2699124813079834, "rewards/rejected": -2.5525100231170654, "step": 98 }, { "epoch": 1.1715976331360947, "grad_norm": 35.25579514857924, "learning_rate": 4.967572150727964e-07, "logits/chosen": -0.6161059737205505, "logits/rejected": -0.619467556476593, "logps/chosen": -41.40134048461914, "logps/rejected": -47.676292419433594, "loss": 0.3182, "rewards/accuracies": 0.8125, "rewards/chosen": -1.002119541168213, "rewards/margins": 1.5968635082244873, "rewards/rejected": -2.598982810974121, "step": 99 }, { "epoch": 1.183431952662722, "grad_norm": 31.87031408729149, "learning_rate": 4.965451197130372e-07, "logits/chosen": -0.6040835380554199, "logits/rejected": -0.6801787614822388, "logps/chosen": -35.400917053222656, "logps/rejected": -46.94811248779297, "loss": 0.3003, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07005947083234787, "rewards/margins": 2.207547903060913, "rewards/rejected": -2.2776074409484863, "step": 100 }, { "epoch": 1.195266272189349, "grad_norm": 34.95313563934469, "learning_rate": 4.963263544229219e-07, "logits/chosen": -0.7886058688163757, "logits/rejected": -0.8541610240936279, "logps/chosen": -37.76707077026367, "logps/rejected": -52.185367584228516, "loss": 0.2959, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6315910816192627, "rewards/margins": 2.7951231002807617, "rewards/rejected": -3.4267144203186035, "step": 101 }, { "epoch": 1.2071005917159763, "grad_norm": 32.38499457275457, "learning_rate": 4.961009251208367e-07, "logits/chosen": -0.9285825490951538, "logits/rejected": -0.9554150700569153, "logps/chosen": -34.80419921875, "logps/rejected": -44.33222961425781, "loss": 0.3286, "rewards/accuracies": 0.9375, "rewards/chosen": -0.12241765856742859, "rewards/margins": 1.6503454446792603, "rewards/rejected": -1.7727631330490112, "step": 102 }, { "epoch": 1.2189349112426036, "grad_norm": 35.88300586588521, "learning_rate": 4.958688379054535e-07, "logits/chosen": -0.717126727104187, "logits/rejected": -0.5765900611877441, "logps/chosen": -32.79739761352539, "logps/rejected": -56.2498893737793, "loss": 0.338, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11334749311208725, "rewards/margins": 2.23618745803833, "rewards/rejected": -2.3495349884033203, "step": 103 }, { "epoch": 1.2307692307692308, "grad_norm": 32.32700164175202, "learning_rate": 4.956300990555643e-07, "logits/chosen": -0.8154267072677612, "logits/rejected": -0.9468405246734619, "logps/chosen": -38.78935623168945, "logps/rejected": -56.32616424560547, "loss": 0.2922, "rewards/accuracies": 1.0, "rewards/chosen": -0.04591973125934601, "rewards/margins": 3.170891761779785, "rewards/rejected": -3.2168116569519043, "step": 104 }, { "epoch": 1.242603550295858, "grad_norm": 31.39200334822418, "learning_rate": 4.953847150299118e-07, "logits/chosen": -0.6333639621734619, "logits/rejected": -0.5471930503845215, "logps/chosen": -36.49355697631836, "logps/rejected": -41.39311218261719, "loss": 0.3301, "rewards/accuracies": 0.75, "rewards/chosen": -0.8302067518234253, "rewards/margins": 0.9772886037826538, "rewards/rejected": -1.8074952363967896, "step": 105 }, { "epoch": 1.2544378698224852, "grad_norm": 28.680251994430197, "learning_rate": 4.951326924670147e-07, "logits/chosen": -0.7039244771003723, "logits/rejected": -0.6648294925689697, "logps/chosen": -34.96596145629883, "logps/rejected": -54.543174743652344, "loss": 0.2705, "rewards/accuracies": 1.0, "rewards/chosen": -0.4160834550857544, "rewards/margins": 4.053628444671631, "rewards/rejected": -4.469712257385254, "step": 106 }, { "epoch": 1.2662721893491125, "grad_norm": 34.660972787041885, "learning_rate": 4.948740381849879e-07, "logits/chosen": -0.436050683259964, "logits/rejected": -0.49883347749710083, "logps/chosen": -42.65693664550781, "logps/rejected": -48.37257766723633, "loss": 0.2924, "rewards/accuracies": 0.875, "rewards/chosen": -0.3416898846626282, "rewards/margins": 2.6665897369384766, "rewards/rejected": -3.00827956199646, "step": 107 }, { "epoch": 1.2781065088757395, "grad_norm": 35.73754376882297, "learning_rate": 4.94608759181358e-07, "logits/chosen": -1.3829025030136108, "logits/rejected": -1.270089864730835, "logps/chosen": -42.553672790527344, "logps/rejected": -68.03410339355469, "loss": 0.3177, "rewards/accuracies": 0.875, "rewards/chosen": -0.4059150815010071, "rewards/margins": 2.707810878753662, "rewards/rejected": -3.1137256622314453, "step": 108 }, { "epoch": 1.2899408284023668, "grad_norm": 29.030913535413347, "learning_rate": 4.943368626328741e-07, "logits/chosen": -0.7719374895095825, "logits/rejected": -0.8931103348731995, "logps/chosen": -37.83287048339844, "logps/rejected": -55.789669036865234, "loss": 0.2498, "rewards/accuracies": 1.0, "rewards/chosen": -0.5285115838050842, "rewards/margins": 3.6779208183288574, "rewards/rejected": -4.206432819366455, "step": 109 }, { "epoch": 1.301775147928994, "grad_norm": 32.18624732441535, "learning_rate": 4.940583558953137e-07, "logits/chosen": -0.8106911778450012, "logits/rejected": -0.6266003847122192, "logps/chosen": -27.738155364990234, "logps/rejected": -53.347442626953125, "loss": 0.2545, "rewards/accuracies": 1.0, "rewards/chosen": -0.49206268787384033, "rewards/margins": 3.2540321350097656, "rewards/rejected": -3.7460951805114746, "step": 110 }, { "epoch": 1.3136094674556213, "grad_norm": 29.81498338818304, "learning_rate": 4.937732465032838e-07, "logits/chosen": -0.9487054347991943, "logits/rejected": -0.7686504125595093, "logps/chosen": -37.48987579345703, "logps/rejected": -61.35884475708008, "loss": 0.2985, "rewards/accuracies": 0.9375, "rewards/chosen": -0.44103536009788513, "rewards/margins": 2.5024337768554688, "rewards/rejected": -2.9434690475463867, "step": 111 }, { "epoch": 1.3254437869822486, "grad_norm": 31.489716700318738, "learning_rate": 4.934815421700164e-07, "logits/chosen": -0.6628305315971375, "logits/rejected": -0.5205198526382446, "logps/chosen": -38.25563430786133, "logps/rejected": -62.12502670288086, "loss": 0.2841, "rewards/accuracies": 0.875, "rewards/chosen": -0.37647631764411926, "rewards/margins": 2.712799549102783, "rewards/rejected": -3.08927583694458, "step": 112 }, { "epoch": 1.3372781065088757, "grad_norm": 26.849776060745437, "learning_rate": 4.93183250787161e-07, "logits/chosen": -0.8486281633377075, "logits/rejected": -0.6799747347831726, "logps/chosen": -36.62342834472656, "logps/rejected": -51.13971710205078, "loss": 0.2563, "rewards/accuracies": 1.0, "rewards/chosen": -0.19857852160930634, "rewards/margins": 2.1364779472351074, "rewards/rejected": -2.3350563049316406, "step": 113 }, { "epoch": 1.349112426035503, "grad_norm": 30.82600982658151, "learning_rate": 4.928783804245699e-07, "logits/chosen": -0.9438729286193848, "logits/rejected": -1.0174190998077393, "logps/chosen": -40.94575119018555, "logps/rejected": -47.26662826538086, "loss": 0.2841, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4203495681285858, "rewards/margins": 2.5113162994384766, "rewards/rejected": -2.9316658973693848, "step": 114 }, { "epoch": 1.3609467455621302, "grad_norm": 37.63814438854822, "learning_rate": 4.925669393300807e-07, "logits/chosen": -0.5685192942619324, "logits/rejected": -0.6837583184242249, "logps/chosen": -35.69676208496094, "logps/rejected": -42.57113265991211, "loss": 0.3434, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2215765118598938, "rewards/margins": 2.7080235481262207, "rewards/rejected": -2.9296000003814697, "step": 115 }, { "epoch": 1.3727810650887573, "grad_norm": 27.187951013233324, "learning_rate": 4.922489359292927e-07, "logits/chosen": -0.8806582689285278, "logits/rejected": -0.8417138457298279, "logps/chosen": -48.617950439453125, "logps/rejected": -63.693763732910156, "loss": 0.258, "rewards/accuracies": 0.9375, "rewards/chosen": -0.31812140345573425, "rewards/margins": 3.5926265716552734, "rewards/rejected": -3.910747766494751, "step": 116 }, { "epoch": 1.3846153846153846, "grad_norm": 33.37578095279378, "learning_rate": 4.919243788253393e-07, "logits/chosen": -0.6644065380096436, "logits/rejected": -0.8404392004013062, "logps/chosen": -34.850852966308594, "logps/rejected": -44.51880645751953, "loss": 0.3009, "rewards/accuracies": 0.875, "rewards/chosen": -0.36010777950286865, "rewards/margins": 2.36444091796875, "rewards/rejected": -2.724548816680908, "step": 117 }, { "epoch": 1.3964497041420119, "grad_norm": 31.874390557999053, "learning_rate": 4.915932767986551e-07, "logits/chosen": -0.3398258686065674, "logits/rejected": -0.35200411081314087, "logps/chosen": -29.706180572509766, "logps/rejected": -36.340633392333984, "loss": 0.2804, "rewards/accuracies": 0.8125, "rewards/chosen": 0.17402225732803345, "rewards/margins": 1.5369799137115479, "rewards/rejected": -1.3629577159881592, "step": 118 }, { "epoch": 1.4082840236686391, "grad_norm": 31.69981967422697, "learning_rate": 4.912556388067381e-07, "logits/chosen": -0.8166269063949585, "logits/rejected": -0.867180347442627, "logps/chosen": -31.888137817382812, "logps/rejected": -43.59930419921875, "loss": 0.2591, "rewards/accuracies": 0.875, "rewards/chosen": -0.0748639851808548, "rewards/margins": 1.9631797075271606, "rewards/rejected": -2.038043737411499, "step": 119 }, { "epoch": 1.4201183431952662, "grad_norm": 31.59528234355252, "learning_rate": 4.909114739839079e-07, "logits/chosen": -0.6452093124389648, "logits/rejected": -0.5552669763565063, "logps/chosen": -29.506380081176758, "logps/rejected": -50.04014587402344, "loss": 0.2972, "rewards/accuracies": 1.0, "rewards/chosen": 0.0014930292963981628, "rewards/margins": 2.9174089431762695, "rewards/rejected": -2.9159157276153564, "step": 120 }, { "epoch": 1.4319526627218935, "grad_norm": 29.665894281084412, "learning_rate": 4.90560791641058e-07, "logits/chosen": -0.9231401085853577, "logits/rejected": -0.9149960279464722, "logps/chosen": -39.093299865722656, "logps/rejected": -58.45146942138672, "loss": 0.2505, "rewards/accuracies": 0.75, "rewards/chosen": 0.0397074818611145, "rewards/margins": 3.4828078746795654, "rewards/rejected": -3.4431004524230957, "step": 121 }, { "epoch": 1.4437869822485208, "grad_norm": 26.842826013555836, "learning_rate": 4.902036012654048e-07, "logits/chosen": -0.743898332118988, "logits/rejected": -0.85213303565979, "logps/chosen": -37.486961364746094, "logps/rejected": -45.735897064208984, "loss": 0.2211, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2370671182870865, "rewards/margins": 2.456345558166504, "rewards/rejected": -2.219278335571289, "step": 122 }, { "epoch": 1.4556213017751478, "grad_norm": 33.17228544783056, "learning_rate": 4.898399125202295e-07, "logits/chosen": -0.6975520849227905, "logits/rejected": -0.5711613893508911, "logps/chosen": -34.24563217163086, "logps/rejected": -54.74011993408203, "loss": 0.2873, "rewards/accuracies": 0.8125, "rewards/chosen": 0.04336928948760033, "rewards/margins": 2.063955307006836, "rewards/rejected": -2.0205860137939453, "step": 123 }, { "epoch": 1.467455621301775, "grad_norm": 23.661270990308882, "learning_rate": 4.894697352446182e-07, "logits/chosen": -1.0904819965362549, "logits/rejected": -0.9323499798774719, "logps/chosen": -32.2510986328125, "logps/rejected": -49.083892822265625, "loss": 0.2113, "rewards/accuracies": 1.0, "rewards/chosen": -0.38007357716560364, "rewards/margins": 2.8477623462677, "rewards/rejected": -3.2278361320495605, "step": 124 }, { "epoch": 1.4792899408284024, "grad_norm": 34.604687836314845, "learning_rate": 4.890930794531947e-07, "logits/chosen": -0.8184336423873901, "logits/rejected": -0.7664436101913452, "logps/chosen": -33.786197662353516, "logps/rejected": -51.256500244140625, "loss": 0.3319, "rewards/accuracies": 0.8125, "rewards/chosen": -0.057498496025800705, "rewards/margins": 2.2278025150299072, "rewards/rejected": -2.2853007316589355, "step": 125 }, { "epoch": 1.4911242603550297, "grad_norm": 32.152626338805746, "learning_rate": 4.887099553358501e-07, "logits/chosen": -0.557933509349823, "logits/rejected": -0.46904832124710083, "logps/chosen": -32.54315948486328, "logps/rejected": -43.39186477661133, "loss": 0.2547, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3194792866706848, "rewards/margins": 2.087386131286621, "rewards/rejected": -1.767906665802002, "step": 126 }, { "epoch": 1.502958579881657, "grad_norm": 27.486858140339642, "learning_rate": 4.883203732574667e-07, "logits/chosen": -0.6726119518280029, "logits/rejected": -0.9127836227416992, "logps/chosen": -37.70806884765625, "logps/rejected": -49.78596878051758, "loss": 0.2222, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04751332104206085, "rewards/margins": 4.030301094055176, "rewards/rejected": -4.07781457901001, "step": 127 }, { "epoch": 1.514792899408284, "grad_norm": 31.048864646685523, "learning_rate": 4.879243437576383e-07, "logits/chosen": -0.7582688927650452, "logits/rejected": -0.5859317183494568, "logps/chosen": -36.134395599365234, "logps/rejected": -54.693885803222656, "loss": 0.2645, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12503023445606232, "rewards/margins": 2.837843894958496, "rewards/rejected": -2.712813377380371, "step": 128 }, { "epoch": 1.5266272189349113, "grad_norm": 24.406363199351727, "learning_rate": 4.875218775503837e-07, "logits/chosen": -0.8559905290603638, "logits/rejected": -0.5925788879394531, "logps/chosen": -30.623798370361328, "logps/rejected": -54.57696533203125, "loss": 0.1985, "rewards/accuracies": 0.875, "rewards/chosen": -0.09095414727926254, "rewards/margins": 2.6515233516693115, "rewards/rejected": -2.7424774169921875, "step": 129 }, { "epoch": 1.5384615384615383, "grad_norm": 29.935002372070645, "learning_rate": 4.871129855238588e-07, "logits/chosen": -0.6436349749565125, "logits/rejected": -0.6043447852134705, "logps/chosen": -39.27113723754883, "logps/rejected": -53.498077392578125, "loss": 0.2637, "rewards/accuracies": 0.75, "rewards/chosen": -0.1115611121058464, "rewards/margins": 2.6680796146392822, "rewards/rejected": -2.7796406745910645, "step": 130 }, { "epoch": 1.5502958579881656, "grad_norm": 34.141742145577254, "learning_rate": 4.866976787400601e-07, "logits/chosen": -0.6176570653915405, "logits/rejected": -0.6334538459777832, "logps/chosen": -32.522979736328125, "logps/rejected": -41.17702865600586, "loss": 0.3054, "rewards/accuracies": 0.8125, "rewards/chosen": 0.09118051826953888, "rewards/margins": 1.9793744087219238, "rewards/rejected": -1.8881936073303223, "step": 131 }, { "epoch": 1.5621301775147929, "grad_norm": 32.31778466187239, "learning_rate": 4.862759684345269e-07, "logits/chosen": -0.8636021614074707, "logits/rejected": -0.8571298122406006, "logps/chosen": -27.641830444335938, "logps/rejected": -47.43977737426758, "loss": 0.2662, "rewards/accuracies": 0.9375, "rewards/chosen": -0.15791228413581848, "rewards/margins": 3.5890026092529297, "rewards/rejected": -3.746914863586426, "step": 132 }, { "epoch": 1.5739644970414202, "grad_norm": 32.025278441483124, "learning_rate": 4.858478660160363e-07, "logits/chosen": -0.7859846353530884, "logits/rejected": -0.855846643447876, "logps/chosen": -42.2872314453125, "logps/rejected": -59.02558135986328, "loss": 0.2718, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20162326097488403, "rewards/margins": 4.159268856048584, "rewards/rejected": -4.360892295837402, "step": 133 }, { "epoch": 1.5857988165680474, "grad_norm": 31.88463706660638, "learning_rate": 4.854133830662955e-07, "logits/chosen": -1.0088858604431152, "logits/rejected": -1.0345858335494995, "logps/chosen": -36.01057052612305, "logps/rejected": -40.89894104003906, "loss": 0.2697, "rewards/accuracies": 0.8125, "rewards/chosen": -0.32271209359169006, "rewards/margins": 1.9244492053985596, "rewards/rejected": -2.247161388397217, "step": 134 }, { "epoch": 1.5976331360946747, "grad_norm": 32.09028994814745, "learning_rate": 4.849725313396274e-07, "logits/chosen": -0.8029739856719971, "logits/rejected": -0.668880820274353, "logps/chosen": -31.962514877319336, "logps/rejected": -56.866539001464844, "loss": 0.2921, "rewards/accuracies": 0.875, "rewards/chosen": 0.030226286500692368, "rewards/margins": 3.690258026123047, "rewards/rejected": -3.660031795501709, "step": 135 }, { "epoch": 1.6094674556213018, "grad_norm": 25.510937214506086, "learning_rate": 4.845253227626536e-07, "logits/chosen": -1.067764401435852, "logits/rejected": -1.126332402229309, "logps/chosen": -29.248348236083984, "logps/rejected": -38.563045501708984, "loss": 0.221, "rewards/accuracies": 0.875, "rewards/chosen": -0.5202600359916687, "rewards/margins": 2.1755950450897217, "rewards/rejected": -2.695855140686035, "step": 136 }, { "epoch": 1.6213017751479288, "grad_norm": 35.45937914643292, "learning_rate": 4.84071769433971e-07, "logits/chosen": -0.9243749380111694, "logits/rejected": -1.0414302349090576, "logps/chosen": -43.77132797241211, "logps/rejected": -46.58074951171875, "loss": 0.2904, "rewards/accuracies": 0.8125, "rewards/chosen": -0.23414787650108337, "rewards/margins": 2.354924201965332, "rewards/rejected": -2.5890719890594482, "step": 137 }, { "epoch": 1.6331360946745561, "grad_norm": 27.49463867525898, "learning_rate": 4.836118836238252e-07, "logits/chosen": -0.8824262619018555, "logits/rejected": -0.8479146361351013, "logps/chosen": -38.267608642578125, "logps/rejected": -54.601646423339844, "loss": 0.2142, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2675169110298157, "rewards/margins": 3.23266339302063, "rewards/rejected": -3.500180244445801, "step": 138 }, { "epoch": 1.6449704142011834, "grad_norm": 25.03352720952285, "learning_rate": 4.831456777737779e-07, "logits/chosen": -0.87726891040802, "logits/rejected": -0.6373116374015808, "logps/chosen": -37.56227111816406, "logps/rejected": -62.3519287109375, "loss": 0.2154, "rewards/accuracies": 0.875, "rewards/chosen": -0.22444042563438416, "rewards/margins": 2.3676767349243164, "rewards/rejected": -2.5921175479888916, "step": 139 }, { "epoch": 1.6568047337278107, "grad_norm": 30.77350000619414, "learning_rate": 4.826731644963704e-07, "logits/chosen": -0.7138141989707947, "logits/rejected": -0.746364951133728, "logps/chosen": -56.473121643066406, "logps/rejected": -67.74131774902344, "loss": 0.2472, "rewards/accuracies": 0.875, "rewards/chosen": -1.1767007112503052, "rewards/margins": 3.261204719543457, "rewards/rejected": -4.437905311584473, "step": 140 }, { "epoch": 1.668639053254438, "grad_norm": 28.0953853311355, "learning_rate": 4.82194356574783e-07, "logits/chosen": -1.0583032369613647, "logits/rejected": -1.1049402952194214, "logps/chosen": -34.80393981933594, "logps/rejected": -44.68275451660156, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": -0.5045955181121826, "rewards/margins": 1.9453740119934082, "rewards/rejected": -2.4499692916870117, "step": 141 }, { "epoch": 1.6804733727810652, "grad_norm": 29.806685142730963, "learning_rate": 4.817092669624882e-07, "logits/chosen": -0.9740761518478394, "logits/rejected": -0.8185986876487732, "logps/chosen": -39.042179107666016, "logps/rejected": -60.931121826171875, "loss": 0.2278, "rewards/accuracies": 0.9375, "rewards/chosen": -0.528903067111969, "rewards/margins": 3.3279364109039307, "rewards/rejected": -3.856839656829834, "step": 142 }, { "epoch": 1.6923076923076923, "grad_norm": 29.649829246510613, "learning_rate": 4.812179087829012e-07, "logits/chosen": -0.6173474788665771, "logits/rejected": -0.5431764125823975, "logps/chosen": -27.147438049316406, "logps/rejected": -43.8935546875, "loss": 0.2404, "rewards/accuracies": 1.0, "rewards/chosen": -0.24039342999458313, "rewards/margins": 2.5034737586975098, "rewards/rejected": -2.7438669204711914, "step": 143 }, { "epoch": 1.7041420118343196, "grad_norm": 30.07302253382677, "learning_rate": 4.807202953290243e-07, "logits/chosen": -0.523086428642273, "logits/rejected": -0.5003029108047485, "logps/chosen": -37.44425582885742, "logps/rejected": -49.62798309326172, "loss": 0.2756, "rewards/accuracies": 0.875, "rewards/chosen": 0.09302498400211334, "rewards/margins": 1.9907047748565674, "rewards/rejected": -1.8976799249649048, "step": 144 }, { "epoch": 1.7159763313609466, "grad_norm": 30.457439377224762, "learning_rate": 4.802164400630872e-07, "logits/chosen": -1.0345890522003174, "logits/rejected": -1.0042998790740967, "logps/chosen": -41.94634246826172, "logps/rejected": -54.282379150390625, "loss": 0.2416, "rewards/accuracies": 1.0, "rewards/chosen": -0.555735170841217, "rewards/margins": 3.464998960494995, "rewards/rejected": -4.020733833312988, "step": 145 }, { "epoch": 1.727810650887574, "grad_norm": 29.365588226716095, "learning_rate": 4.797063566161834e-07, "logits/chosen": -0.5482521653175354, "logits/rejected": -0.8150917291641235, "logps/chosen": -45.255279541015625, "logps/rejected": -44.48121643066406, "loss": 0.2289, "rewards/accuracies": 1.0, "rewards/chosen": -0.004989638924598694, "rewards/margins": 2.8408713340759277, "rewards/rejected": -2.845860719680786, "step": 146 }, { "epoch": 1.7396449704142012, "grad_norm": 23.051187654817465, "learning_rate": 4.791900587879009e-07, "logits/chosen": -1.079697608947754, "logits/rejected": -1.1642544269561768, "logps/chosen": -35.86004638671875, "logps/rejected": -55.203956604003906, "loss": 0.2049, "rewards/accuracies": 1.0, "rewards/chosen": -0.20117734372615814, "rewards/margins": 4.757822513580322, "rewards/rejected": -4.959000110626221, "step": 147 }, { "epoch": 1.7514792899408285, "grad_norm": 23.873165296629157, "learning_rate": 4.786675605459487e-07, "logits/chosen": -0.6171930432319641, "logits/rejected": -0.7254853844642639, "logps/chosen": -37.1898193359375, "logps/rejected": -58.64811706542969, "loss": 0.1909, "rewards/accuracies": 1.0, "rewards/chosen": 0.25659316778182983, "rewards/margins": 4.113015174865723, "rewards/rejected": -3.856421947479248, "step": 148 }, { "epoch": 1.7633136094674557, "grad_norm": 28.365749172580184, "learning_rate": 4.781388760257799e-07, "logits/chosen": -0.6407098174095154, "logits/rejected": -0.5406571626663208, "logps/chosen": -32.85944366455078, "logps/rejected": -45.028846740722656, "loss": 0.2134, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13824699819087982, "rewards/margins": 1.9902299642562866, "rewards/rejected": -1.8519829511642456, "step": 149 }, { "epoch": 1.7751479289940828, "grad_norm": 26.001580942778393, "learning_rate": 4.776040195302079e-07, "logits/chosen": -0.7967870235443115, "logits/rejected": -0.925701916217804, "logps/chosen": -34.21118927001953, "logps/rejected": -47.52817916870117, "loss": 0.1779, "rewards/accuracies": 0.875, "rewards/chosen": 0.07275627553462982, "rewards/margins": 3.335747003555298, "rewards/rejected": -3.262990951538086, "step": 150 }, { "epoch": 1.78698224852071, "grad_norm": 31.836026789786995, "learning_rate": 4.770630055290208e-07, "logits/chosen": -1.0263866186141968, "logits/rejected": -0.8518810868263245, "logps/chosen": -40.71949768066406, "logps/rejected": -57.82533264160156, "loss": 0.2399, "rewards/accuracies": 0.875, "rewards/chosen": -0.18406735360622406, "rewards/margins": 2.9664227962493896, "rewards/rejected": -3.1504902839660645, "step": 151 }, { "epoch": 1.7988165680473371, "grad_norm": 28.63311269352365, "learning_rate": 4.76515848658589e-07, "logits/chosen": -0.5594848394393921, "logits/rejected": -0.8863150477409363, "logps/chosen": -45.70458221435547, "logps/rejected": -38.8409538269043, "loss": 0.2137, "rewards/accuracies": 0.875, "rewards/chosen": -0.2182983160018921, "rewards/margins": 2.2750673294067383, "rewards/rejected": -2.4933652877807617, "step": 152 }, { "epoch": 1.8106508875739644, "grad_norm": 34.34513187647427, "learning_rate": 4.759625637214696e-07, "logits/chosen": -0.7560346126556396, "logits/rejected": -0.8882582187652588, "logps/chosen": -30.11992073059082, "logps/rejected": -39.5860595703125, "loss": 0.2751, "rewards/accuracies": 0.75, "rewards/chosen": -0.3838905096054077, "rewards/margins": 1.65618097782135, "rewards/rejected": -2.040071487426758, "step": 153 }, { "epoch": 1.8224852071005917, "grad_norm": 32.10962096929911, "learning_rate": 4.754031656860059e-07, "logits/chosen": -0.8416492342948914, "logits/rejected": -0.6819140911102295, "logps/chosen": -33.661521911621094, "logps/rejected": -44.78226852416992, "loss": 0.2833, "rewards/accuracies": 0.6875, "rewards/chosen": 0.014047026634216309, "rewards/margins": 2.2703661918640137, "rewards/rejected": -2.256319046020508, "step": 154 }, { "epoch": 1.834319526627219, "grad_norm": 30.000217414301275, "learning_rate": 4.748376696859226e-07, "logits/chosen": -0.9286520481109619, "logits/rejected": -0.9778493642807007, "logps/chosen": -41.20292663574219, "logps/rejected": -48.67192077636719, "loss": 0.2596, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4870182275772095, "rewards/margins": 2.4510464668273926, "rewards/rejected": -1.9640284776687622, "step": 155 }, { "epoch": 1.8461538461538463, "grad_norm": 32.4118395869003, "learning_rate": 4.74266091019916e-07, "logits/chosen": -0.8907891511917114, "logits/rejected": -0.9750989079475403, "logps/chosen": -45.65682601928711, "logps/rejected": -50.134220123291016, "loss": 0.2678, "rewards/accuracies": 0.875, "rewards/chosen": -0.07820607721805573, "rewards/margins": 3.1200551986694336, "rewards/rejected": -3.198261260986328, "step": 156 }, { "epoch": 1.8579881656804735, "grad_norm": 28.928479525804008, "learning_rate": 4.7368844515124046e-07, "logits/chosen": -0.6818917393684387, "logits/rejected": -0.8876403570175171, "logps/chosen": -38.4929084777832, "logps/rejected": -44.042869567871094, "loss": 0.2156, "rewards/accuracies": 0.875, "rewards/chosen": 0.11902601271867752, "rewards/margins": 3.1182360649108887, "rewards/rejected": -2.9992103576660156, "step": 157 }, { "epoch": 1.8698224852071006, "grad_norm": 29.65958384914858, "learning_rate": 4.7310474770728996e-07, "logits/chosen": -0.6357210278511047, "logits/rejected": -0.5424870252609253, "logps/chosen": -37.36800765991211, "logps/rejected": -58.64324951171875, "loss": 0.2376, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6739649176597595, "rewards/margins": 3.992888927459717, "rewards/rejected": -4.666853904724121, "step": 158 }, { "epoch": 1.8816568047337277, "grad_norm": 34.26684990746155, "learning_rate": 4.725150144791753e-07, "logits/chosen": -0.46081531047821045, "logits/rejected": -0.5821120142936707, "logps/chosen": -37.67076110839844, "logps/rejected": -41.890037536621094, "loss": 0.3107, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4266676902770996, "rewards/margins": 1.9253289699554443, "rewards/rejected": -2.351996660232544, "step": 159 }, { "epoch": 1.893491124260355, "grad_norm": 35.05902021201059, "learning_rate": 4.719192614212969e-07, "logits/chosen": -0.6496328711509705, "logits/rejected": -0.5829223990440369, "logps/chosen": -36.085716247558594, "logps/rejected": -51.334327697753906, "loss": 0.2594, "rewards/accuracies": 1.0, "rewards/chosen": -0.33468520641326904, "rewards/margins": 2.861034631729126, "rewards/rejected": -3.1957197189331055, "step": 160 }, { "epoch": 1.9053254437869822, "grad_norm": 30.04841620365748, "learning_rate": 4.713175046509131e-07, "logits/chosen": -0.9227581024169922, "logits/rejected": -0.8245525360107422, "logps/chosen": -39.57933807373047, "logps/rejected": -66.19513702392578, "loss": 0.23, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03423337638378143, "rewards/margins": 5.112496376037598, "rewards/rejected": -5.146729469299316, "step": 161 }, { "epoch": 1.9171597633136095, "grad_norm": 31.04728022665797, "learning_rate": 4.707097604477045e-07, "logits/chosen": -1.0367375612258911, "logits/rejected": -1.0377088785171509, "logps/chosen": -46.939781188964844, "logps/rejected": -56.965232849121094, "loss": 0.2494, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24659229815006256, "rewards/margins": 2.464108467102051, "rewards/rejected": -2.710700750350952, "step": 162 }, { "epoch": 1.9289940828402368, "grad_norm": 28.85191727468816, "learning_rate": 4.700960452533328e-07, "logits/chosen": -0.5171566605567932, "logits/rejected": -0.5847671627998352, "logps/chosen": -41.256561279296875, "logps/rejected": -48.218727111816406, "loss": 0.2072, "rewards/accuracies": 0.9375, "rewards/chosen": -0.48802250623703003, "rewards/margins": 2.6758077144622803, "rewards/rejected": -3.163830280303955, "step": 163 }, { "epoch": 1.940828402366864, "grad_norm": 34.99511678470972, "learning_rate": 4.694763756709967e-07, "logits/chosen": -0.9630662202835083, "logits/rejected": -1.0143767595291138, "logps/chosen": -33.528255462646484, "logps/rejected": -41.041446685791016, "loss": 0.2857, "rewards/accuracies": 0.9375, "rewards/chosen": -0.00032395869493484497, "rewards/margins": 2.951721429824829, "rewards/rejected": -2.952045440673828, "step": 164 }, { "epoch": 1.952662721893491, "grad_norm": 32.6916182628487, "learning_rate": 4.688507684649825e-07, "logits/chosen": -0.5867289304733276, "logits/rejected": -0.6317815184593201, "logps/chosen": -34.935386657714844, "logps/rejected": -50.50403594970703, "loss": 0.299, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3650425374507904, "rewards/margins": 3.305384635925293, "rewards/rejected": -2.9403419494628906, "step": 165 }, { "epoch": 1.9644970414201184, "grad_norm": 25.572043392572215, "learning_rate": 4.6821924056021053e-07, "logits/chosen": -0.8966530561447144, "logits/rejected": -0.9081934094429016, "logps/chosen": -39.484596252441406, "logps/rejected": -59.22516632080078, "loss": 0.2167, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013990327715873718, "rewards/margins": 4.235882759094238, "rewards/rejected": -4.237281799316406, "step": 166 }, { "epoch": 1.9763313609467454, "grad_norm": 33.32850848147448, "learning_rate": 4.6758180904177715e-07, "logits/chosen": -0.9043580293655396, "logits/rejected": -0.9197327494621277, "logps/chosen": -30.749338150024414, "logps/rejected": -46.336402893066406, "loss": 0.2978, "rewards/accuracies": 0.9375, "rewards/chosen": 0.061946846544742584, "rewards/margins": 3.1583476066589355, "rewards/rejected": -3.0964009761810303, "step": 167 }, { "epoch": 1.9881656804733727, "grad_norm": 30.12339676116848, "learning_rate": 4.669384911544926e-07, "logits/chosen": -0.7334628701210022, "logits/rejected": -0.6194922924041748, "logps/chosen": -33.2913818359375, "logps/rejected": -58.153072357177734, "loss": 0.2359, "rewards/accuracies": 0.875, "rewards/chosen": -0.35717520117759705, "rewards/margins": 3.8730297088623047, "rewards/rejected": -4.230205059051514, "step": 168 }, { "epoch": 2.0, "grad_norm": 21.4554135691754, "learning_rate": 4.6628930430241495e-07, "logits/chosen": -0.5781211853027344, "logits/rejected": -0.6272490620613098, "logps/chosen": -33.10559844970703, "logps/rejected": -47.765830993652344, "loss": 0.1717, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07726180553436279, "rewards/margins": 3.996021270751953, "rewards/rejected": -4.0732831954956055, "step": 169 }, { "epoch": 2.0118343195266273, "grad_norm": 17.804389758687186, "learning_rate": 4.6563426604837817e-07, "logits/chosen": -0.7812290787696838, "logits/rejected": -0.6464096307754517, "logps/chosen": -37.48862838745117, "logps/rejected": -57.36015319824219, "loss": 0.1516, "rewards/accuracies": 1.0, "rewards/chosen": -0.08392468094825745, "rewards/margins": 3.2438626289367676, "rewards/rejected": -3.327787160873413, "step": 170 }, { "epoch": 2.0236686390532546, "grad_norm": 16.40347419407978, "learning_rate": 4.649733941135183e-07, "logits/chosen": -0.9235811233520508, "logits/rejected": -0.966956615447998, "logps/chosen": -46.7747802734375, "logps/rejected": -61.03520584106445, "loss": 0.1229, "rewards/accuracies": 1.0, "rewards/chosen": -0.24425256252288818, "rewards/margins": 3.003178834915161, "rewards/rejected": -3.2474312782287598, "step": 171 }, { "epoch": 2.035502958579882, "grad_norm": 17.315280346491143, "learning_rate": 4.6430670637679294e-07, "logits/chosen": -0.8347846865653992, "logits/rejected": -0.8632474541664124, "logps/chosen": -40.783226013183594, "logps/rejected": -54.76463317871094, "loss": 0.1394, "rewards/accuracies": 1.0, "rewards/chosen": -0.18245816230773926, "rewards/margins": 3.924067497253418, "rewards/rejected": -4.106525421142578, "step": 172 }, { "epoch": 2.0473372781065087, "grad_norm": 17.457800428549817, "learning_rate": 4.636342208744981e-07, "logits/chosen": -0.6723726987838745, "logits/rejected": -0.7173234820365906, "logps/chosen": -28.166179656982422, "logps/rejected": -44.342166900634766, "loss": 0.1436, "rewards/accuracies": 1.0, "rewards/chosen": 0.3225460350513458, "rewards/margins": 3.5667474269866943, "rewards/rejected": -3.244201421737671, "step": 173 }, { "epoch": 2.059171597633136, "grad_norm": 15.061594900995148, "learning_rate": 4.629559557997804e-07, "logits/chosen": -0.6233261823654175, "logits/rejected": -0.7046049237251282, "logps/chosen": -51.49015426635742, "logps/rejected": -72.83427429199219, "loss": 0.1249, "rewards/accuracies": 1.0, "rewards/chosen": -0.3748975396156311, "rewards/margins": 4.756686210632324, "rewards/rejected": -5.1315836906433105, "step": 174 }, { "epoch": 2.0710059171597632, "grad_norm": 15.61277777512069, "learning_rate": 4.6227192950214435e-07, "logits/chosen": -0.8263496160507202, "logits/rejected": -0.7221701145172119, "logps/chosen": -37.889495849609375, "logps/rejected": -54.355918884277344, "loss": 0.1236, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1404503583908081, "rewards/margins": 3.4195923805236816, "rewards/rejected": -3.279142379760742, "step": 175 }, { "epoch": 2.0828402366863905, "grad_norm": 15.2773228213431, "learning_rate": 4.615821604869563e-07, "logits/chosen": -0.9737514853477478, "logits/rejected": -0.8237433433532715, "logps/chosen": -39.199546813964844, "logps/rejected": -62.34404754638672, "loss": 0.111, "rewards/accuracies": 1.0, "rewards/chosen": -0.041255027055740356, "rewards/margins": 3.107170581817627, "rewards/rejected": -3.148425579071045, "step": 176 }, { "epoch": 2.094674556213018, "grad_norm": 15.829900533787184, "learning_rate": 4.6088666741494384e-07, "logits/chosen": -1.1444345712661743, "logits/rejected": -1.1528784036636353, "logps/chosen": -38.0962028503418, "logps/rejected": -62.006046295166016, "loss": 0.1109, "rewards/accuracies": 1.0, "rewards/chosen": -0.7900293469429016, "rewards/margins": 4.543603420257568, "rewards/rejected": -5.3336334228515625, "step": 177 }, { "epoch": 2.106508875739645, "grad_norm": 16.116982532695665, "learning_rate": 4.6018546910169067e-07, "logits/chosen": -1.0515778064727783, "logits/rejected": -0.9833648800849915, "logps/chosen": -40.967002868652344, "logps/rejected": -65.30772399902344, "loss": 0.1273, "rewards/accuracies": 0.9375, "rewards/chosen": -0.15716314315795898, "rewards/margins": 3.4091856479644775, "rewards/rejected": -3.5663485527038574, "step": 178 }, { "epoch": 2.1183431952662723, "grad_norm": 15.361490349201445, "learning_rate": 4.5947858451712773e-07, "logits/chosen": -0.819975733757019, "logits/rejected": -0.8770939111709595, "logps/chosen": -33.2042236328125, "logps/rejected": -50.4263916015625, "loss": 0.1302, "rewards/accuracies": 1.0, "rewards/chosen": 0.4746937155723572, "rewards/margins": 4.132750511169434, "rewards/rejected": -3.6580564975738525, "step": 179 }, { "epoch": 2.1301775147928996, "grad_norm": 14.5720865184576, "learning_rate": 4.5876603278502027e-07, "logits/chosen": -0.7717497944831848, "logits/rejected": -0.7804837822914124, "logps/chosen": -43.37432861328125, "logps/rejected": -59.71336364746094, "loss": 0.1184, "rewards/accuracies": 1.0, "rewards/chosen": -0.3754761517047882, "rewards/margins": 3.8695340156555176, "rewards/rejected": -4.2450103759765625, "step": 180 }, { "epoch": 2.1420118343195265, "grad_norm": 17.24169816378187, "learning_rate": 4.580478331824498e-07, "logits/chosen": -0.7291906476020813, "logits/rejected": -0.7426820993423462, "logps/chosen": -35.4974365234375, "logps/rejected": -49.904014587402344, "loss": 0.1635, "rewards/accuracies": 1.0, "rewards/chosen": 0.2558782994747162, "rewards/margins": 2.3701655864715576, "rewards/rejected": -2.1142873764038086, "step": 181 }, { "epoch": 2.1538461538461537, "grad_norm": 17.93219012330719, "learning_rate": 4.573240051392935e-07, "logits/chosen": -0.6299488544464111, "logits/rejected": -0.5285680294036865, "logps/chosen": -27.371360778808594, "logps/rejected": -41.3109245300293, "loss": 0.1323, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1316337138414383, "rewards/margins": 2.6835739612579346, "rewards/rejected": -2.8152074813842773, "step": 182 }, { "epoch": 2.165680473372781, "grad_norm": 19.89980345891806, "learning_rate": 4.565945682376977e-07, "logits/chosen": -0.9120879173278809, "logits/rejected": -0.6737322211265564, "logps/chosen": -43.5380859375, "logps/rejected": -75.94633483886719, "loss": 0.1551, "rewards/accuracies": 0.9375, "rewards/chosen": -0.55577552318573, "rewards/margins": 4.484833717346191, "rewards/rejected": -5.040609359741211, "step": 183 }, { "epoch": 2.1775147928994083, "grad_norm": 16.09665942372561, "learning_rate": 4.5585954221154853e-07, "logits/chosen": -0.8184518814086914, "logits/rejected": -0.7758468985557556, "logps/chosen": -30.036758422851562, "logps/rejected": -45.972900390625, "loss": 0.1244, "rewards/accuracies": 1.0, "rewards/chosen": 0.20884810388088226, "rewards/margins": 3.8934361934661865, "rewards/rejected": -3.6845881938934326, "step": 184 }, { "epoch": 2.1893491124260356, "grad_norm": 16.783451535199216, "learning_rate": 4.551189469459382e-07, "logits/chosen": -0.9438289403915405, "logits/rejected": -0.7987072467803955, "logps/chosen": -31.17508888244629, "logps/rejected": -50.338836669921875, "loss": 0.1361, "rewards/accuracies": 1.0, "rewards/chosen": -0.023317746818065643, "rewards/margins": 2.8164429664611816, "rewards/rejected": -2.8397610187530518, "step": 185 }, { "epoch": 2.201183431952663, "grad_norm": 20.143036888697704, "learning_rate": 4.5437280247662646e-07, "logits/chosen": -0.7536525130271912, "logits/rejected": -0.855215311050415, "logps/chosen": -54.403778076171875, "logps/rejected": -66.35308074951172, "loss": 0.1466, "rewards/accuracies": 1.0, "rewards/chosen": -1.0882877111434937, "rewards/margins": 4.7153778076171875, "rewards/rejected": -5.803666114807129, "step": 186 }, { "epoch": 2.21301775147929, "grad_norm": 14.808692042518373, "learning_rate": 4.5362112898949947e-07, "logits/chosen": -0.48705536127090454, "logits/rejected": -0.572258710861206, "logps/chosen": -37.71576690673828, "logps/rejected": -45.96680450439453, "loss": 0.1157, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4106942117214203, "rewards/margins": 4.168295860290527, "rewards/rejected": -4.5789899826049805, "step": 187 }, { "epoch": 2.224852071005917, "grad_norm": 18.376017289584023, "learning_rate": 4.528639468200226e-07, "logits/chosen": -0.9109969735145569, "logits/rejected": -0.9377925992012024, "logps/chosen": -35.95753479003906, "logps/rejected": -53.1741943359375, "loss": 0.1315, "rewards/accuracies": 1.0, "rewards/chosen": -0.26368990540504456, "rewards/margins": 4.901925563812256, "rewards/rejected": -5.165615558624268, "step": 188 }, { "epoch": 2.2366863905325443, "grad_norm": 17.965404516840803, "learning_rate": 4.5210127645269125e-07, "logits/chosen": -0.7325922846794128, "logits/rejected": -0.7497273683547974, "logps/chosen": -32.5103759765625, "logps/rejected": -51.267616271972656, "loss": 0.1185, "rewards/accuracies": 1.0, "rewards/chosen": -0.3522591292858124, "rewards/margins": 4.243406295776367, "rewards/rejected": -4.595664978027344, "step": 189 }, { "epoch": 2.2485207100591715, "grad_norm": 16.570545177365638, "learning_rate": 4.5133313852047613e-07, "logits/chosen": -0.351344496011734, "logits/rejected": -0.3544999659061432, "logps/chosen": -30.0961971282959, "logps/rejected": -47.018310546875, "loss": 0.1119, "rewards/accuracies": 1.0, "rewards/chosen": -0.17489728331565857, "rewards/margins": 3.530579090118408, "rewards/rejected": -3.7054765224456787, "step": 190 }, { "epoch": 2.260355029585799, "grad_norm": 15.819997196077969, "learning_rate": 4.5055955380426514e-07, "logits/chosen": -0.6617011427879333, "logits/rejected": -0.6919267773628235, "logps/chosen": -31.14508819580078, "logps/rejected": -49.937660217285156, "loss": 0.1371, "rewards/accuracies": 0.875, "rewards/chosen": 0.010032139718532562, "rewards/margins": 4.074734687805176, "rewards/rejected": -4.06470251083374, "step": 191 }, { "epoch": 2.272189349112426, "grad_norm": 14.990393051749372, "learning_rate": 4.4978054323230144e-07, "logits/chosen": -0.8150711059570312, "logits/rejected": -0.8408107757568359, "logps/chosen": -27.763072967529297, "logps/rejected": -43.43778991699219, "loss": 0.1182, "rewards/accuracies": 1.0, "rewards/chosen": 0.2238546460866928, "rewards/margins": 3.774090051651001, "rewards/rejected": -3.5502355098724365, "step": 192 }, { "epoch": 2.2840236686390534, "grad_norm": 16.48968206240205, "learning_rate": 4.489961278796167e-07, "logits/chosen": -1.1060757637023926, "logits/rejected": -0.9984903335571289, "logps/chosen": -39.745208740234375, "logps/rejected": -61.31498718261719, "loss": 0.1238, "rewards/accuracies": 1.0, "rewards/chosen": -0.17969991266727448, "rewards/margins": 4.915201663970947, "rewards/rejected": -5.0949015617370605, "step": 193 }, { "epoch": 2.2958579881656807, "grad_norm": 19.190560048976156, "learning_rate": 4.482063289674618e-07, "logits/chosen": -0.8471282720565796, "logits/rejected": -0.7247289419174194, "logps/chosen": -32.47812271118164, "logps/rejected": -54.26726150512695, "loss": 0.1459, "rewards/accuracies": 1.0, "rewards/chosen": 0.25966179370880127, "rewards/margins": 3.339050531387329, "rewards/rejected": -3.0793888568878174, "step": 194 }, { "epoch": 2.3076923076923075, "grad_norm": 14.39242536769752, "learning_rate": 4.4741116786273176e-07, "logits/chosen": -0.8266146779060364, "logits/rejected": -0.815448522567749, "logps/chosen": -34.32430648803711, "logps/rejected": -54.35994338989258, "loss": 0.1023, "rewards/accuracies": 1.0, "rewards/chosen": -0.08938950300216675, "rewards/margins": 4.4216461181640625, "rewards/rejected": -4.511035919189453, "step": 195 }, { "epoch": 2.3195266272189348, "grad_norm": 15.55346828770515, "learning_rate": 4.466106660773884e-07, "logits/chosen": -0.7416298389434814, "logits/rejected": -0.7652521133422852, "logps/chosen": -38.31879425048828, "logps/rejected": -59.721343994140625, "loss": 0.103, "rewards/accuracies": 1.0, "rewards/chosen": 0.3820110857486725, "rewards/margins": 4.027256488800049, "rewards/rejected": -3.645245313644409, "step": 196 }, { "epoch": 2.331360946745562, "grad_norm": 20.33506239450445, "learning_rate": 4.4580484526787807e-07, "logits/chosen": -0.9200114011764526, "logits/rejected": -0.8991610407829285, "logps/chosen": -34.80711364746094, "logps/rejected": -54.14820861816406, "loss": 0.1412, "rewards/accuracies": 1.0, "rewards/chosen": -0.4271819293498993, "rewards/margins": 4.234493732452393, "rewards/rejected": -4.661675930023193, "step": 197 }, { "epoch": 2.3431952662721893, "grad_norm": 16.966597309776112, "learning_rate": 4.44993727234546e-07, "logits/chosen": -0.9545549154281616, "logits/rejected": -0.9994832277297974, "logps/chosen": -41.37195587158203, "logps/rejected": -53.36943817138672, "loss": 0.1497, "rewards/accuracies": 1.0, "rewards/chosen": -0.6568608283996582, "rewards/margins": 4.235930442810059, "rewards/rejected": -4.892791748046875, "step": 198 }, { "epoch": 2.3550295857988166, "grad_norm": 16.008353794022174, "learning_rate": 4.4417733392104585e-07, "logits/chosen": -0.9546022415161133, "logits/rejected": -1.0305328369140625, "logps/chosen": -37.549339294433594, "logps/rejected": -52.69127655029297, "loss": 0.1028, "rewards/accuracies": 1.0, "rewards/chosen": -0.14469043910503387, "rewards/margins": 4.475038528442383, "rewards/rejected": -4.619729042053223, "step": 199 }, { "epoch": 2.366863905325444, "grad_norm": 12.501500717461248, "learning_rate": 4.4335568741374695e-07, "logits/chosen": -0.9863907098770142, "logits/rejected": -0.9025635123252869, "logps/chosen": -31.974454879760742, "logps/rejected": -47.34270477294922, "loss": 0.0902, "rewards/accuracies": 1.0, "rewards/chosen": -0.17588046193122864, "rewards/margins": 4.036709308624268, "rewards/rejected": -4.212589740753174, "step": 200 }, { "epoch": 2.378698224852071, "grad_norm": 12.198066199001053, "learning_rate": 4.425288099411364e-07, "logits/chosen": -1.135861873626709, "logits/rejected": -1.1184909343719482, "logps/chosen": -42.55821228027344, "logps/rejected": -58.36648178100586, "loss": 0.077, "rewards/accuracies": 1.0, "rewards/chosen": -0.1261935532093048, "rewards/margins": 6.448820114135742, "rewards/rejected": -6.575014591217041, "step": 201 }, { "epoch": 2.390532544378698, "grad_norm": 18.343824652909003, "learning_rate": 4.4169672387321735e-07, "logits/chosen": -0.7927027344703674, "logits/rejected": -0.8605716228485107, "logps/chosen": -44.98955535888672, "logps/rejected": -52.32347106933594, "loss": 0.1207, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08800630271434784, "rewards/margins": 4.026900768280029, "rewards/rejected": -4.1149067878723145, "step": 202 }, { "epoch": 2.4023668639053253, "grad_norm": 14.754318186294096, "learning_rate": 4.408594517209045e-07, "logits/chosen": -0.9587286710739136, "logits/rejected": -0.9846871495246887, "logps/chosen": -34.01704025268555, "logps/rejected": -53.6927375793457, "loss": 0.108, "rewards/accuracies": 1.0, "rewards/chosen": -0.4293210208415985, "rewards/margins": 5.7534356117248535, "rewards/rejected": -6.182756423950195, "step": 203 }, { "epoch": 2.4142011834319526, "grad_norm": 16.259714501612795, "learning_rate": 4.4001701613541454e-07, "logits/chosen": -0.8172876834869385, "logits/rejected": -1.0052968263626099, "logps/chosen": -50.09556198120117, "logps/rejected": -48.03728103637695, "loss": 0.1187, "rewards/accuracies": 0.9375, "rewards/chosen": -0.36984017491340637, "rewards/margins": 3.9901065826416016, "rewards/rejected": -4.3599467277526855, "step": 204 }, { "epoch": 2.42603550295858, "grad_norm": 19.017744132865513, "learning_rate": 4.391694399076536e-07, "logits/chosen": -0.9216486215591431, "logits/rejected": -0.8954623937606812, "logps/chosen": -29.45151710510254, "logps/rejected": -53.07656478881836, "loss": 0.131, "rewards/accuracies": 1.0, "rewards/chosen": -0.30826300382614136, "rewards/margins": 5.172004699707031, "rewards/rejected": -5.4802680015563965, "step": 205 }, { "epoch": 2.437869822485207, "grad_norm": 16.95344970048546, "learning_rate": 4.383167459676008e-07, "logits/chosen": -0.8062804341316223, "logits/rejected": -0.5907716155052185, "logps/chosen": -37.306583404541016, "logps/rejected": -65.82327270507812, "loss": 0.1077, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03246502950787544, "rewards/margins": 3.6565213203430176, "rewards/rejected": -3.624056100845337, "step": 206 }, { "epoch": 2.4497041420118344, "grad_norm": 14.56061263170088, "learning_rate": 4.374589573836874e-07, "logits/chosen": -0.9072690606117249, "logits/rejected": -1.0106024742126465, "logps/chosen": -39.9183349609375, "logps/rejected": -51.78164291381836, "loss": 0.0865, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8933297395706177, "rewards/margins": 4.566455841064453, "rewards/rejected": -5.459786415100098, "step": 207 }, { "epoch": 2.4615384615384617, "grad_norm": 14.148619010746572, "learning_rate": 4.365960973621734e-07, "logits/chosen": -0.9763280749320984, "logits/rejected": -1.2256582975387573, "logps/chosen": -43.78369903564453, "logps/rejected": -54.03715896606445, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": -0.2019457072019577, "rewards/margins": 4.386034965515137, "rewards/rejected": -4.587980270385742, "step": 208 }, { "epoch": 2.4733727810650885, "grad_norm": 11.952569068353256, "learning_rate": 4.357281892465191e-07, "logits/chosen": -0.850165069103241, "logits/rejected": -0.9085839986801147, "logps/chosen": -30.494304656982422, "logps/rejected": -45.906429290771484, "loss": 0.0831, "rewards/accuracies": 1.0, "rewards/chosen": 0.06118796020746231, "rewards/margins": 4.031259536743164, "rewards/rejected": -3.970071792602539, "step": 209 }, { "epoch": 2.485207100591716, "grad_norm": 14.94650375178369, "learning_rate": 4.348552565167542e-07, "logits/chosen": -0.9337579011917114, "logits/rejected": -0.8570343255996704, "logps/chosen": -37.36121368408203, "logps/rejected": -63.51266098022461, "loss": 0.1052, "rewards/accuracies": 1.0, "rewards/chosen": -0.1832229197025299, "rewards/margins": 5.160886287689209, "rewards/rejected": -5.344109058380127, "step": 210 }, { "epoch": 2.497041420118343, "grad_norm": 17.82179694890819, "learning_rate": 4.3397732278884194e-07, "logits/chosen": -0.7793615460395813, "logits/rejected": -0.7347290515899658, "logps/chosen": -38.538475036621094, "logps/rejected": -51.74705123901367, "loss": 0.1133, "rewards/accuracies": 0.9375, "rewards/chosen": 0.012450069189071655, "rewards/margins": 2.9992494583129883, "rewards/rejected": -2.9867992401123047, "step": 211 }, { "epoch": 2.5088757396449703, "grad_norm": 17.00316151116976, "learning_rate": 4.330944118140406e-07, "logits/chosen": -0.671549916267395, "logits/rejected": -0.530448317527771, "logps/chosen": -36.966880798339844, "logps/rejected": -57.631778717041016, "loss": 0.0971, "rewards/accuracies": 1.0, "rewards/chosen": -0.0925275981426239, "rewards/margins": 5.078682899475098, "rewards/rejected": -5.171210289001465, "step": 212 }, { "epoch": 2.5207100591715976, "grad_norm": 12.753485442227316, "learning_rate": 4.322065474782609e-07, "logits/chosen": -0.9630086421966553, "logits/rejected": -0.7986509799957275, "logps/chosen": -34.51360321044922, "logps/rejected": -55.286319732666016, "loss": 0.0986, "rewards/accuracies": 1.0, "rewards/chosen": -0.17616373300552368, "rewards/margins": 4.3915815353393555, "rewards/rejected": -4.567745208740234, "step": 213 }, { "epoch": 2.532544378698225, "grad_norm": 19.996942513731593, "learning_rate": 4.313137538014198e-07, "logits/chosen": -0.6898477077484131, "logits/rejected": -0.636679470539093, "logps/chosen": -35.616844177246094, "logps/rejected": -56.1121826171875, "loss": 0.119, "rewards/accuracies": 1.0, "rewards/chosen": -0.24041543900966644, "rewards/margins": 4.271088600158691, "rewards/rejected": -4.51150369644165, "step": 214 }, { "epoch": 2.544378698224852, "grad_norm": 16.48295901806472, "learning_rate": 4.304160549367906e-07, "logits/chosen": -1.0932834148406982, "logits/rejected": -1.1098268032073975, "logps/chosen": -29.987680435180664, "logps/rejected": -46.118587493896484, "loss": 0.0842, "rewards/accuracies": 0.9375, "rewards/chosen": 0.31450319290161133, "rewards/margins": 3.8299648761749268, "rewards/rejected": -3.5154621601104736, "step": 215 }, { "epoch": 2.556213017751479, "grad_norm": 17.61093634952303, "learning_rate": 4.295134751703492e-07, "logits/chosen": -0.9821122884750366, "logits/rejected": -1.0032066106796265, "logps/chosen": -29.145097732543945, "logps/rejected": -53.80962371826172, "loss": 0.1108, "rewards/accuracies": 1.0, "rewards/chosen": -0.3220353424549103, "rewards/margins": 5.978228569030762, "rewards/rejected": -6.300264358520508, "step": 216 }, { "epoch": 2.5680473372781067, "grad_norm": 12.734227646855976, "learning_rate": 4.28606038920118e-07, "logits/chosen": -0.8004301190376282, "logits/rejected": -0.6481240391731262, "logps/chosen": -33.56897735595703, "logps/rejected": -59.778018951416016, "loss": 0.0721, "rewards/accuracies": 1.0, "rewards/chosen": -0.31138908863067627, "rewards/margins": 5.115996837615967, "rewards/rejected": -5.427386283874512, "step": 217 }, { "epoch": 2.5798816568047336, "grad_norm": 19.316829502268995, "learning_rate": 4.276937707355044e-07, "logits/chosen": -0.8966995477676392, "logits/rejected": -0.970150887966156, "logps/chosen": -37.73095703125, "logps/rejected": -48.6308708190918, "loss": 0.1289, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6531825065612793, "rewards/margins": 4.240229606628418, "rewards/rejected": -4.893411636352539, "step": 218 }, { "epoch": 2.591715976331361, "grad_norm": 15.35867876097276, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -0.7338119745254517, "logits/rejected": -0.7964376211166382, "logps/chosen": -35.940223693847656, "logps/rejected": -49.16367721557617, "loss": 0.0979, "rewards/accuracies": 1.0, "rewards/chosen": -0.5470548868179321, "rewards/margins": 4.513436317443848, "rewards/rejected": -5.060491561889648, "step": 219 }, { "epoch": 2.603550295857988, "grad_norm": 15.463745366871002, "learning_rate": 4.2585483741369755e-07, "logits/chosen": -0.7903125882148743, "logits/rejected": -0.6668514609336853, "logps/chosen": -39.22986602783203, "logps/rejected": -56.949462890625, "loss": 0.109, "rewards/accuracies": 1.0, "rewards/chosen": -0.4734441637992859, "rewards/margins": 3.438096523284912, "rewards/rejected": -3.9115407466888428, "step": 220 }, { "epoch": 2.6153846153846154, "grad_norm": 11.656065312741738, "learning_rate": 4.2492822202625065e-07, "logits/chosen": -0.746453046798706, "logits/rejected": -0.5671579837799072, "logps/chosen": -29.767318725585938, "logps/rejected": -55.21434020996094, "loss": 0.0675, "rewards/accuracies": 1.0, "rewards/chosen": -0.5708121657371521, "rewards/margins": 4.064294338226318, "rewards/rejected": -4.635106563568115, "step": 221 }, { "epoch": 2.6272189349112427, "grad_norm": 21.80300920307137, "learning_rate": 4.239968742025684e-07, "logits/chosen": -1.0824371576309204, "logits/rejected": -1.0959596633911133, "logps/chosen": -43.30390548706055, "logps/rejected": -61.727516174316406, "loss": 0.1193, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6028754711151123, "rewards/margins": 4.46554708480835, "rewards/rejected": -5.068422317504883, "step": 222 }, { "epoch": 2.63905325443787, "grad_norm": 14.049260804964101, "learning_rate": 4.2306081913895177e-07, "logits/chosen": -0.9191200137138367, "logits/rejected": -1.0407882928848267, "logps/chosen": -32.684852600097656, "logps/rejected": -46.769744873046875, "loss": 0.0912, "rewards/accuracies": 1.0, "rewards/chosen": -0.8108103275299072, "rewards/margins": 4.522951126098633, "rewards/rejected": -5.333761215209961, "step": 223 }, { "epoch": 2.6508875739644973, "grad_norm": 15.994418099284374, "learning_rate": 4.2212008215905e-07, "logits/chosen": -0.8396817445755005, "logits/rejected": -0.7051962018013, "logps/chosen": -41.968048095703125, "logps/rejected": -68.28170776367188, "loss": 0.0873, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5884815454483032, "rewards/margins": 5.965177536010742, "rewards/rejected": -6.553658962249756, "step": 224 }, { "epoch": 2.662721893491124, "grad_norm": 17.35636622879192, "learning_rate": 4.2117468871317465e-07, "logits/chosen": -1.0077329874038696, "logits/rejected": -1.021416425704956, "logps/chosen": -33.865623474121094, "logps/rejected": -55.850704193115234, "loss": 0.1052, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3826393485069275, "rewards/margins": 5.580774784088135, "rewards/rejected": -5.963414192199707, "step": 225 }, { "epoch": 2.6745562130177514, "grad_norm": 12.60270784440611, "learning_rate": 4.2022466437761154e-07, "logits/chosen": -1.1540577411651611, "logits/rejected": -1.0161852836608887, "logps/chosen": -32.255516052246094, "logps/rejected": -64.34480285644531, "loss": 0.0715, "rewards/accuracies": 1.0, "rewards/chosen": 0.15079544484615326, "rewards/margins": 4.2565717697143555, "rewards/rejected": -4.105776786804199, "step": 226 }, { "epoch": 2.6863905325443787, "grad_norm": 11.837128157661406, "learning_rate": 4.1927003485392873e-07, "logits/chosen": -1.1032685041427612, "logits/rejected": -1.2223795652389526, "logps/chosen": -30.195100784301758, "logps/rejected": -43.28514099121094, "loss": 0.0599, "rewards/accuracies": 1.0, "rewards/chosen": -0.1709037721157074, "rewards/margins": 3.4937398433685303, "rewards/rejected": -3.6646437644958496, "step": 227 }, { "epoch": 2.698224852071006, "grad_norm": 16.871047034781974, "learning_rate": 4.18310825968281e-07, "logits/chosen": -1.022660732269287, "logits/rejected": -1.1288559436798096, "logps/chosen": -40.125972747802734, "logps/rejected": -48.385841369628906, "loss": 0.1092, "rewards/accuracies": 1.0, "rewards/chosen": 0.03397001326084137, "rewards/margins": 3.533397674560547, "rewards/rejected": -3.4994277954101562, "step": 228 }, { "epoch": 2.710059171597633, "grad_norm": 14.91535959862739, "learning_rate": 4.173470636707115e-07, "logits/chosen": -1.0440349578857422, "logits/rejected": -1.0170382261276245, "logps/chosen": -31.154685974121094, "logps/rejected": -54.95928192138672, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": -0.514030933380127, "rewards/margins": 5.214803695678711, "rewards/rejected": -5.72883415222168, "step": 229 }, { "epoch": 2.7218934911242605, "grad_norm": 16.4841578502459, "learning_rate": 4.1637877403444923e-07, "logits/chosen": -0.771393358707428, "logits/rejected": -0.7641423940658569, "logps/chosen": -42.56052017211914, "logps/rejected": -58.539730072021484, "loss": 0.0918, "rewards/accuracies": 1.0, "rewards/chosen": -1.2780624628067017, "rewards/margins": 5.585483551025391, "rewards/rejected": -6.863546371459961, "step": 230 }, { "epoch": 2.7337278106508878, "grad_norm": 12.082861048786787, "learning_rate": 4.1540598325520406e-07, "logits/chosen": -0.8163421154022217, "logits/rejected": -0.8682371377944946, "logps/chosen": -26.423309326171875, "logps/rejected": -41.69921875, "loss": 0.0766, "rewards/accuracies": 1.0, "rewards/chosen": -0.16670477390289307, "rewards/margins": 4.770580291748047, "rewards/rejected": -4.93728494644165, "step": 231 }, { "epoch": 2.7455621301775146, "grad_norm": 13.206244471660067, "learning_rate": 4.144287176504582e-07, "logits/chosen": -1.0770142078399658, "logits/rejected": -1.100901484489441, "logps/chosen": -43.73130416870117, "logps/rejected": -58.910987854003906, "loss": 0.0782, "rewards/accuracies": 1.0, "rewards/chosen": -0.775425136089325, "rewards/margins": 5.230464935302734, "rewards/rejected": -6.005890369415283, "step": 232 }, { "epoch": 2.757396449704142, "grad_norm": 18.831608964137178, "learning_rate": 4.1344700365875353e-07, "logits/chosen": -1.2907536029815674, "logits/rejected": -1.1133619546890259, "logps/chosen": -42.22511672973633, "logps/rejected": -80.1695327758789, "loss": 0.1184, "rewards/accuracies": 1.0, "rewards/chosen": -0.7888930439949036, "rewards/margins": 4.728762149810791, "rewards/rejected": -5.517655372619629, "step": 233 }, { "epoch": 2.769230769230769, "grad_norm": 18.13801394049149, "learning_rate": 4.1246086783897713e-07, "logits/chosen": -0.7159754037857056, "logits/rejected": -0.9155410528182983, "logps/chosen": -38.486732482910156, "logps/rejected": -46.4632568359375, "loss": 0.1165, "rewards/accuracies": 0.9375, "rewards/chosen": -0.28693825006484985, "rewards/margins": 4.148380756378174, "rewards/rejected": -4.435318946838379, "step": 234 }, { "epoch": 2.7810650887573964, "grad_norm": 14.561324524299124, "learning_rate": 4.1147033686964213e-07, "logits/chosen": -1.0087186098098755, "logits/rejected": -1.0201172828674316, "logps/chosen": -38.35453796386719, "logps/rejected": -61.89058303833008, "loss": 0.082, "rewards/accuracies": 1.0, "rewards/chosen": -0.6061041951179504, "rewards/margins": 6.066676139831543, "rewards/rejected": -6.6727800369262695, "step": 235 }, { "epoch": 2.7928994082840237, "grad_norm": 18.20449643988736, "learning_rate": 4.104754375481664e-07, "logits/chosen": -1.0226837396621704, "logits/rejected": -1.1159076690673828, "logps/chosen": -33.471160888671875, "logps/rejected": -49.130104064941406, "loss": 0.1018, "rewards/accuracies": 1.0, "rewards/chosen": -0.9132391214370728, "rewards/margins": 4.530313491821289, "rewards/rejected": -5.443552494049072, "step": 236 }, { "epoch": 2.804733727810651, "grad_norm": 18.714482911950206, "learning_rate": 4.0947619679014733e-07, "logits/chosen": -1.3095102310180664, "logits/rejected": -1.3122498989105225, "logps/chosen": -37.17940902709961, "logps/rejected": -56.99421691894531, "loss": 0.0986, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6820850372314453, "rewards/margins": 5.377782821655273, "rewards/rejected": -6.059867858886719, "step": 237 }, { "epoch": 2.8165680473372783, "grad_norm": 11.370290693432937, "learning_rate": 4.084726416286337e-07, "logits/chosen": -1.0340602397918701, "logits/rejected": -0.8559252023696899, "logps/chosen": -36.95317840576172, "logps/rejected": -69.14118957519531, "loss": 0.0699, "rewards/accuracies": 1.0, "rewards/chosen": -0.7623500227928162, "rewards/margins": 6.243594169616699, "rewards/rejected": -7.00594425201416, "step": 238 }, { "epoch": 2.828402366863905, "grad_norm": 18.470159692479328, "learning_rate": 4.0746479921339456e-07, "logits/chosen": -1.0048712491989136, "logits/rejected": -1.0136165618896484, "logps/chosen": -51.31761169433594, "logps/rejected": -61.482666015625, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": -0.9726583361625671, "rewards/margins": 3.5378739833831787, "rewards/rejected": -4.510532379150391, "step": 239 }, { "epoch": 2.8402366863905324, "grad_norm": 11.915657733560364, "learning_rate": 4.0645269681018434e-07, "logits/chosen": -1.22614324092865, "logits/rejected": -1.3402845859527588, "logps/chosen": -37.68153381347656, "logps/rejected": -55.563926696777344, "loss": 0.0739, "rewards/accuracies": 1.0, "rewards/chosen": -0.6234626173973083, "rewards/margins": 4.600487232208252, "rewards/rejected": -5.223949909210205, "step": 240 }, { "epoch": 2.8520710059171597, "grad_norm": 15.928696873524284, "learning_rate": 4.054363618000057e-07, "logits/chosen": -1.0241750478744507, "logits/rejected": -0.9920932650566101, "logps/chosen": -41.79422378540039, "logps/rejected": -73.42777252197266, "loss": 0.1032, "rewards/accuracies": 1.0, "rewards/chosen": 0.0987391397356987, "rewards/margins": 6.280758857727051, "rewards/rejected": -6.18202018737793, "step": 241 }, { "epoch": 2.863905325443787, "grad_norm": 14.965124803326795, "learning_rate": 4.044158216783684e-07, "logits/chosen": -1.2023519277572632, "logits/rejected": -1.1554661989212036, "logps/chosen": -50.22661590576172, "logps/rejected": -60.445709228515625, "loss": 0.0811, "rewards/accuracies": 1.0, "rewards/chosen": -0.7524704933166504, "rewards/margins": 4.429455280303955, "rewards/rejected": -5.1819257736206055, "step": 242 }, { "epoch": 2.8757396449704142, "grad_norm": 15.090679655603102, "learning_rate": 4.033911040545453e-07, "logits/chosen": -1.0586270093917847, "logits/rejected": -1.1135869026184082, "logps/chosen": -39.405941009521484, "logps/rejected": -55.257102966308594, "loss": 0.0836, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5358131527900696, "rewards/margins": 4.630053520202637, "rewards/rejected": -5.165866374969482, "step": 243 }, { "epoch": 2.8875739644970415, "grad_norm": 15.379447226520636, "learning_rate": 4.0236223665082605e-07, "logits/chosen": -0.9784256219863892, "logits/rejected": -1.0398386716842651, "logps/chosen": -40.73570251464844, "logps/rejected": -48.330848693847656, "loss": 0.0896, "rewards/accuracies": 0.875, "rewards/chosen": -0.9628621935844421, "rewards/margins": 3.23721981048584, "rewards/rejected": -4.200081825256348, "step": 244 }, { "epoch": 2.899408284023669, "grad_norm": 16.74378459980107, "learning_rate": 4.0132924730176653e-07, "logits/chosen": -0.723244309425354, "logits/rejected": -0.7906000018119812, "logps/chosen": -33.3350944519043, "logps/rejected": -44.06977844238281, "loss": 0.1087, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1651308238506317, "rewards/margins": 4.324864864349365, "rewards/rejected": -4.489995956420898, "step": 245 }, { "epoch": 2.9112426035502956, "grad_norm": 16.171542896949546, "learning_rate": 4.0029216395343617e-07, "logits/chosen": -0.9244989156723022, "logits/rejected": -0.7894719839096069, "logps/chosen": -38.35075759887695, "logps/rejected": -63.83277893066406, "loss": 0.104, "rewards/accuracies": 1.0, "rewards/chosen": -0.6405995488166809, "rewards/margins": 5.280316352844238, "rewards/rejected": -5.9209160804748535, "step": 246 }, { "epoch": 2.9230769230769234, "grad_norm": 16.103173612778058, "learning_rate": 3.992510146626617e-07, "logits/chosen": -1.072364091873169, "logits/rejected": -1.219006061553955, "logps/chosen": -52.24778366088867, "logps/rejected": -54.28422546386719, "loss": 0.088, "rewards/accuracies": 1.0, "rewards/chosen": -0.5743333101272583, "rewards/margins": 4.490229606628418, "rewards/rejected": -5.064562797546387, "step": 247 }, { "epoch": 2.93491124260355, "grad_norm": 21.934919226614063, "learning_rate": 3.982058275962682e-07, "logits/chosen": -1.0643049478530884, "logits/rejected": -1.0713043212890625, "logps/chosen": -38.724098205566406, "logps/rejected": -49.14268112182617, "loss": 0.1283, "rewards/accuracies": 0.875, "rewards/chosen": 0.003525674343109131, "rewards/margins": 3.779031991958618, "rewards/rejected": -3.775506019592285, "step": 248 }, { "epoch": 2.9467455621301775, "grad_norm": 13.845420751689739, "learning_rate": 3.9715663103031706e-07, "logits/chosen": -0.8697773218154907, "logits/rejected": -1.0021997690200806, "logps/chosen": -48.516197204589844, "logps/rejected": -63.481204986572266, "loss": 0.0863, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2427257299423218, "rewards/margins": 5.817460060119629, "rewards/rejected": -7.06018590927124, "step": 249 }, { "epoch": 2.9585798816568047, "grad_norm": 15.20831572963414, "learning_rate": 3.9610345334934094e-07, "logits/chosen": -0.9560275673866272, "logits/rejected": -0.7608417272567749, "logps/chosen": -27.386804580688477, "logps/rejected": -56.76541519165039, "loss": 0.0957, "rewards/accuracies": 1.0, "rewards/chosen": 0.055940259248018265, "rewards/margins": 4.628420829772949, "rewards/rejected": -4.57248067855835, "step": 250 }, { "epoch": 2.970414201183432, "grad_norm": 13.509957666437309, "learning_rate": 3.950463230455761e-07, "logits/chosen": -0.9403877258300781, "logits/rejected": -1.0766938924789429, "logps/chosen": -46.44561004638672, "logps/rejected": -55.376487731933594, "loss": 0.0769, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6139943599700928, "rewards/margins": 5.4038591384887695, "rewards/rejected": -7.017853260040283, "step": 251 }, { "epoch": 2.9822485207100593, "grad_norm": 13.281356988917038, "learning_rate": 3.939852687181915e-07, "logits/chosen": -1.5572373867034912, "logits/rejected": -1.509653091430664, "logps/chosen": -38.27780532836914, "logps/rejected": -59.58174514770508, "loss": 0.0765, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13283273577690125, "rewards/margins": 6.1197829246521, "rewards/rejected": -6.252615451812744, "step": 252 }, { "epoch": 2.994082840236686, "grad_norm": 13.363993336432463, "learning_rate": 3.9292031907251464e-07, "logits/chosen": -0.9404221773147583, "logits/rejected": -0.8659825921058655, "logps/chosen": -47.44232177734375, "logps/rejected": -72.00565338134766, "loss": 0.0827, "rewards/accuracies": 1.0, "rewards/chosen": -2.288888692855835, "rewards/margins": 6.671911239624023, "rewards/rejected": -8.960800170898438, "step": 253 }, { "epoch": 3.0059171597633134, "grad_norm": 13.191144507547312, "learning_rate": 3.9185150291925585e-07, "logits/chosen": -0.9490239024162292, "logits/rejected": -0.7950053215026855, "logps/chosen": -27.338720321655273, "logps/rejected": -54.10641860961914, "loss": 0.085, "rewards/accuracies": 1.0, "rewards/chosen": -0.2650642395019531, "rewards/margins": 4.751713275909424, "rewards/rejected": -5.016777992248535, "step": 254 }, { "epoch": 3.0177514792899407, "grad_norm": 10.096716674793884, "learning_rate": 3.9077884917372806e-07, "logits/chosen": -0.9493421912193298, "logits/rejected": -0.9330881834030151, "logps/chosen": -33.75062561035156, "logps/rejected": -58.77503967285156, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": -0.49208763241767883, "rewards/margins": 5.006468296051025, "rewards/rejected": -5.498556137084961, "step": 255 }, { "epoch": 3.029585798816568, "grad_norm": 8.187374294888702, "learning_rate": 3.8970238685506486e-07, "logits/chosen": -1.0067384243011475, "logits/rejected": -1.0715603828430176, "logps/chosen": -29.130413055419922, "logps/rejected": -55.432456970214844, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": -0.37083467841148376, "rewards/margins": 6.350147247314453, "rewards/rejected": -6.720982551574707, "step": 256 }, { "epoch": 3.0414201183431953, "grad_norm": 10.400956778483433, "learning_rate": 3.8862214508543544e-07, "logits/chosen": -1.1537256240844727, "logits/rejected": -1.2295022010803223, "logps/chosen": -44.73936080932617, "logps/rejected": -56.7740478515625, "loss": 0.0558, "rewards/accuracies": 1.0, "rewards/chosen": -0.8702179789543152, "rewards/margins": 4.224546432495117, "rewards/rejected": -5.09476375579834, "step": 257 }, { "epoch": 3.0532544378698225, "grad_norm": 9.746601784268003, "learning_rate": 3.8753815308925685e-07, "logits/chosen": -0.9428281784057617, "logits/rejected": -0.7745850682258606, "logps/chosen": -42.13050842285156, "logps/rejected": -81.40808868408203, "loss": 0.0576, "rewards/accuracies": 1.0, "rewards/chosen": -0.15858441591262817, "rewards/margins": 5.615185260772705, "rewards/rejected": -5.773769378662109, "step": 258 }, { "epoch": 3.06508875739645, "grad_norm": 8.307502810968026, "learning_rate": 3.864504401924031e-07, "logits/chosen": -0.750975489616394, "logits/rejected": -0.8657329082489014, "logps/chosen": -42.026031494140625, "logps/rejected": -60.68046951293945, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": -0.10235822200775146, "rewards/margins": 5.683484077453613, "rewards/rejected": -5.7858428955078125, "step": 259 }, { "epoch": 3.076923076923077, "grad_norm": 10.229530562143605, "learning_rate": 3.8535903582141184e-07, "logits/chosen": -0.9478952288627625, "logits/rejected": -1.0307767391204834, "logps/chosen": -55.219512939453125, "logps/rejected": -65.23870849609375, "loss": 0.0592, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5834192633628845, "rewards/margins": 5.146862983703613, "rewards/rejected": -5.730282306671143, "step": 260 }, { "epoch": 3.088757396449704, "grad_norm": 11.360767151960536, "learning_rate": 3.8426396950268846e-07, "logits/chosen": -0.9507812261581421, "logits/rejected": -1.0435519218444824, "logps/chosen": -37.444190979003906, "logps/rejected": -53.09665298461914, "loss": 0.0683, "rewards/accuracies": 1.0, "rewards/chosen": -0.7893316745758057, "rewards/margins": 4.056205749511719, "rewards/rejected": -4.845537185668945, "step": 261 }, { "epoch": 3.100591715976331, "grad_norm": 10.428147481093854, "learning_rate": 3.8316527086170727e-07, "logits/chosen": -1.1136678457260132, "logits/rejected": -1.1166181564331055, "logps/chosen": -46.36027526855469, "logps/rejected": -61.403350830078125, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": -1.6686680316925049, "rewards/margins": 5.635904312133789, "rewards/rejected": -7.304572582244873, "step": 262 }, { "epoch": 3.1124260355029585, "grad_norm": 9.067198800478028, "learning_rate": 3.820629696222096e-07, "logits/chosen": -0.6134490370750427, "logits/rejected": -0.6816850900650024, "logps/chosen": -37.14327621459961, "logps/rejected": -45.768043518066406, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": -0.10417760908603668, "rewards/margins": 4.2560577392578125, "rewards/rejected": -4.36023473739624, "step": 263 }, { "epoch": 3.1242603550295858, "grad_norm": 9.142972342246532, "learning_rate": 3.809570956054003e-07, "logits/chosen": -1.110413908958435, "logits/rejected": -1.2050219774246216, "logps/chosen": -46.59968185424805, "logps/rejected": -67.91047668457031, "loss": 0.0521, "rewards/accuracies": 1.0, "rewards/chosen": -1.3769968748092651, "rewards/margins": 4.8837504386901855, "rewards/rejected": -6.260746955871582, "step": 264 }, { "epoch": 3.136094674556213, "grad_norm": 9.722445350426577, "learning_rate": 3.798476787291407e-07, "logits/chosen": -1.204038143157959, "logits/rejected": -1.2214388847351074, "logps/chosen": -55.631988525390625, "logps/rejected": -70.27022552490234, "loss": 0.0545, "rewards/accuracies": 1.0, "rewards/chosen": -2.43508243560791, "rewards/margins": 5.983229637145996, "rewards/rejected": -8.418312072753906, "step": 265 }, { "epoch": 3.1479289940828403, "grad_norm": 12.165954862754088, "learning_rate": 3.787347490071389e-07, "logits/chosen": -1.0962128639221191, "logits/rejected": -1.0446038246154785, "logps/chosen": -30.58979034423828, "logps/rejected": -57.98908233642578, "loss": 0.0751, "rewards/accuracies": 1.0, "rewards/chosen": -0.3746563792228699, "rewards/margins": 6.635012626647949, "rewards/rejected": -7.009669303894043, "step": 266 }, { "epoch": 3.1597633136094676, "grad_norm": 9.45931606123372, "learning_rate": 3.776183365481385e-07, "logits/chosen": -1.1556551456451416, "logits/rejected": -1.1371455192565918, "logps/chosen": -29.80742835998535, "logps/rejected": -65.14445495605469, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": -0.65096515417099, "rewards/margins": 8.631277084350586, "rewards/rejected": -9.282241821289062, "step": 267 }, { "epoch": 3.171597633136095, "grad_norm": 8.216009521089312, "learning_rate": 3.764984715551031e-07, "logits/chosen": -0.7823802828788757, "logits/rejected": -0.8771790266036987, "logps/chosen": -33.291709899902344, "logps/rejected": -53.34422302246094, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": -0.4557185471057892, "rewards/margins": 5.85953426361084, "rewards/rejected": -6.315252780914307, "step": 268 }, { "epoch": 3.1834319526627217, "grad_norm": 8.44405243827575, "learning_rate": 3.753751843244003e-07, "logits/chosen": -0.6730700731277466, "logits/rejected": -0.7033834457397461, "logps/chosen": -38.24939727783203, "logps/rejected": -50.691505432128906, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": -0.48223841190338135, "rewards/margins": 5.0005269050598145, "rewards/rejected": -5.482766151428223, "step": 269 }, { "epoch": 3.195266272189349, "grad_norm": 12.678568405782729, "learning_rate": 3.7424850524498113e-07, "logits/chosen": -1.1234819889068604, "logits/rejected": -1.0680205821990967, "logps/chosen": -49.61648178100586, "logps/rejected": -62.6021728515625, "loss": 0.0669, "rewards/accuracies": 1.0, "rewards/chosen": -1.4030035734176636, "rewards/margins": 6.600266456604004, "rewards/rejected": -8.003270149230957, "step": 270 }, { "epoch": 3.2071005917159763, "grad_norm": 8.769925116184863, "learning_rate": 3.731184647975584e-07, "logits/chosen": -1.0641188621520996, "logits/rejected": -1.0786867141723633, "logps/chosen": -39.86817169189453, "logps/rejected": -59.44279479980469, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": -0.2523530125617981, "rewards/margins": 5.5049519538879395, "rewards/rejected": -5.757305145263672, "step": 271 }, { "epoch": 3.2189349112426036, "grad_norm": 9.128279564046098, "learning_rate": 3.7198509355378207e-07, "logits/chosen": -1.1525990962982178, "logits/rejected": -1.1050852537155151, "logps/chosen": -31.691631317138672, "logps/rejected": -58.040374755859375, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": -0.3830980956554413, "rewards/margins": 5.858831882476807, "rewards/rejected": -6.24193000793457, "step": 272 }, { "epoch": 3.230769230769231, "grad_norm": 5.945965238283508, "learning_rate": 3.7084842217541196e-07, "logits/chosen": -1.1103689670562744, "logits/rejected": -1.2655141353607178, "logps/chosen": -43.58381652832031, "logps/rejected": -63.32284927368164, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": -0.6699415445327759, "rewards/margins": 7.9212541580200195, "rewards/rejected": -8.591195106506348, "step": 273 }, { "epoch": 3.242603550295858, "grad_norm": 9.302647208034704, "learning_rate": 3.6970848141348855e-07, "logits/chosen": -1.2158900499343872, "logits/rejected": -1.2491604089736938, "logps/chosen": -43.763771057128906, "logps/rejected": -65.52889251708984, "loss": 0.0642, "rewards/accuracies": 1.0, "rewards/chosen": -0.5131170153617859, "rewards/margins": 5.668525218963623, "rewards/rejected": -6.181642532348633, "step": 274 }, { "epoch": 3.2544378698224854, "grad_norm": 8.636078483244923, "learning_rate": 3.685653021075006e-07, "logits/chosen": -1.3144724369049072, "logits/rejected": -1.1518325805664062, "logps/chosen": -32.579593658447266, "logps/rejected": -59.97221374511719, "loss": 0.0518, "rewards/accuracies": 1.0, "rewards/chosen": -0.165370911359787, "rewards/margins": 6.193992614746094, "rewards/rejected": -6.359362602233887, "step": 275 }, { "epoch": 3.2662721893491122, "grad_norm": 8.631432982295077, "learning_rate": 3.6741891518455146e-07, "logits/chosen": -1.0432945489883423, "logits/rejected": -0.9308419227600098, "logps/chosen": -38.913291931152344, "logps/rejected": -64.8463363647461, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": -1.2648208141326904, "rewards/margins": 7.041728496551514, "rewards/rejected": -8.306549072265625, "step": 276 }, { "epoch": 3.2781065088757395, "grad_norm": 8.627648180187752, "learning_rate": 3.6626935165852183e-07, "logits/chosen": -1.1467763185501099, "logits/rejected": -0.95106440782547, "logps/chosen": -45.197444915771484, "logps/rejected": -77.6880874633789, "loss": 0.0575, "rewards/accuracies": 1.0, "rewards/chosen": -0.7282447218894958, "rewards/margins": 6.463216304779053, "rewards/rejected": -7.191461086273193, "step": 277 }, { "epoch": 3.289940828402367, "grad_norm": 10.925286523452975, "learning_rate": 3.6511664262923094e-07, "logits/chosen": -1.0780082941055298, "logits/rejected": -1.060866355895996, "logps/chosen": -41.23103332519531, "logps/rejected": -64.39918518066406, "loss": 0.0655, "rewards/accuracies": 1.0, "rewards/chosen": -0.5627799034118652, "rewards/margins": 4.834949970245361, "rewards/rejected": -5.397729873657227, "step": 278 }, { "epoch": 3.301775147928994, "grad_norm": 7.9347499078081745, "learning_rate": 3.639608192815951e-07, "logits/chosen": -1.1390736103057861, "logits/rejected": -1.1505769491195679, "logps/chosen": -31.97028923034668, "logps/rejected": -52.920997619628906, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": 0.059164464473724365, "rewards/margins": 5.323459148406982, "rewards/rejected": -5.264294624328613, "step": 279 }, { "epoch": 3.3136094674556213, "grad_norm": 9.658317222064715, "learning_rate": 3.6280191288478435e-07, "logits/chosen": -0.3561999797821045, "logits/rejected": -0.4131832420825958, "logps/chosen": -49.74396514892578, "logps/rejected": -51.83927917480469, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": 0.05161542445421219, "rewards/margins": 3.9036967754364014, "rewards/rejected": -3.852081298828125, "step": 280 }, { "epoch": 3.3254437869822486, "grad_norm": 7.0333602841005565, "learning_rate": 3.61639954791376e-07, "logits/chosen": -0.7871007919311523, "logits/rejected": -0.7978352308273315, "logps/chosen": -37.140010833740234, "logps/rejected": -54.86417007446289, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": -0.12373484671115875, "rewards/margins": 5.170371055603027, "rewards/rejected": -5.2941060066223145, "step": 281 }, { "epoch": 3.337278106508876, "grad_norm": 7.845433796623778, "learning_rate": 3.604749764365069e-07, "logits/chosen": -0.9799227714538574, "logits/rejected": -0.933765709400177, "logps/chosen": -41.03892517089844, "logps/rejected": -58.41741943359375, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": -0.3890104293823242, "rewards/margins": 4.3942551612854, "rewards/rejected": -4.783265113830566, "step": 282 }, { "epoch": 3.3491124260355027, "grad_norm": 8.104393341578428, "learning_rate": 3.593070093370226e-07, "logits/chosen": -1.1032558679580688, "logits/rejected": -1.1244325637817383, "logps/chosen": -37.84381866455078, "logps/rejected": -54.38085174560547, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": 0.03373198211193085, "rewards/margins": 5.32047176361084, "rewards/rejected": -5.286739349365234, "step": 283 }, { "epoch": 3.36094674556213, "grad_norm": 7.636891980111439, "learning_rate": 3.5813608509062526e-07, "logits/chosen": -1.0172635316848755, "logits/rejected": -1.1440664529800415, "logps/chosen": -46.45831298828125, "logps/rejected": -58.084495544433594, "loss": 0.0364, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6164510250091553, "rewards/margins": 5.46311092376709, "rewards/rejected": -7.079561233520508, "step": 284 }, { "epoch": 3.3727810650887573, "grad_norm": 8.575228042301212, "learning_rate": 3.569622353750181e-07, "logits/chosen": -1.1266000270843506, "logits/rejected": -1.0841931104660034, "logps/chosen": -30.485980987548828, "logps/rejected": -66.34876251220703, "loss": 0.0429, "rewards/accuracies": 1.0, "rewards/chosen": 0.12574508786201477, "rewards/margins": 6.036714553833008, "rewards/rejected": -5.910969257354736, "step": 285 }, { "epoch": 3.3846153846153846, "grad_norm": 14.644268169698243, "learning_rate": 3.557854919470491e-07, "logits/chosen": -0.8823134899139404, "logits/rejected": -0.9364801645278931, "logps/chosen": -42.28858947753906, "logps/rejected": -47.119564056396484, "loss": 0.0808, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7562182545661926, "rewards/margins": 3.6561923027038574, "rewards/rejected": -4.412410736083984, "step": 286 }, { "epoch": 3.396449704142012, "grad_norm": 8.345961744927349, "learning_rate": 3.546058866418513e-07, "logits/chosen": -1.3047679662704468, "logits/rejected": -1.2904943227767944, "logps/chosen": -32.27897644042969, "logps/rejected": -55.80604553222656, "loss": 0.0401, "rewards/accuracies": 1.0, "rewards/chosen": -0.10520702600479126, "rewards/margins": 6.855145454406738, "rewards/rejected": -6.960352420806885, "step": 287 }, { "epoch": 3.408284023668639, "grad_norm": 6.8991491567748895, "learning_rate": 3.5342345137198206e-07, "logits/chosen": -1.0065886974334717, "logits/rejected": -0.9561058282852173, "logps/chosen": -36.960044860839844, "logps/rejected": -61.72477340698242, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": -1.1544219255447388, "rewards/margins": 6.217504978179932, "rewards/rejected": -7.371927261352539, "step": 288 }, { "epoch": 3.4201183431952664, "grad_norm": 10.230292373857246, "learning_rate": 3.5223821812655903e-07, "logits/chosen": -0.9817609190940857, "logits/rejected": -1.056479811668396, "logps/chosen": -42.391998291015625, "logps/rejected": -52.13397979736328, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": -1.1262445449829102, "rewards/margins": 5.622671604156494, "rewards/rejected": -6.748915672302246, "step": 289 }, { "epoch": 3.4319526627218933, "grad_norm": 11.408367741978596, "learning_rate": 3.510502189703954e-07, "logits/chosen": -0.8357728719711304, "logits/rejected": -0.7616229057312012, "logps/chosen": -42.24382781982422, "logps/rejected": -66.47443389892578, "loss": 0.0657, "rewards/accuracies": 1.0, "rewards/chosen": -0.8911622762680054, "rewards/margins": 6.7061848640441895, "rewards/rejected": -7.597347259521484, "step": 290 }, { "epoch": 3.4437869822485205, "grad_norm": 12.561936917149565, "learning_rate": 3.4985948604313237e-07, "logits/chosen": -0.8755354285240173, "logits/rejected": -0.8271730542182922, "logps/chosen": -31.485605239868164, "logps/rejected": -58.347190856933594, "loss": 0.0752, "rewards/accuracies": 0.9375, "rewards/chosen": 0.19215503334999084, "rewards/margins": 5.973523139953613, "rewards/rejected": -5.781367301940918, "step": 291 }, { "epoch": 3.455621301775148, "grad_norm": 11.955176243577917, "learning_rate": 3.486660515583691e-07, "logits/chosen": -1.228103518486023, "logits/rejected": -1.2567815780639648, "logps/chosen": -41.65220260620117, "logps/rejected": -58.665496826171875, "loss": 0.0598, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3302292823791504, "rewards/margins": 5.375921726226807, "rewards/rejected": -6.706151008605957, "step": 292 }, { "epoch": 3.467455621301775, "grad_norm": 10.442695814707806, "learning_rate": 3.474699478027918e-07, "logits/chosen": -1.121075987815857, "logits/rejected": -1.162339687347412, "logps/chosen": -40.622459411621094, "logps/rejected": -55.069026947021484, "loss": 0.0514, "rewards/accuracies": 1.0, "rewards/chosen": -0.2291250228881836, "rewards/margins": 5.634562015533447, "rewards/rejected": -5.863687515258789, "step": 293 }, { "epoch": 3.4792899408284024, "grad_norm": 9.212432774516737, "learning_rate": 3.4627120713529983e-07, "logits/chosen": -1.1274569034576416, "logits/rejected": -1.0957834720611572, "logps/chosen": -29.82750129699707, "logps/rejected": -60.58903503417969, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": -0.8590860366821289, "rewards/margins": 6.390963554382324, "rewards/rejected": -7.250050067901611, "step": 294 }, { "epoch": 3.4911242603550297, "grad_norm": 8.987438061891513, "learning_rate": 3.4506986198613077e-07, "logits/chosen": -1.041571855545044, "logits/rejected": -1.0537890195846558, "logps/chosen": -41.13783264160156, "logps/rejected": -72.97576904296875, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": -0.5465377569198608, "rewards/margins": 7.734911918640137, "rewards/rejected": -8.281450271606445, "step": 295 }, { "epoch": 3.502958579881657, "grad_norm": 7.117159631612635, "learning_rate": 3.438659448559825e-07, "logits/chosen": -1.280914306640625, "logits/rejected": -1.3170225620269775, "logps/chosen": -36.09680938720703, "logps/rejected": -57.332427978515625, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": -0.9314979910850525, "rewards/margins": 7.998897075653076, "rewards/rejected": -8.930395126342773, "step": 296 }, { "epoch": 3.5147928994082838, "grad_norm": 9.213072401647429, "learning_rate": 3.4265948831513434e-07, "logits/chosen": -1.0224130153656006, "logits/rejected": -1.0106931924819946, "logps/chosen": -51.425453186035156, "logps/rejected": -59.91952896118164, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": -0.6506232023239136, "rewards/margins": 6.213413715362549, "rewards/rejected": -6.864037036895752, "step": 297 }, { "epoch": 3.5266272189349115, "grad_norm": 8.34764646763322, "learning_rate": 3.414505250025659e-07, "logits/chosen": -0.5949307680130005, "logits/rejected": -0.6818762421607971, "logps/chosen": -31.321874618530273, "logps/rejected": -45.574928283691406, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": -0.6341886520385742, "rewards/margins": 4.197469234466553, "rewards/rejected": -4.831657886505127, "step": 298 }, { "epoch": 3.5384615384615383, "grad_norm": 9.531733325613454, "learning_rate": 3.402390876250737e-07, "logits/chosen": -0.9537699222564697, "logits/rejected": -0.8830502033233643, "logps/chosen": -43.555381774902344, "logps/rejected": -60.58173751831055, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": -1.3789546489715576, "rewards/margins": 5.775010585784912, "rewards/rejected": -7.153965473175049, "step": 299 }, { "epoch": 3.5502958579881656, "grad_norm": 9.379884768180517, "learning_rate": 3.390252089563867e-07, "logits/chosen": -1.3647562265396118, "logits/rejected": -1.2569482326507568, "logps/chosen": -32.759376525878906, "logps/rejected": -46.86608123779297, "loss": 0.0402, "rewards/accuracies": 0.9375, "rewards/chosen": -0.663446307182312, "rewards/margins": 4.781562805175781, "rewards/rejected": -5.445009231567383, "step": 300 }, { "epoch": 3.562130177514793, "grad_norm": 6.852898201829805, "learning_rate": 3.3780892183627974e-07, "logits/chosen": -1.2863893508911133, "logits/rejected": -1.256026029586792, "logps/chosen": -45.72820281982422, "logps/rejected": -76.36709594726562, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -0.662516713142395, "rewards/margins": 7.760738372802734, "rewards/rejected": -8.423254013061523, "step": 301 }, { "epoch": 3.57396449704142, "grad_norm": 9.656007482244595, "learning_rate": 3.3659025916968475e-07, "logits/chosen": -1.1637496948242188, "logits/rejected": -1.0999524593353271, "logps/chosen": -40.303245544433594, "logps/rejected": -65.28178405761719, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": -1.500770092010498, "rewards/margins": 6.479090690612793, "rewards/rejected": -7.979861259460449, "step": 302 }, { "epoch": 3.5857988165680474, "grad_norm": 6.56810361487223, "learning_rate": 3.353692539258006e-07, "logits/chosen": -1.1888104677200317, "logits/rejected": -1.1970776319503784, "logps/chosen": -55.63449478149414, "logps/rejected": -79.99554443359375, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": -1.9003673791885376, "rewards/margins": 7.262213706970215, "rewards/rejected": -9.162581443786621, "step": 303 }, { "epoch": 3.5976331360946747, "grad_norm": 10.490981950638373, "learning_rate": 3.3414593913720155e-07, "logits/chosen": -0.9099432229995728, "logits/rejected": -0.8601805567741394, "logps/chosen": -41.21411895751953, "logps/rejected": -65.25252532958984, "loss": 0.0569, "rewards/accuracies": 1.0, "rewards/chosen": -1.1893513202667236, "rewards/margins": 5.567013740539551, "rewards/rejected": -6.756364822387695, "step": 304 }, { "epoch": 3.609467455621302, "grad_norm": 6.610277331758854, "learning_rate": 3.329203478989431e-07, "logits/chosen": -1.0593793392181396, "logits/rejected": -1.0052438974380493, "logps/chosen": -36.38404083251953, "logps/rejected": -59.90925598144531, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": -1.4144278764724731, "rewards/margins": 5.430604457855225, "rewards/rejected": -6.845032215118408, "step": 305 }, { "epoch": 3.621301775147929, "grad_norm": 9.012951266000211, "learning_rate": 3.3169251336766697e-07, "logits/chosen": -1.0458168983459473, "logits/rejected": -1.012871265411377, "logps/chosen": -35.66429138183594, "logps/rejected": -57.909767150878906, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": -1.3210654258728027, "rewards/margins": 6.225252151489258, "rewards/rejected": -7.546317100524902, "step": 306 }, { "epoch": 3.633136094674556, "grad_norm": 10.588611564681699, "learning_rate": 3.3046246876070405e-07, "logits/chosen": -1.1131824254989624, "logits/rejected": -1.117433786392212, "logps/chosen": -41.028900146484375, "logps/rejected": -58.1406135559082, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": -0.07064366340637207, "rewards/margins": 6.420350551605225, "rewards/rejected": -6.490993976593018, "step": 307 }, { "epoch": 3.6449704142011834, "grad_norm": 10.357494650059015, "learning_rate": 3.2923024735517567e-07, "logits/chosen": -1.2510991096496582, "logits/rejected": -1.169487476348877, "logps/chosen": -49.32808303833008, "logps/rejected": -77.26386260986328, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": -1.1415975093841553, "rewards/margins": 8.103403091430664, "rewards/rejected": -9.245000839233398, "step": 308 }, { "epoch": 3.6568047337278107, "grad_norm": 10.469013609843655, "learning_rate": 3.279958824870934e-07, "logits/chosen": -0.921501100063324, "logits/rejected": -0.8510603308677673, "logps/chosen": -37.00920486450195, "logps/rejected": -50.785682678222656, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 0.2592521011829376, "rewards/margins": 3.4533708095550537, "rewards/rejected": -3.1941187381744385, "step": 309 }, { "epoch": 3.668639053254438, "grad_norm": 8.143849115083505, "learning_rate": 3.2675940755045713e-07, "logits/chosen": -1.1272180080413818, "logits/rejected": -1.1566120386123657, "logps/chosen": -41.058563232421875, "logps/rejected": -59.50581741333008, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": -0.3426307439804077, "rewards/margins": 5.675926685333252, "rewards/rejected": -6.018557548522949, "step": 310 }, { "epoch": 3.6804733727810652, "grad_norm": 7.797821676664401, "learning_rate": 3.2552085599635167e-07, "logits/chosen": -1.2833009958267212, "logits/rejected": -1.1694531440734863, "logps/chosen": -32.42615509033203, "logps/rejected": -64.32304382324219, "loss": 0.0376, "rewards/accuracies": 1.0, "rewards/chosen": -1.2963796854019165, "rewards/margins": 6.591299533843994, "rewards/rejected": -7.887678623199463, "step": 311 }, { "epoch": 3.6923076923076925, "grad_norm": 6.89356963474552, "learning_rate": 3.242802613320418e-07, "logits/chosen": -1.393490195274353, "logits/rejected": -1.3847185373306274, "logps/chosen": -41.63416290283203, "logps/rejected": -70.63141632080078, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": -0.9141947031021118, "rewards/margins": 7.590588569641113, "rewards/rejected": -8.504783630371094, "step": 312 }, { "epoch": 3.7041420118343193, "grad_norm": 11.222653962104593, "learning_rate": 3.2303765712006585e-07, "logits/chosen": -1.1832377910614014, "logits/rejected": -1.1547398567199707, "logps/chosen": -40.68590545654297, "logps/rejected": -70.75530242919922, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": -0.7116194367408752, "rewards/margins": 5.869533538818359, "rewards/rejected": -6.58115291595459, "step": 313 }, { "epoch": 3.7159763313609466, "grad_norm": 6.708240807361137, "learning_rate": 3.217930769773275e-07, "logits/chosen": -1.2435672283172607, "logits/rejected": -1.035918951034546, "logps/chosen": -50.29835891723633, "logps/rejected": -74.53111267089844, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": -0.17033040523529053, "rewards/margins": 6.697113513946533, "rewards/rejected": -6.867444038391113, "step": 314 }, { "epoch": 3.727810650887574, "grad_norm": 9.42387101375972, "learning_rate": 3.2054655457418647e-07, "logits/chosen": -1.0515731573104858, "logits/rejected": -1.0346689224243164, "logps/chosen": -35.189369201660156, "logps/rejected": -55.9654655456543, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": -0.5117961168289185, "rewards/margins": 5.10178804397583, "rewards/rejected": -5.613584041595459, "step": 315 }, { "epoch": 3.739644970414201, "grad_norm": 6.516712675879623, "learning_rate": 3.1929812363354764e-07, "logits/chosen": -1.3077701330184937, "logits/rejected": -1.1945271492004395, "logps/chosen": -35.00279235839844, "logps/rejected": -54.25492477416992, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": -0.9486129283905029, "rewards/margins": 5.082883834838867, "rewards/rejected": -6.031496524810791, "step": 316 }, { "epoch": 3.7514792899408285, "grad_norm": 8.087941236249774, "learning_rate": 3.1804781792994867e-07, "logits/chosen": -1.1108276844024658, "logits/rejected": -1.2471861839294434, "logps/chosen": -41.833309173583984, "logps/rejected": -61.84939193725586, "loss": 0.0482, "rewards/accuracies": 1.0, "rewards/chosen": -1.742074728012085, "rewards/margins": 7.079897403717041, "rewards/rejected": -8.821971893310547, "step": 317 }, { "epoch": 3.7633136094674557, "grad_norm": 10.801590352605164, "learning_rate": 3.167956712886463e-07, "logits/chosen": -1.373018503189087, "logits/rejected": -1.1945630311965942, "logps/chosen": -30.055692672729492, "logps/rejected": -56.957427978515625, "loss": 0.053, "rewards/accuracies": 1.0, "rewards/chosen": -0.4738597273826599, "rewards/margins": 5.641894817352295, "rewards/rejected": -6.115754127502441, "step": 318 }, { "epoch": 3.775147928994083, "grad_norm": 10.24310089782514, "learning_rate": 3.155417175847011e-07, "logits/chosen": -1.0630590915679932, "logits/rejected": -1.0347498655319214, "logps/chosen": -32.52122497558594, "logps/rejected": -51.32162857055664, "loss": 0.0591, "rewards/accuracies": 1.0, "rewards/chosen": -0.5487163066864014, "rewards/margins": 4.911801815032959, "rewards/rejected": -5.460517883300781, "step": 319 }, { "epoch": 3.78698224852071, "grad_norm": 6.870366909127477, "learning_rate": 3.142859907420615e-07, "logits/chosen": -1.1960430145263672, "logits/rejected": -1.1303733587265015, "logps/chosen": -41.18871307373047, "logps/rejected": -58.43561553955078, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": -0.28152644634246826, "rewards/margins": 5.013343811035156, "rewards/rejected": -5.294870376586914, "step": 320 }, { "epoch": 3.798816568047337, "grad_norm": 8.050195637302277, "learning_rate": 3.1302852473264537e-07, "logits/chosen": -0.8659465312957764, "logits/rejected": -0.8467557430267334, "logps/chosen": -33.895164489746094, "logps/rejected": -50.80845642089844, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 0.08278264105319977, "rewards/margins": 4.752477169036865, "rewards/rejected": -4.669694900512695, "step": 321 }, { "epoch": 3.8106508875739644, "grad_norm": 8.069576088623236, "learning_rate": 3.117693535754213e-07, "logits/chosen": -0.9457200765609741, "logits/rejected": -1.0279572010040283, "logps/chosen": -37.87699890136719, "logps/rejected": -58.80052185058594, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": -1.5841734409332275, "rewards/margins": 5.943948268890381, "rewards/rejected": -7.528121471405029, "step": 322 }, { "epoch": 3.8224852071005917, "grad_norm": 9.494312396980359, "learning_rate": 3.105085113354885e-07, "logits/chosen": -1.0865830183029175, "logits/rejected": -1.06438410282135, "logps/chosen": -34.925296783447266, "logps/rejected": -50.69199752807617, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": -0.6583223342895508, "rewards/margins": 5.800050735473633, "rewards/rejected": -6.458373069763184, "step": 323 }, { "epoch": 3.834319526627219, "grad_norm": 10.61028311721643, "learning_rate": 3.092460321231547e-07, "logits/chosen": -1.1025853157043457, "logits/rejected": -1.3029682636260986, "logps/chosen": -45.37340545654297, "logps/rejected": -57.43135452270508, "loss": 0.0482, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6712946891784668, "rewards/margins": 6.542860984802246, "rewards/rejected": -8.214155197143555, "step": 324 }, { "epoch": 3.8461538461538463, "grad_norm": 9.127446196094581, "learning_rate": 3.079819500930138e-07, "logits/chosen": -1.025221824645996, "logits/rejected": -1.021698236465454, "logps/chosen": -35.102596282958984, "logps/rejected": -62.06328582763672, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": -0.45298677682876587, "rewards/margins": 5.769662857055664, "rewards/rejected": -6.222650051116943, "step": 325 }, { "epoch": 3.8579881656804735, "grad_norm": 12.29389361531357, "learning_rate": 3.0671629944302164e-07, "logits/chosen": -1.0510854721069336, "logits/rejected": -0.951848566532135, "logps/chosen": -36.842681884765625, "logps/rejected": -58.03953552246094, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": -0.3583628535270691, "rewards/margins": 6.295431613922119, "rewards/rejected": -6.653794288635254, "step": 326 }, { "epoch": 3.8698224852071004, "grad_norm": 6.750607959887297, "learning_rate": 3.054491144135707e-07, "logits/chosen": -1.186820387840271, "logits/rejected": -1.2949779033660889, "logps/chosen": -38.38534164428711, "logps/rejected": -59.20850372314453, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": -0.5839760303497314, "rewards/margins": 7.092982292175293, "rewards/rejected": -7.6769585609436035, "step": 327 }, { "epoch": 3.8816568047337277, "grad_norm": 11.006787084671119, "learning_rate": 3.0418042928656415e-07, "logits/chosen": -0.9911553859710693, "logits/rejected": -1.0456457138061523, "logps/chosen": -46.48586654663086, "logps/rejected": -65.22984313964844, "loss": 0.0537, "rewards/accuracies": 1.0, "rewards/chosen": -0.38870108127593994, "rewards/margins": 6.2278571128845215, "rewards/rejected": -6.616558074951172, "step": 328 }, { "epoch": 3.893491124260355, "grad_norm": 10.159592627032264, "learning_rate": 3.029102783844879e-07, "logits/chosen": -0.9870089292526245, "logits/rejected": -1.0301330089569092, "logps/chosen": -43.52060317993164, "logps/rejected": -66.05502319335938, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": -1.344921350479126, "rewards/margins": 7.7191667556762695, "rewards/rejected": -9.064088821411133, "step": 329 }, { "epoch": 3.905325443786982, "grad_norm": 9.8965394155921, "learning_rate": 3.016386960694827e-07, "logits/chosen": -1.0783404111862183, "logits/rejected": -1.1657443046569824, "logps/chosen": -35.86804962158203, "logps/rejected": -49.75996017456055, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": -0.5152837038040161, "rewards/margins": 5.677330017089844, "rewards/rejected": -6.1926140785217285, "step": 330 }, { "epoch": 3.9171597633136095, "grad_norm": 7.647123014651004, "learning_rate": 3.003657167424139e-07, "logits/chosen": -1.116632103919983, "logits/rejected": -1.007010579109192, "logps/chosen": -27.30445671081543, "logps/rejected": -46.01488494873047, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": -0.475461483001709, "rewards/margins": 4.353217601776123, "rewards/rejected": -4.828679084777832, "step": 331 }, { "epoch": 3.9289940828402368, "grad_norm": 7.370007475987176, "learning_rate": 2.990913748419411e-07, "logits/chosen": -1.0164567232131958, "logits/rejected": -1.099671483039856, "logps/chosen": -42.57475662231445, "logps/rejected": -57.868221282958984, "loss": 0.0329, "rewards/accuracies": 1.0, "rewards/chosen": -0.8491787910461426, "rewards/margins": 5.017266750335693, "rewards/rejected": -5.866445541381836, "step": 332 }, { "epoch": 3.940828402366864, "grad_norm": 11.13700718226015, "learning_rate": 2.978157048435863e-07, "logits/chosen": -1.2751820087432861, "logits/rejected": -1.3596327304840088, "logps/chosen": -44.9207878112793, "logps/rejected": -69.44329833984375, "loss": 0.0585, "rewards/accuracies": 1.0, "rewards/chosen": -1.0238862037658691, "rewards/margins": 6.633727073669434, "rewards/rejected": -7.657613754272461, "step": 333 }, { "epoch": 3.952662721893491, "grad_norm": 9.53101226669104, "learning_rate": 2.9653874125880167e-07, "logits/chosen": -1.184804081916809, "logits/rejected": -1.2233521938323975, "logps/chosen": -33.34724807739258, "logps/rejected": -57.77789306640625, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": -0.6291203498840332, "rewards/margins": 6.075474739074707, "rewards/rejected": -6.704594612121582, "step": 334 }, { "epoch": 3.9644970414201186, "grad_norm": 7.607623067226138, "learning_rate": 2.9526051863403517e-07, "logits/chosen": -0.9798343181610107, "logits/rejected": -1.0512641668319702, "logps/chosen": -33.80629348754883, "logps/rejected": -59.37635803222656, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 0.18926861882209778, "rewards/margins": 6.627708435058594, "rewards/rejected": -6.438440322875977, "step": 335 }, { "epoch": 3.9763313609467454, "grad_norm": 8.807254950481648, "learning_rate": 2.9398107154979634e-07, "logits/chosen": -1.0378540754318237, "logits/rejected": -1.236304759979248, "logps/chosen": -53.999755859375, "logps/rejected": -71.97964477539062, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": -2.30962872505188, "rewards/margins": 8.442242622375488, "rewards/rejected": -10.751871109008789, "step": 336 }, { "epoch": 3.9881656804733727, "grad_norm": 7.885596351737529, "learning_rate": 2.9270043461972097e-07, "logits/chosen": -1.066207766532898, "logits/rejected": -1.1475964784622192, "logps/chosen": -51.98231506347656, "logps/rejected": -71.3375244140625, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": -0.8781924247741699, "rewards/margins": 7.75531005859375, "rewards/rejected": -8.633502006530762, "step": 337 }, { "epoch": 4.0, "grad_norm": 9.210155965565953, "learning_rate": 2.9141864248963427e-07, "logits/chosen": -1.214263916015625, "logits/rejected": -1.1584827899932861, "logps/chosen": -45.34033966064453, "logps/rejected": -73.4906005859375, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": -1.079200029373169, "rewards/margins": 5.90239143371582, "rewards/rejected": -6.98159122467041, "step": 338 }, { "epoch": 4.011834319526627, "grad_norm": 6.508446991164881, "learning_rate": 2.9013572983661375e-07, "logits/chosen": -1.2331136465072632, "logits/rejected": -1.3492088317871094, "logps/chosen": -40.93061828613281, "logps/rejected": -59.18800354003906, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": -0.27319905161857605, "rewards/margins": 7.39341926574707, "rewards/rejected": -7.666618347167969, "step": 339 }, { "epoch": 4.023668639053255, "grad_norm": 7.338060837553236, "learning_rate": 2.8885173136805125e-07, "logits/chosen": -1.226414442062378, "logits/rejected": -1.2364081144332886, "logps/chosen": -43.55427551269531, "logps/rejected": -65.36073303222656, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": -1.8232107162475586, "rewards/margins": 7.595679759979248, "rewards/rejected": -9.418889999389648, "step": 340 }, { "epoch": 4.035502958579881, "grad_norm": 4.628023712795517, "learning_rate": 2.8756668182071357e-07, "logits/chosen": -1.180713176727295, "logits/rejected": -1.1071914434432983, "logps/chosen": -41.24338150024414, "logps/rejected": -62.83121109008789, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": -1.6221990585327148, "rewards/margins": 6.7731523513793945, "rewards/rejected": -8.39535140991211, "step": 341 }, { "epoch": 4.047337278106509, "grad_norm": 6.104842751233805, "learning_rate": 2.862806159598032e-07, "logits/chosen": -0.9698923826217651, "logits/rejected": -0.9841170310974121, "logps/chosen": -39.91763687133789, "logps/rejected": -61.13340759277344, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": -1.4580131769180298, "rewards/margins": 5.857044219970703, "rewards/rejected": -7.315057754516602, "step": 342 }, { "epoch": 4.059171597633136, "grad_norm": 6.103306838069507, "learning_rate": 2.8499356857801744e-07, "logits/chosen": -1.0496821403503418, "logits/rejected": -1.0485060214996338, "logps/chosen": -36.92170715332031, "logps/rejected": -52.14490509033203, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": -1.119423747062683, "rewards/margins": 5.038466453552246, "rewards/rejected": -6.1578898429870605, "step": 343 }, { "epoch": 4.071005917159764, "grad_norm": 5.23129093382033, "learning_rate": 2.837055744946072e-07, "logits/chosen": -1.3063690662384033, "logits/rejected": -1.1161246299743652, "logps/chosen": -37.174598693847656, "logps/rejected": -70.67867279052734, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": -1.7968075275421143, "rewards/margins": 8.80527114868164, "rewards/rejected": -10.602078437805176, "step": 344 }, { "epoch": 4.0828402366863905, "grad_norm": 5.119583740628856, "learning_rate": 2.8241666855443526e-07, "logits/chosen": -1.1465834379196167, "logits/rejected": -0.9921385049819946, "logps/chosen": -43.1416015625, "logps/rejected": -62.68724822998047, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": -0.9080947637557983, "rewards/margins": 6.91895866394043, "rewards/rejected": -7.827053070068359, "step": 345 }, { "epoch": 4.094674556213017, "grad_norm": 7.489855061260909, "learning_rate": 2.811268856270332e-07, "logits/chosen": -1.083118200302124, "logits/rejected": -1.002679467201233, "logps/chosen": -38.36323547363281, "logps/rejected": -67.2934799194336, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": -0.746131181716919, "rewards/margins": 6.556519508361816, "rewards/rejected": -7.302650451660156, "step": 346 }, { "epoch": 4.106508875739645, "grad_norm": 5.683245039375868, "learning_rate": 2.798362606056583e-07, "logits/chosen": -1.0683708190917969, "logits/rejected": -1.0461918115615845, "logps/chosen": -44.84065628051758, "logps/rejected": -65.26290130615234, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": -1.4837861061096191, "rewards/margins": 6.347103118896484, "rewards/rejected": -7.8308892250061035, "step": 347 }, { "epoch": 4.118343195266272, "grad_norm": 6.990533138399409, "learning_rate": 2.7854482840634965e-07, "logits/chosen": -1.2399723529815674, "logits/rejected": -1.242792010307312, "logps/chosen": -35.35354995727539, "logps/rejected": -50.29901123046875, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": -0.814429759979248, "rewards/margins": 4.70147180557251, "rewards/rejected": -5.5159010887146, "step": 348 }, { "epoch": 4.1301775147929, "grad_norm": 6.319829009469765, "learning_rate": 2.772526239669831e-07, "logits/chosen": -1.2442165613174438, "logits/rejected": -1.2576425075531006, "logps/chosen": -41.020626068115234, "logps/rejected": -71.2080078125, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": -0.1832388937473297, "rewards/margins": 6.497980117797852, "rewards/rejected": -6.681219100952148, "step": 349 }, { "epoch": 4.1420118343195265, "grad_norm": 7.137478103836617, "learning_rate": 2.759596822463267e-07, "logits/chosen": -1.0815571546554565, "logits/rejected": -1.0727592706680298, "logps/chosen": -45.82701110839844, "logps/rejected": -74.0084228515625, "loss": 0.032, "rewards/accuracies": 1.0, "rewards/chosen": -1.5913039445877075, "rewards/margins": 7.610474586486816, "rewards/rejected": -9.201778411865234, "step": 350 }, { "epoch": 4.153846153846154, "grad_norm": 6.127060479740991, "learning_rate": 2.746660382230944e-07, "logits/chosen": -1.1690300703048706, "logits/rejected": -1.1895644664764404, "logps/chosen": -31.055179595947266, "logps/rejected": -52.40230178833008, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": -1.1996252536773682, "rewards/margins": 6.068203926086426, "rewards/rejected": -7.267828941345215, "step": 351 }, { "epoch": 4.165680473372781, "grad_norm": 6.941829346693308, "learning_rate": 2.73371726895e-07, "logits/chosen": -1.4027228355407715, "logits/rejected": -1.4094792604446411, "logps/chosen": -36.16400909423828, "logps/rejected": -54.41309356689453, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": -1.0032942295074463, "rewards/margins": 5.790349960327148, "rewards/rejected": -6.793643951416016, "step": 352 }, { "epoch": 4.177514792899408, "grad_norm": 5.664169120525594, "learning_rate": 2.7207678327781036e-07, "logits/chosen": -1.0156735181808472, "logits/rejected": -1.0846226215362549, "logps/chosen": -41.97883605957031, "logps/rejected": -58.83436584472656, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": -0.6752943396568298, "rewards/margins": 6.577206611633301, "rewards/rejected": -7.252501487731934, "step": 353 }, { "epoch": 4.189349112426036, "grad_norm": 4.278910321706378, "learning_rate": 2.7078124240439793e-07, "logits/chosen": -1.4016739130020142, "logits/rejected": -1.3881572484970093, "logps/chosen": -40.908294677734375, "logps/rejected": -81.16354370117188, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": -1.1897321939468384, "rewards/margins": 9.733206748962402, "rewards/rejected": -10.92293930053711, "step": 354 }, { "epoch": 4.201183431952662, "grad_norm": 5.662906607186885, "learning_rate": 2.6948513932379307e-07, "logits/chosen": -1.127701997756958, "logits/rejected": -1.2144131660461426, "logps/chosen": -40.303466796875, "logps/rejected": -53.539791107177734, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": -0.6169775724411011, "rewards/margins": 5.929780006408691, "rewards/rejected": -6.54675817489624, "step": 355 }, { "epoch": 4.21301775147929, "grad_norm": 5.869401589278453, "learning_rate": 2.68188509100236e-07, "logits/chosen": -1.1310701370239258, "logits/rejected": -0.9885028004646301, "logps/chosen": -43.84556579589844, "logps/rejected": -67.79801940917969, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": -0.5635640621185303, "rewards/margins": 6.635869979858398, "rewards/rejected": -7.199433326721191, "step": 356 }, { "epoch": 4.224852071005917, "grad_norm": 6.4938093501862335, "learning_rate": 2.668913868122279e-07, "logits/chosen": -1.145686388015747, "logits/rejected": -1.1048319339752197, "logps/chosen": -38.25225830078125, "logps/rejected": -84.31645965576172, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": -0.9592878222465515, "rewards/margins": 8.200289726257324, "rewards/rejected": -9.159577369689941, "step": 357 }, { "epoch": 4.236686390532545, "grad_norm": 5.3318941750637805, "learning_rate": 2.6559380755158206e-07, "logits/chosen": -1.306983470916748, "logits/rejected": -1.3083993196487427, "logps/chosen": -32.1518440246582, "logps/rejected": -57.46953582763672, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": -1.3908557891845703, "rewards/margins": 6.340449810028076, "rewards/rejected": -7.731306076049805, "step": 358 }, { "epoch": 4.2485207100591715, "grad_norm": 4.583272241534079, "learning_rate": 2.642958064224747e-07, "logits/chosen": -1.1370916366577148, "logits/rejected": -1.223602056503296, "logps/chosen": -45.049015045166016, "logps/rejected": -67.99398803710938, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": -0.5301861763000488, "rewards/margins": 7.891375541687012, "rewards/rejected": -8.421562194824219, "step": 359 }, { "epoch": 4.260355029585799, "grad_norm": 5.8869583103527505, "learning_rate": 2.629974185404951e-07, "logits/chosen": -1.142185926437378, "logits/rejected": -1.0663723945617676, "logps/chosen": -54.66786193847656, "logps/rejected": -72.61735534667969, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": -2.0223376750946045, "rewards/margins": 5.790317535400391, "rewards/rejected": -7.812655448913574, "step": 360 }, { "epoch": 4.272189349112426, "grad_norm": 5.434403797417726, "learning_rate": 2.616986790316952e-07, "logits/chosen": -1.084727168083191, "logits/rejected": -1.000258207321167, "logps/chosen": -41.456581115722656, "logps/rejected": -65.51860046386719, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": -0.9817674160003662, "rewards/margins": 6.707640647888184, "rewards/rejected": -7.689408302307129, "step": 361 }, { "epoch": 4.284023668639053, "grad_norm": 7.370310021015268, "learning_rate": 2.603996230316402e-07, "logits/chosen": -0.9062331318855286, "logits/rejected": -1.0064326524734497, "logps/chosen": -37.973838806152344, "logps/rejected": -57.57244873046875, "loss": 0.0387, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2495323419570923, "rewards/margins": 6.539309501647949, "rewards/rejected": -7.78884220123291, "step": 362 }, { "epoch": 4.295857988165681, "grad_norm": 4.712912974501213, "learning_rate": 2.5910028568445716e-07, "logits/chosen": -1.0892990827560425, "logits/rejected": -0.9764453172683716, "logps/chosen": -36.994937896728516, "logps/rejected": -60.98323059082031, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": -0.5305968523025513, "rewards/margins": 6.173823356628418, "rewards/rejected": -6.704420566558838, "step": 363 }, { "epoch": 4.3076923076923075, "grad_norm": 8.269082594101617, "learning_rate": 2.5780070214188474e-07, "logits/chosen": -1.2240197658538818, "logits/rejected": -1.2403483390808105, "logps/chosen": -44.62925720214844, "logps/rejected": -68.63853454589844, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": -1.3063997030258179, "rewards/margins": 9.024595260620117, "rewards/rejected": -10.330994606018066, "step": 364 }, { "epoch": 4.319526627218935, "grad_norm": 7.007032072350081, "learning_rate": 2.5650090756232226e-07, "logits/chosen": -0.9315057992935181, "logits/rejected": -0.9429672360420227, "logps/chosen": -41.07881546020508, "logps/rejected": -55.53737258911133, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": -1.0261346101760864, "rewards/margins": 5.170999526977539, "rewards/rejected": -6.197134017944336, "step": 365 }, { "epoch": 4.331360946745562, "grad_norm": 6.395251337450572, "learning_rate": 2.552009371098778e-07, "logits/chosen": -1.404557466506958, "logits/rejected": -1.3076186180114746, "logps/chosen": -31.61805534362793, "logps/rejected": -48.43040084838867, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": -0.18829268217086792, "rewards/margins": 5.8097405433654785, "rewards/rejected": -5.998033046722412, "step": 366 }, { "epoch": 4.34319526627219, "grad_norm": 3.8677234721220275, "learning_rate": 2.5390082595341816e-07, "logits/chosen": -1.1186158657073975, "logits/rejected": -1.1845828294754028, "logps/chosen": -30.134925842285156, "logps/rejected": -55.77180480957031, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": 0.2170889675617218, "rewards/margins": 6.981093883514404, "rewards/rejected": -6.764005661010742, "step": 367 }, { "epoch": 4.355029585798817, "grad_norm": 6.159683736078311, "learning_rate": 2.5260060926561604e-07, "logits/chosen": -1.056211233139038, "logits/rejected": -1.0722953081130981, "logps/chosen": -38.946746826171875, "logps/rejected": -69.67351531982422, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": -1.082647681236267, "rewards/margins": 6.603187561035156, "rewards/rejected": -7.685835361480713, "step": 368 }, { "epoch": 4.366863905325443, "grad_norm": 6.5279554127734745, "learning_rate": 2.5130032222199954e-07, "logits/chosen": -1.2777047157287598, "logits/rejected": -1.230380654335022, "logps/chosen": -44.25871658325195, "logps/rejected": -68.12265014648438, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": -1.8238017559051514, "rewards/margins": 7.426830291748047, "rewards/rejected": -9.250631332397461, "step": 369 }, { "epoch": 4.378698224852071, "grad_norm": 5.951315298701114, "learning_rate": 2.5e-07, "logits/chosen": -1.1897178888320923, "logits/rejected": -1.1760835647583008, "logps/chosen": -50.80073928833008, "logps/rejected": -85.21668243408203, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": -1.2594873905181885, "rewards/margins": 10.383180618286133, "rewards/rejected": -11.642667770385742, "step": 370 }, { "epoch": 4.390532544378698, "grad_norm": 6.15592831885902, "learning_rate": 2.4869967777800055e-07, "logits/chosen": -0.6888167858123779, "logits/rejected": -0.6404827833175659, "logps/chosen": -41.95843505859375, "logps/rejected": -54.23292922973633, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": -0.676899790763855, "rewards/margins": 4.1613569259643555, "rewards/rejected": -4.8382568359375, "step": 371 }, { "epoch": 4.402366863905326, "grad_norm": 6.891198413290914, "learning_rate": 2.4739939073438393e-07, "logits/chosen": -0.82053142786026, "logits/rejected": -0.8729037046432495, "logps/chosen": -42.464149475097656, "logps/rejected": -56.32053756713867, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": -1.1332708597183228, "rewards/margins": 5.776480674743652, "rewards/rejected": -6.909751892089844, "step": 372 }, { "epoch": 4.414201183431953, "grad_norm": 5.861758579748894, "learning_rate": 2.460991740465819e-07, "logits/chosen": -1.1168274879455566, "logits/rejected": -1.2398712635040283, "logps/chosen": -42.28007507324219, "logps/rejected": -57.90584945678711, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": -1.5763722658157349, "rewards/margins": 5.462160110473633, "rewards/rejected": -7.038532733917236, "step": 373 }, { "epoch": 4.42603550295858, "grad_norm": 6.486927362087198, "learning_rate": 2.4479906289012216e-07, "logits/chosen": -1.2565290927886963, "logits/rejected": -1.167330026626587, "logps/chosen": -33.31645584106445, "logps/rejected": -67.92594909667969, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": -0.6366786956787109, "rewards/margins": 7.325068473815918, "rewards/rejected": -7.961746692657471, "step": 374 }, { "epoch": 4.437869822485207, "grad_norm": 7.945577732784187, "learning_rate": 2.434990924376778e-07, "logits/chosen": -1.3346067667007446, "logits/rejected": -1.3618366718292236, "logps/chosen": -40.330810546875, "logps/rejected": -75.98135375976562, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": -1.036665678024292, "rewards/margins": 8.026451110839844, "rewards/rejected": -9.063117027282715, "step": 375 }, { "epoch": 4.449704142011834, "grad_norm": 4.958938820398859, "learning_rate": 2.421992978581152e-07, "logits/chosen": -0.9798704981803894, "logits/rejected": -0.9934732913970947, "logps/chosen": -55.855712890625, "logps/rejected": -73.17900085449219, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -2.960662841796875, "rewards/margins": 6.544261932373047, "rewards/rejected": -9.504924774169922, "step": 376 }, { "epoch": 4.461538461538462, "grad_norm": 7.067788432308369, "learning_rate": 2.4089971431554287e-07, "logits/chosen": -1.0426850318908691, "logits/rejected": -1.1020509004592896, "logps/chosen": -38.878639221191406, "logps/rejected": -50.67585754394531, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": -0.268679678440094, "rewards/margins": 6.866507053375244, "rewards/rejected": -7.135187149047852, "step": 377 }, { "epoch": 4.4733727810650885, "grad_norm": 6.288836568067266, "learning_rate": 2.3960037696835987e-07, "logits/chosen": -1.304239273071289, "logits/rejected": -1.282547950744629, "logps/chosen": -39.218841552734375, "logps/rejected": -60.83658218383789, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": -1.095663070678711, "rewards/margins": 7.307448863983154, "rewards/rejected": -8.403112411499023, "step": 378 }, { "epoch": 4.485207100591716, "grad_norm": 6.123368205316238, "learning_rate": 2.3830132096830475e-07, "logits/chosen": -1.1514697074890137, "logits/rejected": -1.2038421630859375, "logps/chosen": -38.228721618652344, "logps/rejected": -63.91864776611328, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": -0.9770803451538086, "rewards/margins": 7.351076602935791, "rewards/rejected": -8.328157424926758, "step": 379 }, { "epoch": 4.497041420118343, "grad_norm": 6.073409726703173, "learning_rate": 2.3700258145950493e-07, "logits/chosen": -1.1492507457733154, "logits/rejected": -1.1283336877822876, "logps/chosen": -34.70807647705078, "logps/rejected": -53.41638946533203, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": -1.4615098237991333, "rewards/margins": 5.6219682693481445, "rewards/rejected": -7.083477973937988, "step": 380 }, { "epoch": 4.508875739644971, "grad_norm": 6.859789156730043, "learning_rate": 2.3570419357752518e-07, "logits/chosen": -1.1793534755706787, "logits/rejected": -1.1703987121582031, "logps/chosen": -35.941184997558594, "logps/rejected": -62.35126495361328, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": -1.987781286239624, "rewards/margins": 7.1669921875, "rewards/rejected": -9.154773712158203, "step": 381 }, { "epoch": 4.520710059171598, "grad_norm": 5.867928411886104, "learning_rate": 2.3440619244841794e-07, "logits/chosen": -1.33339262008667, "logits/rejected": -1.2492295503616333, "logps/chosen": -34.28276062011719, "logps/rejected": -59.37235641479492, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": -0.813497006893158, "rewards/margins": 6.642629623413086, "rewards/rejected": -7.456126689910889, "step": 382 }, { "epoch": 4.5325443786982245, "grad_norm": 7.114872273799716, "learning_rate": 2.3310861318777214e-07, "logits/chosen": -0.9742549657821655, "logits/rejected": -0.9942373037338257, "logps/chosen": -35.96149826049805, "logps/rejected": -58.944549560546875, "loss": 0.0363, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3453497886657715, "rewards/margins": 6.538041114807129, "rewards/rejected": -7.883390426635742, "step": 383 }, { "epoch": 4.544378698224852, "grad_norm": 6.454893576265482, "learning_rate": 2.3181149089976404e-07, "logits/chosen": -0.9551500082015991, "logits/rejected": -0.8459126949310303, "logps/chosen": -35.49050521850586, "logps/rejected": -59.616416931152344, "loss": 0.0252, "rewards/accuracies": 1.0, "rewards/chosen": -1.7028827667236328, "rewards/margins": 5.758362293243408, "rewards/rejected": -7.461245536804199, "step": 384 }, { "epoch": 4.556213017751479, "grad_norm": 4.604917778075366, "learning_rate": 2.30514860676207e-07, "logits/chosen": -0.9264880418777466, "logits/rejected": -0.8983692526817322, "logps/chosen": -37.952232360839844, "logps/rejected": -66.89598846435547, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -1.458532452583313, "rewards/margins": 7.424208641052246, "rewards/rejected": -8.88274097442627, "step": 385 }, { "epoch": 4.568047337278107, "grad_norm": 5.972753790143483, "learning_rate": 2.2921875759560207e-07, "logits/chosen": -1.2438300848007202, "logits/rejected": -1.4885196685791016, "logps/chosen": -48.01713943481445, "logps/rejected": -50.086448669433594, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": -1.3077518939971924, "rewards/margins": 5.58108377456665, "rewards/rejected": -6.888835906982422, "step": 386 }, { "epoch": 4.579881656804734, "grad_norm": 5.6047301713387485, "learning_rate": 2.2792321672218967e-07, "logits/chosen": -1.1834698915481567, "logits/rejected": -1.1805692911148071, "logps/chosen": -37.24559020996094, "logps/rejected": -68.1692886352539, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": -1.3298698663711548, "rewards/margins": 8.419723510742188, "rewards/rejected": -9.749593734741211, "step": 387 }, { "epoch": 4.591715976331361, "grad_norm": 7.525291987243171, "learning_rate": 2.2662827310499995e-07, "logits/chosen": -1.016861081123352, "logits/rejected": -1.143229603767395, "logps/chosen": -43.143497467041016, "logps/rejected": -63.837677001953125, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": -0.9594184756278992, "rewards/margins": 6.90932559967041, "rewards/rejected": -7.868744850158691, "step": 388 }, { "epoch": 4.603550295857988, "grad_norm": 7.015133341827511, "learning_rate": 2.2533396177690562e-07, "logits/chosen": -1.2056879997253418, "logits/rejected": -1.3042309284210205, "logps/chosen": -46.522300720214844, "logps/rejected": -61.494346618652344, "loss": 0.0287, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9979819059371948, "rewards/margins": 6.070592880249023, "rewards/rejected": -7.068574905395508, "step": 389 }, { "epoch": 4.615384615384615, "grad_norm": 5.78008137850188, "learning_rate": 2.2404031775367332e-07, "logits/chosen": -1.2291995286941528, "logits/rejected": -1.314921259880066, "logps/chosen": -45.853309631347656, "logps/rejected": -55.867095947265625, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": -1.3121535778045654, "rewards/margins": 5.488016605377197, "rewards/rejected": -6.800169944763184, "step": 390 }, { "epoch": 4.627218934911243, "grad_norm": 6.252483858832655, "learning_rate": 2.227473760330169e-07, "logits/chosen": -1.239790439605713, "logits/rejected": -1.1441867351531982, "logps/chosen": -37.55449676513672, "logps/rejected": -68.65978240966797, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": -1.0223238468170166, "rewards/margins": 8.488706588745117, "rewards/rejected": -9.511030197143555, "step": 391 }, { "epoch": 4.6390532544378695, "grad_norm": 5.571831888079325, "learning_rate": 2.2145517159365043e-07, "logits/chosen": -1.147141456604004, "logits/rejected": -1.0945308208465576, "logps/chosen": -40.555084228515625, "logps/rejected": -71.9697265625, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": -2.045267105102539, "rewards/margins": 7.398416519165039, "rewards/rejected": -9.443683624267578, "step": 392 }, { "epoch": 4.650887573964497, "grad_norm": 6.698497149981605, "learning_rate": 2.2016373939434166e-07, "logits/chosen": -1.2282354831695557, "logits/rejected": -1.2330670356750488, "logps/chosen": -41.47081756591797, "logps/rejected": -56.92009353637695, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": -0.8390868902206421, "rewards/margins": 5.762375831604004, "rewards/rejected": -6.601463317871094, "step": 393 }, { "epoch": 4.662721893491124, "grad_norm": 5.665101121799137, "learning_rate": 2.1887311437296684e-07, "logits/chosen": -1.3237837553024292, "logits/rejected": -1.4168846607208252, "logps/chosen": -43.45185470581055, "logps/rejected": -69.13044738769531, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -1.4442262649536133, "rewards/margins": 8.553362846374512, "rewards/rejected": -9.997589111328125, "step": 394 }, { "epoch": 4.674556213017752, "grad_norm": 4.857938949164866, "learning_rate": 2.175833314455647e-07, "logits/chosen": -1.044704794883728, "logits/rejected": -1.047666072845459, "logps/chosen": -57.778099060058594, "logps/rejected": -86.46309661865234, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": -1.1337398290634155, "rewards/margins": 7.24013614654541, "rewards/rejected": -8.373876571655273, "step": 395 }, { "epoch": 4.686390532544379, "grad_norm": 4.452615646822525, "learning_rate": 2.162944255053928e-07, "logits/chosen": -1.4428414106369019, "logits/rejected": -1.300477147102356, "logps/chosen": -43.00053405761719, "logps/rejected": -78.71292877197266, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": -1.2613164186477661, "rewards/margins": 7.334977149963379, "rewards/rejected": -8.596292495727539, "step": 396 }, { "epoch": 4.6982248520710055, "grad_norm": 4.714776633335646, "learning_rate": 2.1500643142198264e-07, "logits/chosen": -1.146743893623352, "logits/rejected": -1.1370177268981934, "logps/chosen": -37.42723846435547, "logps/rejected": -47.88990020751953, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": -1.5033150911331177, "rewards/margins": 4.86285400390625, "rewards/rejected": -6.366168975830078, "step": 397 }, { "epoch": 4.710059171597633, "grad_norm": 6.0816315897712725, "learning_rate": 2.137193840401968e-07, "logits/chosen": -1.2402558326721191, "logits/rejected": -1.2731809616088867, "logps/chosen": -44.44050598144531, "logps/rejected": -54.368404388427734, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": -0.6547656059265137, "rewards/margins": 5.59221887588501, "rewards/rejected": -6.246984481811523, "step": 398 }, { "epoch": 4.72189349112426, "grad_norm": 5.278736402112801, "learning_rate": 2.1243331817928643e-07, "logits/chosen": -1.2595813274383545, "logits/rejected": -1.263629674911499, "logps/chosen": -44.460487365722656, "logps/rejected": -73.69570922851562, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": -0.5480247735977173, "rewards/margins": 8.522817611694336, "rewards/rejected": -9.070842742919922, "step": 399 }, { "epoch": 4.733727810650888, "grad_norm": 7.040411325600633, "learning_rate": 2.1114826863194878e-07, "logits/chosen": -1.3222355842590332, "logits/rejected": -1.3484004735946655, "logps/chosen": -34.907257080078125, "logps/rejected": -60.81079864501953, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": -0.18377304077148438, "rewards/margins": 8.166070938110352, "rewards/rejected": -8.349843978881836, "step": 400 }, { "epoch": 4.745562130177515, "grad_norm": 5.317019025575069, "learning_rate": 2.0986427016338623e-07, "logits/chosen": -0.9710571765899658, "logits/rejected": -0.9310001730918884, "logps/chosen": -39.45679473876953, "logps/rejected": -56.64714050292969, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": -0.3554608225822449, "rewards/margins": 6.091450214385986, "rewards/rejected": -6.446911811828613, "step": 401 }, { "epoch": 4.757396449704142, "grad_norm": 6.760332535967884, "learning_rate": 2.0858135751036568e-07, "logits/chosen": -1.2228119373321533, "logits/rejected": -1.1839139461517334, "logps/chosen": -30.775981903076172, "logps/rejected": -47.54846954345703, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": 0.206694558262825, "rewards/margins": 5.442242622375488, "rewards/rejected": -5.23554801940918, "step": 402 }, { "epoch": 4.769230769230769, "grad_norm": 5.929464844555724, "learning_rate": 2.0729956538027904e-07, "logits/chosen": -1.2554802894592285, "logits/rejected": -1.1986385583877563, "logps/chosen": -29.697330474853516, "logps/rejected": -48.260414123535156, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": -0.7226145267486572, "rewards/margins": 5.005193710327148, "rewards/rejected": -5.727808475494385, "step": 403 }, { "epoch": 4.781065088757396, "grad_norm": 7.814308902771925, "learning_rate": 2.060189284502037e-07, "logits/chosen": -1.1625741720199585, "logits/rejected": -1.014739751815796, "logps/chosen": -44.1807861328125, "logps/rejected": -75.00509643554688, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": -2.093876600265503, "rewards/margins": 7.723724842071533, "rewards/rejected": -9.817602157592773, "step": 404 }, { "epoch": 4.792899408284024, "grad_norm": 5.110756280509404, "learning_rate": 2.0473948136596486e-07, "logits/chosen": -1.2573013305664062, "logits/rejected": -1.1042354106903076, "logps/chosen": -37.25166320800781, "logps/rejected": -62.03949737548828, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": -1.6967883110046387, "rewards/margins": 7.865792751312256, "rewards/rejected": -9.562580108642578, "step": 405 }, { "epoch": 4.804733727810651, "grad_norm": 6.7687604973263635, "learning_rate": 2.0346125874119838e-07, "logits/chosen": -1.3532781600952148, "logits/rejected": -1.336191177368164, "logps/chosen": -39.134239196777344, "logps/rejected": -60.64654541015625, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": -0.8724418878555298, "rewards/margins": 7.370305061340332, "rewards/rejected": -8.242748260498047, "step": 406 }, { "epoch": 4.816568047337278, "grad_norm": 5.980695574776132, "learning_rate": 2.0218429515641368e-07, "logits/chosen": -0.9892873167991638, "logits/rejected": -0.639751672744751, "logps/chosen": -29.87440299987793, "logps/rejected": -68.78720092773438, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": -0.17198152840137482, "rewards/margins": 8.037885665893555, "rewards/rejected": -8.209866523742676, "step": 407 }, { "epoch": 4.828402366863905, "grad_norm": 6.2455457286356815, "learning_rate": 2.0090862515805895e-07, "logits/chosen": -1.1471354961395264, "logits/rejected": -1.0667625665664673, "logps/chosen": -34.445716857910156, "logps/rejected": -61.40612030029297, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": -1.5051982402801514, "rewards/margins": 6.787008285522461, "rewards/rejected": -8.292206764221191, "step": 408 }, { "epoch": 4.840236686390533, "grad_norm": 6.051206947220886, "learning_rate": 1.9963428325758613e-07, "logits/chosen": -1.2832268476486206, "logits/rejected": -1.2529618740081787, "logps/chosen": -48.28749465942383, "logps/rejected": -68.13203430175781, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": -1.748518466949463, "rewards/margins": 7.8393754959106445, "rewards/rejected": -9.58789348602295, "step": 409 }, { "epoch": 4.85207100591716, "grad_norm": 6.3012658749958765, "learning_rate": 1.983613039305173e-07, "logits/chosen": -1.134077787399292, "logits/rejected": -1.1521306037902832, "logps/chosen": -49.527854919433594, "logps/rejected": -67.16100311279297, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": -1.6244655847549438, "rewards/margins": 6.405965805053711, "rewards/rejected": -8.030430793762207, "step": 410 }, { "epoch": 4.8639053254437865, "grad_norm": 4.470631833677289, "learning_rate": 1.9708972161551213e-07, "logits/chosen": -1.132239580154419, "logits/rejected": -1.0069799423217773, "logps/chosen": -39.385215759277344, "logps/rejected": -58.923709869384766, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -0.6855692863464355, "rewards/margins": 6.034358024597168, "rewards/rejected": -6.7199273109436035, "step": 411 }, { "epoch": 4.875739644970414, "grad_norm": 6.376861071339664, "learning_rate": 1.9581957071343588e-07, "logits/chosen": -0.9943496584892273, "logits/rejected": -0.9143755435943604, "logps/chosen": -48.05913543701172, "logps/rejected": -69.44911193847656, "loss": 0.0287, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8955947160720825, "rewards/margins": 6.447237968444824, "rewards/rejected": -8.342832565307617, "step": 412 }, { "epoch": 4.887573964497041, "grad_norm": 6.431311497276744, "learning_rate": 1.9455088558642932e-07, "logits/chosen": -1.2816779613494873, "logits/rejected": -1.1103105545043945, "logps/chosen": -28.044443130493164, "logps/rejected": -67.90215301513672, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": -1.3063042163848877, "rewards/margins": 9.634206771850586, "rewards/rejected": -10.940509796142578, "step": 413 }, { "epoch": 4.899408284023669, "grad_norm": 4.779737157402348, "learning_rate": 1.9328370055697832e-07, "logits/chosen": -1.1393996477127075, "logits/rejected": -1.108349323272705, "logps/chosen": -34.88286590576172, "logps/rejected": -57.27365493774414, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": -1.0275461673736572, "rewards/margins": 6.514031410217285, "rewards/rejected": -7.541577339172363, "step": 414 }, { "epoch": 4.911242603550296, "grad_norm": 4.144080990787552, "learning_rate": 1.9201804990698616e-07, "logits/chosen": -1.0242797136306763, "logits/rejected": -1.0819731950759888, "logps/chosen": -62.003173828125, "logps/rejected": -75.39320373535156, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -2.2295634746551514, "rewards/margins": 6.729301452636719, "rewards/rejected": -8.95886516571045, "step": 415 }, { "epoch": 4.923076923076923, "grad_norm": 7.28582515595878, "learning_rate": 1.907539678768453e-07, "logits/chosen": -1.2485833168029785, "logits/rejected": -1.2470812797546387, "logps/chosen": -33.02880859375, "logps/rejected": -60.55603790283203, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": -0.4287913739681244, "rewards/margins": 8.188567161560059, "rewards/rejected": -8.617358207702637, "step": 416 }, { "epoch": 4.93491124260355, "grad_norm": 5.63867046125462, "learning_rate": 1.8949148866451152e-07, "logits/chosen": -1.3377947807312012, "logits/rejected": -1.2713524103164673, "logps/chosen": -43.48049545288086, "logps/rejected": -66.24922943115234, "loss": 0.029, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6896387338638306, "rewards/margins": 7.300614356994629, "rewards/rejected": -8.990253448486328, "step": 417 }, { "epoch": 4.946745562130177, "grad_norm": 7.296218456889154, "learning_rate": 1.8823064642457876e-07, "logits/chosen": -1.2259938716888428, "logits/rejected": -1.05912184715271, "logps/chosen": -27.457061767578125, "logps/rejected": -54.369407653808594, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": -0.2313295304775238, "rewards/margins": 6.432798862457275, "rewards/rejected": -6.664128303527832, "step": 418 }, { "epoch": 4.958579881656805, "grad_norm": 6.2793049860116215, "learning_rate": 1.8697147526735466e-07, "logits/chosen": -1.1357454061508179, "logits/rejected": -1.172181487083435, "logps/chosen": -39.59063720703125, "logps/rejected": -70.14149475097656, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -1.2256022691726685, "rewards/margins": 7.56755256652832, "rewards/rejected": -8.793155670166016, "step": 419 }, { "epoch": 4.970414201183432, "grad_norm": 5.690239912229541, "learning_rate": 1.8571400925793852e-07, "logits/chosen": -1.1824370622634888, "logits/rejected": -1.0679001808166504, "logps/chosen": -39.977928161621094, "logps/rejected": -63.33866500854492, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": -0.93265700340271, "rewards/margins": 7.5748395919799805, "rewards/rejected": -8.507495880126953, "step": 420 }, { "epoch": 4.982248520710059, "grad_norm": 6.961259730499676, "learning_rate": 1.844582824152988e-07, "logits/chosen": -1.2937397956848145, "logits/rejected": -1.4227337837219238, "logps/chosen": -47.419578552246094, "logps/rejected": -62.564239501953125, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": -0.698372483253479, "rewards/margins": 8.21216869354248, "rewards/rejected": -8.910541534423828, "step": 421 }, { "epoch": 4.994082840236686, "grad_norm": 7.002318301330378, "learning_rate": 1.8320432871135376e-07, "logits/chosen": -1.068471074104309, "logits/rejected": -0.9389731884002686, "logps/chosen": -39.13528823852539, "logps/rejected": -73.06786346435547, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": -1.0938169956207275, "rewards/margins": 7.050632476806641, "rewards/rejected": -8.144449234008789, "step": 422 }, { "epoch": 5.005917159763314, "grad_norm": 6.3621819487176205, "learning_rate": 1.8195218207005136e-07, "logits/chosen": -1.3071154356002808, "logits/rejected": -1.2660558223724365, "logps/chosen": -41.71501541137695, "logps/rejected": -58.077205657958984, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -0.21319010853767395, "rewards/margins": 5.4651103019714355, "rewards/rejected": -5.678299903869629, "step": 423 }, { "epoch": 5.017751479289941, "grad_norm": 5.688784125844276, "learning_rate": 1.8070187636645237e-07, "logits/chosen": -1.1194195747375488, "logits/rejected": -1.1475024223327637, "logps/chosen": -39.484375, "logps/rejected": -53.20079803466797, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": -0.6331821084022522, "rewards/margins": 5.97371768951416, "rewards/rejected": -6.606899261474609, "step": 424 }, { "epoch": 5.029585798816568, "grad_norm": 4.08606565715224, "learning_rate": 1.7945344542581353e-07, "logits/chosen": -1.2313172817230225, "logits/rejected": -1.2285120487213135, "logps/chosen": -34.221275329589844, "logps/rejected": -71.1649398803711, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": -0.6170086860656738, "rewards/margins": 9.6792573928833, "rewards/rejected": -10.296265602111816, "step": 425 }, { "epoch": 5.041420118343195, "grad_norm": 4.101168582532201, "learning_rate": 1.782069230226725e-07, "logits/chosen": -1.331200122833252, "logits/rejected": -1.2716310024261475, "logps/chosen": -32.71428298950195, "logps/rejected": -57.31195068359375, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": 0.05249008536338806, "rewards/margins": 6.174403190612793, "rewards/rejected": -6.121913433074951, "step": 426 }, { "epoch": 5.053254437869822, "grad_norm": 3.885129904982064, "learning_rate": 1.7696234287993413e-07, "logits/chosen": -1.0245940685272217, "logits/rejected": -0.809888482093811, "logps/chosen": -26.134567260742188, "logps/rejected": -58.40096664428711, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": 0.018941545858979225, "rewards/margins": 6.3871893882751465, "rewards/rejected": -6.3682475090026855, "step": 427 }, { "epoch": 5.06508875739645, "grad_norm": 4.487578296923204, "learning_rate": 1.7571973866795813e-07, "logits/chosen": -1.1871979236602783, "logits/rejected": -1.1301292181015015, "logps/chosen": -47.12839889526367, "logps/rejected": -72.7445068359375, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -2.4960553646087646, "rewards/margins": 8.162572860717773, "rewards/rejected": -10.658628463745117, "step": 428 }, { "epoch": 5.076923076923077, "grad_norm": 4.259353249136821, "learning_rate": 1.7447914400364833e-07, "logits/chosen": -1.3441885709762573, "logits/rejected": -1.4836678504943848, "logps/chosen": -56.2959098815918, "logps/rejected": -63.773292541503906, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -1.281679391860962, "rewards/margins": 6.2985944747924805, "rewards/rejected": -7.580273628234863, "step": 429 }, { "epoch": 5.088757396449704, "grad_norm": 6.225232745781602, "learning_rate": 1.7324059244954292e-07, "logits/chosen": -1.179904580116272, "logits/rejected": -1.0788750648498535, "logps/chosen": -31.996553421020508, "logps/rejected": -57.39022445678711, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": -1.4775885343551636, "rewards/margins": 5.636683464050293, "rewards/rejected": -7.114272117614746, "step": 430 }, { "epoch": 5.100591715976331, "grad_norm": 5.489476203604287, "learning_rate": 1.720041175129066e-07, "logits/chosen": -1.0896636247634888, "logits/rejected": -0.9166202545166016, "logps/chosen": -37.66105651855469, "logps/rejected": -66.37045288085938, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": -1.2693698406219482, "rewards/margins": 6.162668228149414, "rewards/rejected": -7.432038307189941, "step": 431 }, { "epoch": 5.112426035502959, "grad_norm": 5.243666393695587, "learning_rate": 1.7076975264482433e-07, "logits/chosen": -1.1870818138122559, "logits/rejected": -1.1362037658691406, "logps/chosen": -36.95043182373047, "logps/rejected": -65.01616668701172, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": -1.3586033582687378, "rewards/margins": 8.19964599609375, "rewards/rejected": -9.558250427246094, "step": 432 }, { "epoch": 5.124260355029586, "grad_norm": 5.459471747574669, "learning_rate": 1.6953753123929595e-07, "logits/chosen": -0.9043876528739929, "logits/rejected": -1.0556137561798096, "logps/chosen": -42.625064849853516, "logps/rejected": -65.89883422851562, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": -1.0012390613555908, "rewards/margins": 5.687243938446045, "rewards/rejected": -6.688483238220215, "step": 433 }, { "epoch": 5.136094674556213, "grad_norm": 4.0735562348358725, "learning_rate": 1.6830748663233303e-07, "logits/chosen": -1.276261806488037, "logits/rejected": -1.059903860092163, "logps/chosen": -37.19121551513672, "logps/rejected": -67.30182647705078, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": -0.8689858913421631, "rewards/margins": 7.131950378417969, "rewards/rejected": -8.000936508178711, "step": 434 }, { "epoch": 5.14792899408284, "grad_norm": 5.750153182234357, "learning_rate": 1.6707965210105687e-07, "logits/chosen": -1.0981320142745972, "logits/rejected": -1.1993542909622192, "logps/chosen": -45.06623077392578, "logps/rejected": -59.895782470703125, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": -1.1987268924713135, "rewards/margins": 7.3387932777404785, "rewards/rejected": -8.537520408630371, "step": 435 }, { "epoch": 5.159763313609467, "grad_norm": 4.092389515181816, "learning_rate": 1.6585406086279846e-07, "logits/chosen": -1.2612695693969727, "logits/rejected": -1.0451221466064453, "logps/chosen": -36.8587646484375, "logps/rejected": -79.8933334350586, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": -0.8900283575057983, "rewards/margins": 9.542950630187988, "rewards/rejected": -10.432977676391602, "step": 436 }, { "epoch": 5.171597633136095, "grad_norm": 5.775549033301964, "learning_rate": 1.6463074607419942e-07, "logits/chosen": -1.1313787698745728, "logits/rejected": -1.1185730695724487, "logps/chosen": -39.299861907958984, "logps/rejected": -55.985435485839844, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": -1.1983766555786133, "rewards/margins": 5.455132484436035, "rewards/rejected": -6.65350866317749, "step": 437 }, { "epoch": 5.183431952662722, "grad_norm": 4.591857419480486, "learning_rate": 1.6340974083031523e-07, "logits/chosen": -1.1867667436599731, "logits/rejected": -1.101377248764038, "logps/chosen": -37.22975158691406, "logps/rejected": -59.464149475097656, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": -0.3749149739742279, "rewards/margins": 7.300636291503906, "rewards/rejected": -7.675551414489746, "step": 438 }, { "epoch": 5.195266272189349, "grad_norm": 4.200678818481589, "learning_rate": 1.6219107816372024e-07, "logits/chosen": -1.307371973991394, "logits/rejected": -1.1307945251464844, "logps/chosen": -45.63922119140625, "logps/rejected": -96.90985107421875, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": -1.9187942743301392, "rewards/margins": 10.127294540405273, "rewards/rejected": -12.046089172363281, "step": 439 }, { "epoch": 5.207100591715976, "grad_norm": 4.736427790505553, "learning_rate": 1.6097479104361326e-07, "logits/chosen": -1.4351165294647217, "logits/rejected": -1.3055188655853271, "logps/chosen": -33.065948486328125, "logps/rejected": -62.625099182128906, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": -0.6623735427856445, "rewards/margins": 7.489253044128418, "rewards/rejected": -8.151626586914062, "step": 440 }, { "epoch": 5.218934911242604, "grad_norm": 5.4353100679782385, "learning_rate": 1.5976091237492634e-07, "logits/chosen": -1.1784343719482422, "logits/rejected": -1.1669944524765015, "logps/chosen": -43.70185852050781, "logps/rejected": -61.90728759765625, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": -0.9407563805580139, "rewards/margins": 5.61828088760376, "rewards/rejected": -6.559036731719971, "step": 441 }, { "epoch": 5.230769230769231, "grad_norm": 5.5234881107996, "learning_rate": 1.5854947499743413e-07, "logits/chosen": -1.3166191577911377, "logits/rejected": -1.2368249893188477, "logps/chosen": -42.619972229003906, "logps/rejected": -83.75607299804688, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": -1.4741311073303223, "rewards/margins": 8.95831298828125, "rewards/rejected": -10.432443618774414, "step": 442 }, { "epoch": 5.242603550295858, "grad_norm": 5.027875638098313, "learning_rate": 1.573405116848656e-07, "logits/chosen": -1.2968052625656128, "logits/rejected": -1.1763187646865845, "logps/chosen": -41.92218017578125, "logps/rejected": -65.44755554199219, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": -1.754935622215271, "rewards/margins": 5.887596607208252, "rewards/rejected": -7.6425323486328125, "step": 443 }, { "epoch": 5.254437869822485, "grad_norm": 4.9663450415072035, "learning_rate": 1.5613405514401757e-07, "logits/chosen": -1.1399455070495605, "logits/rejected": -1.21956205368042, "logps/chosen": -39.965789794921875, "logps/rejected": -55.96144485473633, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": -1.283247709274292, "rewards/margins": 6.501648902893066, "rewards/rejected": -7.7848968505859375, "step": 444 }, { "epoch": 5.266272189349112, "grad_norm": 5.858342159433277, "learning_rate": 1.5493013801386923e-07, "logits/chosen": -1.2090262174606323, "logits/rejected": -1.2344286441802979, "logps/chosen": -46.805938720703125, "logps/rejected": -75.00871276855469, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": -1.800337791442871, "rewards/margins": 8.601978302001953, "rewards/rejected": -10.402315139770508, "step": 445 }, { "epoch": 5.27810650887574, "grad_norm": 3.9282604177697107, "learning_rate": 1.537287928647002e-07, "logits/chosen": -1.1603989601135254, "logits/rejected": -1.1315605640411377, "logps/chosen": -56.917301177978516, "logps/rejected": -87.98289489746094, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": -2.3748254776000977, "rewards/margins": 9.121573448181152, "rewards/rejected": -11.49639892578125, "step": 446 }, { "epoch": 5.289940828402367, "grad_norm": 4.17827764711654, "learning_rate": 1.525300521972082e-07, "logits/chosen": -1.1303211450576782, "logits/rejected": -1.0561895370483398, "logps/chosen": -34.059898376464844, "logps/rejected": -58.381256103515625, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -0.6126248836517334, "rewards/margins": 6.0588579177856445, "rewards/rejected": -6.671482086181641, "step": 447 }, { "epoch": 5.3017751479289945, "grad_norm": 5.0074680639809745, "learning_rate": 1.513339484416309e-07, "logits/chosen": -1.169161319732666, "logits/rejected": -1.327141284942627, "logps/chosen": -44.55299377441406, "logps/rejected": -59.21006774902344, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": -0.9135736227035522, "rewards/margins": 6.587090492248535, "rewards/rejected": -7.500664710998535, "step": 448 }, { "epoch": 5.313609467455621, "grad_norm": 5.786312787732582, "learning_rate": 1.5014051395686766e-07, "logits/chosen": -1.314427137374878, "logits/rejected": -1.2852325439453125, "logps/chosen": -48.46540832519531, "logps/rejected": -74.1810531616211, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": -2.380225658416748, "rewards/margins": 7.730525016784668, "rewards/rejected": -10.110750198364258, "step": 449 }, { "epoch": 5.325443786982248, "grad_norm": 4.769640674846225, "learning_rate": 1.489497810296046e-07, "logits/chosen": -1.3386704921722412, "logits/rejected": -1.4422738552093506, "logps/chosen": -50.036563873291016, "logps/rejected": -78.25895690917969, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": -3.123276710510254, "rewards/margins": 8.760514259338379, "rewards/rejected": -11.883790969848633, "step": 450 }, { "epoch": 5.337278106508876, "grad_norm": 4.460218619596584, "learning_rate": 1.4776178187344105e-07, "logits/chosen": -0.9677804112434387, "logits/rejected": -1.0331177711486816, "logps/chosen": -59.2320442199707, "logps/rejected": -78.76850891113281, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": -3.6641674041748047, "rewards/margins": 6.445460319519043, "rewards/rejected": -10.109628677368164, "step": 451 }, { "epoch": 5.349112426035503, "grad_norm": 3.802978212761357, "learning_rate": 1.4657654862801797e-07, "logits/chosen": -1.4019286632537842, "logits/rejected": -1.3270961046218872, "logps/chosen": -50.94618606567383, "logps/rejected": -73.76014709472656, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -2.508474826812744, "rewards/margins": 7.509592056274414, "rewards/rejected": -10.01806640625, "step": 452 }, { "epoch": 5.3609467455621305, "grad_norm": 5.615210502971369, "learning_rate": 1.4539411335814866e-07, "logits/chosen": -1.0637266635894775, "logits/rejected": -1.0000033378601074, "logps/chosen": -41.48188018798828, "logps/rejected": -63.91345977783203, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": -0.4643651247024536, "rewards/margins": 6.114117622375488, "rewards/rejected": -6.578482627868652, "step": 453 }, { "epoch": 5.372781065088757, "grad_norm": 5.198548513900638, "learning_rate": 1.4421450805295082e-07, "logits/chosen": -1.2260617017745972, "logits/rejected": -1.0831642150878906, "logps/chosen": -35.77007293701172, "logps/rejected": -64.84323120117188, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": -0.987048864364624, "rewards/margins": 7.374147415161133, "rewards/rejected": -8.361196517944336, "step": 454 }, { "epoch": 5.384615384615385, "grad_norm": 4.832767981066584, "learning_rate": 1.4303776462498186e-07, "logits/chosen": -1.160849690437317, "logits/rejected": -1.2087219953536987, "logps/chosen": -40.54812240600586, "logps/rejected": -65.13203430175781, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": -2.0010249614715576, "rewards/margins": 8.726207733154297, "rewards/rejected": -10.727232933044434, "step": 455 }, { "epoch": 5.396449704142012, "grad_norm": 5.177069682037774, "learning_rate": 1.418639149093748e-07, "logits/chosen": -1.1638591289520264, "logits/rejected": -1.1559276580810547, "logps/chosen": -40.623634338378906, "logps/rejected": -68.79119873046875, "loss": 0.0258, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1391851902008057, "rewards/margins": 8.373973846435547, "rewards/rejected": -9.513158798217773, "step": 456 }, { "epoch": 5.408284023668639, "grad_norm": 4.365063167846994, "learning_rate": 1.406929906629774e-07, "logits/chosen": -1.0570735931396484, "logits/rejected": -1.124732255935669, "logps/chosen": -43.47991943359375, "logps/rejected": -74.1714096069336, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -2.225761890411377, "rewards/margins": 8.236499786376953, "rewards/rejected": -10.462261199951172, "step": 457 }, { "epoch": 5.420118343195266, "grad_norm": 3.589902288780942, "learning_rate": 1.3952502356349323e-07, "logits/chosen": -1.3269389867782593, "logits/rejected": -1.233825922012329, "logps/chosen": -34.30451202392578, "logps/rejected": -52.173431396484375, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -1.0334144830703735, "rewards/margins": 5.583642482757568, "rewards/rejected": -6.617057800292969, "step": 458 }, { "epoch": 5.431952662721893, "grad_norm": 4.55181870052482, "learning_rate": 1.38360045208624e-07, "logits/chosen": -1.2689670324325562, "logits/rejected": -1.2837406396865845, "logps/chosen": -40.163639068603516, "logps/rejected": -56.57490539550781, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": -0.6674104928970337, "rewards/margins": 6.7150068283081055, "rewards/rejected": -7.382417678833008, "step": 459 }, { "epoch": 5.443786982248521, "grad_norm": 5.077146204999947, "learning_rate": 1.371980871152157e-07, "logits/chosen": -1.154836893081665, "logits/rejected": -0.9715840816497803, "logps/chosen": -45.965538024902344, "logps/rejected": -83.77821350097656, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": -1.9867550134658813, "rewards/margins": 7.073174953460693, "rewards/rejected": -9.059929847717285, "step": 460 }, { "epoch": 5.455621301775148, "grad_norm": 5.101492700995248, "learning_rate": 1.3603918071840486e-07, "logits/chosen": -1.316448450088501, "logits/rejected": -1.2533689737319946, "logps/chosen": -42.93837356567383, "logps/rejected": -62.27436065673828, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": -1.6039561033248901, "rewards/margins": 8.3004732131958, "rewards/rejected": -9.90442943572998, "step": 461 }, { "epoch": 5.4674556213017755, "grad_norm": 3.581286281901749, "learning_rate": 1.3488335737076911e-07, "logits/chosen": -1.05559504032135, "logits/rejected": -1.045451283454895, "logps/chosen": -39.23100280761719, "logps/rejected": -68.82288360595703, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -1.3661481142044067, "rewards/margins": 7.688545227050781, "rewards/rejected": -9.054693222045898, "step": 462 }, { "epoch": 5.479289940828402, "grad_norm": 4.630228404526749, "learning_rate": 1.3373064834147817e-07, "logits/chosen": -1.1598727703094482, "logits/rejected": -1.264807939529419, "logps/chosen": -34.88840103149414, "logps/rejected": -62.14350891113281, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -1.034010887145996, "rewards/margins": 8.205092430114746, "rewards/rejected": -9.239103317260742, "step": 463 }, { "epoch": 5.491124260355029, "grad_norm": 5.16650939084812, "learning_rate": 1.3258108481544847e-07, "logits/chosen": -1.1226065158843994, "logits/rejected": -1.1048481464385986, "logps/chosen": -43.532371520996094, "logps/rejected": -56.43935775756836, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": -1.7918784618377686, "rewards/margins": 5.705506324768066, "rewards/rejected": -7.497385025024414, "step": 464 }, { "epoch": 5.502958579881657, "grad_norm": 3.8686941600847433, "learning_rate": 1.314346978924994e-07, "logits/chosen": -1.229318380355835, "logits/rejected": -1.2423676252365112, "logps/chosen": -39.24761199951172, "logps/rejected": -61.67021179199219, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -2.169360876083374, "rewards/margins": 7.238840103149414, "rewards/rejected": -9.408201217651367, "step": 465 }, { "epoch": 5.514792899408284, "grad_norm": 5.296943513917031, "learning_rate": 1.3029151858651143e-07, "logits/chosen": -1.3978345394134521, "logits/rejected": -1.3712904453277588, "logps/chosen": -33.012115478515625, "logps/rejected": -56.899269104003906, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": -1.2505531311035156, "rewards/margins": 6.717418670654297, "rewards/rejected": -7.9679718017578125, "step": 466 }, { "epoch": 5.5266272189349115, "grad_norm": 4.646575940746333, "learning_rate": 1.2915157782458802e-07, "logits/chosen": -1.171278715133667, "logits/rejected": -1.205727458000183, "logps/chosen": -47.129981994628906, "logps/rejected": -68.88152313232422, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -1.6739788055419922, "rewards/margins": 7.506168842315674, "rewards/rejected": -9.180148124694824, "step": 467 }, { "epoch": 5.538461538461538, "grad_norm": 6.123565299064301, "learning_rate": 1.2801490644621788e-07, "logits/chosen": -1.1880264282226562, "logits/rejected": -1.1200838088989258, "logps/chosen": -38.7286491394043, "logps/rejected": -48.82095718383789, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": -0.8554790616035461, "rewards/margins": 4.912729740142822, "rewards/rejected": -5.7682085037231445, "step": 468 }, { "epoch": 5.550295857988166, "grad_norm": 4.461044775468357, "learning_rate": 1.268815352024416e-07, "logits/chosen": -1.3372142314910889, "logits/rejected": -1.2087198495864868, "logps/chosen": -39.35560607910156, "logps/rejected": -66.68772888183594, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": -2.420445203781128, "rewards/margins": 7.610683441162109, "rewards/rejected": -10.0311279296875, "step": 469 }, { "epoch": 5.562130177514793, "grad_norm": 3.4572482970403624, "learning_rate": 1.257514947550189e-07, "logits/chosen": -1.3124825954437256, "logits/rejected": -1.2549489736557007, "logps/chosen": -34.87763214111328, "logps/rejected": -71.9126968383789, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -0.7214508652687073, "rewards/margins": 9.442049026489258, "rewards/rejected": -10.16349983215332, "step": 470 }, { "epoch": 5.57396449704142, "grad_norm": 4.197929133254421, "learning_rate": 1.2462481567559966e-07, "logits/chosen": -1.253973126411438, "logits/rejected": -1.0877537727355957, "logps/chosen": -39.30159378051758, "logps/rejected": -77.09483337402344, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": -0.9544857740402222, "rewards/margins": 6.852291107177734, "rewards/rejected": -7.806777000427246, "step": 471 }, { "epoch": 5.585798816568047, "grad_norm": 5.704469040817279, "learning_rate": 1.2350152844489688e-07, "logits/chosen": -1.2464970350265503, "logits/rejected": -1.2669868469238281, "logps/chosen": -34.1337890625, "logps/rejected": -61.517852783203125, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": -1.8590599298477173, "rewards/margins": 8.024746894836426, "rewards/rejected": -9.883807182312012, "step": 472 }, { "epoch": 5.597633136094674, "grad_norm": 3.345069840685552, "learning_rate": 1.2238166345186152e-07, "logits/chosen": -1.1105680465698242, "logits/rejected": -1.1130532026290894, "logps/chosen": -44.979270935058594, "logps/rejected": -61.89778137207031, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -1.2353321313858032, "rewards/margins": 6.803773880004883, "rewards/rejected": -8.039106369018555, "step": 473 }, { "epoch": 5.609467455621302, "grad_norm": 5.571254370649255, "learning_rate": 1.2126525099286108e-07, "logits/chosen": -1.5405449867248535, "logits/rejected": -1.5201728343963623, "logps/chosen": -35.71417236328125, "logps/rejected": -76.58299255371094, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": -1.5148788690567017, "rewards/margins": 8.865453720092773, "rewards/rejected": -10.380332946777344, "step": 474 }, { "epoch": 5.621301775147929, "grad_norm": 6.171366106305816, "learning_rate": 1.201523212708593e-07, "logits/chosen": -1.2863376140594482, "logits/rejected": -1.0735771656036377, "logps/chosen": -31.64728546142578, "logps/rejected": -67.95651245117188, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": -0.8103072047233582, "rewards/margins": 8.086475372314453, "rewards/rejected": -8.896783828735352, "step": 475 }, { "epoch": 5.633136094674557, "grad_norm": 5.562476478094527, "learning_rate": 1.1904290439459971e-07, "logits/chosen": -1.3088757991790771, "logits/rejected": -1.3743656873703003, "logps/chosen": -44.74784851074219, "logps/rejected": -75.913818359375, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": -1.7934825420379639, "rewards/margins": 8.956192016601562, "rewards/rejected": -10.749673843383789, "step": 476 }, { "epoch": 5.644970414201183, "grad_norm": 4.9137843056433494, "learning_rate": 1.1793703037779055e-07, "logits/chosen": -1.2263046503067017, "logits/rejected": -1.2299702167510986, "logps/chosen": -52.14278793334961, "logps/rejected": -83.45344543457031, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -2.6467747688293457, "rewards/margins": 9.627193450927734, "rewards/rejected": -12.273969650268555, "step": 477 }, { "epoch": 5.65680473372781, "grad_norm": 4.209643150852624, "learning_rate": 1.1683472913829284e-07, "logits/chosen": -1.1544851064682007, "logits/rejected": -1.0651438236236572, "logps/chosen": -43.3776741027832, "logps/rejected": -73.96808624267578, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": -1.589303970336914, "rewards/margins": 7.712847709655762, "rewards/rejected": -9.302152633666992, "step": 478 }, { "epoch": 5.668639053254438, "grad_norm": 4.463200843657177, "learning_rate": 1.1573603049731153e-07, "logits/chosen": -1.2334978580474854, "logits/rejected": -1.2861613035202026, "logps/chosen": -46.0629997253418, "logps/rejected": -64.01387023925781, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -0.542292058467865, "rewards/margins": 7.245844841003418, "rewards/rejected": -7.788136959075928, "step": 479 }, { "epoch": 5.680473372781065, "grad_norm": 4.733353453788434, "learning_rate": 1.146409641785882e-07, "logits/chosen": -1.3681819438934326, "logits/rejected": -1.1802641153335571, "logps/chosen": -49.00510025024414, "logps/rejected": -74.6863784790039, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -0.9393169283866882, "rewards/margins": 8.013870239257812, "rewards/rejected": -8.953186988830566, "step": 480 }, { "epoch": 5.6923076923076925, "grad_norm": 4.192888321849314, "learning_rate": 1.1354955980759689e-07, "logits/chosen": -1.2294647693634033, "logits/rejected": -1.1312428712844849, "logps/chosen": -32.66830825805664, "logps/rejected": -62.29680252075195, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -0.7126701474189758, "rewards/margins": 7.077181816101074, "rewards/rejected": -7.789851665496826, "step": 481 }, { "epoch": 5.704142011834319, "grad_norm": 4.6911466794721255, "learning_rate": 1.1246184691074314e-07, "logits/chosen": -1.097620964050293, "logits/rejected": -1.0427839756011963, "logps/chosen": -36.93230056762695, "logps/rejected": -56.27244567871094, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": -0.5761284232139587, "rewards/margins": 6.3100385665893555, "rewards/rejected": -6.886167049407959, "step": 482 }, { "epoch": 5.715976331360947, "grad_norm": 4.510848016909919, "learning_rate": 1.1137785491456453e-07, "logits/chosen": -1.135402798652649, "logits/rejected": -1.1884230375289917, "logps/chosen": -42.09026336669922, "logps/rejected": -65.27650451660156, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -2.159618377685547, "rewards/margins": 7.496487617492676, "rewards/rejected": -9.656106948852539, "step": 483 }, { "epoch": 5.727810650887574, "grad_norm": 6.019538895167783, "learning_rate": 1.1029761314493518e-07, "logits/chosen": -1.4300427436828613, "logits/rejected": -1.311929702758789, "logps/chosen": -35.11430358886719, "logps/rejected": -61.214385986328125, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -1.1705330610275269, "rewards/margins": 6.273542404174805, "rewards/rejected": -7.444075584411621, "step": 484 }, { "epoch": 5.739644970414201, "grad_norm": 4.484361080757079, "learning_rate": 1.0922115082627196e-07, "logits/chosen": -1.2917143106460571, "logits/rejected": -1.2676193714141846, "logps/chosen": -39.90666580200195, "logps/rejected": -63.61042404174805, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": -2.078958511352539, "rewards/margins": 7.296936511993408, "rewards/rejected": -9.375894546508789, "step": 485 }, { "epoch": 5.7514792899408285, "grad_norm": 3.7963083798265678, "learning_rate": 1.0814849708074414e-07, "logits/chosen": -1.2180938720703125, "logits/rejected": -1.2313947677612305, "logps/chosen": -38.8764762878418, "logps/rejected": -58.5401496887207, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -0.2903863787651062, "rewards/margins": 6.708767890930176, "rewards/rejected": -6.999154090881348, "step": 486 }, { "epoch": 5.763313609467455, "grad_norm": 3.9184087371836953, "learning_rate": 1.070796809274853e-07, "logits/chosen": -1.410796880722046, "logits/rejected": -1.51241934299469, "logps/chosen": -39.3426513671875, "logps/rejected": -64.799560546875, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -1.2968083620071411, "rewards/margins": 8.182130813598633, "rewards/rejected": -9.478939056396484, "step": 487 }, { "epoch": 5.775147928994083, "grad_norm": 4.695034688833399, "learning_rate": 1.0601473128180854e-07, "logits/chosen": -1.1525176763534546, "logits/rejected": -1.1728888750076294, "logps/chosen": -29.180187225341797, "logps/rejected": -47.42591094970703, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": 0.2968808710575104, "rewards/margins": 5.646054267883301, "rewards/rejected": -5.349173545837402, "step": 488 }, { "epoch": 5.78698224852071, "grad_norm": 3.9629884838666793, "learning_rate": 1.0495367695442392e-07, "logits/chosen": -1.4741270542144775, "logits/rejected": -1.4874193668365479, "logps/chosen": -37.341251373291016, "logps/rejected": -64.96741485595703, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": -1.7409987449645996, "rewards/margins": 8.580930709838867, "rewards/rejected": -10.321928977966309, "step": 489 }, { "epoch": 5.798816568047338, "grad_norm": 4.579911167657576, "learning_rate": 1.0389654665065908e-07, "logits/chosen": -1.3020949363708496, "logits/rejected": -1.1568399667739868, "logps/chosen": -37.12276840209961, "logps/rejected": -74.50867462158203, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": -2.156496047973633, "rewards/margins": 8.242414474487305, "rewards/rejected": -10.398909568786621, "step": 490 }, { "epoch": 5.810650887573964, "grad_norm": 3.8373743576852046, "learning_rate": 1.0284336896968304e-07, "logits/chosen": -1.4103869199752808, "logits/rejected": -1.2831324338912964, "logps/chosen": -27.218812942504883, "logps/rejected": -64.12830352783203, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -0.5227464437484741, "rewards/margins": 7.4249677658081055, "rewards/rejected": -7.947714328765869, "step": 491 }, { "epoch": 5.822485207100591, "grad_norm": 3.95946268366364, "learning_rate": 1.0179417240373182e-07, "logits/chosen": -1.0852227210998535, "logits/rejected": -1.0003684759140015, "logps/chosen": -34.86237716674805, "logps/rejected": -55.69224166870117, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": -1.195955753326416, "rewards/margins": 6.8462629318237305, "rewards/rejected": -8.042219161987305, "step": 492 }, { "epoch": 5.834319526627219, "grad_norm": 4.561320933520321, "learning_rate": 1.0074898533733833e-07, "logits/chosen": -1.3027982711791992, "logits/rejected": -1.272012710571289, "logps/chosen": -44.73501205444336, "logps/rejected": -74.4944076538086, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": -0.941219687461853, "rewards/margins": 7.836876392364502, "rewards/rejected": -8.778095245361328, "step": 493 }, { "epoch": 5.846153846153846, "grad_norm": 3.4880073627324113, "learning_rate": 9.970783604656383e-08, "logits/chosen": -1.1432762145996094, "logits/rejected": -1.182610034942627, "logps/chosen": -45.404197692871094, "logps/rejected": -68.64156341552734, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -2.063544273376465, "rewards/margins": 7.183533668518066, "rewards/rejected": -9.247077941894531, "step": 494 }, { "epoch": 5.8579881656804735, "grad_norm": 4.263751098046147, "learning_rate": 9.867075269823353e-08, "logits/chosen": -1.2150077819824219, "logits/rejected": -1.156544804573059, "logps/chosen": -31.194244384765625, "logps/rejected": -62.26982498168945, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -0.5434834957122803, "rewards/margins": 7.384872913360596, "rewards/rejected": -7.928356647491455, "step": 495 }, { "epoch": 5.8698224852071, "grad_norm": 3.5712261428305476, "learning_rate": 9.763776334917398e-08, "logits/chosen": -1.137619972229004, "logits/rejected": -1.1010360717773438, "logps/chosen": -42.5244140625, "logps/rejected": -64.01480865478516, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -1.5774519443511963, "rewards/margins": 7.529844284057617, "rewards/rejected": -9.107295989990234, "step": 496 }, { "epoch": 5.881656804733728, "grad_norm": 3.9520695580072838, "learning_rate": 9.660889594545469e-08, "logits/chosen": -1.3965240716934204, "logits/rejected": -1.2792103290557861, "logps/chosen": -35.29607391357422, "logps/rejected": -62.42978286743164, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": -1.475753664970398, "rewards/margins": 7.789628982543945, "rewards/rejected": -9.265382766723633, "step": 497 }, { "epoch": 5.893491124260355, "grad_norm": 2.9741347782819343, "learning_rate": 9.558417832163162e-08, "logits/chosen": -1.3339378833770752, "logits/rejected": -1.3494333028793335, "logps/chosen": -34.82639694213867, "logps/rejected": -60.99176788330078, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -0.567624568939209, "rewards/margins": 7.62171745300293, "rewards/rejected": -8.189342498779297, "step": 498 }, { "epoch": 5.905325443786982, "grad_norm": 3.571047214468529, "learning_rate": 9.456363819999419e-08, "logits/chosen": -1.3118352890014648, "logits/rejected": -1.2693629264831543, "logps/chosen": -41.321712493896484, "logps/rejected": -56.57762145996094, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -0.858654797077179, "rewards/margins": 6.5985426902771, "rewards/rejected": -7.457197666168213, "step": 499 }, { "epoch": 5.9171597633136095, "grad_norm": 5.851871018530032, "learning_rate": 9.354730318981561e-08, "logits/chosen": -1.362858772277832, "logits/rejected": -1.2244151830673218, "logps/chosen": -42.49793243408203, "logps/rejected": -82.23344421386719, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": -2.179011344909668, "rewards/margins": 9.12831974029541, "rewards/rejected": -11.307331085205078, "step": 500 }, { "epoch": 5.928994082840236, "grad_norm": 5.383838413833312, "learning_rate": 9.25352007866054e-08, "logits/chosen": -1.2556869983673096, "logits/rejected": -1.443664789199829, "logps/chosen": -40.52064514160156, "logps/rejected": -56.383453369140625, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": -2.034975528717041, "rewards/margins": 6.7539753913879395, "rewards/rejected": -8.78895092010498, "step": 501 }, { "epoch": 5.940828402366864, "grad_norm": 5.05266297652339, "learning_rate": 9.15273583713663e-08, "logits/chosen": -1.4486913681030273, "logits/rejected": -1.5014913082122803, "logps/chosen": -48.460594177246094, "logps/rejected": -62.99694061279297, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": -1.5520663261413574, "rewards/margins": 5.633842468261719, "rewards/rejected": -7.185909271240234, "step": 502 }, { "epoch": 5.952662721893491, "grad_norm": 3.687156022372964, "learning_rate": 9.052380320985273e-08, "logits/chosen": -1.378886103630066, "logits/rejected": -1.4398740530014038, "logps/chosen": -45.16747283935547, "logps/rejected": -63.0939826965332, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": -1.6366732120513916, "rewards/margins": 7.194758892059326, "rewards/rejected": -8.83143138885498, "step": 503 }, { "epoch": 5.964497041420119, "grad_norm": 4.578124297729498, "learning_rate": 8.95245624518336e-08, "logits/chosen": -1.1423323154449463, "logits/rejected": -1.1454185247421265, "logps/chosen": -36.498924255371094, "logps/rejected": -51.14469528198242, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": -1.339404821395874, "rewards/margins": 5.312580585479736, "rewards/rejected": -6.651985168457031, "step": 504 }, { "epoch": 5.976331360946745, "grad_norm": 3.6118523674964518, "learning_rate": 8.85296631303579e-08, "logits/chosen": -1.2390810251235962, "logits/rejected": -1.1867014169692993, "logps/chosen": -32.07006072998047, "logps/rejected": -58.290775299072266, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -0.3951120972633362, "rewards/margins": 6.040757656097412, "rewards/rejected": -6.435870170593262, "step": 505 }, { "epoch": 5.988165680473373, "grad_norm": 4.13649603264138, "learning_rate": 8.753913216102285e-08, "logits/chosen": -1.404435634613037, "logits/rejected": -1.3154963254928589, "logps/chosen": -47.064483642578125, "logps/rejected": -74.79150390625, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -3.253176212310791, "rewards/margins": 7.4068121910095215, "rewards/rejected": -10.659988403320312, "step": 506 }, { "epoch": 6.0, "grad_norm": 6.57455134237728, "learning_rate": 8.655299634124646e-08, "logits/chosen": -1.1584608554840088, "logits/rejected": -1.1112971305847168, "logps/chosen": -35.10588455200195, "logps/rejected": -55.12195587158203, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": -0.5993782877922058, "rewards/margins": 7.258352279663086, "rewards/rejected": -7.857730865478516, "step": 507 }, { "epoch": 6.011834319526627, "grad_norm": 5.072270968295128, "learning_rate": 8.557128234954189e-08, "logits/chosen": -1.2677009105682373, "logits/rejected": -1.2384376525878906, "logps/chosen": -26.764663696289062, "logps/rejected": -54.153812408447266, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -0.4589073061943054, "rewards/margins": 7.230363845825195, "rewards/rejected": -7.689270973205566, "step": 508 }, { "epoch": 6.023668639053255, "grad_norm": 3.823553652256516, "learning_rate": 8.459401674479594e-08, "logits/chosen": -1.1779940128326416, "logits/rejected": -1.098586082458496, "logps/chosen": -37.93672180175781, "logps/rejected": -65.87274932861328, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -1.2578346729278564, "rewards/margins": 7.301403999328613, "rewards/rejected": -8.55923843383789, "step": 509 }, { "epoch": 6.035502958579881, "grad_norm": 3.2726696014561516, "learning_rate": 8.362122596555088e-08, "logits/chosen": -1.3603450059890747, "logits/rejected": -1.190643548965454, "logps/chosen": -31.422203063964844, "logps/rejected": -67.04301452636719, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -0.9887487888336182, "rewards/margins": 9.028081893920898, "rewards/rejected": -10.016830444335938, "step": 510 }, { "epoch": 6.047337278106509, "grad_norm": 3.455246506585187, "learning_rate": 8.265293632928854e-08, "logits/chosen": -1.155658483505249, "logits/rejected": -1.0525039434432983, "logps/chosen": -38.18634796142578, "logps/rejected": -54.874427795410156, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -1.180433750152588, "rewards/margins": 5.439917087554932, "rewards/rejected": -6.6203508377075195, "step": 511 }, { "epoch": 6.059171597633136, "grad_norm": 3.350614417751688, "learning_rate": 8.16891740317189e-08, "logits/chosen": -1.2656223773956299, "logits/rejected": -1.3540056943893433, "logps/chosen": -35.06914138793945, "logps/rejected": -60.15445327758789, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -1.5391483306884766, "rewards/margins": 8.150188446044922, "rewards/rejected": -9.689336776733398, "step": 512 }, { "epoch": 6.071005917159764, "grad_norm": 3.484209239680023, "learning_rate": 8.072996514607124e-08, "logits/chosen": -1.2974562644958496, "logits/rejected": -1.1921536922454834, "logps/chosen": -55.431785583496094, "logps/rejected": -79.14292907714844, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": -2.612431526184082, "rewards/margins": 9.743474960327148, "rewards/rejected": -12.355907440185547, "step": 513 }, { "epoch": 6.0828402366863905, "grad_norm": 4.446328914142042, "learning_rate": 7.977533562238838e-08, "logits/chosen": -1.277441143989563, "logits/rejected": -1.2345993518829346, "logps/chosen": -40.02715301513672, "logps/rejected": -76.5589599609375, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": -1.1496319770812988, "rewards/margins": 10.045013427734375, "rewards/rejected": -11.194644927978516, "step": 514 }, { "epoch": 6.094674556213017, "grad_norm": 3.4828911535899496, "learning_rate": 7.882531128682538e-08, "logits/chosen": -1.220058798789978, "logits/rejected": -1.2398343086242676, "logps/chosen": -48.92205810546875, "logps/rejected": -75.846923828125, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -2.44828462600708, "rewards/margins": 8.325919151306152, "rewards/rejected": -10.77420425415039, "step": 515 }, { "epoch": 6.106508875739645, "grad_norm": 3.974217510492179, "learning_rate": 7.787991784094999e-08, "logits/chosen": -1.1681749820709229, "logits/rejected": -1.2502659559249878, "logps/chosen": -40.98628616333008, "logps/rejected": -69.3888168334961, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": -2.623548746109009, "rewards/margins": 9.416035652160645, "rewards/rejected": -12.03958511352539, "step": 516 }, { "epoch": 6.118343195266272, "grad_norm": 3.087635521774272, "learning_rate": 7.693918086104825e-08, "logits/chosen": -1.2360180616378784, "logits/rejected": -1.1360399723052979, "logps/chosen": -37.70630645751953, "logps/rejected": -69.30216979980469, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -0.35728567838668823, "rewards/margins": 8.243356704711914, "rewards/rejected": -8.600642204284668, "step": 517 }, { "epoch": 6.1301775147929, "grad_norm": 4.697710852543094, "learning_rate": 7.60031257974316e-08, "logits/chosen": -1.4873653650283813, "logits/rejected": -1.365291953086853, "logps/chosen": -36.08180618286133, "logps/rejected": -69.22196960449219, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -2.034714698791504, "rewards/margins": 7.524087905883789, "rewards/rejected": -9.558802604675293, "step": 518 }, { "epoch": 6.1420118343195265, "grad_norm": 4.048751031295707, "learning_rate": 7.507177797374927e-08, "logits/chosen": -1.3827829360961914, "logits/rejected": -1.363875389099121, "logps/chosen": -43.49116516113281, "logps/rejected": -78.3033447265625, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -1.8128806352615356, "rewards/margins": 10.288780212402344, "rewards/rejected": -12.101661682128906, "step": 519 }, { "epoch": 6.153846153846154, "grad_norm": 3.0144143135282286, "learning_rate": 7.414516258630244e-08, "logits/chosen": -1.3672518730163574, "logits/rejected": -1.294494390487671, "logps/chosen": -28.87794303894043, "logps/rejected": -49.833717346191406, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -0.42372238636016846, "rewards/margins": 6.666773796081543, "rewards/rejected": -7.090496063232422, "step": 520 }, { "epoch": 6.165680473372781, "grad_norm": 5.7375168904982745, "learning_rate": 7.322330470336313e-08, "logits/chosen": -1.1266202926635742, "logits/rejected": -1.1897248029708862, "logps/chosen": -36.070762634277344, "logps/rejected": -67.00115203857422, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": -2.276731491088867, "rewards/margins": 8.913595199584961, "rewards/rejected": -11.190326690673828, "step": 521 }, { "epoch": 6.177514792899408, "grad_norm": 3.9341322070007227, "learning_rate": 7.230622926449564e-08, "logits/chosen": -1.2871345281600952, "logits/rejected": -1.3150429725646973, "logps/chosen": -32.48534393310547, "logps/rejected": -56.244903564453125, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": -1.2191728353500366, "rewards/margins": 7.289381980895996, "rewards/rejected": -8.508554458618164, "step": 522 }, { "epoch": 6.189349112426036, "grad_norm": 3.8384098316045776, "learning_rate": 7.139396107988193e-08, "logits/chosen": -1.0808664560317993, "logits/rejected": -1.051466464996338, "logps/chosen": -36.810298919677734, "logps/rejected": -50.8199462890625, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 0.059516020119190216, "rewards/margins": 5.3354082107543945, "rewards/rejected": -5.27589225769043, "step": 523 }, { "epoch": 6.201183431952662, "grad_norm": 3.7617803402555965, "learning_rate": 7.048652482965078e-08, "logits/chosen": -1.5526072978973389, "logits/rejected": -1.4698734283447266, "logps/chosen": -35.63499450683594, "logps/rejected": -58.799835205078125, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -1.1897969245910645, "rewards/margins": 6.380156517028809, "rewards/rejected": -7.569952964782715, "step": 524 }, { "epoch": 6.21301775147929, "grad_norm": 6.280571337534133, "learning_rate": 6.958394506320947e-08, "logits/chosen": -1.082352876663208, "logits/rejected": -0.9739059805870056, "logps/chosen": -59.19343566894531, "logps/rejected": -85.16358184814453, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -3.377283811569214, "rewards/margins": 8.0195951461792, "rewards/rejected": -11.396878242492676, "step": 525 }, { "epoch": 6.224852071005917, "grad_norm": 3.6878257721370926, "learning_rate": 6.868624619858021e-08, "logits/chosen": -1.1773961782455444, "logits/rejected": -1.069741129875183, "logps/chosen": -35.47507095336914, "logps/rejected": -71.48643493652344, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -0.48855772614479065, "rewards/margins": 9.38101863861084, "rewards/rejected": -9.869575500488281, "step": 526 }, { "epoch": 6.236686390532545, "grad_norm": 5.553600246448852, "learning_rate": 6.779345252173906e-08, "logits/chosen": -0.9772287011146545, "logits/rejected": -1.2621002197265625, "logps/chosen": -66.20740509033203, "logps/rejected": -61.06924819946289, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": -0.746315598487854, "rewards/margins": 5.9837775230407715, "rewards/rejected": -6.730093002319336, "step": 527 }, { "epoch": 6.2485207100591715, "grad_norm": 4.0981125572939785, "learning_rate": 6.690558818595943e-08, "logits/chosen": -1.3861331939697266, "logits/rejected": -1.297480821609497, "logps/chosen": -43.87446975708008, "logps/rejected": -63.237091064453125, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -1.5427628755569458, "rewards/margins": 7.580643653869629, "rewards/rejected": -9.123406410217285, "step": 528 }, { "epoch": 6.260355029585799, "grad_norm": 3.147025133945202, "learning_rate": 6.602267721115806e-08, "logits/chosen": -1.410119891166687, "logits/rejected": -1.324759840965271, "logps/chosen": -42.89299011230469, "logps/rejected": -70.10383605957031, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -0.9887200593948364, "rewards/margins": 6.539437770843506, "rewards/rejected": -7.5281572341918945, "step": 529 }, { "epoch": 6.272189349112426, "grad_norm": 3.4923988387654714, "learning_rate": 6.514474348324581e-08, "logits/chosen": -1.1285738945007324, "logits/rejected": -1.0195645093917847, "logps/chosen": -39.756072998046875, "logps/rejected": -64.98381042480469, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -1.783644437789917, "rewards/margins": 7.37436056137085, "rewards/rejected": -9.158004760742188, "step": 530 }, { "epoch": 6.284023668639053, "grad_norm": 4.081843420506292, "learning_rate": 6.427181075348084e-08, "logits/chosen": -1.1205132007598877, "logits/rejected": -1.0184344053268433, "logps/chosen": -29.305950164794922, "logps/rejected": -60.268062591552734, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": -0.907474160194397, "rewards/margins": 6.502952575683594, "rewards/rejected": -7.410426139831543, "step": 531 }, { "epoch": 6.295857988165681, "grad_norm": 3.8839404909626585, "learning_rate": 6.340390263782655e-08, "logits/chosen": -1.3858798742294312, "logits/rejected": -1.2407383918762207, "logps/chosen": -41.1785888671875, "logps/rejected": -76.4423828125, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -2.216499090194702, "rewards/margins": 8.766088485717773, "rewards/rejected": -10.982587814331055, "step": 532 }, { "epoch": 6.3076923076923075, "grad_norm": 4.4263228598532365, "learning_rate": 6.254104261631254e-08, "logits/chosen": -1.1712279319763184, "logits/rejected": -1.214058518409729, "logps/chosen": -34.68932342529297, "logps/rejected": -55.6457633972168, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": -1.2202101945877075, "rewards/margins": 6.376070022583008, "rewards/rejected": -7.596280574798584, "step": 533 }, { "epoch": 6.319526627218935, "grad_norm": 3.9131998116199918, "learning_rate": 6.168325403239913e-08, "logits/chosen": -1.3168898820877075, "logits/rejected": -1.4670816659927368, "logps/chosen": -45.90602111816406, "logps/rejected": -59.57246398925781, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -1.582594633102417, "rewards/margins": 7.453899383544922, "rewards/rejected": -9.036494255065918, "step": 534 }, { "epoch": 6.331360946745562, "grad_norm": 4.764780460541576, "learning_rate": 6.08305600923463e-08, "logits/chosen": -1.1949267387390137, "logits/rejected": -1.23961341381073, "logps/chosen": -36.51057434082031, "logps/rejected": -72.36233520507812, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": -2.984785795211792, "rewards/margins": 8.723003387451172, "rewards/rejected": -11.707788467407227, "step": 535 }, { "epoch": 6.34319526627219, "grad_norm": 4.149947688223677, "learning_rate": 5.998298386458545e-08, "logits/chosen": -1.1838600635528564, "logits/rejected": -1.0856642723083496, "logps/chosen": -42.33216857910156, "logps/rejected": -66.8110122680664, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -2.4710516929626465, "rewards/margins": 6.801319599151611, "rewards/rejected": -9.272371292114258, "step": 536 }, { "epoch": 6.355029585798817, "grad_norm": 4.1571001554854465, "learning_rate": 5.914054827909548e-08, "logits/chosen": -1.094924807548523, "logits/rejected": -1.1296252012252808, "logps/chosen": -46.14147186279297, "logps/rejected": -86.65098571777344, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -1.7291374206542969, "rewards/margins": 10.872190475463867, "rewards/rejected": -12.601327896118164, "step": 537 }, { "epoch": 6.366863905325443, "grad_norm": 4.127454820039479, "learning_rate": 5.830327612678265e-08, "logits/chosen": -1.187494158744812, "logits/rejected": -1.201699137687683, "logps/chosen": -43.62383270263672, "logps/rejected": -69.97433471679688, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -1.9448773860931396, "rewards/margins": 9.503475189208984, "rewards/rejected": -11.448352813720703, "step": 538 }, { "epoch": 6.378698224852071, "grad_norm": 5.236200973476889, "learning_rate": 5.747119005886361e-08, "logits/chosen": -1.3696651458740234, "logits/rejected": -1.2970082759857178, "logps/chosen": -35.38874816894531, "logps/rejected": -63.076866149902344, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": -0.804175615310669, "rewards/margins": 7.7187957763671875, "rewards/rejected": -8.522972106933594, "step": 539 }, { "epoch": 6.390532544378698, "grad_norm": 3.4361194305765226, "learning_rate": 5.6644312586253044e-08, "logits/chosen": -1.3181718587875366, "logits/rejected": -1.4402356147766113, "logps/chosen": -47.9390754699707, "logps/rejected": -69.83265686035156, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -1.2599098682403564, "rewards/margins": 6.955543518066406, "rewards/rejected": -8.215453147888184, "step": 540 }, { "epoch": 6.402366863905326, "grad_norm": 5.247988440802347, "learning_rate": 5.582266607895422e-08, "logits/chosen": -1.2013980150222778, "logits/rejected": -1.2277536392211914, "logps/chosen": -32.803062438964844, "logps/rejected": -51.46520233154297, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": -0.23354974389076233, "rewards/margins": 7.179046154022217, "rewards/rejected": -7.412596702575684, "step": 541 }, { "epoch": 6.414201183431953, "grad_norm": 4.074351591350812, "learning_rate": 5.5006272765454056e-08, "logits/chosen": -1.6633051633834839, "logits/rejected": -1.3203377723693848, "logps/chosen": -43.05129623413086, "logps/rejected": -89.4016342163086, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -1.711348056793213, "rewards/margins": 10.217995643615723, "rewards/rejected": -11.929344177246094, "step": 542 }, { "epoch": 6.42603550295858, "grad_norm": 4.882941673599114, "learning_rate": 5.419515473212191e-08, "logits/chosen": -1.0465339422225952, "logits/rejected": -0.9034765958786011, "logps/chosen": -39.69237518310547, "logps/rejected": -64.68145751953125, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -1.7466580867767334, "rewards/margins": 5.554471969604492, "rewards/rejected": -7.301130294799805, "step": 543 }, { "epoch": 6.437869822485207, "grad_norm": 3.1692395598442804, "learning_rate": 5.338933392261158e-08, "logits/chosen": -1.3215035200119019, "logits/rejected": -1.341964602470398, "logps/chosen": -40.1537971496582, "logps/rejected": -63.64967346191406, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": -2.646998405456543, "rewards/margins": 8.139799118041992, "rewards/rejected": -10.786796569824219, "step": 544 }, { "epoch": 6.449704142011834, "grad_norm": 5.949554370120341, "learning_rate": 5.258883213726828e-08, "logits/chosen": -1.194183588027954, "logits/rejected": -1.2076101303100586, "logps/chosen": -44.26350402832031, "logps/rejected": -64.31948852539062, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": -1.1567471027374268, "rewards/margins": 7.238821506500244, "rewards/rejected": -8.39556884765625, "step": 545 }, { "epoch": 6.461538461538462, "grad_norm": 3.7663787692141995, "learning_rate": 5.1793671032538206e-08, "logits/chosen": -1.4605358839035034, "logits/rejected": -1.4506992101669312, "logps/chosen": -35.8072395324707, "logps/rejected": -57.40998458862305, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": -1.3689743280410767, "rewards/margins": 7.345977783203125, "rewards/rejected": -8.714951515197754, "step": 546 }, { "epoch": 6.4733727810650885, "grad_norm": 2.9635523735572247, "learning_rate": 5.100387212038324e-08, "logits/chosen": -1.2727254629135132, "logits/rejected": -1.374696135520935, "logps/chosen": -39.98680114746094, "logps/rejected": -69.32289123535156, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -2.05542254447937, "rewards/margins": 8.780436515808105, "rewards/rejected": -10.835859298706055, "step": 547 }, { "epoch": 6.485207100591716, "grad_norm": 5.072194913958191, "learning_rate": 5.021945676769859e-08, "logits/chosen": -1.2008986473083496, "logits/rejected": -1.1709994077682495, "logps/chosen": -46.82018280029297, "logps/rejected": -68.16090393066406, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": -1.5476022958755493, "rewards/margins": 7.798556804656982, "rewards/rejected": -9.346158981323242, "step": 548 }, { "epoch": 6.497041420118343, "grad_norm": 4.114324411595974, "learning_rate": 4.9440446195734817e-08, "logits/chosen": -1.4089347124099731, "logits/rejected": -1.346145749092102, "logps/chosen": -31.944503784179688, "logps/rejected": -61.80097198486328, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": -1.2490514516830444, "rewards/margins": 8.163646697998047, "rewards/rejected": -9.412696838378906, "step": 549 }, { "epoch": 6.508875739644971, "grad_norm": 5.13609604269027, "learning_rate": 4.866686147952387e-08, "logits/chosen": -1.5707539319992065, "logits/rejected": -1.5316340923309326, "logps/chosen": -42.37779235839844, "logps/rejected": -67.51331329345703, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": -1.2908955812454224, "rewards/margins": 8.278606414794922, "rewards/rejected": -9.569501876831055, "step": 550 }, { "epoch": 6.520710059171598, "grad_norm": 5.255591810225031, "learning_rate": 4.789872354730873e-08, "logits/chosen": -1.5177299976348877, "logits/rejected": -1.4273862838745117, "logps/chosen": -32.705142974853516, "logps/rejected": -63.322547912597656, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -0.9528019428253174, "rewards/margins": 7.921998023986816, "rewards/rejected": -8.874799728393555, "step": 551 }, { "epoch": 6.5325443786982245, "grad_norm": 3.6838956833536938, "learning_rate": 4.71360531799774e-08, "logits/chosen": -1.2748504877090454, "logits/rejected": -1.2340025901794434, "logps/chosen": -38.841758728027344, "logps/rejected": -68.60884094238281, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": -1.4062817096710205, "rewards/margins": 8.743720054626465, "rewards/rejected": -10.150002479553223, "step": 552 }, { "epoch": 6.544378698224852, "grad_norm": 3.742498897516179, "learning_rate": 4.637887101050053e-08, "logits/chosen": -1.331305742263794, "logits/rejected": -1.2110060453414917, "logps/chosen": -35.87771224975586, "logps/rejected": -66.48898315429688, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -0.5530174374580383, "rewards/margins": 7.692140579223633, "rewards/rejected": -8.245157241821289, "step": 553 }, { "epoch": 6.556213017751479, "grad_norm": 4.145756635966609, "learning_rate": 4.562719752337349e-08, "logits/chosen": -1.241339087486267, "logits/rejected": -1.238210916519165, "logps/chosen": -39.84772491455078, "logps/rejected": -65.00758361816406, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -1.1539757251739502, "rewards/margins": 7.031815528869629, "rewards/rejected": -8.185791015625, "step": 554 }, { "epoch": 6.568047337278107, "grad_norm": 4.724411466421152, "learning_rate": 4.488105305406187e-08, "logits/chosen": -1.0200409889221191, "logits/rejected": -0.9774606823921204, "logps/chosen": -38.79374694824219, "logps/rejected": -61.03683090209961, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -1.9451122283935547, "rewards/margins": 7.298187732696533, "rewards/rejected": -9.24329948425293, "step": 555 }, { "epoch": 6.579881656804734, "grad_norm": 4.840401053211273, "learning_rate": 4.4140457788451434e-08, "logits/chosen": -1.3930317163467407, "logits/rejected": -1.3531696796417236, "logps/chosen": -51.00494384765625, "logps/rejected": -69.20004272460938, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": -2.2470803260803223, "rewards/margins": 7.987146854400635, "rewards/rejected": -10.234228134155273, "step": 556 }, { "epoch": 6.591715976331361, "grad_norm": 4.058919938184551, "learning_rate": 4.340543176230232e-08, "logits/chosen": -1.1343650817871094, "logits/rejected": -1.149098515510559, "logps/chosen": -37.745506286621094, "logps/rejected": -62.56489562988281, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -1.3520774841308594, "rewards/margins": 7.742616176605225, "rewards/rejected": -9.094694137573242, "step": 557 }, { "epoch": 6.603550295857988, "grad_norm": 3.8922355564931865, "learning_rate": 4.267599486070647e-08, "logits/chosen": -1.1515233516693115, "logits/rejected": -1.1255171298980713, "logps/chosen": -33.81602478027344, "logps/rejected": -61.61820983886719, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -1.054863691329956, "rewards/margins": 8.12367057800293, "rewards/rejected": -9.178534507751465, "step": 558 }, { "epoch": 6.615384615384615, "grad_norm": 2.9916154212804025, "learning_rate": 4.1952166817550176e-08, "logits/chosen": -1.3873323202133179, "logits/rejected": -1.270090937614441, "logps/chosen": -37.5373649597168, "logps/rejected": -70.85179138183594, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -1.6242471933364868, "rewards/margins": 9.16792106628418, "rewards/rejected": -10.792167663574219, "step": 559 }, { "epoch": 6.627218934911243, "grad_norm": 3.497280688298504, "learning_rate": 4.1233967214979764e-08, "logits/chosen": -1.2440836429595947, "logits/rejected": -1.182302713394165, "logps/chosen": -36.41077423095703, "logps/rejected": -67.35704803466797, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -0.9620167016983032, "rewards/margins": 7.311346054077148, "rewards/rejected": -8.27336311340332, "step": 560 }, { "epoch": 6.6390532544378695, "grad_norm": 5.110607066618349, "learning_rate": 4.05214154828723e-08, "logits/chosen": -1.1929218769073486, "logits/rejected": -1.1870311498641968, "logps/chosen": -41.97100830078125, "logps/rejected": -72.02296447753906, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -1.625248670578003, "rewards/margins": 8.42895793914795, "rewards/rejected": -10.054206848144531, "step": 561 }, { "epoch": 6.650887573964497, "grad_norm": 4.697261168017771, "learning_rate": 3.9814530898309356e-08, "logits/chosen": -1.4104831218719482, "logits/rejected": -1.1492834091186523, "logps/chosen": -40.75250244140625, "logps/rejected": -86.0766372680664, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": -1.9909330606460571, "rewards/margins": 10.393043518066406, "rewards/rejected": -12.383977890014648, "step": 562 }, { "epoch": 6.662721893491124, "grad_norm": 3.86163495686982, "learning_rate": 3.9113332585056166e-08, "logits/chosen": -1.0711450576782227, "logits/rejected": -1.1439751386642456, "logps/chosen": -47.77074432373047, "logps/rejected": -57.20773696899414, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -2.2142012119293213, "rewards/margins": 5.361945152282715, "rewards/rejected": -7.576145648956299, "step": 563 }, { "epoch": 6.674556213017752, "grad_norm": 3.142265753154405, "learning_rate": 3.8417839513043646e-08, "logits/chosen": -1.3805079460144043, "logits/rejected": -1.2833032608032227, "logps/chosen": -34.34812927246094, "logps/rejected": -61.53770446777344, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -1.6832209825515747, "rewards/margins": 7.745155334472656, "rewards/rejected": -9.428376197814941, "step": 564 }, { "epoch": 6.686390532544379, "grad_norm": 4.537864993317601, "learning_rate": 3.7728070497855594e-08, "logits/chosen": -1.4104318618774414, "logits/rejected": -1.3599334955215454, "logps/chosen": -37.086971282958984, "logps/rejected": -67.63109588623047, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": -0.2368394136428833, "rewards/margins": 7.939535140991211, "rewards/rejected": -8.176374435424805, "step": 565 }, { "epoch": 6.6982248520710055, "grad_norm": 3.938202778272971, "learning_rate": 3.704404420021956e-08, "logits/chosen": -1.3482962846755981, "logits/rejected": -1.3277989625930786, "logps/chosen": -42.97172546386719, "logps/rejected": -66.13438415527344, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -2.497891902923584, "rewards/margins": 7.281721115112305, "rewards/rejected": -9.77961254119873, "step": 566 }, { "epoch": 6.710059171597633, "grad_norm": 4.295649983212432, "learning_rate": 3.636577912550187e-08, "logits/chosen": -1.3801615238189697, "logits/rejected": -1.3534774780273438, "logps/chosen": -48.245174407958984, "logps/rejected": -75.80772399902344, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -2.976886510848999, "rewards/margins": 8.30002212524414, "rewards/rejected": -11.276908874511719, "step": 567 }, { "epoch": 6.72189349112426, "grad_norm": 3.3047241439197497, "learning_rate": 3.569329362320708e-08, "logits/chosen": -1.323474407196045, "logits/rejected": -1.2122368812561035, "logps/chosen": -38.685638427734375, "logps/rejected": -81.03096771240234, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -0.7437925934791565, "rewards/margins": 10.262330055236816, "rewards/rejected": -11.006122589111328, "step": 568 }, { "epoch": 6.733727810650888, "grad_norm": 3.1683374020519315, "learning_rate": 3.5026605886481736e-08, "logits/chosen": -1.4267765283584595, "logits/rejected": -1.4384936094284058, "logps/chosen": -40.923892974853516, "logps/rejected": -66.135009765625, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": -1.4209558963775635, "rewards/margins": 7.333900451660156, "rewards/rejected": -8.75485610961914, "step": 569 }, { "epoch": 6.745562130177515, "grad_norm": 4.91012737015821, "learning_rate": 3.436573395162179e-08, "logits/chosen": -1.4396589994430542, "logits/rejected": -1.3906140327453613, "logps/chosen": -34.09233856201172, "logps/rejected": -70.2711181640625, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -1.1198707818984985, "rewards/margins": 8.739965438842773, "rewards/rejected": -9.85983657836914, "step": 570 }, { "epoch": 6.757396449704142, "grad_norm": 3.6007019387475703, "learning_rate": 3.371069569758511e-08, "logits/chosen": -1.251147747039795, "logits/rejected": -1.1890754699707031, "logps/chosen": -53.093597412109375, "logps/rejected": -83.10395050048828, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -1.4066200256347656, "rewards/margins": 8.366990089416504, "rewards/rejected": -9.773611068725586, "step": 571 }, { "epoch": 6.769230769230769, "grad_norm": 3.8258738774626986, "learning_rate": 3.306150884550732e-08, "logits/chosen": -1.2099252939224243, "logits/rejected": -1.2655442953109741, "logps/chosen": -43.43336486816406, "logps/rejected": -58.43470764160156, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -2.2070910930633545, "rewards/margins": 6.868325710296631, "rewards/rejected": -9.075416564941406, "step": 572 }, { "epoch": 6.781065088757396, "grad_norm": 4.510756393854719, "learning_rate": 3.241819095822288e-08, "logits/chosen": -1.2797162532806396, "logits/rejected": -1.3578741550445557, "logps/chosen": -37.647857666015625, "logps/rejected": -63.263267517089844, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": 0.05109795928001404, "rewards/margins": 7.741544246673584, "rewards/rejected": -7.690445899963379, "step": 573 }, { "epoch": 6.792899408284024, "grad_norm": 3.963518873210086, "learning_rate": 3.17807594397895e-08, "logits/chosen": -1.1429615020751953, "logits/rejected": -1.1946382522583008, "logps/chosen": -34.58401870727539, "logps/rejected": -63.32178497314453, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -1.431036114692688, "rewards/margins": 7.827291488647461, "rewards/rejected": -9.25832748413086, "step": 574 }, { "epoch": 6.804733727810651, "grad_norm": 3.969471879010277, "learning_rate": 3.114923153501747e-08, "logits/chosen": -1.2164993286132812, "logits/rejected": -1.1609983444213867, "logps/chosen": -31.950668334960938, "logps/rejected": -61.592769622802734, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -1.7955553531646729, "rewards/margins": 7.445010185241699, "rewards/rejected": -9.24056625366211, "step": 575 }, { "epoch": 6.816568047337278, "grad_norm": 4.7113289076048, "learning_rate": 3.052362432900332e-08, "logits/chosen": -1.1342840194702148, "logits/rejected": -1.0492384433746338, "logps/chosen": -32.85274124145508, "logps/rejected": -72.11370086669922, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -1.3644051551818848, "rewards/margins": 8.635282516479492, "rewards/rejected": -9.999687194824219, "step": 576 }, { "epoch": 6.828402366863905, "grad_norm": 3.7717255825498675, "learning_rate": 2.990395474666724e-08, "logits/chosen": -1.0524818897247314, "logits/rejected": -0.9995134472846985, "logps/chosen": -40.310935974121094, "logps/rejected": -60.963748931884766, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -2.0798089504241943, "rewards/margins": 6.669445991516113, "rewards/rejected": -8.74925422668457, "step": 577 }, { "epoch": 6.840236686390533, "grad_norm": 4.294637735523631, "learning_rate": 2.9290239552295538e-08, "logits/chosen": -1.2513582706451416, "logits/rejected": -1.1278815269470215, "logps/chosen": -37.120304107666016, "logps/rejected": -60.05486297607422, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -0.5432335734367371, "rewards/margins": 6.656371593475342, "rewards/rejected": -7.1996049880981445, "step": 578 }, { "epoch": 6.85207100591716, "grad_norm": 3.417783997535037, "learning_rate": 2.8682495349086816e-08, "logits/chosen": -1.1197659969329834, "logits/rejected": -1.3288072347640991, "logps/chosen": -47.85517883300781, "logps/rejected": -61.156795501708984, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -1.6630420684814453, "rewards/margins": 7.849754810333252, "rewards/rejected": -9.512797355651855, "step": 579 }, { "epoch": 6.8639053254437865, "grad_norm": 3.9888601265802626, "learning_rate": 2.8080738578703052e-08, "logits/chosen": -1.3530460596084595, "logits/rejected": -1.4569690227508545, "logps/chosen": -54.98475646972656, "logps/rejected": -76.0159912109375, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -1.483710765838623, "rewards/margins": 9.008731842041016, "rewards/rejected": -10.49244213104248, "step": 580 }, { "epoch": 6.875739644970414, "grad_norm": 2.9654411597287273, "learning_rate": 2.748498552082465e-08, "logits/chosen": -1.2621766328811646, "logits/rejected": -1.3557482957839966, "logps/chosen": -41.09724044799805, "logps/rejected": -64.43111419677734, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -1.5274691581726074, "rewards/margins": 7.362679481506348, "rewards/rejected": -8.890148162841797, "step": 581 }, { "epoch": 6.887573964497041, "grad_norm": 3.6343192832198272, "learning_rate": 2.6895252292709974e-08, "logits/chosen": -1.2823108434677124, "logits/rejected": -1.2685920000076294, "logps/chosen": -47.48131561279297, "logps/rejected": -65.60801696777344, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": -1.3462042808532715, "rewards/margins": 6.321001052856445, "rewards/rejected": -7.667204856872559, "step": 582 }, { "epoch": 6.899408284023669, "grad_norm": 4.413434221844061, "learning_rate": 2.631155484875952e-08, "logits/chosen": -1.2258765697479248, "logits/rejected": -1.1181480884552002, "logps/chosen": -43.612274169921875, "logps/rejected": -72.54850769042969, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -1.8452584743499756, "rewards/margins": 8.232542991638184, "rewards/rejected": -10.077801704406738, "step": 583 }, { "epoch": 6.911242603550296, "grad_norm": 3.1918002353766037, "learning_rate": 2.5733908980083984e-08, "logits/chosen": -1.3073086738586426, "logits/rejected": -1.2347233295440674, "logps/chosen": -40.125244140625, "logps/rejected": -69.67649841308594, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -2.3976001739501953, "rewards/margins": 8.126445770263672, "rewards/rejected": -10.524045944213867, "step": 584 }, { "epoch": 6.923076923076923, "grad_norm": 3.672509775762771, "learning_rate": 2.5162330314077385e-08, "logits/chosen": -1.2911275625228882, "logits/rejected": -1.332797646522522, "logps/chosen": -60.97373580932617, "logps/rejected": -87.76260375976562, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -4.335407733917236, "rewards/margins": 8.518815994262695, "rewards/rejected": -12.854223251342773, "step": 585 }, { "epoch": 6.93491124260355, "grad_norm": 2.4576523514031123, "learning_rate": 2.4596834313994037e-08, "logits/chosen": -1.215404748916626, "logits/rejected": -1.20293128490448, "logps/chosen": -31.431991577148438, "logps/rejected": -50.589115142822266, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -0.9661941528320312, "rewards/margins": 6.143428325653076, "rewards/rejected": -7.109622955322266, "step": 586 }, { "epoch": 6.946745562130177, "grad_norm": 4.1809106849701685, "learning_rate": 2.403743627853039e-08, "logits/chosen": -1.1825206279754639, "logits/rejected": -1.193249225616455, "logps/chosen": -36.14801788330078, "logps/rejected": -66.2503662109375, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": -1.1976075172424316, "rewards/margins": 9.384969711303711, "rewards/rejected": -10.582576751708984, "step": 587 }, { "epoch": 6.958579881656805, "grad_norm": 4.981046398430429, "learning_rate": 2.3484151341411018e-08, "logits/chosen": -1.0915954113006592, "logits/rejected": -1.0409646034240723, "logps/chosen": -41.00914001464844, "logps/rejected": -60.438446044921875, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": -0.23959925770759583, "rewards/margins": 7.114398956298828, "rewards/rejected": -7.353998184204102, "step": 588 }, { "epoch": 6.970414201183432, "grad_norm": 3.7291093865283065, "learning_rate": 2.2936994470979188e-08, "logits/chosen": -1.3922624588012695, "logits/rejected": -1.4559788703918457, "logps/chosen": -56.968406677246094, "logps/rejected": -65.14087677001953, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -1.4613820314407349, "rewards/margins": 7.349767684936523, "rewards/rejected": -8.811149597167969, "step": 589 }, { "epoch": 6.982248520710059, "grad_norm": 4.305952133916051, "learning_rate": 2.23959804697921e-08, "logits/chosen": -1.133141040802002, "logits/rejected": -1.262980341911316, "logps/chosen": -45.40880584716797, "logps/rejected": -58.749446868896484, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": -2.105184316635132, "rewards/margins": 7.048664093017578, "rewards/rejected": -9.153848648071289, "step": 590 }, { "epoch": 6.994082840236686, "grad_norm": 4.189520921835817, "learning_rate": 2.1861123974220158e-08, "logits/chosen": -1.2279367446899414, "logits/rejected": -1.281379222869873, "logps/chosen": -45.95500183105469, "logps/rejected": -78.72282409667969, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -2.1575565338134766, "rewards/margins": 9.946080207824707, "rewards/rejected": -12.103636741638184, "step": 591 }, { "epoch": 7.005917159763314, "grad_norm": 3.8418221234553425, "learning_rate": 2.1332439454051277e-08, "logits/chosen": -1.03047513961792, "logits/rejected": -0.9760541915893555, "logps/chosen": -35.22906494140625, "logps/rejected": -65.05894470214844, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -0.0646643340587616, "rewards/margins": 7.141489028930664, "rewards/rejected": -7.206153869628906, "step": 592 }, { "epoch": 7.017751479289941, "grad_norm": 4.37902481619465, "learning_rate": 2.080994121209914e-08, "logits/chosen": -1.2298972606658936, "logits/rejected": -1.2462295293807983, "logps/chosen": -52.52454376220703, "logps/rejected": -73.28079223632812, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": -2.6756205558776855, "rewards/margins": 7.350424766540527, "rewards/rejected": -10.026044845581055, "step": 593 }, { "epoch": 7.029585798816568, "grad_norm": 3.9841674822113564, "learning_rate": 2.029364338381656e-08, "logits/chosen": -1.2008388042449951, "logits/rejected": -1.2781215906143188, "logps/chosen": -46.399505615234375, "logps/rejected": -59.717586517333984, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -2.2308552265167236, "rewards/margins": 6.461019515991211, "rewards/rejected": -8.691874504089355, "step": 594 }, { "epoch": 7.041420118343195, "grad_norm": 3.5149491748374957, "learning_rate": 1.9783559936912773e-08, "logits/chosen": -1.4331488609313965, "logits/rejected": -1.3519785404205322, "logps/chosen": -44.29254150390625, "logps/rejected": -78.45376586914062, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -1.327043056488037, "rewards/margins": 8.162921905517578, "rewards/rejected": -9.489965438842773, "step": 595 }, { "epoch": 7.053254437869822, "grad_norm": 3.509610689232701, "learning_rate": 1.9279704670975726e-08, "logits/chosen": -1.0991450548171997, "logits/rejected": -1.106555700302124, "logps/chosen": -48.36073303222656, "logps/rejected": -64.15289306640625, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -0.800105631351471, "rewards/margins": 6.824061393737793, "rewards/rejected": -7.624166965484619, "step": 596 }, { "epoch": 7.06508875739645, "grad_norm": 4.256443303454753, "learning_rate": 1.8782091217098728e-08, "logits/chosen": -1.4569830894470215, "logits/rejected": -1.4492859840393066, "logps/chosen": -38.11000061035156, "logps/rejected": -70.66535949707031, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": -2.615638017654419, "rewards/margins": 8.343268394470215, "rewards/rejected": -10.958906173706055, "step": 597 }, { "epoch": 7.076923076923077, "grad_norm": 3.1992906689262, "learning_rate": 1.829073303751172e-08, "logits/chosen": -1.296358585357666, "logits/rejected": -1.2894511222839355, "logps/chosen": -49.65617370605469, "logps/rejected": -63.472599029541016, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -3.235658884048462, "rewards/margins": 6.7473063468933105, "rewards/rejected": -9.982964515686035, "step": 598 }, { "epoch": 7.088757396449704, "grad_norm": 3.941759552775281, "learning_rate": 1.780564342521698e-08, "logits/chosen": -1.2821180820465088, "logits/rejected": -1.2152577638626099, "logps/chosen": -43.84495544433594, "logps/rejected": -70.52769470214844, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -1.0584795475006104, "rewards/margins": 7.640247344970703, "rewards/rejected": -8.69872760772705, "step": 599 }, { "epoch": 7.100591715976331, "grad_norm": 3.553599235582575, "learning_rate": 1.732683550362954e-08, "logits/chosen": -1.1523908376693726, "logits/rejected": -1.2564313411712646, "logps/chosen": -35.25387954711914, "logps/rejected": -55.94652557373047, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -0.38266000151634216, "rewards/margins": 6.977762222290039, "rewards/rejected": -7.360422134399414, "step": 600 }, { "epoch": 7.112426035502959, "grad_norm": 3.8288450725810717, "learning_rate": 1.6854322226222102e-08, "logits/chosen": -1.2984129190444946, "logits/rejected": -1.2461819648742676, "logps/chosen": -46.678077697753906, "logps/rejected": -75.3173828125, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -2.282292604446411, "rewards/margins": 8.546947479248047, "rewards/rejected": -10.829240798950195, "step": 601 }, { "epoch": 7.124260355029586, "grad_norm": 4.219862245925515, "learning_rate": 1.6388116376174765e-08, "logits/chosen": -1.2928462028503418, "logits/rejected": -1.1129931211471558, "logps/chosen": -40.22349548339844, "logps/rejected": -70.40786743164062, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": -1.8234593868255615, "rewards/margins": 8.468381881713867, "rewards/rejected": -10.291842460632324, "step": 602 }, { "epoch": 7.136094674556213, "grad_norm": 3.8475070615621014, "learning_rate": 1.5928230566028932e-08, "logits/chosen": -1.4136502742767334, "logits/rejected": -1.2762510776519775, "logps/chosen": -35.088523864746094, "logps/rejected": -61.09199523925781, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -1.4712510108947754, "rewards/margins": 7.1925201416015625, "rewards/rejected": -8.663771629333496, "step": 603 }, { "epoch": 7.14792899408284, "grad_norm": 3.206852988789503, "learning_rate": 1.5474677237346468e-08, "logits/chosen": -1.3350969552993774, "logits/rejected": -1.2451387643814087, "logps/chosen": -28.593502044677734, "logps/rejected": -62.2099494934082, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -0.741662859916687, "rewards/margins": 8.64741325378418, "rewards/rejected": -9.389076232910156, "step": 604 }, { "epoch": 7.159763313609467, "grad_norm": 3.844656802369703, "learning_rate": 1.5027468660372604e-08, "logits/chosen": -1.3713629245758057, "logits/rejected": -1.378098964691162, "logps/chosen": -47.545536041259766, "logps/rejected": -69.47958374023438, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -1.9896124601364136, "rewards/margins": 8.07425308227539, "rewards/rejected": -10.063865661621094, "step": 605 }, { "epoch": 7.171597633136095, "grad_norm": 3.0241225907385165, "learning_rate": 1.4586616933704527e-08, "logits/chosen": -1.2971570491790771, "logits/rejected": -1.245847225189209, "logps/chosen": -38.302677154541016, "logps/rejected": -68.13219451904297, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -1.4747118949890137, "rewards/margins": 8.88740062713623, "rewards/rejected": -10.362112998962402, "step": 606 }, { "epoch": 7.183431952662722, "grad_norm": 5.654903481243723, "learning_rate": 1.4152133983963643e-08, "logits/chosen": -1.3624151945114136, "logits/rejected": -1.289166808128357, "logps/chosen": -36.01699447631836, "logps/rejected": -65.6912841796875, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": -1.6957451105117798, "rewards/margins": 7.106748580932617, "rewards/rejected": -8.80249309539795, "step": 607 }, { "epoch": 7.195266272189349, "grad_norm": 2.9794480072644394, "learning_rate": 1.372403156547311e-08, "logits/chosen": -1.2158567905426025, "logits/rejected": -1.2747105360031128, "logps/chosen": -34.01419448852539, "logps/rejected": -57.90458297729492, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -0.9422094821929932, "rewards/margins": 7.069450378417969, "rewards/rejected": -8.011659622192383, "step": 608 }, { "epoch": 7.207100591715976, "grad_norm": 4.261973188621215, "learning_rate": 1.330232125993988e-08, "logits/chosen": -1.3627076148986816, "logits/rejected": -1.3639800548553467, "logps/chosen": -48.03813552856445, "logps/rejected": -64.04691314697266, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": -2.8824987411499023, "rewards/margins": 7.479591369628906, "rewards/rejected": -10.362090110778809, "step": 609 }, { "epoch": 7.218934911242604, "grad_norm": 4.803739657888739, "learning_rate": 1.2887014476141212e-08, "logits/chosen": -1.4838051795959473, "logits/rejected": -1.3311843872070312, "logps/chosen": -43.892982482910156, "logps/rejected": -81.17485046386719, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": -1.9665614366531372, "rewards/margins": 9.852771759033203, "rewards/rejected": -11.81933307647705, "step": 610 }, { "epoch": 7.230769230769231, "grad_norm": 4.0756670658237475, "learning_rate": 1.2478122449616212e-08, "logits/chosen": -1.1747201681137085, "logits/rejected": -1.2890151739120483, "logps/chosen": -56.25837707519531, "logps/rejected": -65.45867156982422, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -2.2632644176483154, "rewards/margins": 7.053145885467529, "rewards/rejected": -9.316410064697266, "step": 611 }, { "epoch": 7.242603550295858, "grad_norm": 4.38053938987994, "learning_rate": 1.2075656242361732e-08, "logits/chosen": -1.0797568559646606, "logits/rejected": -1.0762195587158203, "logps/chosen": -33.57958221435547, "logps/rejected": -71.72637176513672, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -2.2206077575683594, "rewards/margins": 10.526464462280273, "rewards/rejected": -12.74707317352295, "step": 612 }, { "epoch": 7.254437869822485, "grad_norm": 2.925185904249556, "learning_rate": 1.16796267425332e-08, "logits/chosen": -1.4559069871902466, "logits/rejected": -1.3925589323043823, "logps/chosen": -40.79030990600586, "logps/rejected": -67.69854736328125, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -2.6704390048980713, "rewards/margins": 7.342378616333008, "rewards/rejected": -10.0128173828125, "step": 613 }, { "epoch": 7.266272189349112, "grad_norm": 4.199453917170064, "learning_rate": 1.1290044664149873e-08, "logits/chosen": -1.323655128479004, "logits/rejected": -1.2902326583862305, "logps/chosen": -43.997676849365234, "logps/rejected": -70.29308319091797, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -2.0053741931915283, "rewards/margins": 7.789745330810547, "rewards/rejected": -9.795119285583496, "step": 614 }, { "epoch": 7.27810650887574, "grad_norm": 2.9644130977906675, "learning_rate": 1.0906920546805253e-08, "logits/chosen": -1.331533670425415, "logits/rejected": -1.318424940109253, "logps/chosen": -41.30125045776367, "logps/rejected": -69.17133331298828, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -1.8695695400238037, "rewards/margins": 8.263007164001465, "rewards/rejected": -10.132576942443848, "step": 615 }, { "epoch": 7.289940828402367, "grad_norm": 2.9111429661875294, "learning_rate": 1.0530264755381824e-08, "logits/chosen": -1.5518194437026978, "logits/rejected": -1.2643704414367676, "logps/chosen": -25.555744171142578, "logps/rejected": -70.7457275390625, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -0.22286051511764526, "rewards/margins": 10.125096321105957, "rewards/rejected": -10.347957611083984, "step": 616 }, { "epoch": 7.3017751479289945, "grad_norm": 3.222903401246695, "learning_rate": 1.0160087479770513e-08, "logits/chosen": -1.147964358329773, "logits/rejected": -1.1484017372131348, "logps/chosen": -36.42656326293945, "logps/rejected": -54.97199249267578, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -1.2413222789764404, "rewards/margins": 6.4268999099731445, "rewards/rejected": -7.668221950531006, "step": 617 }, { "epoch": 7.313609467455621, "grad_norm": 4.567760348222854, "learning_rate": 9.796398734595284e-09, "logits/chosen": -1.261488437652588, "logits/rejected": -1.1865301132202148, "logps/chosen": -33.513763427734375, "logps/rejected": -80.79141235351562, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": -1.4354597330093384, "rewards/margins": 9.321052551269531, "rewards/rejected": -10.756511688232422, "step": 618 }, { "epoch": 7.325443786982248, "grad_norm": 3.241502517680726, "learning_rate": 9.439208358941907e-09, "logits/chosen": -1.1972136497497559, "logits/rejected": -1.074997901916504, "logps/chosen": -47.66037368774414, "logps/rejected": -74.0272445678711, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -2.2272040843963623, "rewards/margins": 8.553009033203125, "rewards/rejected": -10.78021240234375, "step": 619 }, { "epoch": 7.337278106508876, "grad_norm": 5.389252767817157, "learning_rate": 9.088526016092141e-09, "logits/chosen": -1.5381629467010498, "logits/rejected": -1.3760014772415161, "logps/chosen": -40.557369232177734, "logps/rejected": -73.04681396484375, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": -3.1969547271728516, "rewards/margins": 8.717408180236816, "rewards/rejected": -11.914361953735352, "step": 620 }, { "epoch": 7.349112426035503, "grad_norm": 4.492355419112391, "learning_rate": 8.744361193261912e-09, "logits/chosen": -1.2798972129821777, "logits/rejected": -1.2866759300231934, "logps/chosen": -46.820777893066406, "logps/rejected": -71.25297546386719, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": -1.2585588693618774, "rewards/margins": 8.543848037719727, "rewards/rejected": -9.802406311035156, "step": 621 }, { "epoch": 7.3609467455621305, "grad_norm": 2.590234218369342, "learning_rate": 8.40672320134489e-09, "logits/chosen": -1.3868498802185059, "logits/rejected": -1.3360377550125122, "logps/chosen": -32.945518493652344, "logps/rejected": -53.556705474853516, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -2.3330631256103516, "rewards/margins": 6.077226638793945, "rewards/rejected": -8.410289764404297, "step": 622 }, { "epoch": 7.372781065088757, "grad_norm": 4.999566857886415, "learning_rate": 8.075621174660625e-09, "logits/chosen": -1.1354734897613525, "logits/rejected": -1.1073994636535645, "logps/chosen": -46.984012603759766, "logps/rejected": -67.13949584960938, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": -2.1461362838745117, "rewards/margins": 7.404544353485107, "rewards/rejected": -9.550680160522461, "step": 623 }, { "epoch": 7.384615384615385, "grad_norm": 4.442848421246316, "learning_rate": 7.751064070707247e-09, "logits/chosen": -0.9358320236206055, "logits/rejected": -1.0718661546707153, "logps/chosen": -57.682579040527344, "logps/rejected": -67.44542694091797, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -1.3646697998046875, "rewards/margins": 7.940317153930664, "rewards/rejected": -9.304986953735352, "step": 624 }, { "epoch": 7.396449704142012, "grad_norm": 3.6393760430903463, "learning_rate": 7.4330606699193055e-09, "logits/chosen": -1.3721582889556885, "logits/rejected": -1.194779634475708, "logps/chosen": -41.03765869140625, "logps/rejected": -67.66683959960938, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -2.100832939147949, "rewards/margins": 7.313269138336182, "rewards/rejected": -9.414101600646973, "step": 625 }, { "epoch": 7.408284023668639, "grad_norm": 2.8173995600402995, "learning_rate": 7.12161957543006e-09, "logits/chosen": -1.1632230281829834, "logits/rejected": -1.1450843811035156, "logps/chosen": -45.309226989746094, "logps/rejected": -67.70977783203125, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -1.5633859634399414, "rewards/margins": 7.859521865844727, "rewards/rejected": -9.422907829284668, "step": 626 }, { "epoch": 7.420118343195266, "grad_norm": 3.6516772499433054, "learning_rate": 6.816749212839007e-09, "logits/chosen": -1.2808470726013184, "logits/rejected": -1.3078573942184448, "logps/chosen": -44.84081268310547, "logps/rejected": -70.39693450927734, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -2.452430009841919, "rewards/margins": 7.464086532592773, "rewards/rejected": -9.91651725769043, "step": 627 }, { "epoch": 7.431952662721893, "grad_norm": 3.889072407283795, "learning_rate": 6.518457829983559e-09, "logits/chosen": -1.185568928718567, "logits/rejected": -1.052988052368164, "logps/chosen": -41.724830627441406, "logps/rejected": -69.43457794189453, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4893982410430908, "rewards/margins": 6.81561803817749, "rewards/rejected": -8.30501651763916, "step": 628 }, { "epoch": 7.443786982248521, "grad_norm": 2.887200349157875, "learning_rate": 6.226753496716253e-09, "logits/chosen": -1.2950429916381836, "logits/rejected": -1.252742052078247, "logps/chosen": -30.59502410888672, "logps/rejected": -50.878868103027344, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -0.0032743215560913086, "rewards/margins": 6.188243865966797, "rewards/rejected": -6.1915178298950195, "step": 629 }, { "epoch": 7.455621301775148, "grad_norm": 4.244222893379881, "learning_rate": 5.9416441046862555e-09, "logits/chosen": -1.301514983177185, "logits/rejected": -1.3513267040252686, "logps/chosen": -39.38859558105469, "logps/rejected": -67.55226135253906, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -1.7557300329208374, "rewards/margins": 9.34812068939209, "rewards/rejected": -11.103851318359375, "step": 630 }, { "epoch": 7.4674556213017755, "grad_norm": 4.290461165134473, "learning_rate": 5.663137367125898e-09, "logits/chosen": -1.3903230428695679, "logits/rejected": -1.3901610374450684, "logps/chosen": -46.33057403564453, "logps/rejected": -58.841392517089844, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -1.6279226541519165, "rewards/margins": 7.242958068847656, "rewards/rejected": -8.870880126953125, "step": 631 }, { "epoch": 7.479289940828402, "grad_norm": 3.439301636458819, "learning_rate": 5.3912408186420064e-09, "logits/chosen": -1.1362035274505615, "logits/rejected": -1.0982844829559326, "logps/chosen": -37.11709213256836, "logps/rejected": -61.59524154663086, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -1.149167537689209, "rewards/margins": 7.633286952972412, "rewards/rejected": -8.782454490661621, "step": 632 }, { "epoch": 7.491124260355029, "grad_norm": 4.470095918148292, "learning_rate": 5.12596181501207e-09, "logits/chosen": -1.2651134729385376, "logits/rejected": -1.3249282836914062, "logps/chosen": -30.522804260253906, "logps/rejected": -51.84233856201172, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -0.16683726012706757, "rewards/margins": 7.101442813873291, "rewards/rejected": -7.268280029296875, "step": 633 }, { "epoch": 7.502958579881657, "grad_norm": 3.3611637574552025, "learning_rate": 4.867307532985227e-09, "logits/chosen": -1.364112377166748, "logits/rejected": -1.2576634883880615, "logps/chosen": -49.3748664855957, "logps/rejected": -83.98435974121094, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -2.741516590118408, "rewards/margins": 9.956401824951172, "rewards/rejected": -12.697917938232422, "step": 634 }, { "epoch": 7.514792899408284, "grad_norm": 2.9574639791748036, "learning_rate": 4.615284970088173e-09, "logits/chosen": -1.1360864639282227, "logits/rejected": -1.1513174772262573, "logps/chosen": -36.475341796875, "logps/rejected": -70.95317077636719, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -2.0532495975494385, "rewards/margins": 8.06146240234375, "rewards/rejected": -10.114712715148926, "step": 635 }, { "epoch": 7.5266272189349115, "grad_norm": 3.4730054907543497, "learning_rate": 4.369900944435734e-09, "logits/chosen": -1.369525671005249, "logits/rejected": -1.2865912914276123, "logps/chosen": -40.689971923828125, "logps/rejected": -69.06320190429688, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -2.378922700881958, "rewards/margins": 8.550222396850586, "rewards/rejected": -10.929145812988281, "step": 636 }, { "epoch": 7.538461538461538, "grad_norm": 4.223323554636038, "learning_rate": 4.131162094546531e-09, "logits/chosen": -1.3048131465911865, "logits/rejected": -1.3768178224563599, "logps/chosen": -57.49602508544922, "logps/rejected": -68.12883758544922, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -4.1675872802734375, "rewards/margins": 7.353907585144043, "rewards/rejected": -11.521493911743164, "step": 637 }, { "epoch": 7.550295857988166, "grad_norm": 3.593337678884347, "learning_rate": 3.899074879163244e-09, "logits/chosen": -0.9969456195831299, "logits/rejected": -1.053671956062317, "logps/chosen": -44.34745788574219, "logps/rejected": -60.61125183105469, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -1.0102906227111816, "rewards/margins": 7.186006546020508, "rewards/rejected": -8.196297645568848, "step": 638 }, { "epoch": 7.562130177514793, "grad_norm": 4.313259960588259, "learning_rate": 3.6736455770781104e-09, "logits/chosen": -1.0693228244781494, "logits/rejected": -1.02896249294281, "logps/chosen": -38.95392608642578, "logps/rejected": -63.7398796081543, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": -1.6491146087646484, "rewards/margins": 5.766778945922852, "rewards/rejected": -7.4158935546875, "step": 639 }, { "epoch": 7.57396449704142, "grad_norm": 3.606040906516179, "learning_rate": 3.4548802869627804e-09, "logits/chosen": -1.2556226253509521, "logits/rejected": -1.1845530271530151, "logps/chosen": -42.52489471435547, "logps/rejected": -69.6248550415039, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -3.0042614936828613, "rewards/margins": 8.077693939208984, "rewards/rejected": -11.081954956054688, "step": 640 }, { "epoch": 7.585798816568047, "grad_norm": 3.105559539498645, "learning_rate": 3.2427849272035067e-09, "logits/chosen": -0.9435504674911499, "logits/rejected": -0.9759422540664673, "logps/chosen": -40.1533203125, "logps/rejected": -66.1624526977539, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -1.300183653831482, "rewards/margins": 7.85737943649292, "rewards/rejected": -9.157563209533691, "step": 641 }, { "epoch": 7.597633136094674, "grad_norm": 3.558021048910989, "learning_rate": 3.037365235741024e-09, "logits/chosen": -1.1166036128997803, "logits/rejected": -1.038915753364563, "logps/chosen": -46.17367935180664, "logps/rejected": -77.80493927001953, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -2.5460665225982666, "rewards/margins": 9.366143226623535, "rewards/rejected": -11.912210464477539, "step": 642 }, { "epoch": 7.609467455621302, "grad_norm": 3.2180872359588033, "learning_rate": 2.8386267699152256e-09, "logits/chosen": -1.32170832157135, "logits/rejected": -1.1445355415344238, "logps/chosen": -33.306297302246094, "logps/rejected": -64.81905364990234, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -1.5187739133834839, "rewards/margins": 7.3708953857421875, "rewards/rejected": -8.889669418334961, "step": 643 }, { "epoch": 7.621301775147929, "grad_norm": 4.516310865033977, "learning_rate": 2.6465749063149245e-09, "logits/chosen": -1.3887925148010254, "logits/rejected": -1.5802987813949585, "logps/chosen": -41.122772216796875, "logps/rejected": -60.960174560546875, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": -0.43830376863479614, "rewards/margins": 8.353026390075684, "rewards/rejected": -8.79133129119873, "step": 644 }, { "epoch": 7.633136094674557, "grad_norm": 3.522662435537469, "learning_rate": 2.461214840632331e-09, "logits/chosen": -1.2272520065307617, "logits/rejected": -1.2148579359054565, "logps/chosen": -40.47775650024414, "logps/rejected": -66.96976470947266, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -0.9865707159042358, "rewards/margins": 8.534340858459473, "rewards/rejected": -9.520912170410156, "step": 645 }, { "epoch": 7.644970414201183, "grad_norm": 5.08495729529155, "learning_rate": 2.282551587522441e-09, "logits/chosen": -1.2286673784255981, "logits/rejected": -1.187401294708252, "logps/chosen": -38.408416748046875, "logps/rejected": -66.8298110961914, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -2.147416591644287, "rewards/margins": 8.060477256774902, "rewards/rejected": -10.207894325256348, "step": 646 }, { "epoch": 7.65680473372781, "grad_norm": 3.2465540149343206, "learning_rate": 2.1105899804675363e-09, "logits/chosen": -1.167961597442627, "logits/rejected": -1.1866670846939087, "logps/chosen": -45.849952697753906, "logps/rejected": -69.32218170166016, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -1.9047467708587646, "rewards/margins": 8.320653915405273, "rewards/rejected": -10.225400924682617, "step": 647 }, { "epoch": 7.668639053254438, "grad_norm": 3.0376394347131352, "learning_rate": 1.9453346716462316e-09, "logits/chosen": -1.45213782787323, "logits/rejected": -1.4141486883163452, "logps/chosen": -36.55447006225586, "logps/rejected": -63.271644592285156, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -0.9351499080657959, "rewards/margins": 8.255223274230957, "rewards/rejected": -9.190373420715332, "step": 648 }, { "epoch": 7.680473372781065, "grad_norm": 4.323080858658896, "learning_rate": 1.7867901318077695e-09, "logits/chosen": -1.2777527570724487, "logits/rejected": -1.1896119117736816, "logps/chosen": -50.66539764404297, "logps/rejected": -77.70809936523438, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": -3.193788766860962, "rewards/margins": 8.507574081420898, "rewards/rejected": -11.701362609863281, "step": 649 }, { "epoch": 7.6923076923076925, "grad_norm": 5.437515225024147, "learning_rate": 1.6349606501509794e-09, "logits/chosen": -1.4479446411132812, "logits/rejected": -1.4918639659881592, "logps/chosen": -37.68550109863281, "logps/rejected": -60.039100646972656, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": -1.733797550201416, "rewards/margins": 7.636280059814453, "rewards/rejected": -9.370077133178711, "step": 650 }, { "epoch": 7.704142011834319, "grad_norm": 4.271376085946802, "learning_rate": 1.489850334208259e-09, "logits/chosen": -1.4708189964294434, "logits/rejected": -1.3335071802139282, "logps/chosen": -36.9536247253418, "logps/rejected": -68.65011596679688, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -1.618829369544983, "rewards/margins": 7.648519515991211, "rewards/rejected": -9.267349243164062, "step": 651 }, { "epoch": 7.715976331360947, "grad_norm": 4.326704771148402, "learning_rate": 1.351463109734441e-09, "logits/chosen": -1.3178719282150269, "logits/rejected": -1.2290101051330566, "logps/chosen": -39.71061325073242, "logps/rejected": -64.45901489257812, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -1.014697790145874, "rewards/margins": 7.242136478424072, "rewards/rejected": -8.256834030151367, "step": 652 }, { "epoch": 7.727810650887574, "grad_norm": 3.7356892625464018, "learning_rate": 1.2198027206006822e-09, "logits/chosen": -1.1409059762954712, "logits/rejected": -1.035496711730957, "logps/chosen": -44.18361282348633, "logps/rejected": -72.97080993652344, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -1.7735768556594849, "rewards/margins": 8.308616638183594, "rewards/rejected": -10.082193374633789, "step": 653 }, { "epoch": 7.739644970414201, "grad_norm": 3.888887440172467, "learning_rate": 1.0948727286930192e-09, "logits/chosen": -1.0706766843795776, "logits/rejected": -1.0967326164245605, "logps/chosen": -52.38780212402344, "logps/rejected": -74.34490966796875, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -2.6245856285095215, "rewards/margins": 8.349431037902832, "rewards/rejected": -10.974016189575195, "step": 654 }, { "epoch": 7.7514792899408285, "grad_norm": 4.910196712545756, "learning_rate": 9.766765138160827e-10, "logits/chosen": -1.3771216869354248, "logits/rejected": -1.3673535585403442, "logps/chosen": -26.542402267456055, "logps/rejected": -53.72931671142578, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": -0.7858462929725647, "rewards/margins": 8.258723258972168, "rewards/rejected": -9.044569969177246, "step": 655 }, { "epoch": 7.763313609467455, "grad_norm": 3.6303736143642094, "learning_rate": 8.652172736017816e-10, "logits/chosen": -1.2556291818618774, "logits/rejected": -1.3194972276687622, "logps/chosen": -45.683570861816406, "logps/rejected": -59.34172821044922, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -1.3988208770751953, "rewards/margins": 7.259479522705078, "rewards/rejected": -8.658300399780273, "step": 656 }, { "epoch": 7.775147928994083, "grad_norm": 4.139059122264169, "learning_rate": 7.604980234225122e-10, "logits/chosen": -1.1913983821868896, "logits/rejected": -1.2692341804504395, "logps/chosen": -34.41987228393555, "logps/rejected": -62.04487609863281, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -0.5674678087234497, "rewards/margins": 9.807378768920898, "rewards/rejected": -10.374847412109375, "step": 657 }, { "epoch": 7.78698224852071, "grad_norm": 3.618597489407487, "learning_rate": 6.625215963098896e-10, "logits/chosen": -1.019304871559143, "logits/rejected": -1.0009970664978027, "logps/chosen": -41.301448822021484, "logps/rejected": -56.97692108154297, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -1.1815185546875, "rewards/margins": 5.260707855224609, "rewards/rejected": -6.442225933074951, "step": 658 }, { "epoch": 7.798816568047338, "grad_norm": 3.336765309826322, "learning_rate": 5.712906428778919e-10, "logits/chosen": -1.4606183767318726, "logits/rejected": -1.3886492252349854, "logps/chosen": -27.533708572387695, "logps/rejected": -57.605125427246094, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -1.1746628284454346, "rewards/margins": 7.080971717834473, "rewards/rejected": -8.255634307861328, "step": 659 }, { "epoch": 7.810650887573964, "grad_norm": 3.5093648749750477, "learning_rate": 4.868076312512515e-10, "logits/chosen": -1.306321144104004, "logits/rejected": -1.302777886390686, "logps/chosen": -35.620086669921875, "logps/rejected": -84.12001037597656, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -1.335205078125, "rewards/margins": 9.759173393249512, "rewards/rejected": -11.094379425048828, "step": 660 }, { "epoch": 7.822485207100591, "grad_norm": 4.227789623534498, "learning_rate": 4.090748469986471e-10, "logits/chosen": -1.2036335468292236, "logits/rejected": -1.2865405082702637, "logps/chosen": -35.319541931152344, "logps/rejected": -55.186180114746094, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 0.08307170867919922, "rewards/margins": 6.8585710525512695, "rewards/rejected": -6.77549934387207, "step": 661 }, { "epoch": 7.834319526627219, "grad_norm": 3.6041816520598307, "learning_rate": 3.3809439307086463e-10, "logits/chosen": -1.1416850090026855, "logits/rejected": -1.2989619970321655, "logps/chosen": -46.875675201416016, "logps/rejected": -77.26806640625, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": -2.14378023147583, "rewards/margins": 8.901089668273926, "rewards/rejected": -11.044870376586914, "step": 662 }, { "epoch": 7.846153846153846, "grad_norm": 4.065861079364059, "learning_rate": 2.7386818974395323e-10, "logits/chosen": -1.4301998615264893, "logits/rejected": -1.3408135175704956, "logps/chosen": -44.26204299926758, "logps/rejected": -67.56184387207031, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -2.9295194149017334, "rewards/margins": 6.951984882354736, "rewards/rejected": -9.88150405883789, "step": 663 }, { "epoch": 7.8579881656804735, "grad_norm": 3.4612690772571497, "learning_rate": 2.1639797456723952e-10, "logits/chosen": -1.22926664352417, "logits/rejected": -1.2551913261413574, "logps/chosen": -35.541709899902344, "logps/rejected": -57.685508728027344, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -0.7379496693611145, "rewards/margins": 7.399726867675781, "rewards/rejected": -8.137676239013672, "step": 664 }, { "epoch": 7.8698224852071, "grad_norm": 3.9801474006233795, "learning_rate": 1.6568530231628185e-10, "logits/chosen": -1.4825738668441772, "logits/rejected": -1.4905058145523071, "logps/chosen": -37.85133361816406, "logps/rejected": -63.87050247192383, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -1.4308565855026245, "rewards/margins": 8.55210018157959, "rewards/rejected": -9.982955932617188, "step": 665 }, { "epoch": 7.881656804733728, "grad_norm": 4.104029377641207, "learning_rate": 1.21731544950876e-10, "logits/chosen": -1.226098656654358, "logits/rejected": -1.371010422706604, "logps/chosen": -48.73766326904297, "logps/rejected": -67.00294494628906, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -2.1735711097717285, "rewards/margins": 7.302567481994629, "rewards/rejected": -9.476139068603516, "step": 666 }, { "epoch": 7.893491124260355, "grad_norm": 4.0448161239485145, "learning_rate": 8.453789157794599e-11, "logits/chosen": -1.1803443431854248, "logits/rejected": -1.2014808654785156, "logps/chosen": -38.218624114990234, "logps/rejected": -63.20378112792969, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": -0.791195273399353, "rewards/margins": 8.358329772949219, "rewards/rejected": -9.14952564239502, "step": 667 }, { "epoch": 7.905325443786982, "grad_norm": 4.422193721862127, "learning_rate": 5.4105348419264394e-11, "logits/chosen": -1.3956677913665771, "logits/rejected": -1.1959235668182373, "logps/chosen": -37.42644119262695, "logps/rejected": -67.5661392211914, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -0.9582293629646301, "rewards/margins": 8.263663291931152, "rewards/rejected": -9.221893310546875, "step": 668 }, { "epoch": 7.9171597633136095, "grad_norm": 2.7605966616715523, "learning_rate": 3.043473878436287e-11, "logits/chosen": -1.191653847694397, "logits/rejected": -1.153673768043518, "logps/chosen": -44.5345573425293, "logps/rejected": -70.56991577148438, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -0.7923838496208191, "rewards/margins": 8.033748626708984, "rewards/rejected": -8.826131820678711, "step": 669 }, { "epoch": 7.928994082840236, "grad_norm": 3.739721658753547, "learning_rate": 1.3526703048216682e-11, "logits/chosen": -1.5901668071746826, "logits/rejected": -1.452099323272705, "logps/chosen": -38.17021942138672, "logps/rejected": -63.99696350097656, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -1.6206248998641968, "rewards/margins": 6.980554580688477, "rewards/rejected": -8.601179122924805, "step": 670 }, { "epoch": 7.940828402366864, "grad_norm": 4.73168147418833, "learning_rate": 3.3816986338142117e-12, "logits/chosen": -1.312976598739624, "logits/rejected": -1.323945164680481, "logps/chosen": -33.24012756347656, "logps/rejected": -62.29106140136719, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -1.5721168518066406, "rewards/margins": 8.698356628417969, "rewards/rejected": -10.27047348022461, "step": 671 }, { "epoch": 7.952662721893491, "grad_norm": 4.043153951778064, "learning_rate": 0.0, "logits/chosen": -1.2264971733093262, "logits/rejected": -1.2341418266296387, "logps/chosen": -45.756378173828125, "logps/rejected": -61.860496520996094, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -2.606259822845459, "rewards/margins": 6.51239013671875, "rewards/rejected": -9.118650436401367, "step": 672 }, { "epoch": 7.952662721893491, "step": 672, "total_flos": 0.0, "train_loss": 0.13648563410000256, "train_runtime": 6735.2566, "train_samples_per_second": 12.822, "train_steps_per_second": 0.1 } ], "logging_steps": 1, "max_steps": 672, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 300, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }