{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00026171159382360636, "grad_norm": 12.705608949082016, "learning_rate": 1.3054830287206266e-09, "logits/chosen": -2.5182626247406006, "logits/rejected": -2.2643015384674072, "logps/chosen": -388.1086120605469, "logps/rejected": -98.67791748046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0026171159382360636, "grad_norm": 11.140808468314031, "learning_rate": 1.3054830287206264e-08, "logits/chosen": -2.58882999420166, "logits/rejected": -2.5899062156677246, "logps/chosen": -243.39637756347656, "logps/rejected": -202.82931518554688, "loss": 0.6931, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.019962893798947334, "rewards/margins": 0.03951311111450195, "rewards/rejected": -0.01955021731555462, "step": 10 }, { "epoch": 0.005234231876472127, "grad_norm": 11.144652062744898, "learning_rate": 2.610966057441253e-08, "logits/chosen": -2.672178030014038, "logits/rejected": -2.5746469497680664, "logps/chosen": -230.84091186523438, "logps/rejected": -228.9805145263672, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.009132862091064453, "rewards/margins": -0.06280727684497833, "rewards/rejected": 0.07194013893604279, "step": 20 }, { "epoch": 0.007851347814708191, "grad_norm": 12.445247991485594, "learning_rate": 3.91644908616188e-08, "logits/chosen": -2.6407968997955322, "logits/rejected": -2.627769947052002, "logps/chosen": -324.41583251953125, "logps/rejected": -290.3148193359375, "loss": 0.6932, "rewards/accuracies": 0.375, "rewards/chosen": -0.03206072002649307, "rewards/margins": -0.08101508766412735, "rewards/rejected": 0.04895436763763428, "step": 30 }, { "epoch": 0.010468463752944255, "grad_norm": 12.09719640540224, "learning_rate": 5.221932114882506e-08, "logits/chosen": -2.681063175201416, "logits/rejected": -2.5851457118988037, "logps/chosen": -293.2417907714844, "logps/rejected": -274.8295593261719, "loss": 0.6932, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.017620420083403587, "rewards/margins": -0.15502414107322693, "rewards/rejected": 0.1374037265777588, "step": 40 }, { "epoch": 0.01308557969118032, "grad_norm": 11.599145099560147, "learning_rate": 6.527415143603133e-08, "logits/chosen": -2.767548084259033, "logits/rejected": -2.6930577754974365, "logps/chosen": -265.1833190917969, "logps/rejected": -234.08816528320312, "loss": 0.6931, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22780108451843262, "rewards/margins": 0.1385040581226349, "rewards/rejected": 0.08929703384637833, "step": 50 }, { "epoch": 0.015702695629416383, "grad_norm": 11.965007728798458, "learning_rate": 7.83289817232376e-08, "logits/chosen": -2.7711918354034424, "logits/rejected": -2.669393301010132, "logps/chosen": -297.89581298828125, "logps/rejected": -215.51119995117188, "loss": 0.6929, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.2271616905927658, "rewards/margins": 0.12766209244728088, "rewards/rejected": 0.09949960559606552, "step": 60 }, { "epoch": 0.018319811567652448, "grad_norm": 11.022881833624007, "learning_rate": 9.138381201044386e-08, "logits/chosen": -2.879361867904663, "logits/rejected": -2.7757599353790283, "logps/chosen": -353.2497253417969, "logps/rejected": -311.20623779296875, "loss": 0.6928, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.38858160376548767, "rewards/margins": 0.23797640204429626, "rewards/rejected": 0.1506052017211914, "step": 70 }, { "epoch": 0.02093692750588851, "grad_norm": 11.840625592848136, "learning_rate": 1.0443864229765012e-07, "logits/chosen": -2.6824872493743896, "logits/rejected": -2.6427717208862305, "logps/chosen": -264.01666259765625, "logps/rejected": -247.1999053955078, "loss": 0.6928, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.4467671811580658, "rewards/margins": 0.07854221016168594, "rewards/rejected": 0.36822497844696045, "step": 80 }, { "epoch": 0.023554043444124574, "grad_norm": 9.022222501604846, "learning_rate": 1.174934725848564e-07, "logits/chosen": -2.6744754314422607, "logits/rejected": -2.594621181488037, "logps/chosen": -293.5453796386719, "logps/rejected": -247.10824584960938, "loss": 0.6919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5915378332138062, "rewards/margins": 0.33240383863449097, "rewards/rejected": 0.2591339647769928, "step": 90 }, { "epoch": 0.02617115938236064, "grad_norm": 9.493186007745985, "learning_rate": 1.3054830287206266e-07, "logits/chosen": -2.6995768547058105, "logits/rejected": -2.698713779449463, "logps/chosen": -251.64797973632812, "logps/rejected": -263.9851379394531, "loss": 0.6915, "rewards/accuracies": 0.625, "rewards/chosen": 1.2346092462539673, "rewards/margins": 0.4688534736633301, "rewards/rejected": 0.7657557725906372, "step": 100 }, { "epoch": 0.028788275320596704, "grad_norm": 10.644035033683487, "learning_rate": 1.4360313315926893e-07, "logits/chosen": -2.7012295722961426, "logits/rejected": -2.6298720836639404, "logps/chosen": -323.7200012207031, "logps/rejected": -251.99365234375, "loss": 0.6907, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.4460163116455078, "rewards/margins": 0.7907851934432983, "rewards/rejected": 0.6552310585975647, "step": 110 }, { "epoch": 0.031405391258832765, "grad_norm": 12.593823193229781, "learning_rate": 1.566579634464752e-07, "logits/chosen": -2.8149755001068115, "logits/rejected": -2.693021535873413, "logps/chosen": -321.4913635253906, "logps/rejected": -211.90219116210938, "loss": 0.6885, "rewards/accuracies": 0.625, "rewards/chosen": 1.6065527200698853, "rewards/margins": 2.005385160446167, "rewards/rejected": -0.3988325595855713, "step": 120 }, { "epoch": 0.03402250719706883, "grad_norm": 10.349524391749604, "learning_rate": 1.6971279373368143e-07, "logits/chosen": -2.6855309009552, "logits/rejected": -2.6297318935394287, "logps/chosen": -277.7900390625, "logps/rejected": -257.3739929199219, "loss": 0.6881, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.389769434928894, "rewards/margins": 1.9867759943008423, "rewards/rejected": -0.5970064401626587, "step": 130 }, { "epoch": 0.036639623135304895, "grad_norm": 9.759987095180128, "learning_rate": 1.8276762402088773e-07, "logits/chosen": -2.6445629596710205, "logits/rejected": -2.583130121231079, "logps/chosen": -292.6871337890625, "logps/rejected": -262.7529296875, "loss": 0.6889, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.12567937374115, "rewards/margins": 2.6970019340515137, "rewards/rejected": -1.5713226795196533, "step": 140 }, { "epoch": 0.03925673907354096, "grad_norm": 12.31960701088485, "learning_rate": 1.95822454308094e-07, "logits/chosen": -2.7052559852600098, "logits/rejected": -2.662947177886963, "logps/chosen": -236.67056274414062, "logps/rejected": -259.0706787109375, "loss": 0.6863, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.3762800097465515, "rewards/margins": 1.9162286520004272, "rewards/rejected": -1.5399487018585205, "step": 150 }, { "epoch": 0.04187385501177702, "grad_norm": 11.677562237104198, "learning_rate": 2.0887728459530023e-07, "logits/chosen": -2.7215421199798584, "logits/rejected": -2.596179962158203, "logps/chosen": -308.93212890625, "logps/rejected": -228.4113311767578, "loss": 0.6826, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.3878512382507324, "rewards/margins": 6.691712379455566, "rewards/rejected": -4.303861618041992, "step": 160 }, { "epoch": 0.04449097095001309, "grad_norm": 10.693351476475623, "learning_rate": 2.2193211488250652e-07, "logits/chosen": -2.7206406593322754, "logits/rejected": -2.7074098587036133, "logps/chosen": -292.83819580078125, "logps/rejected": -281.1519775390625, "loss": 0.6824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.645533800125122, "rewards/margins": 2.851980209350586, "rewards/rejected": -4.497513771057129, "step": 170 }, { "epoch": 0.04710808688824915, "grad_norm": 11.109776793504, "learning_rate": 2.349869451697128e-07, "logits/chosen": -2.625349521636963, "logits/rejected": -2.555849552154541, "logps/chosen": -295.85986328125, "logps/rejected": -283.39111328125, "loss": 0.6795, "rewards/accuracies": 0.625, "rewards/chosen": -4.437504768371582, "rewards/margins": 6.071015357971191, "rewards/rejected": -10.508520126342773, "step": 180 }, { "epoch": 0.04972520282648522, "grad_norm": 11.94197957738407, "learning_rate": 2.4804177545691903e-07, "logits/chosen": -2.57228684425354, "logits/rejected": -2.4936299324035645, "logps/chosen": -262.52410888671875, "logps/rejected": -211.85018920898438, "loss": 0.6775, "rewards/accuracies": 0.625, "rewards/chosen": -3.077005386352539, "rewards/margins": 6.298064231872559, "rewards/rejected": -9.375069618225098, "step": 190 }, { "epoch": 0.05234231876472128, "grad_norm": 12.061365139178056, "learning_rate": 2.610966057441253e-07, "logits/chosen": -2.567936658859253, "logits/rejected": -2.4781880378723145, "logps/chosen": -234.6671142578125, "logps/rejected": -221.07540893554688, "loss": 0.677, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -7.456887245178223, "rewards/margins": 0.09982886165380478, "rewards/rejected": -7.556715965270996, "step": 200 }, { "epoch": 0.05495943470295734, "grad_norm": 16.567960601555676, "learning_rate": 2.7415143603133156e-07, "logits/chosen": -2.749500036239624, "logits/rejected": -2.6309502124786377, "logps/chosen": -242.54910278320312, "logps/rejected": -182.82608032226562, "loss": 0.6795, "rewards/accuracies": 0.625, "rewards/chosen": -11.374174118041992, "rewards/margins": 6.828550815582275, "rewards/rejected": -18.20272445678711, "step": 210 }, { "epoch": 0.05757655064119341, "grad_norm": 19.24294269180777, "learning_rate": 2.8720626631853785e-07, "logits/chosen": -2.706005811691284, "logits/rejected": -2.698056936264038, "logps/chosen": -269.6457824707031, "logps/rejected": -219.22647094726562, "loss": 0.6726, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -11.772501945495605, "rewards/margins": 10.55135726928711, "rewards/rejected": -22.32386016845703, "step": 220 }, { "epoch": 0.06019366657942947, "grad_norm": 16.374157225946007, "learning_rate": 3.002610966057441e-07, "logits/chosen": -2.663548707962036, "logits/rejected": -2.6067066192626953, "logps/chosen": -344.29693603515625, "logps/rejected": -307.93218994140625, "loss": 0.6696, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5850511789321899, "rewards/margins": 10.634298324584961, "rewards/rejected": -10.049245834350586, "step": 230 }, { "epoch": 0.06281078251766553, "grad_norm": 16.386410614457457, "learning_rate": 3.133159268929504e-07, "logits/chosen": -2.657069683074951, "logits/rejected": -2.559688091278076, "logps/chosen": -230.8714599609375, "logps/rejected": -256.7811279296875, "loss": 0.6718, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.7591073513031006, "rewards/margins": 12.968846321105957, "rewards/rejected": -11.209738731384277, "step": 240 }, { "epoch": 0.06542789845590159, "grad_norm": 24.693819068780147, "learning_rate": 3.263707571801567e-07, "logits/chosen": -2.754833698272705, "logits/rejected": -2.6114320755004883, "logps/chosen": -286.67083740234375, "logps/rejected": -250.2057647705078, "loss": 0.6632, "rewards/accuracies": 0.625, "rewards/chosen": -10.266336441040039, "rewards/margins": 13.779983520507812, "rewards/rejected": -24.046321868896484, "step": 250 }, { "epoch": 0.06804501439413765, "grad_norm": 13.195073062662031, "learning_rate": 3.3942558746736286e-07, "logits/chosen": -2.57175350189209, "logits/rejected": -2.367932081222534, "logps/chosen": -289.5040588378906, "logps/rejected": -241.59048461914062, "loss": 0.6654, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -20.12925148010254, "rewards/margins": 20.86807632446289, "rewards/rejected": -40.9973258972168, "step": 260 }, { "epoch": 0.07066213033237373, "grad_norm": 22.100525575015013, "learning_rate": 3.5248041775456916e-07, "logits/chosen": -2.6396119594573975, "logits/rejected": -2.627289295196533, "logps/chosen": -342.06005859375, "logps/rejected": -355.39569091796875, "loss": 0.6569, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.9144287109375, "rewards/margins": 18.81797981262207, "rewards/rejected": -46.7324104309082, "step": 270 }, { "epoch": 0.07327924627060979, "grad_norm": 14.329830569747937, "learning_rate": 3.6553524804177545e-07, "logits/chosen": -2.6874194145202637, "logits/rejected": -2.605147361755371, "logps/chosen": -261.7181091308594, "logps/rejected": -301.5287780761719, "loss": 0.6654, "rewards/accuracies": 0.625, "rewards/chosen": -1.3762327432632446, "rewards/margins": 13.867558479309082, "rewards/rejected": -15.243791580200195, "step": 280 }, { "epoch": 0.07589636220884585, "grad_norm": 11.577264424915722, "learning_rate": 3.785900783289817e-07, "logits/chosen": -2.5518436431884766, "logits/rejected": -2.5594162940979004, "logps/chosen": -322.40875244140625, "logps/rejected": -293.8119201660156, "loss": 0.6646, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 5.861729621887207, "rewards/margins": 15.61244010925293, "rewards/rejected": -9.750709533691406, "step": 290 }, { "epoch": 0.07851347814708191, "grad_norm": 12.227671537308453, "learning_rate": 3.91644908616188e-07, "logits/chosen": -2.728774309158325, "logits/rejected": -2.69193696975708, "logps/chosen": -234.78945922851562, "logps/rejected": -249.3471221923828, "loss": 0.6746, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2874860763549805, "rewards/margins": 10.886515617370605, "rewards/rejected": -13.174001693725586, "step": 300 }, { "epoch": 0.08113059408531798, "grad_norm": 10.865740609371638, "learning_rate": 4.046997389033943e-07, "logits/chosen": -2.755521297454834, "logits/rejected": -2.7999794483184814, "logps/chosen": -264.28765869140625, "logps/rejected": -330.6804504394531, "loss": 0.6515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9.909737586975098, "rewards/margins": 20.648042678833008, "rewards/rejected": -30.55777931213379, "step": 310 }, { "epoch": 0.08374771002355404, "grad_norm": 17.405613711098255, "learning_rate": 4.1775456919060046e-07, "logits/chosen": -2.6872105598449707, "logits/rejected": -2.545166015625, "logps/chosen": -249.74252319335938, "logps/rejected": -246.7162628173828, "loss": 0.6584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -11.012734413146973, "rewards/margins": 24.713336944580078, "rewards/rejected": -35.72606658935547, "step": 320 }, { "epoch": 0.08636482596179011, "grad_norm": 16.876842691559403, "learning_rate": 4.3080939947780675e-07, "logits/chosen": -2.8166439533233643, "logits/rejected": -2.660918712615967, "logps/chosen": -333.4720153808594, "logps/rejected": -322.7220153808594, "loss": 0.6438, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -21.886640548706055, "rewards/margins": 36.4941520690918, "rewards/rejected": -58.38079833984375, "step": 330 }, { "epoch": 0.08898194190002617, "grad_norm": 20.055916219424066, "learning_rate": 4.4386422976501305e-07, "logits/chosen": -2.64066219329834, "logits/rejected": -2.6652839183807373, "logps/chosen": -297.4921875, "logps/rejected": -326.902587890625, "loss": 0.6477, "rewards/accuracies": 0.75, "rewards/chosen": -30.227951049804688, "rewards/margins": 36.16551971435547, "rewards/rejected": -66.39347839355469, "step": 340 }, { "epoch": 0.09159905783826224, "grad_norm": 18.03084139024805, "learning_rate": 4.569190600522193e-07, "logits/chosen": -2.5461313724517822, "logits/rejected": -2.3993871212005615, "logps/chosen": -333.36273193359375, "logps/rejected": -314.22796630859375, "loss": 0.661, "rewards/accuracies": 0.75, "rewards/chosen": -50.57813262939453, "rewards/margins": 34.174285888671875, "rewards/rejected": -84.7524185180664, "step": 350 }, { "epoch": 0.0942161737764983, "grad_norm": 21.82315344421899, "learning_rate": 4.699738903394256e-07, "logits/chosen": -2.620579242706299, "logits/rejected": -2.389979839324951, "logps/chosen": -332.0915832519531, "logps/rejected": -292.027099609375, "loss": 0.6535, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -47.5004997253418, "rewards/margins": 15.382966995239258, "rewards/rejected": -62.88347244262695, "step": 360 }, { "epoch": 0.09683328971473436, "grad_norm": 15.226791043551732, "learning_rate": 4.830287206266319e-07, "logits/chosen": -2.47334623336792, "logits/rejected": -2.339829921722412, "logps/chosen": -314.9557189941406, "logps/rejected": -340.21624755859375, "loss": 0.6471, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.457332611083984, "rewards/margins": 20.158397674560547, "rewards/rejected": -44.61573028564453, "step": 370 }, { "epoch": 0.09945040565297043, "grad_norm": 23.15807260624967, "learning_rate": 4.960835509138381e-07, "logits/chosen": -2.4241931438446045, "logits/rejected": -2.4154956340789795, "logps/chosen": -252.0162811279297, "logps/rejected": -317.27960205078125, "loss": 0.6577, "rewards/accuracies": 0.625, "rewards/chosen": -25.363056182861328, "rewards/margins": 27.764968872070312, "rewards/rejected": -53.128028869628906, "step": 380 }, { "epoch": 0.1020675215912065, "grad_norm": 14.367634528764981, "learning_rate": 4.999948856244767e-07, "logits/chosen": -2.4574198722839355, "logits/rejected": -2.225590705871582, "logps/chosen": -278.4183349609375, "logps/rejected": -297.3124084472656, "loss": 0.6508, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -23.85447120666504, "rewards/margins": 30.555782318115234, "rewards/rejected": -54.410255432128906, "step": 390 }, { "epoch": 0.10468463752944256, "grad_norm": 18.410363856260208, "learning_rate": 4.999698361256577e-07, "logits/chosen": -2.3174118995666504, "logits/rejected": -2.241575002670288, "logps/chosen": -293.21435546875, "logps/rejected": -377.48651123046875, "loss": 0.6646, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -21.861358642578125, "rewards/margins": 39.63842010498047, "rewards/rejected": -61.499778747558594, "step": 400 }, { "epoch": 0.10730175346767862, "grad_norm": 21.26097793554471, "learning_rate": 4.99923914217458e-07, "logits/chosen": -2.6152307987213135, "logits/rejected": -2.2710635662078857, "logps/chosen": -290.6064453125, "logps/rejected": -356.77655029296875, "loss": 0.6567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -33.422264099121094, "rewards/margins": 62.78270721435547, "rewards/rejected": -96.20496368408203, "step": 410 }, { "epoch": 0.10991886940591468, "grad_norm": 23.417576747738657, "learning_rate": 4.99857123734344e-07, "logits/chosen": -2.6153998374938965, "logits/rejected": -2.670661449432373, "logps/chosen": -321.48553466796875, "logps/rejected": -420.5201110839844, "loss": 0.6529, "rewards/accuracies": 0.625, "rewards/chosen": -60.42144012451172, "rewards/margins": 39.77070236206055, "rewards/rejected": -100.1921615600586, "step": 420 }, { "epoch": 0.11253598534415074, "grad_norm": 32.63068698649212, "learning_rate": 4.997694702533016e-07, "logits/chosen": -2.52959942817688, "logits/rejected": -2.5021653175354004, "logps/chosen": -405.32867431640625, "logps/rejected": -397.52008056640625, "loss": 0.6494, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -50.44035720825195, "rewards/margins": 55.264503479003906, "rewards/rejected": -105.7048568725586, "step": 430 }, { "epoch": 0.11515310128238682, "grad_norm": 30.402667110459582, "learning_rate": 4.996609610933712e-07, "logits/chosen": -1.9839417934417725, "logits/rejected": -1.6886985301971436, "logps/chosen": -308.8043518066406, "logps/rejected": -316.9330749511719, "loss": 0.6746, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -92.59898376464844, "rewards/margins": 24.77956771850586, "rewards/rejected": -117.3785400390625, "step": 440 }, { "epoch": 0.11777021722062288, "grad_norm": 58.30863783887881, "learning_rate": 4.995316053150366e-07, "logits/chosen": -1.65292489528656, "logits/rejected": -1.3604259490966797, "logps/chosen": -400.2745361328125, "logps/rejected": -401.6268310546875, "loss": 0.6555, "rewards/accuracies": 0.75, "rewards/chosen": -98.50657653808594, "rewards/margins": 38.124855041503906, "rewards/rejected": -136.6314239501953, "step": 450 }, { "epoch": 0.12038733315885894, "grad_norm": 30.609564022713695, "learning_rate": 4.99381413719468e-07, "logits/chosen": -2.0271124839782715, "logits/rejected": -1.5985970497131348, "logps/chosen": -309.2818603515625, "logps/rejected": -336.6201171875, "loss": 0.652, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -56.03574752807617, "rewards/margins": 49.059043884277344, "rewards/rejected": -105.09478759765625, "step": 460 }, { "epoch": 0.123004449097095, "grad_norm": 25.115747814836954, "learning_rate": 4.992103988476205e-07, "logits/chosen": -1.7523378133773804, "logits/rejected": -1.3779999017715454, "logps/chosen": -346.60150146484375, "logps/rejected": -356.7440490722656, "loss": 0.6504, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -83.8178939819336, "rewards/margins": 50.205230712890625, "rewards/rejected": -134.0231170654297, "step": 470 }, { "epoch": 0.12562156503533106, "grad_norm": 30.622381148232613, "learning_rate": 4.990185749791864e-07, "logits/chosen": -1.7865359783172607, "logits/rejected": -1.0789787769317627, "logps/chosen": -356.77484130859375, "logps/rejected": -287.66925048828125, "loss": 0.6493, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -70.98980712890625, "rewards/margins": 35.558433532714844, "rewards/rejected": -106.5482406616211, "step": 480 }, { "epoch": 0.12823868097356714, "grad_norm": 24.535072604494577, "learning_rate": 4.988059581314039e-07, "logits/chosen": -1.7096989154815674, "logits/rejected": -1.8498868942260742, "logps/chosen": -335.67730712890625, "logps/rejected": -398.1258850097656, "loss": 0.6407, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -55.123626708984375, "rewards/margins": 46.74440383911133, "rewards/rejected": -101.86802673339844, "step": 490 }, { "epoch": 0.13085579691180318, "grad_norm": 91.50022744374398, "learning_rate": 4.985725660577184e-07, "logits/chosen": -1.8485355377197266, "logits/rejected": -1.5357762575149536, "logps/chosen": -317.83135986328125, "logps/rejected": -341.1956481933594, "loss": 0.6446, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -70.9972152709961, "rewards/margins": 18.12459373474121, "rewards/rejected": -89.1218032836914, "step": 500 }, { "epoch": 0.13347291285003926, "grad_norm": 18.189747979700574, "learning_rate": 4.983184182463008e-07, "logits/chosen": -2.1394052505493164, "logits/rejected": -2.0640697479248047, "logps/chosen": -420.4029846191406, "logps/rejected": -425.334228515625, "loss": 0.6465, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -73.95565032958984, "rewards/margins": 44.018211364746094, "rewards/rejected": -117.97386169433594, "step": 510 }, { "epoch": 0.1360900287882753, "grad_norm": 43.16282028958125, "learning_rate": 4.980435359184203e-07, "logits/chosen": -2.4100048542022705, "logits/rejected": -2.3749964237213135, "logps/chosen": -322.7026062011719, "logps/rejected": -347.69366455078125, "loss": 0.6603, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -54.79851531982422, "rewards/margins": 15.417330741882324, "rewards/rejected": -70.21583557128906, "step": 520 }, { "epoch": 0.13870714472651138, "grad_norm": 29.05298017877607, "learning_rate": 4.977479420266723e-07, "logits/chosen": -2.0409185886383057, "logits/rejected": -1.8864901065826416, "logps/chosen": -267.41387939453125, "logps/rejected": -322.8434143066406, "loss": 0.649, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -43.029293060302734, "rewards/margins": 38.46556854248047, "rewards/rejected": -81.49485778808594, "step": 530 }, { "epoch": 0.14132426066474746, "grad_norm": 31.793699019536305, "learning_rate": 4.974316612530614e-07, "logits/chosen": -1.8414909839630127, "logits/rejected": -1.3766376972198486, "logps/chosen": -335.9708557128906, "logps/rejected": -364.6490478515625, "loss": 0.6363, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -55.383262634277344, "rewards/margins": 62.95615768432617, "rewards/rejected": -118.33943176269531, "step": 540 }, { "epoch": 0.1439413766029835, "grad_norm": 44.57978704229952, "learning_rate": 4.970947200069415e-07, "logits/chosen": -1.5717941522598267, "logits/rejected": -0.947163462638855, "logps/chosen": -305.5942687988281, "logps/rejected": -333.8151550292969, "loss": 0.6415, "rewards/accuracies": 0.875, "rewards/chosen": -57.449851989746094, "rewards/margins": 67.77587890625, "rewards/rejected": -125.2257308959961, "step": 550 }, { "epoch": 0.14655849254121958, "grad_norm": 20.95587939868339, "learning_rate": 4.967371464228095e-07, "logits/chosen": -1.2510230541229248, "logits/rejected": -0.9176260232925415, "logps/chosen": -392.3152160644531, "logps/rejected": -415.76568603515625, "loss": 0.6309, "rewards/accuracies": 0.75, "rewards/chosen": -86.01702880859375, "rewards/margins": 61.00504684448242, "rewards/rejected": -147.02207946777344, "step": 560 }, { "epoch": 0.14917560847945563, "grad_norm": 21.753739466816693, "learning_rate": 4.963589703579569e-07, "logits/chosen": -1.6249122619628906, "logits/rejected": -1.1825764179229736, "logps/chosen": -324.12762451171875, "logps/rejected": -394.82171630859375, "loss": 0.6297, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -60.056861877441406, "rewards/margins": 62.51560592651367, "rewards/rejected": -122.57247161865234, "step": 570 }, { "epoch": 0.1517927244176917, "grad_norm": 29.488789126329046, "learning_rate": 4.959602233899761e-07, "logits/chosen": -1.5013480186462402, "logits/rejected": -0.5311890840530396, "logps/chosen": -343.14666748046875, "logps/rejected": -399.1929016113281, "loss": 0.6371, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -66.63851928710938, "rewards/margins": 59.16364669799805, "rewards/rejected": -125.80216217041016, "step": 580 }, { "epoch": 0.15440984035592778, "grad_norm": 19.7981594005066, "learning_rate": 4.955409388141243e-07, "logits/chosen": -1.3217118978500366, "logits/rejected": -0.3744869530200958, "logps/chosen": -431.08074951171875, "logps/rejected": -447.07708740234375, "loss": 0.6474, "rewards/accuracies": 0.75, "rewards/chosen": -82.43335723876953, "rewards/margins": 65.14683532714844, "rewards/rejected": -147.58018493652344, "step": 590 }, { "epoch": 0.15702695629416383, "grad_norm": 36.104735427867645, "learning_rate": 4.951011516405429e-07, "logits/chosen": -2.091003894805908, "logits/rejected": -1.4518930912017822, "logps/chosen": -366.91790771484375, "logps/rejected": -385.13629150390625, "loss": 0.6269, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -68.24613952636719, "rewards/margins": 54.08154296875, "rewards/rejected": -122.32768249511719, "step": 600 }, { "epoch": 0.1596440722323999, "grad_norm": 26.905150575209632, "learning_rate": 4.946408985913344e-07, "logits/chosen": -1.9128236770629883, "logits/rejected": -1.008561372756958, "logps/chosen": -354.6834716796875, "logps/rejected": -329.9129638671875, "loss": 0.6369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -48.14597702026367, "rewards/margins": 71.0589370727539, "rewards/rejected": -119.20491027832031, "step": 610 }, { "epoch": 0.16226118817063595, "grad_norm": 34.58032776342021, "learning_rate": 4.941602180974958e-07, "logits/chosen": -1.3618040084838867, "logits/rejected": -0.6716572642326355, "logps/chosen": -314.2044982910156, "logps/rejected": -350.322021484375, "loss": 0.6578, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -67.95203399658203, "rewards/margins": 61.99140548706055, "rewards/rejected": -129.9434356689453, "step": 620 }, { "epoch": 0.16487830410887203, "grad_norm": 50.70972774376307, "learning_rate": 4.936591502957101e-07, "logits/chosen": -1.437552809715271, "logits/rejected": -1.3617020845413208, "logps/chosen": -226.73690795898438, "logps/rejected": -290.77044677734375, "loss": 0.6448, "rewards/accuracies": 0.625, "rewards/chosen": -49.09408950805664, "rewards/margins": 37.69247817993164, "rewards/rejected": -86.78656768798828, "step": 630 }, { "epoch": 0.16749542004710807, "grad_norm": 15.357094809679237, "learning_rate": 4.931377370249945e-07, "logits/chosen": -1.46652090549469, "logits/rejected": -0.865250289440155, "logps/chosen": -293.2486572265625, "logps/rejected": -327.9940490722656, "loss": 0.6425, "rewards/accuracies": 0.75, "rewards/chosen": -45.150516510009766, "rewards/margins": 41.07079315185547, "rewards/rejected": -86.2213134765625, "step": 640 }, { "epoch": 0.17011253598534415, "grad_norm": 24.168035949449166, "learning_rate": 4.925960218232072e-07, "logits/chosen": -0.6024962663650513, "logits/rejected": -0.08184865862131119, "logps/chosen": -361.3127746582031, "logps/rejected": -382.85845947265625, "loss": 0.6723, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -85.62727355957031, "rewards/margins": 49.480709075927734, "rewards/rejected": -135.10800170898438, "step": 650 }, { "epoch": 0.17272965192358022, "grad_norm": 27.513683012496575, "learning_rate": 4.920340499234116e-07, "logits/chosen": -0.9328662157058716, "logits/rejected": -0.03643731027841568, "logps/chosen": -390.78521728515625, "logps/rejected": -394.0958557128906, "loss": 0.6255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -104.32292175292969, "rewards/margins": 56.33015823364258, "rewards/rejected": -160.653076171875, "step": 660 }, { "epoch": 0.17534676786181627, "grad_norm": 21.525597755133532, "learning_rate": 4.914518682500995e-07, "logits/chosen": -0.8200181722640991, "logits/rejected": -0.7997004985809326, "logps/chosen": -367.11407470703125, "logps/rejected": -425.20501708984375, "loss": 0.6432, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -83.42225646972656, "rewards/margins": 41.75618362426758, "rewards/rejected": -125.17845153808594, "step": 670 }, { "epoch": 0.17796388380005235, "grad_norm": 23.587490414758562, "learning_rate": 4.90849525415273e-07, "logits/chosen": -1.2536793947219849, "logits/rejected": -1.5548416376113892, "logps/chosen": -346.5150146484375, "logps/rejected": -420.87451171875, "loss": 0.6435, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -84.81087493896484, "rewards/margins": 32.74394989013672, "rewards/rejected": -117.5548324584961, "step": 680 }, { "epoch": 0.1805809997382884, "grad_norm": 16.140163273606795, "learning_rate": 4.902270717143858e-07, "logits/chosen": -1.2677083015441895, "logits/rejected": -0.6290455460548401, "logps/chosen": -392.08148193359375, "logps/rejected": -412.1546325683594, "loss": 0.6271, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -99.23873901367188, "rewards/margins": 40.86092758178711, "rewards/rejected": -140.0996856689453, "step": 690 }, { "epoch": 0.18319811567652447, "grad_norm": 17.066483148041247, "learning_rate": 4.895845591221426e-07, "logits/chosen": -1.3644059896469116, "logits/rejected": -0.4180236756801605, "logps/chosen": -331.05426025390625, "logps/rejected": -343.99755859375, "loss": 0.6204, "rewards/accuracies": 0.75, "rewards/chosen": -71.15898895263672, "rewards/margins": 59.32234573364258, "rewards/rejected": -130.48133850097656, "step": 700 }, { "epoch": 0.18581523161476055, "grad_norm": 22.437469152186377, "learning_rate": 4.8892204128816e-07, "logits/chosen": -1.5181987285614014, "logits/rejected": -0.2633799910545349, "logps/chosen": -363.61962890625, "logps/rejected": -313.43658447265625, "loss": 0.656, "rewards/accuracies": 0.625, "rewards/chosen": -85.1877670288086, "rewards/margins": 33.58545684814453, "rewards/rejected": -118.7732162475586, "step": 710 }, { "epoch": 0.1884323475529966, "grad_norm": 16.0905054896879, "learning_rate": 4.882395735324863e-07, "logits/chosen": -1.1565659046173096, "logits/rejected": -0.8029291033744812, "logps/chosen": -322.3810729980469, "logps/rejected": -373.6277770996094, "loss": 0.6278, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -55.12693405151367, "rewards/margins": 71.59522247314453, "rewards/rejected": -126.7221450805664, "step": 720 }, { "epoch": 0.19104946349123267, "grad_norm": 18.24845612413702, "learning_rate": 4.875372128409829e-07, "logits/chosen": -0.7359566688537598, "logits/rejected": -0.479675829410553, "logps/chosen": -234.16329956054688, "logps/rejected": -360.5873107910156, "loss": 0.6349, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -82.40586853027344, "rewards/margins": 79.42875671386719, "rewards/rejected": -161.8346405029297, "step": 730 }, { "epoch": 0.19366657942946872, "grad_norm": 31.57360611112559, "learning_rate": 4.868150178605653e-07, "logits/chosen": -1.8614606857299805, "logits/rejected": -0.5545592904090881, "logps/chosen": -468.30242919921875, "logps/rejected": -470.0863342285156, "loss": 0.6641, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -112.69195556640625, "rewards/margins": 80.2920150756836, "rewards/rejected": -192.9839630126953, "step": 740 }, { "epoch": 0.1962836953677048, "grad_norm": 44.06651767708028, "learning_rate": 4.860730488943068e-07, "logits/chosen": -0.7021804451942444, "logits/rejected": -0.19743548333644867, "logps/chosen": -330.96148681640625, "logps/rejected": -390.07098388671875, "loss": 0.6437, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -126.3841781616211, "rewards/margins": 59.72180938720703, "rewards/rejected": -186.10597229003906, "step": 750 }, { "epoch": 0.19890081130594087, "grad_norm": 24.455997791623204, "learning_rate": 4.853113678964021e-07, "logits/chosen": -0.9393317103385925, "logits/rejected": -0.32588958740234375, "logps/chosen": -487.88897705078125, "logps/rejected": -475.53875732421875, "loss": 0.6242, "rewards/accuracies": 0.75, "rewards/chosen": -111.6043701171875, "rewards/margins": 43.061561584472656, "rewards/rejected": -154.6659393310547, "step": 760 }, { "epoch": 0.20151792724417691, "grad_norm": 29.559259631889166, "learning_rate": 4.845300384669957e-07, "logits/chosen": -0.16026926040649414, "logits/rejected": 0.10852203518152237, "logps/chosen": -359.624267578125, "logps/rejected": -412.13970947265625, "loss": 0.6867, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -108.4910659790039, "rewards/margins": 43.94545364379883, "rewards/rejected": -152.4365234375, "step": 770 }, { "epoch": 0.204135043182413, "grad_norm": 38.08984952973831, "learning_rate": 4.8372912584687e-07, "logits/chosen": -1.0849199295043945, "logits/rejected": -0.6502343416213989, "logps/chosen": -363.4341735839844, "logps/rejected": -442.24053955078125, "loss": 0.6453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -82.14627838134766, "rewards/margins": 59.62005615234375, "rewards/rejected": -141.76632690429688, "step": 780 }, { "epoch": 0.20675215912064904, "grad_norm": 17.40406336144934, "learning_rate": 4.829086969119983e-07, "logits/chosen": -0.8264859914779663, "logits/rejected": -0.45223379135131836, "logps/chosen": -364.9395446777344, "logps/rejected": -411.8614196777344, "loss": 0.6368, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -82.84831237792969, "rewards/margins": 35.919227600097656, "rewards/rejected": -118.76753997802734, "step": 790 }, { "epoch": 0.2093692750588851, "grad_norm": 24.74636016899087, "learning_rate": 4.820688201679605e-07, "logits/chosen": -0.6875916719436646, "logits/rejected": -0.07015343010425568, "logps/chosen": -395.1434326171875, "logps/rejected": -461.1236877441406, "loss": 0.6157, "rewards/accuracies": 0.75, "rewards/chosen": -94.22093200683594, "rewards/margins": 64.74181365966797, "rewards/rejected": -158.96275329589844, "step": 800 }, { "epoch": 0.21198639099712116, "grad_norm": 33.221118390849895, "learning_rate": 4.812095657442231e-07, "logits/chosen": -0.19572052359580994, "logits/rejected": 1.188407063484192, "logps/chosen": -333.38201904296875, "logps/rejected": -417.3399353027344, "loss": 0.6145, "rewards/accuracies": 0.875, "rewards/chosen": -74.15843200683594, "rewards/margins": 107.42332458496094, "rewards/rejected": -181.58175659179688, "step": 810 }, { "epoch": 0.21460350693535724, "grad_norm": 30.78718682507575, "learning_rate": 4.803310053882831e-07, "logits/chosen": 1.3916046619415283, "logits/rejected": 1.80306077003479, "logps/chosen": -333.6537780761719, "logps/rejected": -432.14404296875, "loss": 0.6247, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -104.1457290649414, "rewards/margins": 93.13396453857422, "rewards/rejected": -197.27969360351562, "step": 820 }, { "epoch": 0.2172206228735933, "grad_norm": 32.330687937711005, "learning_rate": 4.794332124596775e-07, "logits/chosen": -0.8797634243965149, "logits/rejected": 0.2293863296508789, "logps/chosen": -385.99273681640625, "logps/rejected": -422.72760009765625, "loss": 0.6722, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -76.32684326171875, "rewards/margins": 44.87958526611328, "rewards/rejected": -121.20643615722656, "step": 830 }, { "epoch": 0.21983773881182936, "grad_norm": 18.735973314129254, "learning_rate": 4.785162619238574e-07, "logits/chosen": -1.5916681289672852, "logits/rejected": 0.12047050148248672, "logps/chosen": -320.60382080078125, "logps/rejected": -326.4573974609375, "loss": 0.6174, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -49.556217193603516, "rewards/margins": 59.73494338989258, "rewards/rejected": -109.2911605834961, "step": 840 }, { "epoch": 0.22245485475006543, "grad_norm": 22.699909493169436, "learning_rate": 4.775802303459287e-07, "logits/chosen": -0.5288606286048889, "logits/rejected": 0.5344291925430298, "logps/chosen": -411.8746643066406, "logps/rejected": -471.15692138671875, "loss": 0.6233, "rewards/accuracies": 0.75, "rewards/chosen": -102.88179779052734, "rewards/margins": 61.04967498779297, "rewards/rejected": -163.9314727783203, "step": 850 }, { "epoch": 0.22507197068830148, "grad_norm": 34.24159539521559, "learning_rate": 4.766251958842589e-07, "logits/chosen": 0.23371069133281708, "logits/rejected": 0.967072606086731, "logps/chosen": -335.1881103515625, "logps/rejected": -376.53192138671875, "loss": 0.632, "rewards/accuracies": 0.625, "rewards/chosen": -99.16200256347656, "rewards/margins": 47.29999542236328, "rewards/rejected": -146.46200561523438, "step": 860 }, { "epoch": 0.22768908662653756, "grad_norm": 25.25556764754258, "learning_rate": 4.756512382839506e-07, "logits/chosen": -0.5754461884498596, "logits/rejected": 0.47906550765037537, "logps/chosen": -382.8406677246094, "logps/rejected": -452.028564453125, "loss": 0.6162, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -93.5912857055664, "rewards/margins": 62.332862854003906, "rewards/rejected": -155.9241485595703, "step": 870 }, { "epoch": 0.23030620256477363, "grad_norm": 37.027786250232936, "learning_rate": 4.746584388701831e-07, "logits/chosen": -0.11350803077220917, "logits/rejected": 0.7307132482528687, "logps/chosen": -340.4446716308594, "logps/rejected": -351.2494201660156, "loss": 0.6281, "rewards/accuracies": 0.75, "rewards/chosen": -87.96061706542969, "rewards/margins": 54.02471923828125, "rewards/rejected": -141.9853515625, "step": 880 }, { "epoch": 0.23292331850300968, "grad_norm": 20.793289603998193, "learning_rate": 4.736468805414218e-07, "logits/chosen": -0.33384501934051514, "logits/rejected": 0.36782529950141907, "logps/chosen": -350.48291015625, "logps/rejected": -425.75006103515625, "loss": 0.6383, "rewards/accuracies": 0.75, "rewards/chosen": -86.30848693847656, "rewards/margins": 63.4256706237793, "rewards/rejected": -149.73416137695312, "step": 890 }, { "epoch": 0.23554043444124576, "grad_norm": 29.395597470849673, "learning_rate": 4.7261664776249595e-07, "logits/chosen": -0.3997666537761688, "logits/rejected": -0.10973574221134186, "logps/chosen": -346.1295471191406, "logps/rejected": -398.69842529296875, "loss": 0.6225, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -99.20040893554688, "rewards/margins": 40.127742767333984, "rewards/rejected": -139.32815551757812, "step": 900 }, { "epoch": 0.2381575503794818, "grad_norm": 49.28741036089969, "learning_rate": 4.7156782655754624e-07, "logits/chosen": -0.03910301998257637, "logits/rejected": 0.24898621439933777, "logps/chosen": -387.42022705078125, "logps/rejected": -459.38262939453125, "loss": 0.6485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -113.9815673828125, "rewards/margins": 61.421913146972656, "rewards/rejected": -175.40347290039062, "step": 910 }, { "epoch": 0.24077466631771788, "grad_norm": 20.641644935857332, "learning_rate": 4.705005045028414e-07, "logits/chosen": 0.2061987817287445, "logits/rejected": 0.5889095664024353, "logps/chosen": -405.1602478027344, "logps/rejected": -368.80426025390625, "loss": 0.6533, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -141.77035522460938, "rewards/margins": 24.687891006469727, "rewards/rejected": -166.45823669433594, "step": 920 }, { "epoch": 0.24339178225595393, "grad_norm": 42.863101044059015, "learning_rate": 4.694147707194659e-07, "logits/chosen": 0.5138073563575745, "logits/rejected": 1.4130924940109253, "logps/chosen": -334.7785339355469, "logps/rejected": -423.53521728515625, "loss": 0.62, "rewards/accuracies": 0.75, "rewards/chosen": -111.02415466308594, "rewards/margins": 68.88450622558594, "rewards/rejected": -179.90866088867188, "step": 930 }, { "epoch": 0.24600889819419, "grad_norm": 17.582236860339187, "learning_rate": 4.683107158658781e-07, "logits/chosen": 0.26948803663253784, "logits/rejected": 1.579023838043213, "logps/chosen": -443.610107421875, "logps/rejected": -463.4444274902344, "loss": 0.6302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -142.50537109375, "rewards/margins": 68.03652954101562, "rewards/rejected": -210.54190063476562, "step": 940 }, { "epoch": 0.24862601413242608, "grad_norm": 30.341277759240167, "learning_rate": 4.6718843213034066e-07, "logits/chosen": -0.16793589293956757, "logits/rejected": 0.207515150308609, "logps/chosen": -403.0389099121094, "logps/rejected": -482.0472106933594, "loss": 0.6609, "rewards/accuracies": 0.625, "rewards/chosen": -112.68156433105469, "rewards/margins": 50.542869567871094, "rewards/rejected": -163.22445678710938, "step": 950 }, { "epoch": 0.2512431300706621, "grad_norm": 18.41663797308415, "learning_rate": 4.660480132232224e-07, "logits/chosen": -0.8736503720283508, "logits/rejected": 0.033030878752470016, "logps/chosen": -317.91522216796875, "logps/rejected": -392.4979553222656, "loss": 0.6237, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -76.37789916992188, "rewards/margins": 52.58796310424805, "rewards/rejected": -128.9658660888672, "step": 960 }, { "epoch": 0.25386024600889817, "grad_norm": 23.17667895149893, "learning_rate": 4.64889554369174e-07, "logits/chosen": -0.007824582047760487, "logits/rejected": 0.285226970911026, "logps/chosen": -247.00167846679688, "logps/rejected": -351.49774169921875, "loss": 0.6313, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -67.98593139648438, "rewards/margins": 68.30857849121094, "rewards/rejected": -136.2945098876953, "step": 970 }, { "epoch": 0.2564773619471343, "grad_norm": 25.935374616820546, "learning_rate": 4.637131522991764e-07, "logits/chosen": -0.6176963448524475, "logits/rejected": -0.6724565029144287, "logps/chosen": -349.24371337890625, "logps/rejected": -418.0118103027344, "loss": 0.6512, "rewards/accuracies": 0.625, "rewards/chosen": -93.6009292602539, "rewards/margins": 18.03414535522461, "rewards/rejected": -111.63508605957031, "step": 980 }, { "epoch": 0.2590944778853703, "grad_norm": 60.752178743088585, "learning_rate": 4.6251890524246375e-07, "logits/chosen": 0.0293558482080698, "logits/rejected": 1.5651161670684814, "logps/chosen": -417.18255615234375, "logps/rejected": -387.0553283691406, "loss": 0.6242, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -90.68150329589844, "rewards/margins": 60.28722381591797, "rewards/rejected": -150.96873474121094, "step": 990 }, { "epoch": 0.26171159382360637, "grad_norm": 50.547676030801206, "learning_rate": 4.613069129183218e-07, "logits/chosen": 1.1105889081954956, "logits/rejected": 1.4824490547180176, "logps/chosen": -373.8270263671875, "logps/rejected": -459.0462951660156, "loss": 0.6817, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -129.76486206054688, "rewards/margins": 66.0315933227539, "rewards/rejected": -195.7964324951172, "step": 1000 }, { "epoch": 0.2643287097618425, "grad_norm": 31.19672071911541, "learning_rate": 4.6007727652776065e-07, "logits/chosen": 1.3123009204864502, "logits/rejected": 1.8814923763275146, "logps/chosen": -388.00616455078125, "logps/rejected": -473.76318359375, "loss": 0.6325, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -136.20584106445312, "rewards/margins": 68.31822204589844, "rewards/rejected": -204.5240478515625, "step": 1010 }, { "epoch": 0.2669458257000785, "grad_norm": 24.38940664386023, "learning_rate": 4.588300987450652e-07, "logits/chosen": 0.7850022315979004, "logits/rejected": 1.2041956186294556, "logps/chosen": -416.10125732421875, "logps/rejected": -472.38397216796875, "loss": 0.6221, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -122.00775146484375, "rewards/margins": 70.46678161621094, "rewards/rejected": -192.4745330810547, "step": 1020 }, { "epoch": 0.26956294163831457, "grad_norm": 32.57226417553651, "learning_rate": 4.5756548370922134e-07, "logits/chosen": 0.8883573412895203, "logits/rejected": 1.642805814743042, "logps/chosen": -326.7561950683594, "logps/rejected": -388.120849609375, "loss": 0.6242, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -104.36289978027344, "rewards/margins": 54.60386276245117, "rewards/rejected": -158.96676635742188, "step": 1030 }, { "epoch": 0.2721800575765506, "grad_norm": 46.81257101957739, "learning_rate": 4.5628353701522047e-07, "logits/chosen": -0.09301260858774185, "logits/rejected": 1.3069690465927124, "logps/chosen": -414.9811096191406, "logps/rejected": -459.68389892578125, "loss": 0.6488, "rewards/accuracies": 0.75, "rewards/chosen": -83.94697570800781, "rewards/margins": 63.28023147583008, "rewards/rejected": -147.22718811035156, "step": 1040 }, { "epoch": 0.2747971735147867, "grad_norm": 28.130062724101716, "learning_rate": 4.549843657052429e-07, "logits/chosen": -0.6466625332832336, "logits/rejected": 0.41721582412719727, "logps/chosen": -322.29290771484375, "logps/rejected": -438.45501708984375, "loss": 0.6264, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -61.9997444152832, "rewards/margins": 81.4615249633789, "rewards/rejected": -143.46127319335938, "step": 1050 }, { "epoch": 0.27741428945302277, "grad_norm": 40.68291107851666, "learning_rate": 4.5366807825971907e-07, "logits/chosen": -0.6572477221488953, "logits/rejected": 0.26327458024024963, "logps/chosen": -343.57025146484375, "logps/rejected": -353.9472351074219, "loss": 0.6314, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -78.26343536376953, "rewards/margins": 58.32725143432617, "rewards/rejected": -136.59066772460938, "step": 1060 }, { "epoch": 0.2800314053912588, "grad_norm": 18.424013249011935, "learning_rate": 4.5233478458827176e-07, "logits/chosen": -1.0701769590377808, "logits/rejected": -0.38552147150039673, "logps/chosen": -371.5882263183594, "logps/rejected": -432.673583984375, "loss": 0.6271, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -81.17890930175781, "rewards/margins": 78.04898834228516, "rewards/rejected": -159.22787475585938, "step": 1070 }, { "epoch": 0.2826485213294949, "grad_norm": 29.259874543341486, "learning_rate": 4.509845960205389e-07, "logits/chosen": -0.3320876955986023, "logits/rejected": 0.8465584516525269, "logps/chosen": -347.73455810546875, "logps/rejected": -393.71173095703125, "loss": 0.6556, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -81.61093139648438, "rewards/margins": 78.703369140625, "rewards/rejected": -160.31430053710938, "step": 1080 }, { "epoch": 0.28526563726773096, "grad_norm": 54.92918197903394, "learning_rate": 4.4961762529687736e-07, "logits/chosen": 1.3831452131271362, "logits/rejected": 2.0626230239868164, "logps/chosen": -394.3235168457031, "logps/rejected": -404.1439208984375, "loss": 0.653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -137.50064086914062, "rewards/margins": 56.01093673706055, "rewards/rejected": -193.51158142089844, "step": 1090 }, { "epoch": 0.287882753205967, "grad_norm": 53.85836537057901, "learning_rate": 4.482339865589492e-07, "logits/chosen": 1.0399128198623657, "logits/rejected": 2.5773065090179443, "logps/chosen": -447.4933166503906, "logps/rejected": -476.53717041015625, "loss": 0.6421, "rewards/accuracies": 0.75, "rewards/chosen": -129.35110473632812, "rewards/margins": 97.5198745727539, "rewards/rejected": -226.8709716796875, "step": 1100 }, { "epoch": 0.2904998691442031, "grad_norm": 29.02990654409812, "learning_rate": 4.4683379534019076e-07, "logits/chosen": 1.8409522771835327, "logits/rejected": 2.082953929901123, "logps/chosen": -398.29193115234375, "logps/rejected": -438.3072204589844, "loss": 0.6664, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -144.30996704101562, "rewards/margins": 39.263938903808594, "rewards/rejected": -183.57391357421875, "step": 1110 }, { "epoch": 0.29311698508243916, "grad_norm": 17.300862424332053, "learning_rate": 4.4541716855616593e-07, "logits/chosen": 1.154475450515747, "logits/rejected": 1.1236993074417114, "logps/chosen": -351.427978515625, "logps/rejected": -408.2262878417969, "loss": 0.6382, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -128.45620727539062, "rewards/margins": 44.44871139526367, "rewards/rejected": -172.90492248535156, "step": 1120 }, { "epoch": 0.2957341010206752, "grad_norm": 31.459949619459707, "learning_rate": 4.4398422449480357e-07, "logits/chosen": 0.3457348048686981, "logits/rejected": 0.668042778968811, "logps/chosen": -417.75115966796875, "logps/rejected": -487.89794921875, "loss": 0.632, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -119.79643249511719, "rewards/margins": 72.1993408203125, "rewards/rejected": -191.99578857421875, "step": 1130 }, { "epoch": 0.29835121695891126, "grad_norm": 24.115738554276895, "learning_rate": 4.4253508280652036e-07, "logits/chosen": 0.2072986364364624, "logits/rejected": 2.0747737884521484, "logps/chosen": -435.58673095703125, "logps/rejected": -402.98089599609375, "loss": 0.6209, "rewards/accuracies": 0.875, "rewards/chosen": -113.14810943603516, "rewards/margins": 88.4394302368164, "rewards/rejected": -201.58753967285156, "step": 1140 }, { "epoch": 0.30096833289714736, "grad_norm": 38.44970369580517, "learning_rate": 4.410698644942302e-07, "logits/chosen": 1.2417165040969849, "logits/rejected": 2.5007147789001465, "logps/chosen": -380.4283447265625, "logps/rejected": -374.45361328125, "loss": 0.6236, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -100.56785583496094, "rewards/margins": 60.188575744628906, "rewards/rejected": -160.7564239501953, "step": 1150 }, { "epoch": 0.3035854488353834, "grad_norm": 23.20243908616475, "learning_rate": 4.3958869190324057e-07, "logits/chosen": 1.181898593902588, "logits/rejected": 1.9439668655395508, "logps/chosen": -388.64276123046875, "logps/rejected": -410.17242431640625, "loss": 0.6239, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -98.65071105957031, "rewards/margins": 53.087440490722656, "rewards/rejected": -151.73812866210938, "step": 1160 }, { "epoch": 0.30620256477361946, "grad_norm": 27.16772844182103, "learning_rate": 4.380916887110365e-07, "logits/chosen": 0.9662491679191589, "logits/rejected": 2.0781798362731934, "logps/chosen": -342.8768005371094, "logps/rejected": -418.36102294921875, "loss": 0.6404, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -107.91712951660156, "rewards/margins": 76.5067138671875, "rewards/rejected": -184.42385864257812, "step": 1170 }, { "epoch": 0.30881968071185556, "grad_norm": 23.646068288118425, "learning_rate": 4.3657897991695394e-07, "logits/chosen": 0.5617043375968933, "logits/rejected": 1.8490877151489258, "logps/chosen": -370.76507568359375, "logps/rejected": -392.9139099121094, "loss": 0.6494, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -107.6982421875, "rewards/margins": 75.37187957763672, "rewards/rejected": -183.0701446533203, "step": 1180 }, { "epoch": 0.3114367966500916, "grad_norm": 17.745565087194286, "learning_rate": 4.350506918317416e-07, "logits/chosen": 0.6922825574874878, "logits/rejected": 1.2396411895751953, "logps/chosen": -435.5896911621094, "logps/rejected": -460.70233154296875, "loss": 0.6229, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -120.14180755615234, "rewards/margins": 44.74254608154297, "rewards/rejected": -164.88436889648438, "step": 1190 }, { "epoch": 0.31405391258832765, "grad_norm": 41.0845785288509, "learning_rate": 4.335069520670149e-07, "logits/chosen": 1.7557027339935303, "logits/rejected": 2.1554341316223145, "logps/chosen": -435.21697998046875, "logps/rejected": -479.82550048828125, "loss": 0.6491, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -166.35964965820312, "rewards/margins": 48.9127311706543, "rewards/rejected": -215.2723846435547, "step": 1200 }, { "epoch": 0.3166710285265637, "grad_norm": 62.23468705652573, "learning_rate": 4.319478895245999e-07, "logits/chosen": 1.0253560543060303, "logits/rejected": 1.471136450767517, "logps/chosen": -379.4768981933594, "logps/rejected": -497.72113037109375, "loss": 0.5809, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -130.55929565429688, "rewards/margins": 83.6629638671875, "rewards/rejected": -214.22225952148438, "step": 1210 }, { "epoch": 0.3192881444647998, "grad_norm": 26.246391316141533, "learning_rate": 4.3037363438577036e-07, "logits/chosen": 0.5762010216712952, "logits/rejected": 1.9362785816192627, "logps/chosen": -396.3133544921875, "logps/rejected": -516.368408203125, "loss": 0.5856, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -113.91856384277344, "rewards/margins": 118.1727523803711, "rewards/rejected": -232.09133911132812, "step": 1220 }, { "epoch": 0.32190526040303585, "grad_norm": 49.15854501313321, "learning_rate": 4.2878431810037716e-07, "logits/chosen": 1.5246806144714355, "logits/rejected": 2.3451852798461914, "logps/chosen": -372.4715576171875, "logps/rejected": -474.74169921875, "loss": 0.6256, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -156.90863037109375, "rewards/margins": 109.8390121459961, "rewards/rejected": -266.74761962890625, "step": 1230 }, { "epoch": 0.3245223763412719, "grad_norm": 40.303888351983076, "learning_rate": 4.271800733758729e-07, "logits/chosen": 0.6723651885986328, "logits/rejected": 1.2088282108306885, "logps/chosen": -459.427490234375, "logps/rejected": -454.59136962890625, "loss": 0.6454, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -121.82243347167969, "rewards/margins": 57.75952911376953, "rewards/rejected": -179.58197021484375, "step": 1240 }, { "epoch": 0.327139492279508, "grad_norm": 25.566591095191676, "learning_rate": 4.255610341662304e-07, "logits/chosen": -0.0015428184997290373, "logits/rejected": 0.6781773567199707, "logps/chosen": -351.0030212402344, "logps/rejected": -389.29718017578125, "loss": 0.6228, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -99.91766357421875, "rewards/margins": 89.89549255371094, "rewards/rejected": -189.81317138671875, "step": 1250 }, { "epoch": 0.32975660821774405, "grad_norm": 76.72537095439779, "learning_rate": 4.2392733566075757e-07, "logits/chosen": -0.43047595024108887, "logits/rejected": 0.378257155418396, "logps/chosen": -412.3002014160156, "logps/rejected": -444.5391540527344, "loss": 0.6394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -121.55326843261719, "rewards/margins": 38.64310073852539, "rewards/rejected": -160.19638061523438, "step": 1260 }, { "epoch": 0.3323737241559801, "grad_norm": 52.61567100724301, "learning_rate": 4.2227911427280973e-07, "logits/chosen": -0.7219769954681396, "logits/rejected": 0.48331594467163086, "logps/chosen": -427.62054443359375, "logps/rejected": -450.5807189941406, "loss": 0.6512, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -113.51541900634766, "rewards/margins": 68.88684844970703, "rewards/rejected": -182.4022674560547, "step": 1270 }, { "epoch": 0.33499084009421615, "grad_norm": 30.88397795823836, "learning_rate": 4.206165076283982e-07, "logits/chosen": -0.9375940561294556, "logits/rejected": 1.5283622741699219, "logps/chosen": -421.84320068359375, "logps/rejected": -440.0548400878906, "loss": 0.5959, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -116.27479553222656, "rewards/margins": 98.38531494140625, "rewards/rejected": -214.6601104736328, "step": 1280 }, { "epoch": 0.33760795603245225, "grad_norm": 57.576483810059, "learning_rate": 4.1893965455469946e-07, "logits/chosen": -0.68849116563797, "logits/rejected": 1.0034667253494263, "logps/chosen": -374.0795593261719, "logps/rejected": -387.61724853515625, "loss": 0.6091, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -99.40745544433594, "rewards/margins": 77.72341918945312, "rewards/rejected": -177.13088989257812, "step": 1290 }, { "epoch": 0.3402250719706883, "grad_norm": 31.72284091111631, "learning_rate": 4.172486950684626e-07, "logits/chosen": 0.16339489817619324, "logits/rejected": 1.6479781866073608, "logps/chosen": -434.8985900878906, "logps/rejected": -434.3905334472656, "loss": 0.662, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -159.21035766601562, "rewards/margins": 49.35946273803711, "rewards/rejected": -208.56982421875, "step": 1300 }, { "epoch": 0.34284218790892435, "grad_norm": 24.64306569773067, "learning_rate": 4.155437703643181e-07, "logits/chosen": -0.7670415639877319, "logits/rejected": -0.2981819808483124, "logps/chosen": -374.2372741699219, "logps/rejected": -417.8731384277344, "loss": 0.6229, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -111.1018295288086, "rewards/margins": 54.38102340698242, "rewards/rejected": -165.4828643798828, "step": 1310 }, { "epoch": 0.34545930384716045, "grad_norm": 26.087342646568175, "learning_rate": 4.138250228029881e-07, "logits/chosen": -0.5596447587013245, "logits/rejected": 0.2671245038509369, "logps/chosen": -410.1627502441406, "logps/rejected": -433.45953369140625, "loss": 0.6282, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -120.9754867553711, "rewards/margins": 55.230552673339844, "rewards/rejected": -176.20603942871094, "step": 1320 }, { "epoch": 0.3480764197853965, "grad_norm": 20.376442525686873, "learning_rate": 4.1209259589939935e-07, "logits/chosen": -0.38580647110939026, "logits/rejected": 0.4782138764858246, "logps/chosen": -339.5039367675781, "logps/rejected": -380.3269958496094, "loss": 0.6346, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -94.53594970703125, "rewards/margins": 66.47621154785156, "rewards/rejected": -161.01217651367188, "step": 1330 }, { "epoch": 0.35069353572363254, "grad_norm": 21.37216900838956, "learning_rate": 4.103466343106998e-07, "logits/chosen": -0.649118185043335, "logits/rejected": 0.28338345885276794, "logps/chosen": -398.08270263671875, "logps/rejected": -392.17437744140625, "loss": 0.644, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -98.56443786621094, "rewards/margins": 50.19805145263672, "rewards/rejected": -148.76248168945312, "step": 1340 }, { "epoch": 0.35331065166186865, "grad_norm": 28.767958733320434, "learning_rate": 4.085872838241796e-07, "logits/chosen": -0.13218168914318085, "logits/rejected": 0.9735630750656128, "logps/chosen": -355.2423095703125, "logps/rejected": -426.5146484375, "loss": 0.612, "rewards/accuracies": 0.75, "rewards/chosen": -110.88926696777344, "rewards/margins": 95.35261535644531, "rewards/rejected": -206.2418975830078, "step": 1350 }, { "epoch": 0.3559277676001047, "grad_norm": 28.419877906639005, "learning_rate": 4.06814691345098e-07, "logits/chosen": -0.24849538505077362, "logits/rejected": 1.1262340545654297, "logps/chosen": -443.79180908203125, "logps/rejected": -455.39990234375, "loss": 0.6039, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -112.5448226928711, "rewards/margins": 78.75032806396484, "rewards/rejected": -191.295166015625, "step": 1360 }, { "epoch": 0.35854488353834074, "grad_norm": 68.82439419877379, "learning_rate": 4.0502900488441707e-07, "logits/chosen": 0.19888067245483398, "logits/rejected": 1.122232437133789, "logps/chosen": -423.8077697753906, "logps/rejected": -449.03509521484375, "loss": 0.6206, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -138.73602294921875, "rewards/margins": 72.9966812133789, "rewards/rejected": -211.7327117919922, "step": 1370 }, { "epoch": 0.3611619994765768, "grad_norm": 15.580407976066924, "learning_rate": 4.032303735464422e-07, "logits/chosen": 0.5571808218955994, "logits/rejected": 1.5645115375518799, "logps/chosen": -443.7701110839844, "logps/rejected": -538.642333984375, "loss": 0.6282, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -156.4287567138672, "rewards/margins": 87.27613830566406, "rewards/rejected": -243.7049102783203, "step": 1380 }, { "epoch": 0.3637791154148129, "grad_norm": 25.711219077342538, "learning_rate": 4.014189475163726e-07, "logits/chosen": 0.42829257249832153, "logits/rejected": 2.6174747943878174, "logps/chosen": -480.5956115722656, "logps/rejected": -478.7676696777344, "loss": 0.6314, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -151.87530517578125, "rewards/margins": 86.75433349609375, "rewards/rejected": -238.629638671875, "step": 1390 }, { "epoch": 0.36639623135304894, "grad_norm": 19.353569261594092, "learning_rate": 3.995948780477605e-07, "logits/chosen": 0.7079881429672241, "logits/rejected": 2.395602226257324, "logps/chosen": -477.9998474121094, "logps/rejected": -517.8780517578125, "loss": 0.6215, "rewards/accuracies": 0.75, "rewards/chosen": -151.6291961669922, "rewards/margins": 93.9462890625, "rewards/rejected": -245.5754852294922, "step": 1400 }, { "epoch": 0.369013347291285, "grad_norm": 29.67895552774033, "learning_rate": 3.977583174498816e-07, "logits/chosen": 0.24756821990013123, "logits/rejected": 0.8279555439949036, "logps/chosen": -358.0359802246094, "logps/rejected": -396.7525939941406, "loss": 0.6623, "rewards/accuracies": 0.625, "rewards/chosen": -118.51368713378906, "rewards/margins": 39.870086669921875, "rewards/rejected": -158.38375854492188, "step": 1410 }, { "epoch": 0.3716304632295211, "grad_norm": 22.478816315434774, "learning_rate": 3.9590941907501717e-07, "logits/chosen": 0.16131027042865753, "logits/rejected": 1.7322285175323486, "logps/chosen": -397.6242370605469, "logps/rejected": -410.8728942871094, "loss": 0.6435, "rewards/accuracies": 0.75, "rewards/chosen": -137.3665008544922, "rewards/margins": 59.99677658081055, "rewards/rejected": -197.36328125, "step": 1420 }, { "epoch": 0.37424757916775714, "grad_norm": 25.09540882981143, "learning_rate": 3.9404833730564974e-07, "logits/chosen": 0.5373800992965698, "logits/rejected": 1.047599196434021, "logps/chosen": -433.1474609375, "logps/rejected": -441.2938537597656, "loss": 0.6193, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -142.21206665039062, "rewards/margins": 46.76278305053711, "rewards/rejected": -188.97482299804688, "step": 1430 }, { "epoch": 0.3768646951059932, "grad_norm": 15.852691574589016, "learning_rate": 3.9217522754157117e-07, "logits/chosen": -0.2401774376630783, "logits/rejected": 0.3488456606864929, "logps/chosen": -358.58544921875, "logps/rejected": -401.83233642578125, "loss": 0.5961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -120.71138000488281, "rewards/margins": 69.83918762207031, "rewards/rejected": -190.55055236816406, "step": 1440 }, { "epoch": 0.37948181104422923, "grad_norm": 47.29678098178137, "learning_rate": 3.9029024618690785e-07, "logits/chosen": -0.9757177233695984, "logits/rejected": 0.08731711655855179, "logps/chosen": -320.13580322265625, "logps/rejected": -409.79010009765625, "loss": 0.6304, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -109.36761474609375, "rewards/margins": 76.31434631347656, "rewards/rejected": -185.68197631835938, "step": 1450 }, { "epoch": 0.38209892698246534, "grad_norm": 20.648940769058072, "learning_rate": 3.883935506370605e-07, "logits/chosen": -0.8205481767654419, "logits/rejected": 0.7159221172332764, "logps/chosen": -396.99591064453125, "logps/rejected": -503.263916015625, "loss": 0.6008, "rewards/accuracies": 0.875, "rewards/chosen": -131.62887573242188, "rewards/margins": 118.54338073730469, "rewards/rejected": -250.1722412109375, "step": 1460 }, { "epoch": 0.3847160429207014, "grad_norm": 34.965513352204376, "learning_rate": 3.864852992655616e-07, "logits/chosen": -0.4378627836704254, "logits/rejected": 1.2069480419158936, "logps/chosen": -369.70904541015625, "logps/rejected": -504.4092712402344, "loss": 0.6134, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -133.9385986328125, "rewards/margins": 106.6314468383789, "rewards/rejected": -240.57003784179688, "step": 1470 }, { "epoch": 0.38733315885893743, "grad_norm": 39.359286384644776, "learning_rate": 3.845656514108515e-07, "logits/chosen": -0.2930401861667633, "logits/rejected": 1.5885709524154663, "logps/chosen": -504.27288818359375, "logps/rejected": -531.5779418945312, "loss": 0.6244, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -164.9402313232422, "rewards/margins": 104.5406494140625, "rewards/rejected": -269.48089599609375, "step": 1480 }, { "epoch": 0.38995027479717354, "grad_norm": 133.50854420589513, "learning_rate": 3.8263476736297375e-07, "logits/chosen": -0.17435605823993683, "logits/rejected": 1.8973588943481445, "logps/chosen": -486.27569580078125, "logps/rejected": -542.3553466796875, "loss": 0.6385, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -172.06533813476562, "rewards/margins": 126.09920501708984, "rewards/rejected": -298.1645202636719, "step": 1490 }, { "epoch": 0.3925673907354096, "grad_norm": 51.73277823324011, "learning_rate": 3.8069280835019055e-07, "logits/chosen": 0.6358996033668518, "logits/rejected": 1.3472615480422974, "logps/chosen": -401.0054626464844, "logps/rejected": -480.7518005371094, "loss": 0.6402, "rewards/accuracies": 0.75, "rewards/chosen": -143.2891387939453, "rewards/margins": 92.3449478149414, "rewards/rejected": -235.6341094970703, "step": 1500 }, { "epoch": 0.39518450667364563, "grad_norm": 21.226979866730023, "learning_rate": 3.7873993652552073e-07, "logits/chosen": 1.3380610942840576, "logits/rejected": 2.3835110664367676, "logps/chosen": -434.25274658203125, "logps/rejected": -512.7623901367188, "loss": 0.6167, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -170.42994689941406, "rewards/margins": 104.90080261230469, "rewards/rejected": -275.3307189941406, "step": 1510 }, { "epoch": 0.39780162261188173, "grad_norm": 25.919917334233475, "learning_rate": 3.767763149531995e-07, "logits/chosen": 1.0969598293304443, "logits/rejected": 2.068394660949707, "logps/chosen": -389.23974609375, "logps/rejected": -449.2406311035156, "loss": 0.6114, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -158.19741821289062, "rewards/margins": 76.74111938476562, "rewards/rejected": -234.9385223388672, "step": 1520 }, { "epoch": 0.4004187385501178, "grad_norm": 16.290946941569523, "learning_rate": 3.7480210759506326e-07, "logits/chosen": 0.4282687306404114, "logits/rejected": 1.8052078485488892, "logps/chosen": -349.53875732421875, "logps/rejected": -432.3233337402344, "loss": 0.6152, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -122.47404479980469, "rewards/margins": 89.38887023925781, "rewards/rejected": -211.86288452148438, "step": 1530 }, { "epoch": 0.40303585448835383, "grad_norm": 22.714557755720946, "learning_rate": 3.728174792968582e-07, "logits/chosen": 0.4320814609527588, "logits/rejected": 0.4742346405982971, "logps/chosen": -325.5453186035156, "logps/rejected": -462.4740295410156, "loss": 0.6216, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -116.84428405761719, "rewards/margins": 77.38162994384766, "rewards/rejected": -194.2259063720703, "step": 1540 }, { "epoch": 0.4056529704265899, "grad_norm": 26.358328629224562, "learning_rate": 3.70822595774476e-07, "logits/chosen": -0.13673920929431915, "logits/rejected": 0.8279534578323364, "logps/chosen": -367.3028259277344, "logps/rejected": -376.034912109375, "loss": 0.6665, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -137.3899688720703, "rewards/margins": 37.6882209777832, "rewards/rejected": -175.0781707763672, "step": 1550 }, { "epoch": 0.408270086364826, "grad_norm": 28.476109770564317, "learning_rate": 3.688176236001168e-07, "logits/chosen": -0.4365410804748535, "logits/rejected": 0.026941100135445595, "logps/chosen": -361.25994873046875, "logps/rejected": -420.5032653808594, "loss": 0.6545, "rewards/accuracies": 0.625, "rewards/chosen": -106.12479400634766, "rewards/margins": 39.8696174621582, "rewards/rejected": -145.99441528320312, "step": 1560 }, { "epoch": 0.410887202303062, "grad_norm": 28.010241239383195, "learning_rate": 3.6680273018838016e-07, "logits/chosen": -1.1525790691375732, "logits/rejected": 0.04848083108663559, "logps/chosen": -332.90716552734375, "logps/rejected": -363.8558044433594, "loss": 0.6434, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -97.48597717285156, "rewards/margins": 61.830177307128906, "rewards/rejected": -159.31617736816406, "step": 1570 }, { "epoch": 0.4135043182412981, "grad_norm": 20.182263158112, "learning_rate": 3.6477808378228596e-07, "logits/chosen": -0.7257155776023865, "logits/rejected": 0.9191628694534302, "logps/chosen": -387.0204162597656, "logps/rejected": -464.8163146972656, "loss": 0.5897, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -95.88233947753906, "rewards/margins": 116.20806884765625, "rewards/rejected": -212.0903778076172, "step": 1580 }, { "epoch": 0.4161214341795342, "grad_norm": 23.991143684652364, "learning_rate": 3.6274385343922674e-07, "logits/chosen": -0.1758570373058319, "logits/rejected": 1.9301687479019165, "logps/chosen": -406.59161376953125, "logps/rejected": -472.0311584472656, "loss": 0.6064, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -98.99046325683594, "rewards/margins": 119.24066162109375, "rewards/rejected": -218.2311248779297, "step": 1590 }, { "epoch": 0.4187385501177702, "grad_norm": 28.972630342872332, "learning_rate": 3.6070020901685057e-07, "logits/chosen": 0.7765110731124878, "logits/rejected": 1.9538475275039673, "logps/chosen": -423.0254821777344, "logps/rejected": -460.91815185546875, "loss": 0.6559, "rewards/accuracies": 0.625, "rewards/chosen": -169.84713745117188, "rewards/margins": 64.01656341552734, "rewards/rejected": -233.8636932373047, "step": 1600 }, { "epoch": 0.4213556660560063, "grad_norm": 41.02288224605125, "learning_rate": 3.5864732115887863e-07, "logits/chosen": 0.2611493766307831, "logits/rejected": 2.171125888824463, "logps/chosen": -425.1031799316406, "logps/rejected": -486.8959045410156, "loss": 0.6102, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -140.05665588378906, "rewards/margins": 108.6607437133789, "rewards/rejected": -248.7174072265625, "step": 1610 }, { "epoch": 0.4239727819942423, "grad_norm": 25.694369532356813, "learning_rate": 3.565853612808562e-07, "logits/chosen": -0.3281291127204895, "logits/rejected": 1.471435785293579, "logps/chosen": -434.2025451660156, "logps/rejected": -423.0579528808594, "loss": 0.617, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -112.0459213256836, "rewards/margins": 81.47265625, "rewards/rejected": -193.51858520507812, "step": 1620 }, { "epoch": 0.4265898979324784, "grad_norm": 22.37578326192518, "learning_rate": 3.5451450155583984e-07, "logits/chosen": -0.8572834134101868, "logits/rejected": 0.5741704702377319, "logps/chosen": -437.5694274902344, "logps/rejected": -434.1827087402344, "loss": 0.6029, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -104.87544250488281, "rewards/margins": 64.55244445800781, "rewards/rejected": -169.42788696289062, "step": 1630 }, { "epoch": 0.42920701387071447, "grad_norm": 22.286497715425547, "learning_rate": 3.5243491490002055e-07, "logits/chosen": 0.08054514974355698, "logits/rejected": 0.3288571834564209, "logps/chosen": -337.58953857421875, "logps/rejected": -490.2518615722656, "loss": 0.6122, "rewards/accuracies": 0.75, "rewards/chosen": -115.36927795410156, "rewards/margins": 88.35755157470703, "rewards/rejected": -203.726806640625, "step": 1640 }, { "epoch": 0.4318241298089505, "grad_norm": 17.18902655970043, "learning_rate": 3.503467749582857e-07, "logits/chosen": -0.857958972454071, "logits/rejected": 0.0014857888454571366, "logps/chosen": -389.71380615234375, "logps/rejected": -466.3826599121094, "loss": 0.6358, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -107.03041076660156, "rewards/margins": 70.32191467285156, "rewards/rejected": -177.35232543945312, "step": 1650 }, { "epoch": 0.4344412457471866, "grad_norm": 102.49373012437187, "learning_rate": 3.482502560897194e-07, "logits/chosen": -0.3570778965950012, "logits/rejected": 0.19285421073436737, "logps/chosen": -311.11236572265625, "logps/rejected": -359.0173645019531, "loss": 0.6409, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -113.4951171875, "rewards/margins": 34.06425857543945, "rewards/rejected": -147.5593719482422, "step": 1660 }, { "epoch": 0.43705836168542267, "grad_norm": 30.752504031354707, "learning_rate": 3.4614553335304403e-07, "logits/chosen": -0.8707353472709656, "logits/rejected": -0.01903078332543373, "logps/chosen": -457.0826110839844, "logps/rejected": -451.1153259277344, "loss": 0.6616, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -121.56687927246094, "rewards/margins": 53.41261672973633, "rewards/rejected": -174.9794921875, "step": 1670 }, { "epoch": 0.4396754776236587, "grad_norm": 22.035717198899118, "learning_rate": 3.440327824920022e-07, "logits/chosen": -0.7067769765853882, "logits/rejected": 0.44209250807762146, "logps/chosen": -370.5931091308594, "logps/rejected": -435.90020751953125, "loss": 0.6212, "rewards/accuracies": 0.75, "rewards/chosen": -93.64749908447266, "rewards/margins": 78.0377197265625, "rewards/rejected": -171.68521118164062, "step": 1680 }, { "epoch": 0.44229259356189476, "grad_norm": 80.94342434820176, "learning_rate": 3.4191217992068287e-07, "logits/chosen": -0.795087456703186, "logits/rejected": -0.14704521000385284, "logps/chosen": -319.4860534667969, "logps/rejected": -388.4473571777344, "loss": 0.5818, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -114.21246337890625, "rewards/margins": 56.797027587890625, "rewards/rejected": -171.0094757080078, "step": 1690 }, { "epoch": 0.44490970950013087, "grad_norm": 30.56889810788254, "learning_rate": 3.3978390270879056e-07, "logits/chosen": -0.2528185248374939, "logits/rejected": 1.1413016319274902, "logps/chosen": -351.8018493652344, "logps/rejected": -426.44696044921875, "loss": 0.64, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -101.383056640625, "rewards/margins": 88.7364273071289, "rewards/rejected": -190.11947631835938, "step": 1700 }, { "epoch": 0.4475268254383669, "grad_norm": 36.49606260048181, "learning_rate": 3.376481285668599e-07, "logits/chosen": -0.07895590364933014, "logits/rejected": 1.1439310312271118, "logps/chosen": -411.49530029296875, "logps/rejected": -513.4984130859375, "loss": 0.6144, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -121.69512939453125, "rewards/margins": 86.84122467041016, "rewards/rejected": -208.53634643554688, "step": 1710 }, { "epoch": 0.45014394137660296, "grad_norm": 66.13535061783702, "learning_rate": 3.355050358314172e-07, "logits/chosen": 0.14188793301582336, "logits/rejected": 1.6301027536392212, "logps/chosen": -429.80035400390625, "logps/rejected": -522.5330810546875, "loss": 0.6426, "rewards/accuracies": 0.875, "rewards/chosen": -124.7596664428711, "rewards/margins": 104.18574523925781, "rewards/rejected": -228.9454345703125, "step": 1720 }, { "epoch": 0.45276105731483907, "grad_norm": 29.691570545902252, "learning_rate": 3.33354803450089e-07, "logits/chosen": 1.466657280921936, "logits/rejected": 1.8799492120742798, "logps/chosen": -456.29180908203125, "logps/rejected": -482.02435302734375, "loss": 0.6357, "rewards/accuracies": 0.75, "rewards/chosen": -154.3424072265625, "rewards/margins": 59.94362258911133, "rewards/rejected": -214.28604125976562, "step": 1730 }, { "epoch": 0.4553781732530751, "grad_norm": 46.404207705281095, "learning_rate": 3.311976109666605e-07, "logits/chosen": 0.9557245373725891, "logits/rejected": 2.0985617637634277, "logps/chosen": -417.13018798828125, "logps/rejected": -554.0835571289062, "loss": 0.6334, "rewards/accuracies": 0.875, "rewards/chosen": -138.3618621826172, "rewards/margins": 120.8489990234375, "rewards/rejected": -259.21087646484375, "step": 1740 }, { "epoch": 0.45799528919131116, "grad_norm": 31.419258030314534, "learning_rate": 3.2903363850608317e-07, "logits/chosen": 1.717154860496521, "logits/rejected": 2.31295108795166, "logps/chosen": -435.69207763671875, "logps/rejected": -515.6895751953125, "loss": 0.6113, "rewards/accuracies": 0.75, "rewards/chosen": -178.9014129638672, "rewards/margins": 81.10930633544922, "rewards/rejected": -260.0107116699219, "step": 1750 }, { "epoch": 0.46061240512954726, "grad_norm": 22.348281616345037, "learning_rate": 3.2686306675943477e-07, "logits/chosen": 1.0877479314804077, "logits/rejected": 3.3799686431884766, "logps/chosen": -437.88922119140625, "logps/rejected": -477.70269775390625, "loss": 0.6238, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -176.03854370117188, "rewards/margins": 105.66455078125, "rewards/rejected": -281.703125, "step": 1760 }, { "epoch": 0.4632295210677833, "grad_norm": 29.335315749809492, "learning_rate": 3.2468607696883145e-07, "logits/chosen": 1.5161404609680176, "logits/rejected": 2.1498818397521973, "logps/chosen": -442.09552001953125, "logps/rejected": -503.4599609375, "loss": 0.6141, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -176.09494018554688, "rewards/margins": 75.00154876708984, "rewards/rejected": -251.0964813232422, "step": 1770 }, { "epoch": 0.46584663700601936, "grad_norm": 44.758684680886404, "learning_rate": 3.2250285091229435e-07, "logits/chosen": 1.6801475286483765, "logits/rejected": 2.2319846153259277, "logps/chosen": -435.25421142578125, "logps/rejected": -482.51409912109375, "loss": 0.6287, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -183.05035400390625, "rewards/margins": 72.37165832519531, "rewards/rejected": -255.4219970703125, "step": 1780 }, { "epoch": 0.4684637529442554, "grad_norm": 31.858283788755106, "learning_rate": 3.2031357088857083e-07, "logits/chosen": 2.3322768211364746, "logits/rejected": 3.3507091999053955, "logps/chosen": -466.07000732421875, "logps/rejected": -508.0931701660156, "loss": 0.6111, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -181.24215698242188, "rewards/margins": 83.99452209472656, "rewards/rejected": -265.23663330078125, "step": 1790 }, { "epoch": 0.4710808688824915, "grad_norm": 29.74675313933934, "learning_rate": 3.1811841970191267e-07, "logits/chosen": 1.5117183923721313, "logits/rejected": 2.4991650581359863, "logps/chosen": -473.57568359375, "logps/rejected": -536.4353637695312, "loss": 0.6134, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -162.0258331298828, "rewards/margins": 80.58038330078125, "rewards/rejected": -242.6062469482422, "step": 1800 }, { "epoch": 0.47369798482072756, "grad_norm": 27.35977465905274, "learning_rate": 3.1591758064681257e-07, "logits/chosen": 0.7779955863952637, "logits/rejected": 2.2222166061401367, "logps/chosen": -376.4127502441406, "logps/rejected": -483.91790771484375, "loss": 0.6337, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -168.23818969726562, "rewards/margins": 109.63935852050781, "rewards/rejected": -277.8775634765625, "step": 1810 }, { "epoch": 0.4763151007589636, "grad_norm": 19.99968017850521, "learning_rate": 3.13711237492698e-07, "logits/chosen": 0.8206868171691895, "logits/rejected": 2.1102559566497803, "logps/chosen": -443.5704040527344, "logps/rejected": -483.1748046875, "loss": 0.6148, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -154.49142456054688, "rewards/margins": 85.27095794677734, "rewards/rejected": -239.7623748779297, "step": 1820 }, { "epoch": 0.4789322166971997, "grad_norm": 21.48288399086547, "learning_rate": 3.1149957446858767e-07, "logits/chosen": 0.4431106448173523, "logits/rejected": 1.139479398727417, "logps/chosen": -385.325927734375, "logps/rejected": -440.3143005371094, "loss": 0.5981, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -143.1088104248047, "rewards/margins": 81.86103057861328, "rewards/rejected": -224.9698486328125, "step": 1830 }, { "epoch": 0.48154933263543576, "grad_norm": 27.109010660723932, "learning_rate": 3.0928277624770736e-07, "logits/chosen": 0.6995161771774292, "logits/rejected": 1.9806559085845947, "logps/chosen": -373.56597900390625, "logps/rejected": -450.35137939453125, "loss": 0.613, "rewards/accuracies": 0.75, "rewards/chosen": -123.50726318359375, "rewards/margins": 83.88824462890625, "rewards/rejected": -207.3955078125, "step": 1840 }, { "epoch": 0.4841664485736718, "grad_norm": 22.180659701287773, "learning_rate": 3.0706102793207073e-07, "logits/chosen": 0.9623844027519226, "logits/rejected": 1.8174850940704346, "logps/chosen": -443.29730224609375, "logps/rejected": -453.452392578125, "loss": 0.6264, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -143.41941833496094, "rewards/margins": 51.09202194213867, "rewards/rejected": -194.51145935058594, "step": 1850 }, { "epoch": 0.48678356451190785, "grad_norm": 26.242087699994133, "learning_rate": 3.048345150370226e-07, "logits/chosen": 0.8996448516845703, "logits/rejected": 1.7648754119873047, "logps/chosen": -344.4842529296875, "logps/rejected": -445.62921142578125, "loss": 0.6158, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -114.01617431640625, "rewards/margins": 96.75332641601562, "rewards/rejected": -210.76950073242188, "step": 1860 }, { "epoch": 0.48940068045014395, "grad_norm": 24.019046468101173, "learning_rate": 3.0260342347574913e-07, "logits/chosen": 0.12005790323019028, "logits/rejected": 2.031740188598633, "logps/chosen": -410.7184143066406, "logps/rejected": -538.4361572265625, "loss": 0.6192, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -140.0440216064453, "rewards/margins": 124.01192474365234, "rewards/rejected": -264.05596923828125, "step": 1870 }, { "epoch": 0.49201779638838, "grad_norm": 23.465321376516627, "learning_rate": 3.0036793954375357e-07, "logits/chosen": -0.1856812685728073, "logits/rejected": 0.7818856239318848, "logps/chosen": -422.85186767578125, "logps/rejected": -462.67852783203125, "loss": 0.6195, "rewards/accuracies": 0.75, "rewards/chosen": -120.50177001953125, "rewards/margins": 73.6959228515625, "rewards/rejected": -194.19769287109375, "step": 1880 }, { "epoch": 0.49463491232661605, "grad_norm": 34.7409595895184, "learning_rate": 2.9812824990330085e-07, "logits/chosen": 0.8234826922416687, "logits/rejected": 1.6676380634307861, "logps/chosen": -393.53399658203125, "logps/rejected": -445.2818908691406, "loss": 0.6249, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -143.31112670898438, "rewards/margins": 62.34083938598633, "rewards/rejected": -205.65194702148438, "step": 1890 }, { "epoch": 0.49725202826485215, "grad_norm": 46.69831571470922, "learning_rate": 2.958845415678316e-07, "logits/chosen": 0.357837975025177, "logits/rejected": 1.3686195611953735, "logps/chosen": -350.6862487792969, "logps/rejected": -424.646728515625, "loss": 0.6191, "rewards/accuracies": 0.75, "rewards/chosen": -139.62425231933594, "rewards/margins": 73.19682312011719, "rewards/rejected": -212.8210906982422, "step": 1900 }, { "epoch": 0.4998691442030882, "grad_norm": 31.257813261511235, "learning_rate": 2.936370018863459e-07, "logits/chosen": 0.6837180852890015, "logits/rejected": 1.4936373233795166, "logps/chosen": -393.9849548339844, "logps/rejected": -457.9052734375, "loss": 0.629, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -145.4834747314453, "rewards/margins": 73.64918518066406, "rewards/rejected": -219.1326446533203, "step": 1910 }, { "epoch": 0.5024862601413242, "grad_norm": 20.96702719121057, "learning_rate": 2.913858185277605e-07, "logits/chosen": 0.0933571457862854, "logits/rejected": 1.821059226989746, "logps/chosen": -408.1490173339844, "logps/rejected": -472.2452697753906, "loss": 0.6138, "rewards/accuracies": 0.75, "rewards/chosen": -137.5945587158203, "rewards/margins": 102.65091705322266, "rewards/rejected": -240.24545288085938, "step": 1920 }, { "epoch": 0.5051033760795604, "grad_norm": 48.88683198411564, "learning_rate": 2.89131179465238e-07, "logits/chosen": 0.2965295910835266, "logits/rejected": 1.3621985912322998, "logps/chosen": -435.49591064453125, "logps/rejected": -500.6187438964844, "loss": 0.639, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -165.7919158935547, "rewards/margins": 73.27894592285156, "rewards/rejected": -239.07089233398438, "step": 1930 }, { "epoch": 0.5077204920177963, "grad_norm": 16.11236019830028, "learning_rate": 2.8687327296049125e-07, "logits/chosen": 0.9378612637519836, "logits/rejected": 2.076969623565674, "logps/chosen": -454.8284606933594, "logps/rejected": -496.15289306640625, "loss": 0.6137, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -160.6776123046875, "rewards/margins": 83.29139709472656, "rewards/rejected": -243.96902465820312, "step": 1940 }, { "epoch": 0.5103376079560324, "grad_norm": 31.755303999536356, "learning_rate": 2.846122875480637e-07, "logits/chosen": 1.4026004076004028, "logits/rejected": 2.3323285579681396, "logps/chosen": -454.24560546875, "logps/rejected": -539.9759521484375, "loss": 0.6102, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -189.9915008544922, "rewards/margins": 90.0781021118164, "rewards/rejected": -280.0696105957031, "step": 1950 }, { "epoch": 0.5129547238942685, "grad_norm": 18.193341038011287, "learning_rate": 2.8234841201958647e-07, "logits/chosen": 1.2526977062225342, "logits/rejected": 2.378753423690796, "logps/chosen": -448.1241149902344, "logps/rejected": -493.2060546875, "loss": 0.6047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -148.1785888671875, "rewards/margins": 85.16536712646484, "rewards/rejected": -233.343994140625, "step": 1960 }, { "epoch": 0.5155718398325045, "grad_norm": 42.523096927869034, "learning_rate": 2.800818354080148e-07, "logits/chosen": 0.8163898587226868, "logits/rejected": 2.261420965194702, "logps/chosen": -426.493896484375, "logps/rejected": -436.91143798828125, "loss": 0.6468, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -162.7000732421875, "rewards/margins": 73.9859619140625, "rewards/rejected": -236.68606567382812, "step": 1970 }, { "epoch": 0.5181889557707406, "grad_norm": 28.67012335174378, "learning_rate": 2.778127469718435e-07, "logits/chosen": 0.3960002362728119, "logits/rejected": 1.7132211923599243, "logps/chosen": -446.56378173828125, "logps/rejected": -473.88348388671875, "loss": 0.6284, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -140.1986083984375, "rewards/margins": 91.8968505859375, "rewards/rejected": -232.095458984375, "step": 1980 }, { "epoch": 0.5208060717089767, "grad_norm": 46.10962534230826, "learning_rate": 2.755413361793039e-07, "logits/chosen": 0.23405644297599792, "logits/rejected": 1.4183191061019897, "logps/chosen": -392.7248840332031, "logps/rejected": -422.74835205078125, "loss": 0.6547, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -164.8948211669922, "rewards/margins": 51.055076599121094, "rewards/rejected": -215.9499053955078, "step": 1990 }, { "epoch": 0.5234231876472127, "grad_norm": 41.84711365480925, "learning_rate": 2.7326779269254356e-07, "logits/chosen": -0.1836518496274948, "logits/rejected": 0.5241420865058899, "logps/chosen": -464.1949157714844, "logps/rejected": -582.6480102539062, "loss": 0.6145, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -143.33895874023438, "rewards/margins": 63.12605667114258, "rewards/rejected": -206.4650421142578, "step": 2000 }, { "epoch": 0.5260403035854488, "grad_norm": 83.93444593272139, "learning_rate": 2.709923063517895e-07, "logits/chosen": 0.15282206237316132, "logits/rejected": 2.0228190422058105, "logps/chosen": -439.91326904296875, "logps/rejected": -409.6415100097656, "loss": 0.6282, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -131.47225952148438, "rewards/margins": 62.43913650512695, "rewards/rejected": -193.91139221191406, "step": 2010 }, { "epoch": 0.528657419523685, "grad_norm": 32.728531816402544, "learning_rate": 2.68715067159496e-07, "logits/chosen": 0.6938730478286743, "logits/rejected": 1.1131725311279297, "logps/chosen": -420.9944763183594, "logps/rejected": -477.00750732421875, "loss": 0.6323, "rewards/accuracies": 0.75, "rewards/chosen": -132.22010803222656, "rewards/margins": 75.46855163574219, "rewards/rejected": -207.6886444091797, "step": 2020 }, { "epoch": 0.5312745354619209, "grad_norm": 18.18975295132374, "learning_rate": 2.664362652644806e-07, "logits/chosen": 0.9111794233322144, "logits/rejected": 1.23908531665802, "logps/chosen": -419.4034118652344, "logps/rejected": -420.58935546875, "loss": 0.6008, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -156.475341796875, "rewards/margins": 43.2105712890625, "rewards/rejected": -199.68592834472656, "step": 2030 }, { "epoch": 0.533891651400157, "grad_norm": 99.13883885306136, "learning_rate": 2.6415609094604555e-07, "logits/chosen": 0.37541747093200684, "logits/rejected": 1.5170711278915405, "logps/chosen": -436.736328125, "logps/rejected": -561.4639892578125, "loss": 0.6181, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -134.65609741210938, "rewards/margins": 109.64598083496094, "rewards/rejected": -244.3020477294922, "step": 2040 }, { "epoch": 0.5365087673383931, "grad_norm": 24.440438938042078, "learning_rate": 2.618747345980904e-07, "logits/chosen": 0.6494592428207397, "logits/rejected": 2.528566598892212, "logps/chosen": -452.0528259277344, "logps/rejected": -499.9761657714844, "loss": 0.6047, "rewards/accuracies": 0.875, "rewards/chosen": -124.5580062866211, "rewards/margins": 98.02928161621094, "rewards/rejected": -222.5872802734375, "step": 2050 }, { "epoch": 0.5391258832766291, "grad_norm": 41.64782826080612, "learning_rate": 2.595923867132136e-07, "logits/chosen": 2.0148017406463623, "logits/rejected": 2.202955961227417, "logps/chosen": -395.0124206542969, "logps/rejected": -520.9989013671875, "loss": 0.5933, "rewards/accuracies": 0.75, "rewards/chosen": -150.31761169433594, "rewards/margins": 89.17308807373047, "rewards/rejected": -239.49072265625, "step": 2060 }, { "epoch": 0.5417429992148652, "grad_norm": 28.897098135028244, "learning_rate": 2.5730923786680667e-07, "logits/chosen": 0.9622161984443665, "logits/rejected": 2.1357316970825195, "logps/chosen": -442.57696533203125, "logps/rejected": -503.89483642578125, "loss": 0.596, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -129.53762817382812, "rewards/margins": 110.2646713256836, "rewards/rejected": -239.8023223876953, "step": 2070 }, { "epoch": 0.5443601151531012, "grad_norm": 37.61810979984936, "learning_rate": 2.5502547870114135e-07, "logits/chosen": 0.902941882610321, "logits/rejected": 1.3675785064697266, "logps/chosen": -324.65252685546875, "logps/rejected": -408.60992431640625, "loss": 0.6666, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -123.71934509277344, "rewards/margins": 79.65061950683594, "rewards/rejected": -203.3699951171875, "step": 2080 }, { "epoch": 0.5469772310913373, "grad_norm": 22.119217806890422, "learning_rate": 2.527412999094506e-07, "logits/chosen": 0.31701117753982544, "logits/rejected": 1.13541579246521, "logps/chosen": -423.2291564941406, "logps/rejected": -473.520263671875, "loss": 0.6529, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -144.65245056152344, "rewards/margins": 78.18202209472656, "rewards/rejected": -222.83447265625, "step": 2090 }, { "epoch": 0.5495943470295734, "grad_norm": 12.04734469718736, "learning_rate": 2.5045689222000636e-07, "logits/chosen": -0.7169623374938965, "logits/rejected": -0.41885489225387573, "logps/chosen": -376.8286437988281, "logps/rejected": -467.8668518066406, "loss": 0.6003, "rewards/accuracies": 0.875, "rewards/chosen": -99.94172668457031, "rewards/margins": 90.81985473632812, "rewards/rejected": -190.76156616210938, "step": 2100 }, { "epoch": 0.5522114629678094, "grad_norm": 45.74976103694579, "learning_rate": 2.481724463801933e-07, "logits/chosen": -0.09558014571666718, "logits/rejected": 1.1860709190368652, "logps/chosen": -360.15411376953125, "logps/rejected": -399.21685791015625, "loss": 0.6332, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -110.9947738647461, "rewards/margins": 68.16171264648438, "rewards/rejected": -179.15647888183594, "step": 2110 }, { "epoch": 0.5548285789060455, "grad_norm": 30.058542677557412, "learning_rate": 2.4588815314058154e-07, "logits/chosen": -0.4582470953464508, "logits/rejected": 1.3120964765548706, "logps/chosen": -421.02655029296875, "logps/rejected": -456.4002380371094, "loss": 0.6392, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -84.15247344970703, "rewards/margins": 106.6784439086914, "rewards/rejected": -190.83090209960938, "step": 2120 }, { "epoch": 0.5574456948442816, "grad_norm": 24.331335326894454, "learning_rate": 2.4360420323899917e-07, "logits/chosen": 0.15577450394630432, "logits/rejected": 1.3761570453643799, "logps/chosen": -350.9698791503906, "logps/rejected": -469.49090576171875, "loss": 0.5898, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -98.12188720703125, "rewards/margins": 96.63127136230469, "rewards/rejected": -194.75314331054688, "step": 2130 }, { "epoch": 0.5600628107825176, "grad_norm": 35.87275267194999, "learning_rate": 2.4132078738460583e-07, "logits/chosen": 0.9583314061164856, "logits/rejected": 1.5858229398727417, "logps/chosen": -399.7995910644531, "logps/rejected": -457.26116943359375, "loss": 0.6056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -126.67093658447266, "rewards/margins": 60.547874450683594, "rewards/rejected": -187.21881103515625, "step": 2140 }, { "epoch": 0.5626799267207537, "grad_norm": 39.761293537479226, "learning_rate": 2.390380962419682e-07, "logits/chosen": 0.7930958867073059, "logits/rejected": 2.482306957244873, "logps/chosen": -334.648193359375, "logps/rejected": -446.80438232421875, "loss": 0.6153, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -103.2480239868164, "rewards/margins": 119.85728454589844, "rewards/rejected": -223.1052703857422, "step": 2150 }, { "epoch": 0.5652970426589898, "grad_norm": 31.853624067752015, "learning_rate": 2.3675632041513977e-07, "logits/chosen": 0.531602680683136, "logits/rejected": 1.801783561706543, "logps/chosen": -399.61419677734375, "logps/rejected": -508.2198791503906, "loss": 0.6153, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -125.03511810302734, "rewards/margins": 88.95806121826172, "rewards/rejected": -213.99319458007812, "step": 2160 }, { "epoch": 0.5679141585972258, "grad_norm": 33.621655259329046, "learning_rate": 2.344756504317453e-07, "logits/chosen": 0.4952963888645172, "logits/rejected": 2.3254141807556152, "logps/chosen": -363.47210693359375, "logps/rejected": -426.3758850097656, "loss": 0.6349, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -104.16854095458984, "rewards/margins": 107.5788803100586, "rewards/rejected": -211.7474365234375, "step": 2170 }, { "epoch": 0.5705312745354619, "grad_norm": 36.50906825096521, "learning_rate": 2.3219627672707237e-07, "logits/chosen": -0.048846520483493805, "logits/rejected": 1.8363500833511353, "logps/chosen": -395.2628479003906, "logps/rejected": -441.28546142578125, "loss": 0.6398, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -83.27033996582031, "rewards/margins": 108.64981842041016, "rewards/rejected": -191.920166015625, "step": 2180 }, { "epoch": 0.573148390473698, "grad_norm": 27.414128348892337, "learning_rate": 2.2991838962816918e-07, "logits/chosen": -0.003201258135959506, "logits/rejected": 1.6773014068603516, "logps/chosen": -340.6254577636719, "logps/rejected": -424.5982360839844, "loss": 0.5947, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -71.3757095336914, "rewards/margins": 119.04563903808594, "rewards/rejected": -190.42135620117188, "step": 2190 }, { "epoch": 0.575765506411934, "grad_norm": 20.12881929325743, "learning_rate": 2.2764217933795297e-07, "logits/chosen": 0.325003445148468, "logits/rejected": 2.2267208099365234, "logps/chosen": -318.68359375, "logps/rejected": -379.50787353515625, "loss": 0.5855, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -84.79862976074219, "rewards/margins": 93.99121856689453, "rewards/rejected": -178.7898406982422, "step": 2200 }, { "epoch": 0.5783826223501701, "grad_norm": 28.1067572997552, "learning_rate": 2.253678359193278e-07, "logits/chosen": 0.6280830502510071, "logits/rejected": 2.3489346504211426, "logps/chosen": -494.9659729003906, "logps/rejected": -529.9819946289062, "loss": 0.6309, "rewards/accuracies": 0.75, "rewards/chosen": -138.376220703125, "rewards/margins": 81.79132843017578, "rewards/rejected": -220.1675262451172, "step": 2210 }, { "epoch": 0.5809997382884062, "grad_norm": 19.494344171492532, "learning_rate": 2.230955492793149e-07, "logits/chosen": 1.8268368244171143, "logits/rejected": 2.58113956451416, "logps/chosen": -332.82220458984375, "logps/rejected": -433.9080505371094, "loss": 0.6259, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -130.2157745361328, "rewards/margins": 85.47044372558594, "rewards/rejected": -215.6862335205078, "step": 2220 }, { "epoch": 0.5836168542266422, "grad_norm": 51.63490469991745, "learning_rate": 2.2082550915319468e-07, "logits/chosen": 0.8697152137756348, "logits/rejected": 1.921521782875061, "logps/chosen": -420.7347106933594, "logps/rejected": -502.9576110839844, "loss": 0.6465, "rewards/accuracies": 0.75, "rewards/chosen": -123.50699615478516, "rewards/margins": 93.5182113647461, "rewards/rejected": -217.0252227783203, "step": 2230 }, { "epoch": 0.5862339701648783, "grad_norm": 32.248455012221434, "learning_rate": 2.1855790508866433e-07, "logits/chosen": 0.9172471761703491, "logits/rejected": 2.2043850421905518, "logps/chosen": -384.3970031738281, "logps/rejected": -411.32373046875, "loss": 0.6135, "rewards/accuracies": 0.75, "rewards/chosen": -109.10707092285156, "rewards/margins": 77.65843200683594, "rewards/rejected": -186.76548767089844, "step": 2240 }, { "epoch": 0.5888510861031143, "grad_norm": 34.745998393806474, "learning_rate": 2.162929264300107e-07, "logits/chosen": 1.7178211212158203, "logits/rejected": 1.943967580795288, "logps/chosen": -419.106689453125, "logps/rejected": -513.5217895507812, "loss": 0.619, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -154.78704833984375, "rewards/margins": 75.63910675048828, "rewards/rejected": -230.4261474609375, "step": 2250 }, { "epoch": 0.5914682020413504, "grad_norm": 39.28692190563846, "learning_rate": 2.1403076230230005e-07, "logits/chosen": 0.5459718108177185, "logits/rejected": 1.7720600366592407, "logps/chosen": -366.18939208984375, "logps/rejected": -477.49725341796875, "loss": 0.6097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -132.69638061523438, "rewards/margins": 80.7729263305664, "rewards/rejected": -213.4693145751953, "step": 2260 }, { "epoch": 0.5940853179795865, "grad_norm": 35.79367820795886, "learning_rate": 2.1177160159558596e-07, "logits/chosen": 0.2836107313632965, "logits/rejected": 1.257423758506775, "logps/chosen": -405.7095947265625, "logps/rejected": -575.5140991210938, "loss": 0.5878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -130.59938049316406, "rewards/margins": 115.3625717163086, "rewards/rejected": -245.9619598388672, "step": 2270 }, { "epoch": 0.5967024339178225, "grad_norm": 43.109152856420366, "learning_rate": 2.0951563294913734e-07, "logits/chosen": 0.8286579847335815, "logits/rejected": 1.5607203245162964, "logps/chosen": -433.5328674316406, "logps/rejected": -459.05615234375, "loss": 0.6721, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -121.0738296508789, "rewards/margins": 54.7202262878418, "rewards/rejected": -175.7940673828125, "step": 2280 }, { "epoch": 0.5993195498560586, "grad_norm": 44.27442075290467, "learning_rate": 2.072630447356869e-07, "logits/chosen": 1.2522351741790771, "logits/rejected": 2.194676160812378, "logps/chosen": -375.2573547363281, "logps/rejected": -454.4563903808594, "loss": 0.6213, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -133.56240844726562, "rewards/margins": 69.3228759765625, "rewards/rejected": -202.8852996826172, "step": 2290 }, { "epoch": 0.6019366657942947, "grad_norm": 35.873403321724766, "learning_rate": 2.0501402504570232e-07, "logits/chosen": 1.2167752981185913, "logits/rejected": 2.3980419635772705, "logps/chosen": -382.34857177734375, "logps/rejected": -458.5743713378906, "loss": 0.6282, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -136.52151489257812, "rewards/margins": 77.73162841796875, "rewards/rejected": -214.253173828125, "step": 2300 }, { "epoch": 0.6045537817325307, "grad_norm": 35.83497109520107, "learning_rate": 2.027687616716804e-07, "logits/chosen": 0.8540589213371277, "logits/rejected": 1.2983033657073975, "logps/chosen": -522.3756713867188, "logps/rejected": -499.83544921875, "loss": 0.6397, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -161.60308837890625, "rewards/margins": 24.720584869384766, "rewards/rejected": -186.3236541748047, "step": 2310 }, { "epoch": 0.6071708976707668, "grad_norm": 21.18793615252519, "learning_rate": 2.005274420924668e-07, "logits/chosen": 0.7288818359375, "logits/rejected": 2.1675353050231934, "logps/chosen": -509.79248046875, "logps/rejected": -634.5296020507812, "loss": 0.634, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -152.21360778808594, "rewards/margins": 104.65443420410156, "rewards/rejected": -256.8680114746094, "step": 2320 }, { "epoch": 0.6097880136090029, "grad_norm": 22.50228145741483, "learning_rate": 1.9829025345760121e-07, "logits/chosen": 0.6618310213088989, "logits/rejected": 2.1998677253723145, "logps/chosen": -411.01348876953125, "logps/rejected": -443.0804138183594, "loss": 0.5998, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -146.52345275878906, "rewards/margins": 60.154937744140625, "rewards/rejected": -206.6783905029297, "step": 2330 }, { "epoch": 0.6124051295472389, "grad_norm": 25.601857295410756, "learning_rate": 1.960573825716911e-07, "logits/chosen": 0.7051969766616821, "logits/rejected": 1.6920392513275146, "logps/chosen": -393.4776306152344, "logps/rejected": -550.2787475585938, "loss": 0.6275, "rewards/accuracies": 0.875, "rewards/chosen": -121.97821044921875, "rewards/margins": 120.07755279541016, "rewards/rejected": -242.0557403564453, "step": 2340 }, { "epoch": 0.615022245485475, "grad_norm": 19.213556153517377, "learning_rate": 1.9382901587881273e-07, "logits/chosen": 1.1434944868087769, "logits/rejected": 1.3964121341705322, "logps/chosen": -384.5157470703125, "logps/rejected": -480.2007751464844, "loss": 0.594, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -120.44758605957031, "rewards/margins": 86.69746398925781, "rewards/rejected": -207.1450653076172, "step": 2350 }, { "epoch": 0.6176393614237111, "grad_norm": 25.557737674180782, "learning_rate": 1.9160533944694364e-07, "logits/chosen": 0.3543923497200012, "logits/rejected": 1.8675868511199951, "logps/chosen": -421.5625, "logps/rejected": -434.99102783203125, "loss": 0.6164, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -150.8063507080078, "rewards/margins": 53.77281951904297, "rewards/rejected": -204.5791473388672, "step": 2360 }, { "epoch": 0.6202564773619471, "grad_norm": 158.85223594365854, "learning_rate": 1.8938653895242602e-07, "logits/chosen": 1.4026604890823364, "logits/rejected": 2.489152193069458, "logps/chosen": -349.5987854003906, "logps/rejected": -491.1888732910156, "loss": 0.605, "rewards/accuracies": 0.875, "rewards/chosen": -127.90867614746094, "rewards/margins": 113.81087493896484, "rewards/rejected": -241.7195587158203, "step": 2370 }, { "epoch": 0.6228735933001832, "grad_norm": 25.94527480965663, "learning_rate": 1.8717279966446264e-07, "logits/chosen": 0.9932606816291809, "logits/rejected": 1.2095084190368652, "logps/chosen": -425.6875, "logps/rejected": -495.3365173339844, "loss": 0.6252, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -143.76849365234375, "rewards/margins": 48.57000732421875, "rewards/rejected": -192.3385009765625, "step": 2380 }, { "epoch": 0.6254907092384192, "grad_norm": 35.86272517082607, "learning_rate": 1.8496430642964694e-07, "logits/chosen": 1.6592071056365967, "logits/rejected": 3.2019355297088623, "logps/chosen": -449.941650390625, "logps/rejected": -503.06689453125, "loss": 0.6355, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -154.45082092285156, "rewards/margins": 97.35575866699219, "rewards/rejected": -251.8065948486328, "step": 2390 }, { "epoch": 0.6281078251766553, "grad_norm": 33.36347925046869, "learning_rate": 1.8276124365652855e-07, "logits/chosen": 1.782403588294983, "logits/rejected": 2.6725101470947266, "logps/chosen": -414.8662109375, "logps/rejected": -513.3907470703125, "loss": 0.617, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -152.33621215820312, "rewards/margins": 84.60549926757812, "rewards/rejected": -236.9416961669922, "step": 2400 }, { "epoch": 0.6307249411148914, "grad_norm": 27.211887112083538, "learning_rate": 1.805637953002149e-07, "logits/chosen": 1.0229891538619995, "logits/rejected": 2.227069616317749, "logps/chosen": -412.5189514160156, "logps/rejected": -546.1232299804688, "loss": 0.6067, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -142.4264373779297, "rewards/margins": 105.69502258300781, "rewards/rejected": -248.12142944335938, "step": 2410 }, { "epoch": 0.6333420570531274, "grad_norm": 34.88818302440242, "learning_rate": 1.7837214484701153e-07, "logits/chosen": 1.1308300495147705, "logits/rejected": 1.0798611640930176, "logps/chosen": -440.8893127441406, "logps/rejected": -540.1508178710938, "loss": 0.6435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -157.52186584472656, "rewards/margins": 75.15589904785156, "rewards/rejected": -232.677734375, "step": 2420 }, { "epoch": 0.6359591729913635, "grad_norm": 38.54161473588103, "learning_rate": 1.761864752991004e-07, "logits/chosen": 0.743876576423645, "logits/rejected": 2.1762986183166504, "logps/chosen": -427.7271423339844, "logps/rejected": -495.25909423828125, "loss": 0.6099, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -151.75076293945312, "rewards/margins": 84.32628631591797, "rewards/rejected": -236.0770263671875, "step": 2430 }, { "epoch": 0.6385762889295996, "grad_norm": 30.683794106965227, "learning_rate": 1.7400696915925995e-07, "logits/chosen": 0.9094554781913757, "logits/rejected": 2.102067708969116, "logps/chosen": -393.0418395996094, "logps/rejected": -448.5322265625, "loss": 0.6029, "rewards/accuracies": 0.875, "rewards/chosen": -150.30026245117188, "rewards/margins": 81.52594757080078, "rewards/rejected": -231.8262176513672, "step": 2440 }, { "epoch": 0.6411934048678356, "grad_norm": 120.87900499444974, "learning_rate": 1.718338084156254e-07, "logits/chosen": 0.7895253300666809, "logits/rejected": 1.4243078231811523, "logps/chosen": -478.90643310546875, "logps/rejected": -506.8853454589844, "loss": 0.6337, "rewards/accuracies": 0.75, "rewards/chosen": -147.2356414794922, "rewards/margins": 72.50621032714844, "rewards/rejected": -219.74185180664062, "step": 2450 }, { "epoch": 0.6438105208060717, "grad_norm": 43.85428932744377, "learning_rate": 1.696671745264937e-07, "logits/chosen": 0.8812819719314575, "logits/rejected": 2.2113723754882812, "logps/chosen": -512.0079345703125, "logps/rejected": -543.0887451171875, "loss": 0.6242, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -182.76455688476562, "rewards/margins": 80.2233657836914, "rewards/rejected": -262.9878845214844, "step": 2460 }, { "epoch": 0.6464276367443078, "grad_norm": 18.352614590491243, "learning_rate": 1.67507248405171e-07, "logits/chosen": 0.06380853801965714, "logits/rejected": 1.635575532913208, "logps/chosen": -411.78948974609375, "logps/rejected": -411.417724609375, "loss": 0.6258, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -135.79331970214844, "rewards/margins": 60.68121337890625, "rewards/rejected": -196.4745330810547, "step": 2470 }, { "epoch": 0.6490447526825438, "grad_norm": 27.292806167535627, "learning_rate": 1.6535421040486683e-07, "logits/chosen": 0.4601976275444031, "logits/rejected": 1.9244674444198608, "logps/chosen": -420.33978271484375, "logps/rejected": -493.8761291503906, "loss": 0.5955, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -144.494384765625, "rewards/margins": 96.97511291503906, "rewards/rejected": -241.469482421875, "step": 2480 }, { "epoch": 0.6516618686207799, "grad_norm": 44.574204509093036, "learning_rate": 1.6320824030363456e-07, "logits/chosen": 1.7913410663604736, "logits/rejected": 3.195528984069824, "logps/chosen": -437.32464599609375, "logps/rejected": -459.4554748535156, "loss": 0.6501, "rewards/accuracies": 0.625, "rewards/chosen": -181.6840362548828, "rewards/margins": 56.456878662109375, "rewards/rejected": -238.14089965820312, "step": 2490 }, { "epoch": 0.654278984559016, "grad_norm": 24.375749373782856, "learning_rate": 1.6106951728936024e-07, "logits/chosen": 0.45715102553367615, "logits/rejected": 1.6011425256729126, "logps/chosen": -395.44354248046875, "logps/rejected": -513.138916015625, "loss": 0.6131, "rewards/accuracies": 0.625, "rewards/chosen": -115.36625671386719, "rewards/margins": 79.81534576416016, "rewards/rejected": -195.18161010742188, "step": 2500 }, { "epoch": 0.656896100497252, "grad_norm": 21.478682367181154, "learning_rate": 1.5893821994479994e-07, "logits/chosen": 0.34590616822242737, "logits/rejected": 2.354616165161133, "logps/chosen": -446.23626708984375, "logps/rejected": -481.48394775390625, "loss": 0.6221, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -119.68449401855469, "rewards/margins": 107.2177963256836, "rewards/rejected": -226.9022979736328, "step": 2510 }, { "epoch": 0.6595132164354881, "grad_norm": 30.65322954729203, "learning_rate": 1.5681452623266867e-07, "logits/chosen": 1.2941632270812988, "logits/rejected": 2.3193411827087402, "logps/chosen": -414.8558044433594, "logps/rejected": -468.5655212402344, "loss": 0.6167, "rewards/accuracies": 0.625, "rewards/chosen": -155.61769104003906, "rewards/margins": 55.00055694580078, "rewards/rejected": -210.6182403564453, "step": 2520 }, { "epoch": 0.6621303323737242, "grad_norm": 49.039161022991905, "learning_rate": 1.546986134807801e-07, "logits/chosen": 1.6722854375839233, "logits/rejected": 2.5984983444213867, "logps/chosen": -361.4550476074219, "logps/rejected": -491.7296447753906, "loss": 0.6123, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -129.01124572753906, "rewards/margins": 128.69967651367188, "rewards/rejected": -257.7109069824219, "step": 2530 }, { "epoch": 0.6647474483119602, "grad_norm": 34.85133335462039, "learning_rate": 1.5259065836724034e-07, "logits/chosen": 1.0750457048416138, "logits/rejected": 1.3905805349349976, "logps/chosen": -355.76092529296875, "logps/rejected": -473.81561279296875, "loss": 0.6037, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -121.11027526855469, "rewards/margins": 98.64629364013672, "rewards/rejected": -219.756591796875, "step": 2540 }, { "epoch": 0.6673645642501963, "grad_norm": 18.55982939590747, "learning_rate": 1.5049083690569454e-07, "logits/chosen": 1.4674863815307617, "logits/rejected": 3.273242473602295, "logps/chosen": -380.1376953125, "logps/rejected": -481.0174865722656, "loss": 0.5796, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -130.4925079345703, "rewards/margins": 113.21549224853516, "rewards/rejected": -243.7080078125, "step": 2550 }, { "epoch": 0.6699816801884323, "grad_norm": 28.676002339732538, "learning_rate": 1.4839932443063056e-07, "logits/chosen": 0.23872914910316467, "logits/rejected": 1.4117581844329834, "logps/chosen": -429.30914306640625, "logps/rejected": -487.3106384277344, "loss": 0.6481, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -131.02191162109375, "rewards/margins": 91.13362121582031, "rewards/rejected": -222.15554809570312, "step": 2560 }, { "epoch": 0.6725987961266684, "grad_norm": 54.9929297824748, "learning_rate": 1.46316295582738e-07, "logits/chosen": 1.2317416667938232, "logits/rejected": 2.6543679237365723, "logps/chosen": -458.9169921875, "logps/rejected": -479.111083984375, "loss": 0.6409, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -157.216796875, "rewards/margins": 83.09622955322266, "rewards/rejected": -240.3130340576172, "step": 2570 }, { "epoch": 0.6752159120649045, "grad_norm": 23.433586682506476, "learning_rate": 1.4424192429432655e-07, "logits/chosen": 0.5045792460441589, "logits/rejected": 2.09639310836792, "logps/chosen": -445.75299072265625, "logps/rejected": -484.1240234375, "loss": 0.6155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -135.1338653564453, "rewards/margins": 92.08915710449219, "rewards/rejected": -227.22299194335938, "step": 2580 }, { "epoch": 0.6778330280031405, "grad_norm": 23.583509411900305, "learning_rate": 1.4217638377480158e-07, "logits/chosen": 0.10572358220815659, "logits/rejected": 1.3806489706039429, "logps/chosen": -400.96270751953125, "logps/rejected": -460.14117431640625, "loss": 0.6009, "rewards/accuracies": 0.75, "rewards/chosen": -113.3682632446289, "rewards/margins": 80.95018005371094, "rewards/rejected": -194.31845092773438, "step": 2590 }, { "epoch": 0.6804501439413766, "grad_norm": 40.40953868443985, "learning_rate": 1.401198464962021e-07, "logits/chosen": 0.2016058713197708, "logits/rejected": 1.130928874015808, "logps/chosen": -361.8517150878906, "logps/rejected": -494.3812561035156, "loss": 0.6294, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -103.8023452758789, "rewards/margins": 101.51799011230469, "rewards/rejected": -205.32034301757812, "step": 2600 }, { "epoch": 0.6830672598796127, "grad_norm": 39.94792054480128, "learning_rate": 1.3807248417879894e-07, "logits/chosen": 0.6115899085998535, "logits/rejected": 1.6156005859375, "logps/chosen": -307.701904296875, "logps/rejected": -444.3831481933594, "loss": 0.6089, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -114.9823226928711, "rewards/margins": 100.7208023071289, "rewards/rejected": -215.703125, "step": 2610 }, { "epoch": 0.6856843758178487, "grad_norm": 36.696272333131326, "learning_rate": 1.3603446777675665e-07, "logits/chosen": -0.2377341240644455, "logits/rejected": 1.0026599168777466, "logps/chosen": -350.7626953125, "logps/rejected": -412.9532775878906, "loss": 0.6097, "rewards/accuracies": 0.75, "rewards/chosen": -105.99015045166016, "rewards/margins": 73.70530700683594, "rewards/rejected": -179.6954803466797, "step": 2620 }, { "epoch": 0.6883014917560848, "grad_norm": 39.398684962283816, "learning_rate": 1.3400596746385814e-07, "logits/chosen": 0.5835520029067993, "logits/rejected": 1.5241050720214844, "logps/chosen": -417.64990234375, "logps/rejected": -531.836669921875, "loss": 0.6309, "rewards/accuracies": 0.75, "rewards/chosen": -144.4230499267578, "rewards/margins": 75.8773193359375, "rewards/rejected": -220.3003692626953, "step": 2630 }, { "epoch": 0.6909186076943209, "grad_norm": 44.431457896844876, "learning_rate": 1.3198715261929586e-07, "logits/chosen": 0.36666786670684814, "logits/rejected": 1.6828514337539673, "logps/chosen": -456.6180114746094, "logps/rejected": -464.532958984375, "loss": 0.636, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -151.42739868164062, "rewards/margins": 59.905517578125, "rewards/rejected": -211.3329315185547, "step": 2640 }, { "epoch": 0.6935357236325569, "grad_norm": 25.17979333945541, "learning_rate": 1.299781918135282e-07, "logits/chosen": 0.24016058444976807, "logits/rejected": 1.928510069847107, "logps/chosen": -420.1914978027344, "logps/rejected": -453.20892333984375, "loss": 0.6323, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -144.0685577392578, "rewards/margins": 97.08206176757812, "rewards/rejected": -241.15060424804688, "step": 2650 }, { "epoch": 0.696152839570793, "grad_norm": 31.657621753900987, "learning_rate": 1.279792527942045e-07, "logits/chosen": -0.0180420633405447, "logits/rejected": 2.3416850566864014, "logps/chosen": -406.87652587890625, "logps/rejected": -449.5647888183594, "loss": 0.6203, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -133.4667510986328, "rewards/margins": 90.72755432128906, "rewards/rejected": -224.19430541992188, "step": 2660 }, { "epoch": 0.6987699555090291, "grad_norm": 49.26258187961619, "learning_rate": 1.259905024721576e-07, "logits/chosen": 0.3727002739906311, "logits/rejected": 2.0285167694091797, "logps/chosen": -398.2438659667969, "logps/rejected": -459.0472106933594, "loss": 0.6157, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -122.77784729003906, "rewards/margins": 113.05708312988281, "rewards/rejected": -235.83493041992188, "step": 2670 }, { "epoch": 0.7013870714472651, "grad_norm": 83.37635034432546, "learning_rate": 1.2401210690746703e-07, "logits/chosen": 0.1346236914396286, "logits/rejected": 1.083974838256836, "logps/chosen": -421.3680725097656, "logps/rejected": -470.74627685546875, "loss": 0.6079, "rewards/accuracies": 0.75, "rewards/chosen": -139.5249481201172, "rewards/margins": 49.866912841796875, "rewards/rejected": -189.39187622070312, "step": 2680 }, { "epoch": 0.7040041873855012, "grad_norm": 37.05947965910841, "learning_rate": 1.2204423129559305e-07, "logits/chosen": 0.9880054593086243, "logits/rejected": 2.4434711933135986, "logps/chosen": -341.87005615234375, "logps/rejected": -409.3817443847656, "loss": 0.635, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -123.16813659667969, "rewards/margins": 77.82776641845703, "rewards/rejected": -200.9958953857422, "step": 2690 }, { "epoch": 0.7066213033237373, "grad_norm": 41.84013190369576, "learning_rate": 1.2008703995358299e-07, "logits/chosen": 2.119760274887085, "logits/rejected": 2.1831414699554443, "logps/chosen": -363.73211669921875, "logps/rejected": -506.10137939453125, "loss": 0.6087, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -136.8468475341797, "rewards/margins": 97.38895416259766, "rewards/rejected": -234.23580932617188, "step": 2700 }, { "epoch": 0.7092384192619733, "grad_norm": 31.86581150130641, "learning_rate": 1.1814069630635068e-07, "logits/chosen": 0.14392466843128204, "logits/rejected": 1.7527008056640625, "logps/chosen": -454.44549560546875, "logps/rejected": -468.8114318847656, "loss": 0.6283, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -157.68173217773438, "rewards/margins": 61.41668701171875, "rewards/rejected": -219.09841918945312, "step": 2710 }, { "epoch": 0.7118555352002094, "grad_norm": 18.081063280078112, "learning_rate": 1.1620536287303051e-07, "logits/chosen": 1.341851830482483, "logits/rejected": 2.280597448348999, "logps/chosen": -310.7570495605469, "logps/rejected": -436.3260192871094, "loss": 0.6282, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -132.96395874023438, "rewards/margins": 78.2131118774414, "rewards/rejected": -211.1770782470703, "step": 2720 }, { "epoch": 0.7144726511384454, "grad_norm": 39.45453883471653, "learning_rate": 1.1428120125340716e-07, "logits/chosen": 0.7364514470100403, "logits/rejected": 2.3780157566070557, "logps/chosen": -415.115966796875, "logps/rejected": -506.86932373046875, "loss": 0.6397, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -133.8503875732422, "rewards/margins": 97.6797866821289, "rewards/rejected": -231.5301971435547, "step": 2730 }, { "epoch": 0.7170897670766815, "grad_norm": 28.535318320593547, "learning_rate": 1.123683721144223e-07, "logits/chosen": 0.23405185341835022, "logits/rejected": 1.1598809957504272, "logps/chosen": -408.23394775390625, "logps/rejected": -488.3985290527344, "loss": 0.6089, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -129.1389923095703, "rewards/margins": 65.66856384277344, "rewards/rejected": -194.80755615234375, "step": 2740 }, { "epoch": 0.7197068830149176, "grad_norm": 57.03110684643374, "learning_rate": 1.1046703517675845e-07, "logits/chosen": 0.005275917239487171, "logits/rejected": 1.8837101459503174, "logps/chosen": -454.74847412109375, "logps/rejected": -430.2967834472656, "loss": 0.6373, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -123.52076721191406, "rewards/margins": 77.50233459472656, "rewards/rejected": -201.02310180664062, "step": 2750 }, { "epoch": 0.7223239989531536, "grad_norm": 16.31469315081151, "learning_rate": 1.085773492015028e-07, "logits/chosen": -0.029287749901413918, "logits/rejected": 0.9784728288650513, "logps/chosen": -424.54364013671875, "logps/rejected": -489.2088928222656, "loss": 0.6089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -115.7852783203125, "rewards/margins": 62.044944763183594, "rewards/rejected": -177.83023071289062, "step": 2760 }, { "epoch": 0.7249411148913897, "grad_norm": 27.954300283078403, "learning_rate": 1.0669947197689033e-07, "logits/chosen": 0.25381985306739807, "logits/rejected": 1.3894766569137573, "logps/chosen": -375.91339111328125, "logps/rejected": -428.8695373535156, "loss": 0.597, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -127.7160415649414, "rewards/margins": 75.71189880371094, "rewards/rejected": -203.42791748046875, "step": 2770 }, { "epoch": 0.7275582308296258, "grad_norm": 32.24312345073874, "learning_rate": 1.048335603051291e-07, "logits/chosen": 0.8968909382820129, "logits/rejected": 1.4077928066253662, "logps/chosen": -442.5194396972656, "logps/rejected": -486.0824279785156, "loss": 0.6251, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -123.14749908447266, "rewards/margins": 78.89585876464844, "rewards/rejected": -202.04335021972656, "step": 2780 }, { "epoch": 0.7301753467678618, "grad_norm": 36.23128514010418, "learning_rate": 1.0297976998930663e-07, "logits/chosen": 0.7249239683151245, "logits/rejected": 2.3154051303863525, "logps/chosen": -425.0227966308594, "logps/rejected": -466.5647888183594, "loss": 0.6061, "rewards/accuracies": 0.75, "rewards/chosen": -108.95176696777344, "rewards/margins": 75.4349365234375, "rewards/rejected": -184.38671875, "step": 2790 }, { "epoch": 0.7327924627060979, "grad_norm": 35.159964513238755, "learning_rate": 1.0113825582038077e-07, "logits/chosen": 0.4708864688873291, "logits/rejected": 3.2727150917053223, "logps/chosen": -410.4869689941406, "logps/rejected": -432.30523681640625, "loss": 0.6079, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -114.40809631347656, "rewards/margins": 107.46346282958984, "rewards/rejected": -221.87158203125, "step": 2800 }, { "epoch": 0.735409578644334, "grad_norm": 71.25724626710895, "learning_rate": 9.930917156425475e-08, "logits/chosen": 0.4397871494293213, "logits/rejected": 2.1435513496398926, "logps/chosen": -392.5745849609375, "logps/rejected": -425.1902770996094, "loss": 0.6393, "rewards/accuracies": 0.75, "rewards/chosen": -96.77259826660156, "rewards/margins": 102.71319580078125, "rewards/rejected": -199.4857940673828, "step": 2810 }, { "epoch": 0.73802669458257, "grad_norm": 44.71807626624307, "learning_rate": 9.749266994893754e-08, "logits/chosen": 0.3410795032978058, "logits/rejected": 1.3199317455291748, "logps/chosen": -358.898681640625, "logps/rejected": -448.3935546875, "loss": 0.616, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -127.18278503417969, "rewards/margins": 72.87733459472656, "rewards/rejected": -200.0601043701172, "step": 2820 }, { "epoch": 0.7406438105208061, "grad_norm": 30.761613537091247, "learning_rate": 9.568890265179128e-08, "logits/chosen": 0.4254278540611267, "logits/rejected": 1.124775767326355, "logps/chosen": -352.58270263671875, "logps/rejected": -433.5701599121094, "loss": 0.6495, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -125.1341781616211, "rewards/margins": 84.12373352050781, "rewards/rejected": -209.2578887939453, "step": 2830 }, { "epoch": 0.7432609264590422, "grad_norm": 49.88197491305349, "learning_rate": 9.389802028686616e-08, "logits/chosen": -0.025458145886659622, "logits/rejected": 1.2856556177139282, "logps/chosen": -383.7842712402344, "logps/rejected": -466.41998291015625, "loss": 0.6271, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -114.00410461425781, "rewards/margins": 86.70926666259766, "rewards/rejected": -200.71334838867188, "step": 2840 }, { "epoch": 0.7458780423972782, "grad_norm": 27.585109058465484, "learning_rate": 9.212017239232426e-08, "logits/chosen": 0.7934707403182983, "logits/rejected": 1.2405365705490112, "logps/chosen": -366.0028991699219, "logps/rejected": -449.9388732910156, "loss": 0.6256, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -117.50825500488281, "rewards/margins": 56.368919372558594, "rewards/rejected": -173.87716674804688, "step": 2850 }, { "epoch": 0.7484951583355143, "grad_norm": 19.174565551603006, "learning_rate": 9.035550741795328e-08, "logits/chosen": 0.19867125153541565, "logits/rejected": 1.6759397983551025, "logps/chosen": -413.1884765625, "logps/rejected": -487.4052734375, "loss": 0.6097, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -103.04798889160156, "rewards/margins": 111.28741455078125, "rewards/rejected": -214.3354034423828, "step": 2860 }, { "epoch": 0.7511122742737504, "grad_norm": 16.501011315290544, "learning_rate": 8.860417271277065e-08, "logits/chosen": 0.44556722044944763, "logits/rejected": 1.7269704341888428, "logps/chosen": -358.22796630859375, "logps/rejected": -463.13092041015625, "loss": 0.6076, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -114.99739837646484, "rewards/margins": 107.62504577636719, "rewards/rejected": -222.62246704101562, "step": 2870 }, { "epoch": 0.7537293902119864, "grad_norm": 16.630600643438857, "learning_rate": 8.686631451272029e-08, "logits/chosen": 0.36867132782936096, "logits/rejected": 1.5242793560028076, "logps/chosen": -487.36016845703125, "logps/rejected": -469.58074951171875, "loss": 0.6173, "rewards/accuracies": 0.75, "rewards/chosen": -132.63406372070312, "rewards/margins": 62.797386169433594, "rewards/rejected": -195.43142700195312, "step": 2880 }, { "epoch": 0.7563465061502225, "grad_norm": 19.17394858355595, "learning_rate": 8.514207792846168e-08, "logits/chosen": 0.8502219319343567, "logits/rejected": 1.6160399913787842, "logps/chosen": -342.61981201171875, "logps/rejected": -443.2942810058594, "loss": 0.602, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -99.26615905761719, "rewards/margins": 106.50407409667969, "rewards/rejected": -205.7702178955078, "step": 2890 }, { "epoch": 0.7589636220884585, "grad_norm": 26.890168448182035, "learning_rate": 8.343160693325355e-08, "logits/chosen": 1.1203569173812866, "logits/rejected": 2.6382992267608643, "logps/chosen": -440.35699462890625, "logps/rejected": -502.62176513671875, "loss": 0.6176, "rewards/accuracies": 0.75, "rewards/chosen": -144.5258026123047, "rewards/margins": 112.56309509277344, "rewards/rejected": -257.0888977050781, "step": 2900 }, { "epoch": 0.7615807380266946, "grad_norm": 43.572658787963654, "learning_rate": 8.173504435093173e-08, "logits/chosen": 0.8563163876533508, "logits/rejected": 3.0953686237335205, "logps/chosen": -408.85089111328125, "logps/rejected": -474.3164978027344, "loss": 0.6352, "rewards/accuracies": 0.75, "rewards/chosen": -127.74039459228516, "rewards/margins": 110.32560729980469, "rewards/rejected": -238.0659942626953, "step": 2910 }, { "epoch": 0.7641978539649307, "grad_norm": 25.30166750340952, "learning_rate": 8.005253184398359e-08, "logits/chosen": 1.089434266090393, "logits/rejected": 2.4163076877593994, "logps/chosen": -365.9088439941406, "logps/rejected": -480.734130859375, "loss": 0.6251, "rewards/accuracies": 0.75, "rewards/chosen": -120.29222106933594, "rewards/margins": 89.46209716796875, "rewards/rejected": -209.7543182373047, "step": 2920 }, { "epoch": 0.7668149699031667, "grad_norm": 34.78404791298794, "learning_rate": 7.838420990171926e-08, "logits/chosen": 0.16231641173362732, "logits/rejected": 1.6783320903778076, "logps/chosen": -398.6322021484375, "logps/rejected": -470.6861267089844, "loss": 0.6007, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -111.038330078125, "rewards/margins": 101.57747650146484, "rewards/rejected": -212.6157989501953, "step": 2930 }, { "epoch": 0.7694320858414028, "grad_norm": 14.809008382614701, "learning_rate": 7.673021782854083e-08, "logits/chosen": -0.15028102695941925, "logits/rejected": 1.6816718578338623, "logps/chosen": -429.3408203125, "logps/rejected": -436.5340270996094, "loss": 0.5864, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -116.80708312988281, "rewards/margins": 84.45670318603516, "rewards/rejected": -201.26376342773438, "step": 2940 }, { "epoch": 0.7720492017796389, "grad_norm": 21.97790850293821, "learning_rate": 7.509069373231039e-08, "logits/chosen": 0.1503334790468216, "logits/rejected": 1.3877406120300293, "logps/chosen": -426.85418701171875, "logps/rejected": -458.5160217285156, "loss": 0.612, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -109.48735046386719, "rewards/margins": 87.34893798828125, "rewards/rejected": -196.83627319335938, "step": 2950 }, { "epoch": 0.7746663177178749, "grad_norm": 58.57085668506599, "learning_rate": 7.346577451281821e-08, "logits/chosen": 0.13314999639987946, "logits/rejected": 1.4621391296386719, "logps/chosen": -433.54974365234375, "logps/rejected": -509.16864013671875, "loss": 0.6009, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -123.75052642822266, "rewards/margins": 90.73429870605469, "rewards/rejected": -214.4848175048828, "step": 2960 }, { "epoch": 0.777283433656111, "grad_norm": 23.872007670066296, "learning_rate": 7.185559585035136e-08, "logits/chosen": -0.1507561057806015, "logits/rejected": 1.7900638580322266, "logps/chosen": -398.67156982421875, "logps/rejected": -442.6659240722656, "loss": 0.6187, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -98.9404525756836, "rewards/margins": 82.65269470214844, "rewards/rejected": -181.5931396484375, "step": 2970 }, { "epoch": 0.7799005495943471, "grad_norm": 44.43635867539923, "learning_rate": 7.026029219436502e-08, "logits/chosen": -0.27151232957839966, "logits/rejected": 1.824271559715271, "logps/chosen": -402.84942626953125, "logps/rejected": -433.583984375, "loss": 0.5892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -112.6480941772461, "rewards/margins": 101.14834594726562, "rewards/rejected": -213.7964324951172, "step": 2980 }, { "epoch": 0.7825176655325831, "grad_norm": 39.07899805263987, "learning_rate": 6.867999675225522e-08, "logits/chosen": 0.15538421273231506, "logits/rejected": 0.8754119873046875, "logps/chosen": -396.12249755859375, "logps/rejected": -429.5633239746094, "loss": 0.6509, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -110.3372573852539, "rewards/margins": 73.39195251464844, "rewards/rejected": -183.7292022705078, "step": 2990 }, { "epoch": 0.7851347814708192, "grad_norm": 25.62485658793096, "learning_rate": 6.711484147823662e-08, "logits/chosen": 0.48910078406333923, "logits/rejected": 1.69696843624115, "logps/chosen": -332.47314453125, "logps/rejected": -342.8687744140625, "loss": 0.6306, "rewards/accuracies": 0.75, "rewards/chosen": -110.85118103027344, "rewards/margins": 58.67308807373047, "rewards/rejected": -169.52426147460938, "step": 3000 }, { "epoch": 0.7877518974090553, "grad_norm": 37.83197091357618, "learning_rate": 6.556495706232412e-08, "logits/chosen": 0.1738726645708084, "logits/rejected": 0.9801591038703918, "logps/chosen": -419.72479248046875, "logps/rejected": -499.14410400390625, "loss": 0.6207, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -119.627685546875, "rewards/margins": 98.64978790283203, "rewards/rejected": -218.2774658203125, "step": 3010 }, { "epoch": 0.7903690133472913, "grad_norm": 29.292991665537237, "learning_rate": 6.403047291942057e-08, "logits/chosen": 0.2022714614868164, "logits/rejected": 0.7730263471603394, "logps/chosen": -394.0956726074219, "logps/rejected": -425.01104736328125, "loss": 0.643, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -106.90836334228516, "rewards/margins": 35.84281921386719, "rewards/rejected": -142.75119018554688, "step": 3020 }, { "epoch": 0.7929861292855274, "grad_norm": 28.64177935728824, "learning_rate": 6.251151717851021e-08, "logits/chosen": -0.10760748386383057, "logits/rejected": 1.3448083400726318, "logps/chosen": -371.3802185058594, "logps/rejected": -411.80682373046875, "loss": 0.6036, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -106.61136627197266, "rewards/margins": 80.564208984375, "rewards/rejected": -187.17555236816406, "step": 3030 }, { "epoch": 0.7956032452237635, "grad_norm": 46.31822111959148, "learning_rate": 6.100821667196041e-08, "logits/chosen": 0.24122682213783264, "logits/rejected": 0.4868651330471039, "logps/chosen": -392.434814453125, "logps/rejected": -435.9532165527344, "loss": 0.6202, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -101.78141021728516, "rewards/margins": 51.99553680419922, "rewards/rejected": -153.77696228027344, "step": 3040 }, { "epoch": 0.7982203611619995, "grad_norm": 20.18830972334164, "learning_rate": 5.952069692493061e-08, "logits/chosen": 0.18420252203941345, "logits/rejected": 1.016076683998108, "logps/chosen": -332.6474304199219, "logps/rejected": -428.02618408203125, "loss": 0.6216, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -83.14781188964844, "rewards/margins": 99.45719909667969, "rewards/rejected": -182.60501098632812, "step": 3050 }, { "epoch": 0.8008374771002356, "grad_norm": 32.9312271398435, "learning_rate": 5.8049082144891794e-08, "logits/chosen": -0.34789204597473145, "logits/rejected": 1.1649951934814453, "logps/chosen": -350.571044921875, "logps/rejected": -384.31982421875, "loss": 0.5948, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -100.79808044433594, "rewards/margins": 72.1947250366211, "rewards/rejected": -172.9927978515625, "step": 3060 }, { "epoch": 0.8034545930384716, "grad_norm": 24.538460956530265, "learning_rate": 5.659349521125459e-08, "logits/chosen": 0.20586685836315155, "logits/rejected": 1.3391412496566772, "logps/chosen": -398.23126220703125, "logps/rejected": -544.0811767578125, "loss": 0.6167, "rewards/accuracies": 0.75, "rewards/chosen": -118.44612884521484, "rewards/margins": 88.53105163574219, "rewards/rejected": -206.97720336914062, "step": 3070 }, { "epoch": 0.8060717089767077, "grad_norm": 18.607731049230058, "learning_rate": 5.5154057665109e-08, "logits/chosen": -0.3509600758552551, "logits/rejected": 0.9203709363937378, "logps/chosen": -419.15960693359375, "logps/rejected": -474.97119140625, "loss": 0.6016, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -99.57820892333984, "rewards/margins": 82.64437103271484, "rewards/rejected": -182.22256469726562, "step": 3080 }, { "epoch": 0.8086888249149438, "grad_norm": 34.143748417302646, "learning_rate": 5.3730889699075853e-08, "logits/chosen": -0.47578373551368713, "logits/rejected": 0.4792146682739258, "logps/chosen": -436.461669921875, "logps/rejected": -462.1795349121094, "loss": 0.6033, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -106.6905517578125, "rewards/margins": 73.09249114990234, "rewards/rejected": -179.78305053710938, "step": 3090 }, { "epoch": 0.8113059408531798, "grad_norm": 38.11095562004462, "learning_rate": 5.2324110147270893e-08, "logits/chosen": 0.583615243434906, "logits/rejected": 1.4006626605987549, "logps/chosen": -347.5642395019531, "logps/rejected": -463.29962158203125, "loss": 0.6141, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -113.69773864746094, "rewards/margins": 85.3753662109375, "rewards/rejected": -199.0731201171875, "step": 3100 }, { "epoch": 0.8139230567914159, "grad_norm": 44.946656962405264, "learning_rate": 5.0933836475381795e-08, "logits/chosen": -0.39866572618484497, "logits/rejected": 2.506138801574707, "logps/chosen": -429.55694580078125, "logps/rejected": -445.2572326660156, "loss": 0.6195, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -100.6939926147461, "rewards/margins": 108.70045471191406, "rewards/rejected": -209.39443969726562, "step": 3110 }, { "epoch": 0.816540172729652, "grad_norm": 28.151760961027662, "learning_rate": 4.956018477086005e-08, "logits/chosen": -0.3393104076385498, "logits/rejected": 0.8863036036491394, "logps/chosen": -425.35443115234375, "logps/rejected": -503.589111328125, "loss": 0.6068, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -127.26164245605469, "rewards/margins": 89.95985412597656, "rewards/rejected": -217.2214813232422, "step": 3120 }, { "epoch": 0.819157288667888, "grad_norm": 17.270672002448382, "learning_rate": 4.820326973322763e-08, "logits/chosen": 0.06319572031497955, "logits/rejected": 1.5195165872573853, "logps/chosen": -352.53643798828125, "logps/rejected": -366.3856506347656, "loss": 0.6126, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -124.1020278930664, "rewards/margins": 65.88347625732422, "rewards/rejected": -189.98550415039062, "step": 3130 }, { "epoch": 0.821774404606124, "grad_norm": 22.56622768926043, "learning_rate": 4.686320466449981e-08, "logits/chosen": -0.03642309829592705, "logits/rejected": 0.8057546615600586, "logps/chosen": -409.79571533203125, "logps/rejected": -441.3692321777344, "loss": 0.6214, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -135.99322509765625, "rewards/margins": 58.7821159362793, "rewards/rejected": -194.7753448486328, "step": 3140 }, { "epoch": 0.8243915205443602, "grad_norm": 22.244625038969573, "learning_rate": 4.554010145972417e-08, "logits/chosen": -0.41408711671829224, "logits/rejected": 0.9231753349304199, "logps/chosen": -449.20281982421875, "logps/rejected": -483.89593505859375, "loss": 0.6128, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -104.51177978515625, "rewards/margins": 86.42676544189453, "rewards/rejected": -190.9385223388672, "step": 3150 }, { "epoch": 0.8270086364825961, "grad_norm": 30.60778101586563, "learning_rate": 4.423407059763745e-08, "logits/chosen": -0.17581668496131897, "logits/rejected": 0.6096712350845337, "logps/chosen": -415.3594665527344, "logps/rejected": -438.58526611328125, "loss": 0.6221, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -95.22805786132812, "rewards/margins": 79.04048156738281, "rewards/rejected": -174.26852416992188, "step": 3160 }, { "epoch": 0.8296257524208323, "grad_norm": 29.18479634749517, "learning_rate": 4.294522113144078e-08, "logits/chosen": 0.7951468229293823, "logits/rejected": 1.7467435598373413, "logps/chosen": -419.86871337890625, "logps/rejected": -456.6111755371094, "loss": 0.6103, "rewards/accuracies": 0.75, "rewards/chosen": -145.0784149169922, "rewards/margins": 73.13197326660156, "rewards/rejected": -218.2103729248047, "step": 3170 }, { "epoch": 0.8322428683590684, "grad_norm": 133.54621745583137, "learning_rate": 4.1673660679693804e-08, "logits/chosen": -0.258161723613739, "logits/rejected": 0.2669413089752197, "logps/chosen": -401.003173828125, "logps/rejected": -459.4292907714844, "loss": 0.6436, "rewards/accuracies": 0.75, "rewards/chosen": -109.79402923583984, "rewards/margins": 53.30994415283203, "rewards/rejected": -163.10397338867188, "step": 3180 }, { "epoch": 0.8348599842973043, "grad_norm": 22.45159047211893, "learning_rate": 4.041949541732825e-08, "logits/chosen": 0.8274353742599487, "logits/rejected": 1.4107186794281006, "logps/chosen": -396.6387634277344, "logps/rejected": -528.96484375, "loss": 0.6218, "rewards/accuracies": 0.75, "rewards/chosen": -139.893310546875, "rewards/margins": 150.52256774902344, "rewards/rejected": -290.4158630371094, "step": 3190 }, { "epoch": 0.8374771002355405, "grad_norm": 85.9004333178433, "learning_rate": 3.9182830066782605e-08, "logits/chosen": -0.225154310464859, "logits/rejected": 0.8962351679801941, "logps/chosen": -411.37213134765625, "logps/rejected": -457.3607482910156, "loss": 0.605, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -114.58937072753906, "rewards/margins": 89.79297637939453, "rewards/rejected": -204.38232421875, "step": 3200 }, { "epoch": 0.8400942161737766, "grad_norm": 54.26136466947174, "learning_rate": 3.79637678892577e-08, "logits/chosen": 0.8109161257743835, "logits/rejected": 1.667267084121704, "logps/chosen": -436.07281494140625, "logps/rejected": -462.241943359375, "loss": 0.6251, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -144.87509155273438, "rewards/margins": 60.97523880004883, "rewards/rejected": -205.85031127929688, "step": 3210 }, { "epoch": 0.8427113321120125, "grad_norm": 20.724578281603087, "learning_rate": 3.6762410676094645e-08, "logits/chosen": 0.34443768858909607, "logits/rejected": 1.82961905002594, "logps/chosen": -458.07513427734375, "logps/rejected": -504.356201171875, "loss": 0.5864, "rewards/accuracies": 0.75, "rewards/chosen": -123.5264663696289, "rewards/margins": 89.05165100097656, "rewards/rejected": -212.578125, "step": 3220 }, { "epoch": 0.8453284480502486, "grad_norm": 40.41533481576333, "learning_rate": 3.557885874027497e-08, "logits/chosen": 0.2875576913356781, "logits/rejected": 1.515847086906433, "logps/chosen": -420.4130859375, "logps/rejected": -446.76922607421875, "loss": 0.6213, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -118.218994140625, "rewards/margins": 75.82159423828125, "rewards/rejected": -194.04058837890625, "step": 3230 }, { "epoch": 0.8479455639884846, "grad_norm": 33.34083742420766, "learning_rate": 3.441321090804469e-08, "logits/chosen": -0.43460145592689514, "logits/rejected": 1.3918662071228027, "logps/chosen": -406.4483642578125, "logps/rejected": -525.8848876953125, "loss": 0.5975, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -110.67840576171875, "rewards/margins": 94.2422866821289, "rewards/rejected": -204.92068481445312, "step": 3240 }, { "epoch": 0.8505626799267207, "grad_norm": 17.25663786210711, "learning_rate": 3.326556451066234e-08, "logits/chosen": -0.41477689146995544, "logits/rejected": 1.7711904048919678, "logps/chosen": -430.942138671875, "logps/rejected": -448.9490661621094, "loss": 0.6206, "rewards/accuracies": 0.75, "rewards/chosen": -124.95573425292969, "rewards/margins": 84.45895385742188, "rewards/rejected": -209.41470336914062, "step": 3250 }, { "epoch": 0.8531797958649568, "grad_norm": 33.06045690226845, "learning_rate": 3.2136015376271946e-08, "logits/chosen": 0.573313295841217, "logits/rejected": 2.416731595993042, "logps/chosen": -350.485595703125, "logps/rejected": -356.3022155761719, "loss": 0.5999, "rewards/accuracies": 0.625, "rewards/chosen": -122.4647216796875, "rewards/margins": 62.025794982910156, "rewards/rejected": -184.49050903320312, "step": 3260 }, { "epoch": 0.8557969118031928, "grad_norm": 33.26261425090372, "learning_rate": 3.102465782190106e-08, "logits/chosen": 1.0045592784881592, "logits/rejected": 1.1752761602401733, "logps/chosen": -355.07769775390625, "logps/rejected": -607.1895751953125, "loss": 0.6018, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -141.97750854492188, "rewards/margins": 80.67582702636719, "rewards/rejected": -222.65335083007812, "step": 3270 }, { "epoch": 0.8584140277414289, "grad_norm": 25.329125812833063, "learning_rate": 2.993158464558565e-08, "logits/chosen": 0.6127084493637085, "logits/rejected": 0.8664273023605347, "logps/chosen": -393.71392822265625, "logps/rejected": -506.6620178222656, "loss": 0.6253, "rewards/accuracies": 0.75, "rewards/chosen": -131.31350708007812, "rewards/margins": 82.22903442382812, "rewards/rejected": -213.5425262451172, "step": 3280 }, { "epoch": 0.861031143679665, "grad_norm": 18.25047969135475, "learning_rate": 2.8856887118621358e-08, "logits/chosen": 0.11902396380901337, "logits/rejected": 1.4481818675994873, "logps/chosen": -437.8792419433594, "logps/rejected": -560.4588012695312, "loss": 0.5944, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -111.86763763427734, "rewards/margins": 117.12901306152344, "rewards/rejected": -228.9966583251953, "step": 3290 }, { "epoch": 0.863648259617901, "grad_norm": 26.6063313450085, "learning_rate": 2.7800654977942482e-08, "logits/chosen": 0.6087424159049988, "logits/rejected": 0.7114389538764954, "logps/chosen": -371.51397705078125, "logps/rejected": -431.497802734375, "loss": 0.6332, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -125.68513488769531, "rewards/margins": 42.292999267578125, "rewards/rejected": -167.97811889648438, "step": 3300 }, { "epoch": 0.8662653755561371, "grad_norm": 38.897936775137644, "learning_rate": 2.676297641862879e-08, "logits/chosen": -0.13244356215000153, "logits/rejected": 0.7202237844467163, "logps/chosen": -447.2171325683594, "logps/rejected": -462.3369140625, "loss": 0.6265, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -115.806396484375, "rewards/margins": 49.385658264160156, "rewards/rejected": -165.19203186035156, "step": 3310 }, { "epoch": 0.8688824914943732, "grad_norm": 22.24134018200084, "learning_rate": 2.5743938086541352e-08, "logits/chosen": 0.31034788489341736, "logits/rejected": 1.7712135314941406, "logps/chosen": -361.48370361328125, "logps/rejected": -408.6527404785156, "loss": 0.6042, "rewards/accuracies": 0.875, "rewards/chosen": -123.82491302490234, "rewards/margins": 101.67516326904297, "rewards/rejected": -225.50009155273438, "step": 3320 }, { "epoch": 0.8714996074326092, "grad_norm": 76.72292034254704, "learning_rate": 2.474362507108757e-08, "logits/chosen": -0.2383597195148468, "logits/rejected": 1.3851786851882935, "logps/chosen": -359.73858642578125, "logps/rejected": -491.29266357421875, "loss": 0.6068, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -94.1988525390625, "rewards/margins": 134.3372344970703, "rewards/rejected": -228.5360870361328, "step": 3330 }, { "epoch": 0.8741167233708453, "grad_norm": 48.10790913779111, "learning_rate": 2.3762120898116495e-08, "logits/chosen": 0.07395198196172714, "logits/rejected": 1.8523792028427124, "logps/chosen": -409.458984375, "logps/rejected": -479.45465087890625, "loss": 0.6149, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -129.4833984375, "rewards/margins": 103.62904357910156, "rewards/rejected": -233.1124267578125, "step": 3340 }, { "epoch": 0.8767338393090814, "grad_norm": 35.52309379757307, "learning_rate": 2.2799507522944044e-08, "logits/chosen": -0.05852848291397095, "logits/rejected": 1.087908387184143, "logps/chosen": -470.86181640625, "logps/rejected": -509.21435546875, "loss": 0.6248, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -129.98536682128906, "rewards/margins": 83.6360855102539, "rewards/rejected": -213.6214599609375, "step": 3350 }, { "epoch": 0.8793509552473174, "grad_norm": 30.69116981637674, "learning_rate": 2.1855865323510054e-08, "logits/chosen": -0.04171602800488472, "logits/rejected": 1.6160989999771118, "logps/chosen": -417.27166748046875, "logps/rejected": -514.6968994140625, "loss": 0.6023, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -129.82444763183594, "rewards/margins": 108.20865631103516, "rewards/rejected": -238.0331573486328, "step": 3360 }, { "epoch": 0.8819680711855535, "grad_norm": 18.432172416615423, "learning_rate": 2.0931273093666573e-08, "logits/chosen": -0.05673789232969284, "logits/rejected": 1.1431539058685303, "logps/chosen": -387.72882080078125, "logps/rejected": -360.6510009765625, "loss": 0.6221, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -122.041748046875, "rewards/margins": 46.56056594848633, "rewards/rejected": -168.60232543945312, "step": 3370 }, { "epoch": 0.8845851871237895, "grad_norm": 36.97449002155012, "learning_rate": 2.002580803659873e-08, "logits/chosen": 0.47802838683128357, "logits/rejected": 1.560175895690918, "logps/chosen": -367.7857971191406, "logps/rejected": -479.096435546875, "loss": 0.6068, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -131.46559143066406, "rewards/margins": 93.1476821899414, "rewards/rejected": -224.61325073242188, "step": 3380 }, { "epoch": 0.8872023030620256, "grad_norm": 41.84440019806065, "learning_rate": 1.9139545758378256e-08, "logits/chosen": -0.31822529435157776, "logits/rejected": 2.1346435546875, "logps/chosen": -434.1947326660156, "logps/rejected": -516.7833251953125, "loss": 0.6006, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -107.50236511230469, "rewards/margins": 126.99251556396484, "rewards/rejected": -234.49490356445312, "step": 3390 }, { "epoch": 0.8898194190002617, "grad_norm": 37.403239081733155, "learning_rate": 1.8272560261650277e-08, "logits/chosen": 0.23876461386680603, "logits/rejected": 2.3519275188446045, "logps/chosen": -370.8079528808594, "logps/rejected": -457.3379821777344, "loss": 0.6089, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -125.28385925292969, "rewards/margins": 101.42870330810547, "rewards/rejected": -226.7125701904297, "step": 3400 }, { "epoch": 0.8924365349384977, "grad_norm": 67.36541321581835, "learning_rate": 1.742492393945427e-08, "logits/chosen": 0.07925743609666824, "logits/rejected": 1.592063069343567, "logps/chosen": -428.7183532714844, "logps/rejected": -457.3017578125, "loss": 0.6062, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -125.50584411621094, "rewards/margins": 84.96311950683594, "rewards/rejected": -210.4689483642578, "step": 3410 }, { "epoch": 0.8950536508767338, "grad_norm": 20.44206177273988, "learning_rate": 1.6596707569179302e-08, "logits/chosen": -0.014702320098876953, "logits/rejected": 1.5920372009277344, "logps/chosen": -409.2933654785156, "logps/rejected": -514.6095581054688, "loss": 0.5756, "rewards/accuracies": 0.875, "rewards/chosen": -115.48329162597656, "rewards/margins": 133.8512725830078, "rewards/rejected": -249.3345489501953, "step": 3420 }, { "epoch": 0.8976707668149699, "grad_norm": 28.55538975432989, "learning_rate": 1.5787980306653848e-08, "logits/chosen": -0.006506321020424366, "logits/rejected": 2.24983549118042, "logps/chosen": -427.6556091308594, "logps/rejected": -487.06060791015625, "loss": 0.5991, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -133.6538848876953, "rewards/margins": 104.88616943359375, "rewards/rejected": -238.54006958007812, "step": 3430 }, { "epoch": 0.9002878827532059, "grad_norm": 24.356643588279173, "learning_rate": 1.499880968037165e-08, "logits/chosen": 0.4594038426876068, "logits/rejected": 2.023451805114746, "logps/chosen": -435.5736389160156, "logps/rejected": -524.2735595703125, "loss": 0.6262, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -140.4995574951172, "rewards/margins": 81.06678771972656, "rewards/rejected": -221.56637573242188, "step": 3440 }, { "epoch": 0.902904998691442, "grad_norm": 20.783729767212474, "learning_rate": 1.4229261585852803e-08, "logits/chosen": 0.43992137908935547, "logits/rejected": 2.0338668823242188, "logps/chosen": -427.1771545410156, "logps/rejected": -489.74688720703125, "loss": 0.6145, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -127.97509765625, "rewards/margins": 101.35646057128906, "rewards/rejected": -229.3315887451172, "step": 3450 }, { "epoch": 0.9055221146296781, "grad_norm": 15.969187300278293, "learning_rate": 1.3479400280141883e-08, "logits/chosen": 0.6696885228157043, "logits/rejected": 1.1811548471450806, "logps/chosen": -424.75689697265625, "logps/rejected": -527.1947021484375, "loss": 0.5997, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -132.3514404296875, "rewards/margins": 60.17137908935547, "rewards/rejected": -192.52281188964844, "step": 3460 }, { "epoch": 0.9081392305679141, "grad_norm": 47.729403494478504, "learning_rate": 1.2749288376442042e-08, "logits/chosen": -0.05798863619565964, "logits/rejected": 0.5691540837287903, "logps/chosen": -359.86822509765625, "logps/rejected": -430.391357421875, "loss": 0.6333, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -104.583984375, "rewards/margins": 69.30809020996094, "rewards/rejected": -173.89205932617188, "step": 3470 }, { "epoch": 0.9107563465061502, "grad_norm": 30.620975249350334, "learning_rate": 1.2038986838887127e-08, "logits/chosen": 0.2651011645793915, "logits/rejected": 1.6772241592407227, "logps/chosen": -347.3929443359375, "logps/rejected": -412.2870178222656, "loss": 0.6092, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -113.95643615722656, "rewards/margins": 88.0910415649414, "rewards/rejected": -202.0474853515625, "step": 3480 }, { "epoch": 0.9133734624443863, "grad_norm": 53.121108927701215, "learning_rate": 1.1348554977451131e-08, "logits/chosen": 0.2223062962293625, "logits/rejected": 1.1075026988983154, "logps/chosen": -435.0687561035156, "logps/rejected": -574.1682739257812, "loss": 0.5968, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -120.94419860839844, "rewards/margins": 113.83845520019531, "rewards/rejected": -234.7826385498047, "step": 3490 }, { "epoch": 0.9159905783826223, "grad_norm": 18.84746498341957, "learning_rate": 1.06780504429958e-08, "logits/chosen": 0.6739507913589478, "logits/rejected": 0.42967432737350464, "logps/chosen": -349.63128662109375, "logps/rejected": -455.380859375, "loss": 0.6419, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -125.32344818115234, "rewards/margins": 75.88658905029297, "rewards/rejected": -201.21005249023438, "step": 3500 }, { "epoch": 0.9186076943208584, "grad_norm": 36.78114307261151, "learning_rate": 1.0027529222456754e-08, "logits/chosen": 0.017544955015182495, "logits/rejected": 0.7979161143302917, "logps/chosen": -395.4692687988281, "logps/rejected": -522.6412963867188, "loss": 0.6098, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -127.53913879394531, "rewards/margins": 100.97230529785156, "rewards/rejected": -228.51144409179688, "step": 3510 }, { "epoch": 0.9212248102590945, "grad_norm": 32.458861273146255, "learning_rate": 9.397045634168766e-09, "logits/chosen": -0.1015222892165184, "logits/rejected": 1.0661077499389648, "logps/chosen": -444.6565856933594, "logps/rejected": -434.8824768066406, "loss": 0.6052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -134.20489501953125, "rewards/margins": 66.43836975097656, "rewards/rejected": -200.6432647705078, "step": 3520 }, { "epoch": 0.9238419261973305, "grad_norm": 42.81956388214368, "learning_rate": 8.78665232332998e-09, "logits/chosen": 0.6678240299224854, "logits/rejected": 2.1921708583831787, "logps/chosen": -428.3387145996094, "logps/rejected": -553.4244995117188, "loss": 0.5909, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -119.02767181396484, "rewards/margins": 116.71366119384766, "rewards/rejected": -235.74130249023438, "step": 3530 }, { "epoch": 0.9264590421355666, "grad_norm": 34.21637872966215, "learning_rate": 8.196400257606206e-09, "logits/chosen": 0.47500696778297424, "logits/rejected": 1.3353134393692017, "logps/chosen": -376.7292785644531, "logps/rejected": -447.9065856933594, "loss": 0.6407, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -136.65072631835938, "rewards/margins": 82.5105972290039, "rewards/rejected": -219.1613311767578, "step": 3540 }, { "epoch": 0.9290761580738026, "grad_norm": 53.978193486224676, "learning_rate": 7.626338722875075e-09, "logits/chosen": -0.10515992343425751, "logits/rejected": 1.1506468057632446, "logps/chosen": -461.34552001953125, "logps/rejected": -540.5074462890625, "loss": 0.6092, "rewards/accuracies": 0.75, "rewards/chosen": -160.37290954589844, "rewards/margins": 87.07331848144531, "rewards/rejected": -247.4462432861328, "step": 3550 }, { "epoch": 0.9316932740120387, "grad_norm": 16.239388562963665, "learning_rate": 7.0765153191106875e-09, "logits/chosen": 1.0949875116348267, "logits/rejected": 0.985447883605957, "logps/chosen": -387.4863586425781, "logps/rejected": -427.345703125, "loss": 0.6532, "rewards/accuracies": 0.75, "rewards/chosen": -128.79241943359375, "rewards/margins": 64.76528930664062, "rewards/rejected": -193.55770874023438, "step": 3560 }, { "epoch": 0.9343103899502748, "grad_norm": 25.74478178481144, "learning_rate": 6.54697595640899e-09, "logits/chosen": 0.4125773310661316, "logits/rejected": 1.1157901287078857, "logps/chosen": -324.0262451171875, "logps/rejected": -398.330078125, "loss": 0.5769, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -101.90724182128906, "rewards/margins": 86.06889343261719, "rewards/rejected": -187.97613525390625, "step": 3570 }, { "epoch": 0.9369275058885108, "grad_norm": 32.83789705445322, "learning_rate": 6.037764851154425e-09, "logits/chosen": -0.2145831137895584, "logits/rejected": 1.321026086807251, "logps/chosen": -455.0643615722656, "logps/rejected": -509.22119140625, "loss": 0.5892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -125.3864517211914, "rewards/margins": 87.76890563964844, "rewards/rejected": -213.1553497314453, "step": 3580 }, { "epoch": 0.9395446218267469, "grad_norm": 32.398022325965066, "learning_rate": 5.548924522327747e-09, "logits/chosen": -0.08072943985462189, "logits/rejected": 1.052816390991211, "logps/chosen": -424.6456604003906, "logps/rejected": -463.2438049316406, "loss": 0.6267, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -117.72221374511719, "rewards/margins": 101.6100845336914, "rewards/rejected": -219.3323211669922, "step": 3590 }, { "epoch": 0.942161737764983, "grad_norm": 43.737861260512695, "learning_rate": 5.080495787955691e-09, "logits/chosen": 0.2663705050945282, "logits/rejected": 1.5858561992645264, "logps/chosen": -440.0337829589844, "logps/rejected": -425.4478454589844, "loss": 0.6225, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -116.02880859375, "rewards/margins": 71.27129364013672, "rewards/rejected": -187.3000946044922, "step": 3600 }, { "epoch": 0.944778853703219, "grad_norm": 19.943029188551513, "learning_rate": 4.632517761702814e-09, "logits/chosen": 0.42500191926956177, "logits/rejected": 1.425812005996704, "logps/chosen": -473.26641845703125, "logps/rejected": -651.4464111328125, "loss": 0.6189, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -147.15560913085938, "rewards/margins": 129.23179626464844, "rewards/rejected": -276.387451171875, "step": 3610 }, { "epoch": 0.9473959696414551, "grad_norm": 24.74762269805631, "learning_rate": 4.205027849605358e-09, "logits/chosen": 0.15811410546302795, "logits/rejected": 1.1120755672454834, "logps/chosen": -396.139404296875, "logps/rejected": -448.7173767089844, "loss": 0.6211, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -135.4348602294922, "rewards/margins": 68.15974426269531, "rewards/rejected": -203.59461975097656, "step": 3620 }, { "epoch": 0.9500130855796912, "grad_norm": 22.40191130638278, "learning_rate": 3.798061746947995e-09, "logits/chosen": 0.37473368644714355, "logits/rejected": 1.1352459192276, "logps/chosen": -445.27044677734375, "logps/rejected": -539.070556640625, "loss": 0.615, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -136.21102905273438, "rewards/margins": 94.1988754272461, "rewards/rejected": -230.409912109375, "step": 3630 }, { "epoch": 0.9526302015179272, "grad_norm": 28.695280469787797, "learning_rate": 3.411653435283157e-09, "logits/chosen": -0.19782522320747375, "logits/rejected": 1.6401824951171875, "logps/chosen": -430.9485778808594, "logps/rejected": -515.1265869140625, "loss": 0.6242, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -125.76761627197266, "rewards/margins": 113.26475524902344, "rewards/rejected": -239.0323486328125, "step": 3640 }, { "epoch": 0.9552473174561633, "grad_norm": 33.000510950766326, "learning_rate": 3.0458351795936698e-09, "logits/chosen": 0.09500176459550858, "logits/rejected": 1.7043838500976562, "logps/chosen": -439.33831787109375, "logps/rejected": -497.22515869140625, "loss": 0.5922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -136.70016479492188, "rewards/margins": 84.02696228027344, "rewards/rejected": -220.7271270751953, "step": 3650 }, { "epoch": 0.9578644333943994, "grad_norm": 27.216858014189555, "learning_rate": 2.700637525598598e-09, "logits/chosen": 0.1739320009946823, "logits/rejected": 1.9256073236465454, "logps/chosen": -415.5773010253906, "logps/rejected": -464.6927185058594, "loss": 0.6142, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -122.09500885009766, "rewards/margins": 85.67970275878906, "rewards/rejected": -207.7747039794922, "step": 3660 }, { "epoch": 0.9604815493326354, "grad_norm": 41.15234660865958, "learning_rate": 2.3760892972027324e-09, "logits/chosen": 0.7606725096702576, "logits/rejected": 2.247407913208008, "logps/chosen": -464.6285095214844, "logps/rejected": -494.5741271972656, "loss": 0.6657, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -143.5747528076172, "rewards/margins": 92.42018127441406, "rewards/rejected": -235.9949188232422, "step": 3670 }, { "epoch": 0.9630986652708715, "grad_norm": 37.89432017028242, "learning_rate": 2.0722175940897645e-09, "logits/chosen": -0.2540181279182434, "logits/rejected": 1.4863791465759277, "logps/chosen": -487.421875, "logps/rejected": -469.3511657714844, "loss": 0.6164, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -140.024169921875, "rewards/margins": 66.67285919189453, "rewards/rejected": -206.69699096679688, "step": 3680 }, { "epoch": 0.9657157812091076, "grad_norm": 28.841623740215717, "learning_rate": 1.7890477894593748e-09, "logits/chosen": 0.5983523726463318, "logits/rejected": 2.3860888481140137, "logps/chosen": -413.36090087890625, "logps/rejected": -475.72998046875, "loss": 0.6261, "rewards/accuracies": 0.75, "rewards/chosen": -129.737548828125, "rewards/margins": 118.51426696777344, "rewards/rejected": -248.25180053710938, "step": 3690 }, { "epoch": 0.9683328971473436, "grad_norm": 30.85848190499018, "learning_rate": 1.5266035279088708e-09, "logits/chosen": 0.642953634262085, "logits/rejected": 1.7377115488052368, "logps/chosen": -435.58172607421875, "logps/rejected": -495.81524658203125, "loss": 0.6194, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -122.56893157958984, "rewards/margins": 103.9132308959961, "rewards/rejected": -226.48214721679688, "step": 3700 }, { "epoch": 0.9709500130855797, "grad_norm": 38.33508936295693, "learning_rate": 1.2849067234584621e-09, "logits/chosen": 0.22355318069458008, "logits/rejected": 1.7621004581451416, "logps/chosen": -331.8682556152344, "logps/rejected": -411.5686950683594, "loss": 0.6057, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -128.43795776367188, "rewards/margins": 97.2995376586914, "rewards/rejected": -225.7374725341797, "step": 3710 }, { "epoch": 0.9735671290238157, "grad_norm": 32.865483577563424, "learning_rate": 1.0639775577218625e-09, "logits/chosen": -0.21633200347423553, "logits/rejected": 1.0267454385757446, "logps/chosen": -437.278564453125, "logps/rejected": -542.7936401367188, "loss": 0.5941, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -127.906494140625, "rewards/margins": 112.18253326416016, "rewards/rejected": -240.08901977539062, "step": 3720 }, { "epoch": 0.9761842449620518, "grad_norm": 55.69172200477488, "learning_rate": 8.638344782207485e-10, "logits/chosen": 0.11773265898227692, "logits/rejected": 1.5340635776519775, "logps/chosen": -337.8634948730469, "logps/rejected": -417.99383544921875, "loss": 0.5949, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -123.92936706542969, "rewards/margins": 97.03419494628906, "rewards/rejected": -220.9635772705078, "step": 3730 }, { "epoch": 0.9788013609002879, "grad_norm": 29.33864063438161, "learning_rate": 6.844941968447149e-10, "logits/chosen": 0.4394141137599945, "logits/rejected": 0.8948599696159363, "logps/chosen": -397.30047607421875, "logps/rejected": -521.3082885742188, "loss": 0.6459, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -135.5926513671875, "rewards/margins": 71.82649993896484, "rewards/rejected": -207.4191436767578, "step": 3740 }, { "epoch": 0.9814184768385239, "grad_norm": 32.254807951743736, "learning_rate": 5.25971688455612e-10, "logits/chosen": 0.3935531675815582, "logits/rejected": 1.233229398727417, "logps/chosen": -420.3367614746094, "logps/rejected": -451.939697265625, "loss": 0.602, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -139.55093383789062, "rewards/margins": 70.234130859375, "rewards/rejected": -209.78506469726562, "step": 3750 }, { "epoch": 0.98403559277676, "grad_norm": 16.725344955385015, "learning_rate": 3.882801896372967e-10, "logits/chosen": -0.2951185405254364, "logits/rejected": 1.3272792100906372, "logps/chosen": -458.79693603515625, "logps/rejected": -501.19525146484375, "loss": 0.5835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -117.83502197265625, "rewards/margins": 97.3798828125, "rewards/rejected": -215.2148895263672, "step": 3760 }, { "epoch": 0.9866527087149961, "grad_norm": 23.047469492542017, "learning_rate": 2.714311975902661e-10, "logits/chosen": 0.4829816222190857, "logits/rejected": 0.9257513284683228, "logps/chosen": -404.45098876953125, "logps/rejected": -491.4329528808594, "loss": 0.6195, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -126.78916931152344, "rewards/margins": 86.20039367675781, "rewards/rejected": -212.9895477294922, "step": 3770 }, { "epoch": 0.9892698246532321, "grad_norm": 42.77852100209512, "learning_rate": 1.754344691717591e-10, "logits/chosen": 0.8735979199409485, "logits/rejected": 1.6277217864990234, "logps/chosen": -358.1724548339844, "logps/rejected": -410.5965881347656, "loss": 0.6026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -123.83882141113281, "rewards/margins": 71.39359283447266, "rewards/rejected": -195.23239135742188, "step": 3780 }, { "epoch": 0.9918869405914682, "grad_norm": 48.71547389714528, "learning_rate": 1.0029802008096333e-10, "logits/chosen": -0.3842397928237915, "logits/rejected": 1.9089739322662354, "logps/chosen": -412.76605224609375, "logps/rejected": -472.7958068847656, "loss": 0.5939, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -129.0559539794922, "rewards/margins": 96.40486907958984, "rewards/rejected": -225.46084594726562, "step": 3790 }, { "epoch": 0.9945040565297043, "grad_norm": 35.70591610414197, "learning_rate": 4.602812418974533e-11, "logits/chosen": 0.3812098503112793, "logits/rejected": 2.2290968894958496, "logps/chosen": -390.21673583984375, "logps/rejected": -426.26788330078125, "loss": 0.6217, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -138.75518798828125, "rewards/margins": 76.2783203125, "rewards/rejected": -215.0334930419922, "step": 3800 }, { "epoch": 0.9971211724679403, "grad_norm": 34.935749182467326, "learning_rate": 1.2629313018819309e-11, "logits/chosen": 0.44964084029197693, "logits/rejected": 1.517663598060608, "logps/chosen": -444.845703125, "logps/rejected": -547.6998901367188, "loss": 0.6326, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -141.57296752929688, "rewards/margins": 83.2319107055664, "rewards/rejected": -224.8048858642578, "step": 3810 }, { "epoch": 0.9997382884061764, "grad_norm": 19.681913314378978, "learning_rate": 1.0437535929996855e-13, "logits/chosen": 0.43567976355552673, "logits/rejected": 2.002690315246582, "logps/chosen": -437.58770751953125, "logps/rejected": -477.990966796875, "loss": 0.6177, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -148.7041778564453, "rewards/margins": 90.53707122802734, "rewards/rejected": -239.2412567138672, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.6291336198965001, "train_runtime": 16881.5542, "train_samples_per_second": 3.621, "train_steps_per_second": 0.226 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }