{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990680335507922, "eval_steps": 500, "global_step": 670, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014911463187325257, "grad_norm": 131.6420531261408, "learning_rate": 2.941176470588235e-08, "logits/chosen": 1.860435962677002, "logits/rejected": 2.0789663791656494, "logps/chosen": -1.3781263828277588, "logps/rejected": -1.480776309967041, "loss": 4.9287, "nll_loss": 1.3781262636184692, "rewards/accuracies": 0.625, "rewards/chosen": -13.78126335144043, "rewards/margins": 1.0264991521835327, "rewards/rejected": -14.80776309967041, "step": 1 }, { "epoch": 0.0029822926374650513, "grad_norm": 103.40519385579142, "learning_rate": 5.88235294117647e-08, "logits/chosen": 3.1205239295959473, "logits/rejected": 2.894362211227417, "logps/chosen": -0.8937948346138, "logps/rejected": -1.622621774673462, "loss": 3.7891, "nll_loss": 0.8937948346138, "rewards/accuracies": 1.0, "rewards/chosen": -8.937948226928711, "rewards/margins": 7.28826904296875, "rewards/rejected": -16.226219177246094, "step": 2 }, { "epoch": 0.004473438956197577, "grad_norm": 134.5616836750816, "learning_rate": 8.823529411764706e-08, "logits/chosen": 3.2682483196258545, "logits/rejected": 3.209494113922119, "logps/chosen": -0.8724318742752075, "logps/rejected": -1.301709532737732, "loss": 5.7572, "nll_loss": 0.8724318742752075, "rewards/accuracies": 0.875, "rewards/chosen": -8.724318504333496, "rewards/margins": 4.292776584625244, "rewards/rejected": -13.017093658447266, "step": 3 }, { "epoch": 0.005964585274930103, "grad_norm": 60.52582088050329, "learning_rate": 1.176470588235294e-07, "logits/chosen": 3.037123918533325, "logits/rejected": 3.441870927810669, "logps/chosen": -1.0890567302703857, "logps/rejected": -1.773290753364563, "loss": 3.3092, "nll_loss": 1.0890568494796753, "rewards/accuracies": 0.875, "rewards/chosen": -10.890567779541016, "rewards/margins": 6.842340469360352, "rewards/rejected": -17.732908248901367, "step": 4 }, { "epoch": 0.007455731593662628, "grad_norm": 299.54504667805105, "learning_rate": 1.4705882352941175e-07, "logits/chosen": 1.6548478603363037, "logits/rejected": 1.8445292711257935, "logps/chosen": -0.7503100037574768, "logps/rejected": -1.4727164506912231, "loss": 3.4786, "nll_loss": 0.7503100633621216, "rewards/accuracies": 0.875, "rewards/chosen": -7.503100395202637, "rewards/margins": 7.224064826965332, "rewards/rejected": -14.727165222167969, "step": 5 }, { "epoch": 0.008946877912395153, "grad_norm": 325.1467810043402, "learning_rate": 1.764705882352941e-07, "logits/chosen": 1.5452815294265747, "logits/rejected": 1.7093256711959839, "logps/chosen": -0.9170266389846802, "logps/rejected": -1.4084343910217285, "loss": 7.1976, "nll_loss": 0.9170266389846802, "rewards/accuracies": 0.75, "rewards/chosen": -9.170266151428223, "rewards/margins": 4.914077281951904, "rewards/rejected": -14.084342956542969, "step": 6 }, { "epoch": 0.01043802423112768, "grad_norm": 77.8378923449896, "learning_rate": 2.0588235294117645e-07, "logits/chosen": 1.6482737064361572, "logits/rejected": 2.253169059753418, "logps/chosen": -0.9095208644866943, "logps/rejected": -1.4046701192855835, "loss": 3.2881, "nll_loss": 0.9095209240913391, "rewards/accuracies": 1.0, "rewards/chosen": -9.095209121704102, "rewards/margins": 4.951492786407471, "rewards/rejected": -14.046701431274414, "step": 7 }, { "epoch": 0.011929170549860205, "grad_norm": 211.71750009391528, "learning_rate": 2.352941176470588e-07, "logits/chosen": 2.1762900352478027, "logits/rejected": 1.8468989133834839, "logps/chosen": -0.9643529057502747, "logps/rejected": -1.0850187540054321, "loss": 4.9795, "nll_loss": 0.9643529057502747, "rewards/accuracies": 0.625, "rewards/chosen": -9.643528938293457, "rewards/margins": 1.2066583633422852, "rewards/rejected": -10.850187301635742, "step": 8 }, { "epoch": 0.01342031686859273, "grad_norm": 89.30493536935535, "learning_rate": 2.6470588235294114e-07, "logits/chosen": 2.073772430419922, "logits/rejected": 2.0000739097595215, "logps/chosen": -1.0350722074508667, "logps/rejected": -1.282395839691162, "loss": 3.8779, "nll_loss": 1.0350722074508667, "rewards/accuracies": 0.625, "rewards/chosen": -10.35072135925293, "rewards/margins": 2.4732351303100586, "rewards/rejected": -12.823957443237305, "step": 9 }, { "epoch": 0.014911463187325256, "grad_norm": 448.01455210884984, "learning_rate": 2.941176470588235e-07, "logits/chosen": 2.8771941661834717, "logits/rejected": 2.724689483642578, "logps/chosen": -0.9448939561843872, "logps/rejected": -2.669597625732422, "loss": 5.7577, "nll_loss": 0.9448938369750977, "rewards/accuracies": 0.75, "rewards/chosen": -9.448939323425293, "rewards/margins": 17.24703598022461, "rewards/rejected": -26.69597625732422, "step": 10 }, { "epoch": 0.01640260950605778, "grad_norm": 52.029727877884525, "learning_rate": 3.2352941176470586e-07, "logits/chosen": 2.3021833896636963, "logits/rejected": 2.522644281387329, "logps/chosen": -0.7436463832855225, "logps/rejected": -0.9411755800247192, "loss": 3.1168, "nll_loss": 0.7436463832855225, "rewards/accuracies": 0.875, "rewards/chosen": -7.436463832855225, "rewards/margins": 1.9752916097640991, "rewards/rejected": -9.411755561828613, "step": 11 }, { "epoch": 0.017893755824790306, "grad_norm": 82.59162493133492, "learning_rate": 3.529411764705882e-07, "logits/chosen": 1.856669306755066, "logits/rejected": 2.4997129440307617, "logps/chosen": -0.7935373187065125, "logps/rejected": -1.27372145652771, "loss": 2.2473, "nll_loss": 0.7935372591018677, "rewards/accuracies": 1.0, "rewards/chosen": -7.935372829437256, "rewards/margins": 4.801842212677002, "rewards/rejected": -12.737215042114258, "step": 12 }, { "epoch": 0.01938490214352283, "grad_norm": 374.9421894888616, "learning_rate": 3.8235294117647053e-07, "logits/chosen": 2.017672061920166, "logits/rejected": 2.787160873413086, "logps/chosen": -0.9906985759735107, "logps/rejected": -1.5864818096160889, "loss": 6.4503, "nll_loss": 0.9906984567642212, "rewards/accuracies": 0.875, "rewards/chosen": -9.90698528289795, "rewards/margins": 5.9578328132629395, "rewards/rejected": -15.86481761932373, "step": 13 }, { "epoch": 0.02087604846225536, "grad_norm": 43.83168786420967, "learning_rate": 4.117647058823529e-07, "logits/chosen": 2.2166688442230225, "logits/rejected": 2.6085205078125, "logps/chosen": -0.7322432994842529, "logps/rejected": -1.4798029661178589, "loss": 3.4529, "nll_loss": 0.7322432398796082, "rewards/accuracies": 0.875, "rewards/chosen": -7.322432994842529, "rewards/margins": 7.475597381591797, "rewards/rejected": -14.798028945922852, "step": 14 }, { "epoch": 0.022367194780987885, "grad_norm": 44.227628229595126, "learning_rate": 4.4117647058823526e-07, "logits/chosen": 3.3875794410705566, "logits/rejected": 3.3296778202056885, "logps/chosen": -1.0347462892532349, "logps/rejected": -3.000295877456665, "loss": 2.4859, "nll_loss": 1.0347462892532349, "rewards/accuracies": 0.875, "rewards/chosen": -10.34746265411377, "rewards/margins": 19.65549087524414, "rewards/rejected": -30.002954483032227, "step": 15 }, { "epoch": 0.02385834109972041, "grad_norm": 325.8346322242626, "learning_rate": 4.705882352941176e-07, "logits/chosen": 3.1263082027435303, "logits/rejected": 3.1564300060272217, "logps/chosen": -0.895110011100769, "logps/rejected": -1.4188649654388428, "loss": 6.0086, "nll_loss": 0.895110011100769, "rewards/accuracies": 0.875, "rewards/chosen": -8.951099395751953, "rewards/margins": 5.237548828125, "rewards/rejected": -14.188648223876953, "step": 16 }, { "epoch": 0.025349487418452936, "grad_norm": 587.6736588822685, "learning_rate": 5e-07, "logits/chosen": 1.43706214427948, "logits/rejected": 1.386991262435913, "logps/chosen": -1.0890893936157227, "logps/rejected": -2.1828713417053223, "loss": 4.7301, "nll_loss": 1.0890893936157227, "rewards/accuracies": 0.75, "rewards/chosen": -10.890893936157227, "rewards/margins": 10.93781852722168, "rewards/rejected": -21.828712463378906, "step": 17 }, { "epoch": 0.02684063373718546, "grad_norm": 48.89890995560396, "learning_rate": 5.294117647058823e-07, "logits/chosen": 2.488058090209961, "logits/rejected": 2.8513152599334717, "logps/chosen": -0.8674882650375366, "logps/rejected": -2.094695568084717, "loss": 3.4653, "nll_loss": 0.8674882650375366, "rewards/accuracies": 0.875, "rewards/chosen": -8.674882888793945, "rewards/margins": 12.272073745727539, "rewards/rejected": -20.946956634521484, "step": 18 }, { "epoch": 0.028331780055917986, "grad_norm": 70.05145049092668, "learning_rate": 5.588235294117647e-07, "logits/chosen": 1.3571797609329224, "logits/rejected": 1.412168025970459, "logps/chosen": -0.5889593958854675, "logps/rejected": -0.9319955110549927, "loss": 3.0801, "nll_loss": 0.5889593362808228, "rewards/accuracies": 0.875, "rewards/chosen": -5.889594078063965, "rewards/margins": 3.430360794067383, "rewards/rejected": -9.319953918457031, "step": 19 }, { "epoch": 0.02982292637465051, "grad_norm": 451.3908655974561, "learning_rate": 5.88235294117647e-07, "logits/chosen": 2.2031502723693848, "logits/rejected": 2.382431983947754, "logps/chosen": -1.0798367261886597, "logps/rejected": -1.9304254055023193, "loss": 7.9661, "nll_loss": 1.0798367261886597, "rewards/accuracies": 0.75, "rewards/chosen": -10.79836654663086, "rewards/margins": 8.505887985229492, "rewards/rejected": -19.30425453186035, "step": 20 }, { "epoch": 0.03131407269338304, "grad_norm": 52.30093423337889, "learning_rate": 6.176470588235294e-07, "logits/chosen": 2.417792797088623, "logits/rejected": 2.771883487701416, "logps/chosen": -1.1893094778060913, "logps/rejected": -1.3052068948745728, "loss": 3.2391, "nll_loss": 1.1893094778060913, "rewards/accuracies": 0.625, "rewards/chosen": -11.893095016479492, "rewards/margins": 1.1589728593826294, "rewards/rejected": -13.052067756652832, "step": 21 }, { "epoch": 0.03280521901211556, "grad_norm": 308.86340481320207, "learning_rate": 6.470588235294117e-07, "logits/chosen": 2.9378466606140137, "logits/rejected": 2.937854528427124, "logps/chosen": -3.167470693588257, "logps/rejected": -1.3397622108459473, "loss": 6.9591, "nll_loss": 3.167470693588257, "rewards/accuracies": 0.5, "rewards/chosen": -31.67470932006836, "rewards/margins": -18.277084350585938, "rewards/rejected": -13.397623062133789, "step": 22 }, { "epoch": 0.03429636533084809, "grad_norm": 98.96765350390677, "learning_rate": 6.764705882352941e-07, "logits/chosen": 2.7766013145446777, "logits/rejected": 2.894148111343384, "logps/chosen": -1.078629970550537, "logps/rejected": -2.440201997756958, "loss": 3.5107, "nll_loss": 1.0786300897598267, "rewards/accuracies": 0.75, "rewards/chosen": -10.786300659179688, "rewards/margins": 13.61571979522705, "rewards/rejected": -24.402023315429688, "step": 23 }, { "epoch": 0.03578751164958061, "grad_norm": 396.6066334256457, "learning_rate": 7.058823529411765e-07, "logits/chosen": 2.2593963146209717, "logits/rejected": 2.1909866333007812, "logps/chosen": -0.9177834987640381, "logps/rejected": -1.28047513961792, "loss": 5.0049, "nll_loss": 0.9177834987640381, "rewards/accuracies": 0.75, "rewards/chosen": -9.177834510803223, "rewards/margins": 3.6269168853759766, "rewards/rejected": -12.804752349853516, "step": 24 }, { "epoch": 0.03727865796831314, "grad_norm": 408.09262397804815, "learning_rate": 7.352941176470589e-07, "logits/chosen": 1.8168671131134033, "logits/rejected": 2.147278070449829, "logps/chosen": -0.9170699119567871, "logps/rejected": -1.5485506057739258, "loss": 6.9351, "nll_loss": 0.9170699119567871, "rewards/accuracies": 0.625, "rewards/chosen": -9.170699119567871, "rewards/margins": 6.314807415008545, "rewards/rejected": -15.485507011413574, "step": 25 }, { "epoch": 0.03876980428704566, "grad_norm": 48.476267730529635, "learning_rate": 7.647058823529411e-07, "logits/chosen": 2.122601270675659, "logits/rejected": 2.130781888961792, "logps/chosen": -0.8701527714729309, "logps/rejected": -4.8353047370910645, "loss": 2.8474, "nll_loss": 0.8701527714729309, "rewards/accuracies": 0.875, "rewards/chosen": -8.701526641845703, "rewards/margins": 39.651519775390625, "rewards/rejected": -48.353050231933594, "step": 26 }, { "epoch": 0.040260950605778195, "grad_norm": 60.6168649217365, "learning_rate": 7.941176470588235e-07, "logits/chosen": 2.607726812362671, "logits/rejected": 2.7943997383117676, "logps/chosen": -0.9930699467658997, "logps/rejected": -2.903250217437744, "loss": 1.8576, "nll_loss": 0.9930700063705444, "rewards/accuracies": 1.0, "rewards/chosen": -9.930700302124023, "rewards/margins": 19.101806640625, "rewards/rejected": -29.03250503540039, "step": 27 }, { "epoch": 0.04175209692451072, "grad_norm": 323.1578672926767, "learning_rate": 8.235294117647058e-07, "logits/chosen": 1.7018678188323975, "logits/rejected": 2.3391165733337402, "logps/chosen": -0.7163273096084595, "logps/rejected": -1.9551293849945068, "loss": 4.318, "nll_loss": 0.7163272500038147, "rewards/accuracies": 1.0, "rewards/chosen": -7.163272857666016, "rewards/margins": 12.388022422790527, "rewards/rejected": -19.55129623413086, "step": 28 }, { "epoch": 0.043243243243243246, "grad_norm": 648.8657305986615, "learning_rate": 8.529411764705882e-07, "logits/chosen": 2.8854246139526367, "logits/rejected": 2.9412496089935303, "logps/chosen": -0.7824481129646301, "logps/rejected": -3.6611781120300293, "loss": 7.4867, "nll_loss": 0.7824481129646301, "rewards/accuracies": 1.0, "rewards/chosen": -7.824481010437012, "rewards/margins": 28.787302017211914, "rewards/rejected": -36.611785888671875, "step": 29 }, { "epoch": 0.04473438956197577, "grad_norm": 201.44323353397405, "learning_rate": 8.823529411764705e-07, "logits/chosen": 2.1763527393341064, "logits/rejected": 2.4524543285369873, "logps/chosen": -0.9366781115531921, "logps/rejected": -1.4384242296218872, "loss": 6.2664, "nll_loss": 0.9366780519485474, "rewards/accuracies": 0.75, "rewards/chosen": -9.366782188415527, "rewards/margins": 5.017460346221924, "rewards/rejected": -14.384241104125977, "step": 30 }, { "epoch": 0.046225535880708296, "grad_norm": 58.18949616340279, "learning_rate": 9.117647058823529e-07, "logits/chosen": 1.7160942554473877, "logits/rejected": 1.9639906883239746, "logps/chosen": -0.7702720761299133, "logps/rejected": -1.9634369611740112, "loss": 2.8173, "nll_loss": 0.7702720761299133, "rewards/accuracies": 1.0, "rewards/chosen": -7.702720642089844, "rewards/margins": 11.931649208068848, "rewards/rejected": -19.634368896484375, "step": 31 }, { "epoch": 0.04771668219944082, "grad_norm": 53.2747982513516, "learning_rate": 9.411764705882352e-07, "logits/chosen": 1.9022067785263062, "logits/rejected": 2.0756680965423584, "logps/chosen": -0.7626714706420898, "logps/rejected": -1.3213756084442139, "loss": 3.8545, "nll_loss": 0.7626714706420898, "rewards/accuracies": 1.0, "rewards/chosen": -7.62671422958374, "rewards/margins": 5.587039947509766, "rewards/rejected": -13.213754653930664, "step": 32 }, { "epoch": 0.04920782851817335, "grad_norm": 123.13299849078159, "learning_rate": 9.705882352941176e-07, "logits/chosen": 2.2254092693328857, "logits/rejected": 2.159079074859619, "logps/chosen": -0.6334936618804932, "logps/rejected": -1.3932712078094482, "loss": 2.7757, "nll_loss": 0.6334936022758484, "rewards/accuracies": 0.875, "rewards/chosen": -6.334936618804932, "rewards/margins": 7.597774505615234, "rewards/rejected": -13.932710647583008, "step": 33 }, { "epoch": 0.05069897483690587, "grad_norm": 514.3589700810505, "learning_rate": 1e-06, "logits/chosen": 2.442638874053955, "logits/rejected": 2.389988422393799, "logps/chosen": -0.9179913997650146, "logps/rejected": -2.180492877960205, "loss": 5.3914, "nll_loss": 0.9179913997650146, "rewards/accuracies": 0.625, "rewards/chosen": -9.179913520812988, "rewards/margins": 12.625017166137695, "rewards/rejected": -21.804927825927734, "step": 34 }, { "epoch": 0.0521901211556384, "grad_norm": 351.36582028421606, "learning_rate": 9.999939000729715e-07, "logits/chosen": 3.020676374435425, "logits/rejected": 2.6687097549438477, "logps/chosen": -1.1960551738739014, "logps/rejected": -1.4366477727890015, "loss": 5.0978, "nll_loss": 1.1960551738739014, "rewards/accuracies": 0.25, "rewards/chosen": -11.960551261901855, "rewards/margins": 2.4059267044067383, "rewards/rejected": -14.366477966308594, "step": 35 }, { "epoch": 0.05368126747437092, "grad_norm": 33.27397343776287, "learning_rate": 9.999756004407228e-07, "logits/chosen": 2.4607324600219727, "logits/rejected": 2.1372451782226562, "logps/chosen": -0.7419140338897705, "logps/rejected": -1.294709324836731, "loss": 2.6714, "nll_loss": 0.7419140338897705, "rewards/accuracies": 0.625, "rewards/chosen": -7.419139862060547, "rewards/margins": 5.527953147888184, "rewards/rejected": -12.94709300994873, "step": 36 }, { "epoch": 0.05517241379310345, "grad_norm": 481.47428057363646, "learning_rate": 9.999451015497595e-07, "logits/chosen": 1.1678117513656616, "logits/rejected": 1.1830850839614868, "logps/chosen": -0.8966320753097534, "logps/rejected": -1.7006372213363647, "loss": 7.4877, "nll_loss": 0.8966320157051086, "rewards/accuracies": 0.75, "rewards/chosen": -8.966320991516113, "rewards/margins": 8.04005241394043, "rewards/rejected": -17.00637435913086, "step": 37 }, { "epoch": 0.05666356011183597, "grad_norm": 93.58445404338936, "learning_rate": 9.999024041442455e-07, "logits/chosen": 2.5838398933410645, "logits/rejected": 2.9061648845672607, "logps/chosen": -1.2142046689987183, "logps/rejected": -1.276504397392273, "loss": 4.1691, "nll_loss": 1.2142047882080078, "rewards/accuracies": 0.625, "rewards/chosen": -12.142046928405762, "rewards/margins": 0.6229971051216125, "rewards/rejected": -12.765044212341309, "step": 38 }, { "epoch": 0.0581547064305685, "grad_norm": 40.82279691083783, "learning_rate": 9.998475092659849e-07, "logits/chosen": 1.7935913801193237, "logits/rejected": 2.12984299659729, "logps/chosen": -0.8291258215904236, "logps/rejected": -1.4954478740692139, "loss": 2.2373, "nll_loss": 0.8291257619857788, "rewards/accuracies": 0.875, "rewards/chosen": -8.291257858276367, "rewards/margins": 6.66322135925293, "rewards/rejected": -14.954479217529297, "step": 39 }, { "epoch": 0.05964585274930102, "grad_norm": 53.81500379033792, "learning_rate": 9.99780418254397e-07, "logits/chosen": 1.685670018196106, "logits/rejected": 1.92978036403656, "logps/chosen": -0.5348352789878845, "logps/rejected": -0.830666720867157, "loss": 3.8797, "nll_loss": 0.5348352789878845, "rewards/accuracies": 1.0, "rewards/chosen": -5.348352909088135, "rewards/margins": 2.9583141803741455, "rewards/rejected": -8.30666732788086, "step": 40 }, { "epoch": 0.06113699906803355, "grad_norm": 32.317751201915385, "learning_rate": 9.99701132746483e-07, "logits/chosen": 1.0136024951934814, "logits/rejected": 1.234411597251892, "logps/chosen": -0.6656249761581421, "logps/rejected": -1.2858957052230835, "loss": 2.9056, "nll_loss": 0.6656249165534973, "rewards/accuracies": 0.75, "rewards/chosen": -6.656249046325684, "rewards/margins": 6.202707290649414, "rewards/rejected": -12.858956336975098, "step": 41 }, { "epoch": 0.06262814538676607, "grad_norm": 37.920752742703854, "learning_rate": 9.996096546767859e-07, "logits/chosen": 1.643129825592041, "logits/rejected": 1.9104418754577637, "logps/chosen": -0.8929121494293213, "logps/rejected": -3.583488702774048, "loss": 2.6743, "nll_loss": 0.8929121494293213, "rewards/accuracies": 1.0, "rewards/chosen": -8.929121017456055, "rewards/margins": 26.905765533447266, "rewards/rejected": -35.83489227294922, "step": 42 }, { "epoch": 0.0641192917054986, "grad_norm": 220.01417151470528, "learning_rate": 9.995059862773438e-07, "logits/chosen": 2.98364520072937, "logits/rejected": 3.2048516273498535, "logps/chosen": -0.9894806146621704, "logps/rejected": -1.338758945465088, "loss": 4.432, "nll_loss": 0.9894806742668152, "rewards/accuracies": 0.625, "rewards/chosen": -9.894806861877441, "rewards/margins": 3.4927821159362793, "rewards/rejected": -13.387588500976562, "step": 43 }, { "epoch": 0.06561043802423112, "grad_norm": 129.84779630155947, "learning_rate": 9.993901300776358e-07, "logits/chosen": 2.7791249752044678, "logits/rejected": 2.397531509399414, "logps/chosen": -0.9826548099517822, "logps/rejected": -1.3197441101074219, "loss": 3.2615, "nll_loss": 0.982654869556427, "rewards/accuracies": 0.75, "rewards/chosen": -9.826549530029297, "rewards/margins": 3.3708925247192383, "rewards/rejected": -13.197441101074219, "step": 44 }, { "epoch": 0.06710158434296365, "grad_norm": 39.440699831745064, "learning_rate": 9.99262088904519e-07, "logits/chosen": 2.2303521633148193, "logits/rejected": 2.4076573848724365, "logps/chosen": -0.8233699202537537, "logps/rejected": -2.383819818496704, "loss": 2.8246, "nll_loss": 0.8233699202537537, "rewards/accuracies": 1.0, "rewards/chosen": -8.233698844909668, "rewards/margins": 15.604498863220215, "rewards/rejected": -23.838197708129883, "step": 45 }, { "epoch": 0.06859273066169617, "grad_norm": 61.5149468898277, "learning_rate": 9.991218658821608e-07, "logits/chosen": 1.7433161735534668, "logits/rejected": 1.4342732429504395, "logps/chosen": -0.9544675946235657, "logps/rejected": -1.520774245262146, "loss": 3.262, "nll_loss": 0.9544676542282104, "rewards/accuracies": 0.375, "rewards/chosen": -9.544675827026367, "rewards/margins": 5.6630659103393555, "rewards/rejected": -15.207741737365723, "step": 46 }, { "epoch": 0.0700838769804287, "grad_norm": 46.130144503376954, "learning_rate": 9.989694644319617e-07, "logits/chosen": 2.038182497024536, "logits/rejected": 1.2625888586044312, "logps/chosen": -1.2647312879562378, "logps/rejected": -4.061408042907715, "loss": 3.1697, "nll_loss": 1.2647314071655273, "rewards/accuracies": 0.625, "rewards/chosen": -12.64731216430664, "rewards/margins": 27.96677017211914, "rewards/rejected": -40.61408996582031, "step": 47 }, { "epoch": 0.07157502329916123, "grad_norm": 59.34170299379562, "learning_rate": 9.988048882724732e-07, "logits/chosen": 2.8762378692626953, "logits/rejected": 2.991826295852661, "logps/chosen": -0.9993945360183716, "logps/rejected": -1.4654295444488525, "loss": 3.4738, "nll_loss": 0.999394416809082, "rewards/accuracies": 0.75, "rewards/chosen": -9.993945121765137, "rewards/margins": 4.6603498458862305, "rewards/rejected": -14.654294967651367, "step": 48 }, { "epoch": 0.07306616961789375, "grad_norm": 102.94683203629245, "learning_rate": 9.98628141419305e-07, "logits/chosen": 2.6454639434814453, "logits/rejected": 2.2062056064605713, "logps/chosen": -0.9753769636154175, "logps/rejected": -1.7307155132293701, "loss": 3.7492, "nll_loss": 0.9753769636154175, "rewards/accuracies": 0.75, "rewards/chosen": -9.753769874572754, "rewards/margins": 7.553386211395264, "rewards/rejected": -17.30715560913086, "step": 49 }, { "epoch": 0.07455731593662628, "grad_norm": 100.46810904952117, "learning_rate": 9.98439228185029e-07, "logits/chosen": 1.7063933610916138, "logits/rejected": 2.1597023010253906, "logps/chosen": -0.791731595993042, "logps/rejected": -1.4047749042510986, "loss": 2.9333, "nll_loss": 0.7917314767837524, "rewards/accuracies": 0.75, "rewards/chosen": -7.917316436767578, "rewards/margins": 6.13043212890625, "rewards/rejected": -14.047748565673828, "step": 50 }, { "epoch": 0.0760484622553588, "grad_norm": 47.125555335318396, "learning_rate": 9.982381531790732e-07, "logits/chosen": 2.3178887367248535, "logits/rejected": 2.353886842727661, "logps/chosen": -0.9494035840034485, "logps/rejected": -3.0622718334198, "loss": 2.6833, "nll_loss": 0.9494035840034485, "rewards/accuracies": 0.625, "rewards/chosen": -9.494034767150879, "rewards/margins": 21.12868309020996, "rewards/rejected": -30.622718811035156, "step": 51 }, { "epoch": 0.07753960857409133, "grad_norm": 27.564523433846276, "learning_rate": 9.980249213076084e-07, "logits/chosen": 2.2474324703216553, "logits/rejected": 2.361985206604004, "logps/chosen": -1.0604387521743774, "logps/rejected": -1.6388243436813354, "loss": 2.8513, "nll_loss": 1.0604385137557983, "rewards/accuracies": 0.875, "rewards/chosen": -10.604386329650879, "rewards/margins": 5.783858776092529, "rewards/rejected": -16.38824462890625, "step": 52 }, { "epoch": 0.07903075489282387, "grad_norm": 47.86126354521992, "learning_rate": 9.977995377734306e-07, "logits/chosen": 2.8253977298736572, "logits/rejected": 2.822557210922241, "logps/chosen": -0.9413918852806091, "logps/rejected": -1.1508779525756836, "loss": 3.4441, "nll_loss": 0.9413918256759644, "rewards/accuracies": 0.75, "rewards/chosen": -9.413918495178223, "rewards/margins": 2.0948617458343506, "rewards/rejected": -11.508780479431152, "step": 53 }, { "epoch": 0.08052190121155639, "grad_norm": 75.25519513038266, "learning_rate": 9.97562008075832e-07, "logits/chosen": 2.55771541595459, "logits/rejected": 2.859192371368408, "logps/chosen": -1.1559171676635742, "logps/rejected": -1.4593706130981445, "loss": 2.9522, "nll_loss": 1.1559171676635742, "rewards/accuracies": 0.875, "rewards/chosen": -11.559171676635742, "rewards/margins": 3.034536361694336, "rewards/rejected": -14.593707084655762, "step": 54 }, { "epoch": 0.08201304753028892, "grad_norm": 198.24815027946113, "learning_rate": 9.97312338010468e-07, "logits/chosen": 1.073845386505127, "logits/rejected": 0.7879983186721802, "logps/chosen": -0.825156033039093, "logps/rejected": -1.5017766952514648, "loss": 3.7043, "nll_loss": 0.8251559734344482, "rewards/accuracies": 0.875, "rewards/chosen": -8.25156021118164, "rewards/margins": 6.766207695007324, "rewards/rejected": -15.017766952514648, "step": 55 }, { "epoch": 0.08350419384902144, "grad_norm": 104.23207197532834, "learning_rate": 9.970505336692153e-07, "logits/chosen": 1.5102717876434326, "logits/rejected": 1.6282591819763184, "logps/chosen": -0.8507825136184692, "logps/rejected": -1.504204273223877, "loss": 2.2505, "nll_loss": 0.850782573223114, "rewards/accuracies": 0.875, "rewards/chosen": -8.50782585144043, "rewards/margins": 6.534218788146973, "rewards/rejected": -15.042045593261719, "step": 56 }, { "epoch": 0.08499534016775397, "grad_norm": 71.0325316202818, "learning_rate": 9.96776601440023e-07, "logits/chosen": 1.9039965867996216, "logits/rejected": 2.3082637786865234, "logps/chosen": -1.021749496459961, "logps/rejected": -1.833884596824646, "loss": 2.796, "nll_loss": 1.021749496459961, "rewards/accuracies": 0.875, "rewards/chosen": -10.21749496459961, "rewards/margins": 8.121349334716797, "rewards/rejected": -18.338844299316406, "step": 57 }, { "epoch": 0.08648648648648649, "grad_norm": 36.642253230458486, "learning_rate": 9.964905480067584e-07, "logits/chosen": 1.909174919128418, "logits/rejected": 2.0987725257873535, "logps/chosen": -0.8566492199897766, "logps/rejected": -1.418030023574829, "loss": 3.3146, "nll_loss": 0.8566492795944214, "rewards/accuracies": 0.75, "rewards/chosen": -8.566493034362793, "rewards/margins": 5.6138081550598145, "rewards/rejected": -14.18030071258545, "step": 58 }, { "epoch": 0.08797763280521902, "grad_norm": 61.18925563704483, "learning_rate": 9.96192380349041e-07, "logits/chosen": 0.8996385335922241, "logits/rejected": 0.7567811012268066, "logps/chosen": -0.71656334400177, "logps/rejected": -2.003258466720581, "loss": 2.4045, "nll_loss": 0.71656334400177, "rewards/accuracies": 0.875, "rewards/chosen": -7.165633201599121, "rewards/margins": 12.866951942443848, "rewards/rejected": -20.03258514404297, "step": 59 }, { "epoch": 0.08946877912395154, "grad_norm": 71.5541363297922, "learning_rate": 9.958821057420752e-07, "logits/chosen": 3.4364728927612305, "logits/rejected": 2.817261219024658, "logps/chosen": -1.2784249782562256, "logps/rejected": -1.459514856338501, "loss": 3.4693, "nll_loss": 1.2784249782562256, "rewards/accuracies": 0.625, "rewards/chosen": -12.784249305725098, "rewards/margins": 1.8109009265899658, "rewards/rejected": -14.595149993896484, "step": 60 }, { "epoch": 0.09095992544268407, "grad_norm": 39.528470047855876, "learning_rate": 9.955597317564703e-07, "logits/chosen": 2.21244215965271, "logits/rejected": 2.1904029846191406, "logps/chosen": -0.8074604868888855, "logps/rejected": -1.7122533321380615, "loss": 3.0047, "nll_loss": 0.8074605464935303, "rewards/accuracies": 0.875, "rewards/chosen": -8.074604034423828, "rewards/margins": 9.047929763793945, "rewards/rejected": -17.122533798217773, "step": 61 }, { "epoch": 0.09245107176141659, "grad_norm": 78.73411061437659, "learning_rate": 9.952252662580579e-07, "logits/chosen": 2.613969326019287, "logits/rejected": 2.271270513534546, "logps/chosen": -0.8780649900436401, "logps/rejected": -1.5495920181274414, "loss": 2.8028, "nll_loss": 0.8780649900436401, "rewards/accuracies": 0.75, "rewards/chosen": -8.780649185180664, "rewards/margins": 6.715270519256592, "rewards/rejected": -15.49592113494873, "step": 62 }, { "epoch": 0.09394221808014912, "grad_norm": 76.89341482591242, "learning_rate": 9.948787174076981e-07, "logits/chosen": 2.1724722385406494, "logits/rejected": 2.4555037021636963, "logps/chosen": -1.1089688539505005, "logps/rejected": -1.4283640384674072, "loss": 2.669, "nll_loss": 1.1089688539505005, "rewards/accuracies": 0.625, "rewards/chosen": -11.08968734741211, "rewards/margins": 3.193953037261963, "rewards/rejected": -14.28364086151123, "step": 63 }, { "epoch": 0.09543336439888164, "grad_norm": 43.861657538915026, "learning_rate": 9.94520093661082e-07, "logits/chosen": 1.7243608236312866, "logits/rejected": 2.012627601623535, "logps/chosen": -1.3577117919921875, "logps/rejected": -1.9268991947174072, "loss": 3.2175, "nll_loss": 1.3577117919921875, "rewards/accuracies": 0.875, "rewards/chosen": -13.577116966247559, "rewards/margins": 5.691873550415039, "rewards/rejected": -19.268991470336914, "step": 64 }, { "epoch": 0.09692451071761417, "grad_norm": 87.80753078470336, "learning_rate": 9.941494037685243e-07, "logits/chosen": 1.7657297849655151, "logits/rejected": 1.7299094200134277, "logps/chosen": -0.83369380235672, "logps/rejected": -1.3283755779266357, "loss": 3.6815, "nll_loss": 0.83369380235672, "rewards/accuracies": 0.875, "rewards/chosen": -8.336938858032227, "rewards/margins": 4.946816444396973, "rewards/rejected": -13.283754348754883, "step": 65 }, { "epoch": 0.0984156570363467, "grad_norm": 37.44360560489009, "learning_rate": 9.9376665677475e-07, "logits/chosen": 2.8668999671936035, "logits/rejected": 2.786085844039917, "logps/chosen": -0.8269742131233215, "logps/rejected": -1.4122259616851807, "loss": 3.2433, "nll_loss": 0.8269742131233215, "rewards/accuracies": 0.75, "rewards/chosen": -8.269742965698242, "rewards/margins": 5.852518081665039, "rewards/rejected": -14.122259140014648, "step": 66 }, { "epoch": 0.09990680335507922, "grad_norm": 75.06063398922664, "learning_rate": 9.933718620186744e-07, "logits/chosen": 2.2985568046569824, "logits/rejected": 2.2154862880706787, "logps/chosen": -0.9047732949256897, "logps/rejected": -2.214324712753296, "loss": 2.4105, "nll_loss": 0.9047732353210449, "rewards/accuracies": 0.875, "rewards/chosen": -9.04773235321045, "rewards/margins": 13.095513343811035, "rewards/rejected": -22.143245697021484, "step": 67 }, { "epoch": 0.10139794967381174, "grad_norm": 40.87038075692728, "learning_rate": 9.929650291331739e-07, "logits/chosen": 2.396892547607422, "logits/rejected": 2.3966078758239746, "logps/chosen": -0.9977419376373291, "logps/rejected": -3.083773374557495, "loss": 2.5921, "nll_loss": 0.9977419376373291, "rewards/accuracies": 0.75, "rewards/chosen": -9.977418899536133, "rewards/margins": 20.86031723022461, "rewards/rejected": -30.837732315063477, "step": 68 }, { "epoch": 0.10288909599254427, "grad_norm": 65.73975701903078, "learning_rate": 9.925461680448525e-07, "logits/chosen": 2.516754150390625, "logits/rejected": 2.5938100814819336, "logps/chosen": -0.8624992370605469, "logps/rejected": -1.3350712060928345, "loss": 2.9874, "nll_loss": 0.8624992966651917, "rewards/accuracies": 0.875, "rewards/chosen": -8.624992370605469, "rewards/margins": 4.725719451904297, "rewards/rejected": -13.350711822509766, "step": 69 }, { "epoch": 0.1043802423112768, "grad_norm": 40.804175507450815, "learning_rate": 9.921152889737984e-07, "logits/chosen": 2.944434404373169, "logits/rejected": 3.123687744140625, "logps/chosen": -1.1898823976516724, "logps/rejected": -2.2927870750427246, "loss": 2.6063, "nll_loss": 1.1898823976516724, "rewards/accuracies": 0.875, "rewards/chosen": -11.898823738098145, "rewards/margins": 11.029047966003418, "rewards/rejected": -22.927871704101562, "step": 70 }, { "epoch": 0.10587138863000932, "grad_norm": 43.42248899213977, "learning_rate": 9.916724024333349e-07, "logits/chosen": 1.6870605945587158, "logits/rejected": 2.1503679752349854, "logps/chosen": -0.8132368326187134, "logps/rejected": -2.27059268951416, "loss": 2.3886, "nll_loss": 0.8132367730140686, "rewards/accuracies": 0.75, "rewards/chosen": -8.132368087768555, "rewards/margins": 14.57356071472168, "rewards/rejected": -22.705928802490234, "step": 71 }, { "epoch": 0.10736253494874184, "grad_norm": 43.24943223571458, "learning_rate": 9.912175192297647e-07, "logits/chosen": 2.581845283508301, "logits/rejected": 2.648552894592285, "logps/chosen": -1.249975562095642, "logps/rejected": -3.19691801071167, "loss": 3.2368, "nll_loss": 1.249975562095642, "rewards/accuracies": 0.75, "rewards/chosen": -12.499755859375, "rewards/margins": 19.469423294067383, "rewards/rejected": -31.969181060791016, "step": 72 }, { "epoch": 0.10885368126747437, "grad_norm": 86.67815176112049, "learning_rate": 9.90750650462105e-07, "logits/chosen": 2.0326600074768066, "logits/rejected": 1.7139393091201782, "logps/chosen": -0.8729643821716309, "logps/rejected": -2.466681718826294, "loss": 3.1837, "nll_loss": 0.8729644417762756, "rewards/accuracies": 0.75, "rewards/chosen": -8.729644775390625, "rewards/margins": 15.937172889709473, "rewards/rejected": -24.66681671142578, "step": 73 }, { "epoch": 0.1103448275862069, "grad_norm": 85.54815320638102, "learning_rate": 9.902718075218176e-07, "logits/chosen": 2.492297649383545, "logits/rejected": 2.0416009426116943, "logps/chosen": -0.8708876371383667, "logps/rejected": -3.123574733734131, "loss": 2.7176, "nll_loss": 0.8708876371383667, "rewards/accuracies": 1.0, "rewards/chosen": -8.708876609802246, "rewards/margins": 22.526870727539062, "rewards/rejected": -31.235746383666992, "step": 74 }, { "epoch": 0.11183597390493942, "grad_norm": 175.12315126357532, "learning_rate": 9.8978100209253e-07, "logits/chosen": 2.0305941104888916, "logits/rejected": 2.296379566192627, "logps/chosen": -0.9669104814529419, "logps/rejected": -2.073458194732666, "loss": 3.1995, "nll_loss": 0.9669104814529419, "rewards/accuracies": 1.0, "rewards/chosen": -9.669103622436523, "rewards/margins": 11.065479278564453, "rewards/rejected": -20.73458480834961, "step": 75 }, { "epoch": 0.11332712022367195, "grad_norm": 46.256680217091585, "learning_rate": 9.89278246149752e-07, "logits/chosen": 3.132906913757324, "logits/rejected": 3.1125741004943848, "logps/chosen": -0.998193621635437, "logps/rejected": -1.794511079788208, "loss": 2.4239, "nll_loss": 0.9981937408447266, "rewards/accuracies": 0.75, "rewards/chosen": -9.981935501098633, "rewards/margins": 7.963172912597656, "rewards/rejected": -17.945110321044922, "step": 76 }, { "epoch": 0.11481826654240447, "grad_norm": 96.89440127224765, "learning_rate": 9.887635519605815e-07, "logits/chosen": 1.6895731687545776, "logits/rejected": 2.142848014831543, "logps/chosen": -1.096168875694275, "logps/rejected": -1.9435949325561523, "loss": 3.3364, "nll_loss": 1.096168875694275, "rewards/accuracies": 0.875, "rewards/chosen": -10.961688995361328, "rewards/margins": 8.474259376525879, "rewards/rejected": -19.435949325561523, "step": 77 }, { "epoch": 0.116309412861137, "grad_norm": 44.44278994953829, "learning_rate": 9.882369320834068e-07, "logits/chosen": 2.2599291801452637, "logits/rejected": 1.5205597877502441, "logps/chosen": -0.9418594837188721, "logps/rejected": -1.2209088802337646, "loss": 2.9725, "nll_loss": 0.9418594837188721, "rewards/accuracies": 0.875, "rewards/chosen": -9.418594360351562, "rewards/margins": 2.7904937267303467, "rewards/rejected": -12.209087371826172, "step": 78 }, { "epoch": 0.11780055917986952, "grad_norm": 107.08151395089075, "learning_rate": 9.876983993675989e-07, "logits/chosen": 2.659163236618042, "logits/rejected": 2.7966227531433105, "logps/chosen": -0.8889027833938599, "logps/rejected": -1.249839425086975, "loss": 3.018, "nll_loss": 0.8889028429985046, "rewards/accuracies": 0.625, "rewards/chosen": -8.88902759552002, "rewards/margins": 3.609365940093994, "rewards/rejected": -12.498394966125488, "step": 79 }, { "epoch": 0.11929170549860205, "grad_norm": 22.34809477010669, "learning_rate": 9.871479669531988e-07, "logits/chosen": 2.099395513534546, "logits/rejected": 2.030531406402588, "logps/chosen": -0.9590896368026733, "logps/rejected": -1.314590334892273, "loss": 2.559, "nll_loss": 0.9590896368026733, "rewards/accuracies": 0.75, "rewards/chosen": -9.590896606445312, "rewards/margins": 3.555006742477417, "rewards/rejected": -13.145903587341309, "step": 80 }, { "epoch": 0.12078285181733457, "grad_norm": 50.15274973664094, "learning_rate": 9.865856482705972e-07, "logits/chosen": 3.312157154083252, "logits/rejected": 3.159273386001587, "logps/chosen": -1.3754456043243408, "logps/rejected": -2.9079883098602295, "loss": 2.8137, "nll_loss": 1.3754454851150513, "rewards/accuracies": 0.875, "rewards/chosen": -13.75445556640625, "rewards/margins": 15.325429916381836, "rewards/rejected": -29.079885482788086, "step": 81 }, { "epoch": 0.1222739981360671, "grad_norm": 43.46345081289328, "learning_rate": 9.860114570402054e-07, "logits/chosen": 2.0022189617156982, "logits/rejected": 1.6277714967727661, "logps/chosen": -1.0034466981887817, "logps/rejected": -2.133042335510254, "loss": 3.1481, "nll_loss": 1.0034466981887817, "rewards/accuracies": 0.75, "rewards/chosen": -10.034467697143555, "rewards/margins": 11.295957565307617, "rewards/rejected": -21.330425262451172, "step": 82 }, { "epoch": 0.12376514445479962, "grad_norm": 58.38128560109899, "learning_rate": 9.85425407272122e-07, "logits/chosen": 2.490449905395508, "logits/rejected": 2.7972893714904785, "logps/chosen": -1.2935540676116943, "logps/rejected": -2.6080098152160645, "loss": 2.5246, "nll_loss": 1.2935540676116943, "rewards/accuracies": 0.75, "rewards/chosen": -12.935540199279785, "rewards/margins": 13.14455795288086, "rewards/rejected": -26.08009910583496, "step": 83 }, { "epoch": 0.12525629077353215, "grad_norm": 178.3311223173836, "learning_rate": 9.8482751326579e-07, "logits/chosen": 1.821961760520935, "logits/rejected": 2.0110504627227783, "logps/chosen": -0.774723470211029, "logps/rejected": -2.1148810386657715, "loss": 2.3433, "nll_loss": 0.774723470211029, "rewards/accuracies": 0.875, "rewards/chosen": -7.747235298156738, "rewards/margins": 13.401576042175293, "rewards/rejected": -21.1488094329834, "step": 84 }, { "epoch": 0.1267474370922647, "grad_norm": 51.77425803563619, "learning_rate": 9.842177896096493e-07, "logits/chosen": 2.007399082183838, "logits/rejected": 2.0587520599365234, "logps/chosen": -0.9214052557945251, "logps/rejected": -2.382662534713745, "loss": 2.3806, "nll_loss": 0.9214051961898804, "rewards/accuracies": 0.75, "rewards/chosen": -9.214052200317383, "rewards/margins": 14.612571716308594, "rewards/rejected": -23.82662582397461, "step": 85 }, { "epoch": 0.1282385834109972, "grad_norm": 136.05280033355075, "learning_rate": 9.835962511807785e-07, "logits/chosen": 2.5353775024414062, "logits/rejected": 2.7205231189727783, "logps/chosen": -0.8614702224731445, "logps/rejected": -1.641845703125, "loss": 3.0362, "nll_loss": 0.8614702820777893, "rewards/accuracies": 0.875, "rewards/chosen": -8.614703178405762, "rewards/margins": 7.8037543296813965, "rewards/rejected": -16.41845703125, "step": 86 }, { "epoch": 0.12972972972972974, "grad_norm": 30.05683499780679, "learning_rate": 9.82962913144534e-07, "logits/chosen": 2.634047746658325, "logits/rejected": 2.6122305393218994, "logps/chosen": -0.9235073328018188, "logps/rejected": -1.4619425535202026, "loss": 2.434, "nll_loss": 0.9235073328018188, "rewards/accuracies": 0.75, "rewards/chosen": -9.235074043273926, "rewards/margins": 5.384352684020996, "rewards/rejected": -14.619426727294922, "step": 87 }, { "epoch": 0.13122087604846225, "grad_norm": 50.14650415819595, "learning_rate": 9.823177909541793e-07, "logits/chosen": 2.015709638595581, "logits/rejected": 1.9986441135406494, "logps/chosen": -0.8479939699172974, "logps/rejected": -1.2945144176483154, "loss": 2.7979, "nll_loss": 0.8479939699172974, "rewards/accuracies": 0.625, "rewards/chosen": -8.479939460754395, "rewards/margins": 4.46520471572876, "rewards/rejected": -12.945144653320312, "step": 88 }, { "epoch": 0.1327120223671948, "grad_norm": 52.862158002098944, "learning_rate": 9.816609003505072e-07, "logits/chosen": 3.347060203552246, "logits/rejected": 3.233002185821533, "logps/chosen": -1.1397117376327515, "logps/rejected": -1.4866154193878174, "loss": 2.6573, "nll_loss": 1.1397117376327515, "rewards/accuracies": 0.625, "rewards/chosen": -11.397117614746094, "rewards/margins": 3.4690380096435547, "rewards/rejected": -14.866154670715332, "step": 89 }, { "epoch": 0.1342031686859273, "grad_norm": 48.28420889494256, "learning_rate": 9.809922573614569e-07, "logits/chosen": 2.897589921951294, "logits/rejected": 2.594212532043457, "logps/chosen": -1.155824899673462, "logps/rejected": -1.7766919136047363, "loss": 2.6429, "nll_loss": 1.1558247804641724, "rewards/accuracies": 1.0, "rewards/chosen": -11.558248519897461, "rewards/margins": 6.208670616149902, "rewards/rejected": -17.76692008972168, "step": 90 }, { "epoch": 0.13569431500465984, "grad_norm": 44.09837920772329, "learning_rate": 9.80311878301722e-07, "logits/chosen": 2.5982978343963623, "logits/rejected": 2.4047904014587402, "logps/chosen": -1.1400195360183716, "logps/rejected": -1.6405586004257202, "loss": 2.7758, "nll_loss": 1.1400195360183716, "rewards/accuracies": 0.75, "rewards/chosen": -11.400195121765137, "rewards/margins": 5.005392074584961, "rewards/rejected": -16.40558624267578, "step": 91 }, { "epoch": 0.13718546132339235, "grad_norm": 35.373221853287035, "learning_rate": 9.796197797723532e-07, "logits/chosen": 1.8639134168624878, "logits/rejected": 1.8991543054580688, "logps/chosen": -0.9351028203964233, "logps/rejected": -2.294522523880005, "loss": 2.4379, "nll_loss": 0.9351028800010681, "rewards/accuracies": 0.625, "rewards/chosen": -9.351028442382812, "rewards/margins": 13.594196319580078, "rewards/rejected": -22.94522476196289, "step": 92 }, { "epoch": 0.1386766076421249, "grad_norm": 139.38985986452, "learning_rate": 9.789159786603522e-07, "logits/chosen": 1.8421365022659302, "logits/rejected": 2.1121838092803955, "logps/chosen": -0.7293067574501038, "logps/rejected": -1.8387272357940674, "loss": 3.2568, "nll_loss": 0.729306697845459, "rewards/accuracies": 0.875, "rewards/chosen": -7.293067932128906, "rewards/margins": 11.094205856323242, "rewards/rejected": -18.387271881103516, "step": 93 }, { "epoch": 0.1401677539608574, "grad_norm": 66.73824264608913, "learning_rate": 9.78200492138261e-07, "logits/chosen": 2.5073771476745605, "logits/rejected": 2.620734453201294, "logps/chosen": -1.0046569108963013, "logps/rejected": -2.5618834495544434, "loss": 2.5804, "nll_loss": 1.0046569108963013, "rewards/accuracies": 0.875, "rewards/chosen": -10.046568870544434, "rewards/margins": 15.57226848602295, "rewards/rejected": -25.61883544921875, "step": 94 }, { "epoch": 0.14165890027958994, "grad_norm": 60.19100070878113, "learning_rate": 9.774733376637421e-07, "logits/chosen": 2.4931178092956543, "logits/rejected": 2.3655691146850586, "logps/chosen": -0.9876638054847717, "logps/rejected": -1.5048593282699585, "loss": 3.3979, "nll_loss": 0.9876636266708374, "rewards/accuracies": 0.75, "rewards/chosen": -9.87663745880127, "rewards/margins": 5.1719560623168945, "rewards/rejected": -15.048593521118164, "step": 95 }, { "epoch": 0.14315004659832245, "grad_norm": 93.58142504504913, "learning_rate": 9.76734532979152e-07, "logits/chosen": 1.8486552238464355, "logits/rejected": 2.0405972003936768, "logps/chosen": -0.7252814769744873, "logps/rejected": -1.593367099761963, "loss": 1.4345, "nll_loss": 0.7252814769744873, "rewards/accuracies": 1.0, "rewards/chosen": -7.252814292907715, "rewards/margins": 8.680856704711914, "rewards/rejected": -15.933670997619629, "step": 96 }, { "epoch": 0.144641192917055, "grad_norm": 103.20755248724876, "learning_rate": 9.759840961111097e-07, "logits/chosen": 2.693235397338867, "logits/rejected": 2.6906933784484863, "logps/chosen": -0.8279076218605042, "logps/rejected": -1.965848445892334, "loss": 1.9372, "nll_loss": 0.8279076218605042, "rewards/accuracies": 1.0, "rewards/chosen": -8.27907657623291, "rewards/margins": 11.37940788269043, "rewards/rejected": -19.658483505249023, "step": 97 }, { "epoch": 0.1461323392357875, "grad_norm": 88.41840643452079, "learning_rate": 9.752220453700554e-07, "logits/chosen": 2.36478328704834, "logits/rejected": 2.2650041580200195, "logps/chosen": -0.9894752502441406, "logps/rejected": -1.866193413734436, "loss": 2.8837, "nll_loss": 0.9894753694534302, "rewards/accuracies": 1.0, "rewards/chosen": -9.894752502441406, "rewards/margins": 8.767179489135742, "rewards/rejected": -18.66193199157715, "step": 98 }, { "epoch": 0.14762348555452004, "grad_norm": 86.18987498941965, "learning_rate": 9.744483993498052e-07, "logits/chosen": 1.8514745235443115, "logits/rejected": 2.3874871730804443, "logps/chosen": -0.9877501726150513, "logps/rejected": -2.1680359840393066, "loss": 2.5626, "nll_loss": 0.987750232219696, "rewards/accuracies": 0.875, "rewards/chosen": -9.87750244140625, "rewards/margins": 11.802858352661133, "rewards/rejected": -21.68035888671875, "step": 99 }, { "epoch": 0.14911463187325255, "grad_norm": 63.73675084474543, "learning_rate": 9.736631769270957e-07, "logits/chosen": 2.287912368774414, "logits/rejected": 2.1225051879882812, "logps/chosen": -0.9378949403762817, "logps/rejected": -1.3148002624511719, "loss": 2.2336, "nll_loss": 0.9378949403762817, "rewards/accuracies": 0.625, "rewards/chosen": -9.378949165344238, "rewards/margins": 3.769054412841797, "rewards/rejected": -13.148003578186035, "step": 100 }, { "epoch": 0.1506057781919851, "grad_norm": 106.232957474972, "learning_rate": 9.72866397261125e-07, "logits/chosen": 1.9837908744812012, "logits/rejected": 2.2438063621520996, "logps/chosen": -0.9606898427009583, "logps/rejected": -1.9379208087921143, "loss": 2.9115, "nll_loss": 0.960689902305603, "rewards/accuracies": 1.0, "rewards/chosen": -9.60689926147461, "rewards/margins": 9.772309303283691, "rewards/rejected": -19.37920570373535, "step": 101 }, { "epoch": 0.1520969245107176, "grad_norm": 38.68435341536512, "learning_rate": 9.720580797930844e-07, "logits/chosen": 2.5899598598480225, "logits/rejected": 2.651418685913086, "logps/chosen": -0.9334679841995239, "logps/rejected": -1.3885936737060547, "loss": 2.4316, "nll_loss": 0.9334679841995239, "rewards/accuracies": 0.75, "rewards/chosen": -9.334678649902344, "rewards/margins": 4.551258087158203, "rewards/rejected": -13.885936737060547, "step": 102 }, { "epoch": 0.15358807082945014, "grad_norm": 68.61885464106034, "learning_rate": 9.712382442456844e-07, "logits/chosen": 2.4430806636810303, "logits/rejected": 2.515192985534668, "logps/chosen": -1.4942792654037476, "logps/rejected": -1.9828426837921143, "loss": 2.8353, "nll_loss": 1.4942792654037476, "rewards/accuracies": 0.75, "rewards/chosen": -14.942790985107422, "rewards/margins": 4.885636329650879, "rewards/rejected": -19.828428268432617, "step": 103 }, { "epoch": 0.15507921714818265, "grad_norm": 33.861836411287534, "learning_rate": 9.704069106226727e-07, "logits/chosen": 0.8605862855911255, "logits/rejected": 0.6118173599243164, "logps/chosen": -0.7482974529266357, "logps/rejected": -1.6876130104064941, "loss": 2.861, "nll_loss": 0.7482973337173462, "rewards/accuracies": 0.875, "rewards/chosen": -7.482974052429199, "rewards/margins": 9.393156051635742, "rewards/rejected": -16.876129150390625, "step": 104 }, { "epoch": 0.1565703634669152, "grad_norm": 31.203010053391782, "learning_rate": 9.695640992083471e-07, "logits/chosen": 2.2685840129852295, "logits/rejected": 2.6703624725341797, "logps/chosen": -1.1785123348236084, "logps/rejected": -1.6574785709381104, "loss": 2.4841, "nll_loss": 1.1785123348236084, "rewards/accuracies": 0.875, "rewards/chosen": -11.785123825073242, "rewards/margins": 4.789661407470703, "rewards/rejected": -16.574787139892578, "step": 105 }, { "epoch": 0.15806150978564773, "grad_norm": 70.20320807885953, "learning_rate": 9.687098305670604e-07, "logits/chosen": 1.7209910154342651, "logits/rejected": 2.1076905727386475, "logps/chosen": -0.9404205083847046, "logps/rejected": -1.0514883995056152, "loss": 2.579, "nll_loss": 0.9404205083847046, "rewards/accuracies": 0.625, "rewards/chosen": -9.404204368591309, "rewards/margins": 1.1106791496276855, "rewards/rejected": -10.514884948730469, "step": 106 }, { "epoch": 0.15955265610438024, "grad_norm": 48.677672121591414, "learning_rate": 9.678441255427179e-07, "logits/chosen": 2.7841222286224365, "logits/rejected": 2.7015132904052734, "logps/chosen": -1.0275628566741943, "logps/rejected": -2.475385904312134, "loss": 2.7087, "nll_loss": 1.0275628566741943, "rewards/accuracies": 0.875, "rewards/chosen": -10.275627136230469, "rewards/margins": 14.478230476379395, "rewards/rejected": -24.753860473632812, "step": 107 }, { "epoch": 0.16104380242311278, "grad_norm": 36.30484378835504, "learning_rate": 9.669670052582693e-07, "logits/chosen": 0.9016488790512085, "logits/rejected": 0.7409514784812927, "logps/chosen": -0.7989850640296936, "logps/rejected": -2.0533151626586914, "loss": 2.3533, "nll_loss": 0.7989850640296936, "rewards/accuracies": 0.875, "rewards/chosen": -7.9898505210876465, "rewards/margins": 12.543301582336426, "rewards/rejected": -20.533153533935547, "step": 108 }, { "epoch": 0.1625349487418453, "grad_norm": 90.07954865659732, "learning_rate": 9.66078491115194e-07, "logits/chosen": 1.88589608669281, "logits/rejected": 1.5732561349868774, "logps/chosen": -1.0318375825881958, "logps/rejected": -1.31565523147583, "loss": 2.8364, "nll_loss": 1.0318375825881958, "rewards/accuracies": 0.75, "rewards/chosen": -10.318375587463379, "rewards/margins": 2.838176727294922, "rewards/rejected": -13.156554222106934, "step": 109 }, { "epoch": 0.16402609506057783, "grad_norm": 93.22012698020275, "learning_rate": 9.651786047929772e-07, "logits/chosen": 1.7846213579177856, "logits/rejected": 2.1781160831451416, "logps/chosen": -0.945646345615387, "logps/rejected": -1.7606041431427002, "loss": 2.0061, "nll_loss": 0.945646345615387, "rewards/accuracies": 1.0, "rewards/chosen": -9.456462860107422, "rewards/margins": 8.149579048156738, "rewards/rejected": -17.606042861938477, "step": 110 }, { "epoch": 0.16551724137931034, "grad_norm": 161.0227005909337, "learning_rate": 9.642673682485828e-07, "logits/chosen": 1.544893741607666, "logits/rejected": 1.7132470607757568, "logps/chosen": -0.7085322737693787, "logps/rejected": -2.0612924098968506, "loss": 3.0642, "nll_loss": 0.7085322737693787, "rewards/accuracies": 1.0, "rewards/chosen": -7.085322380065918, "rewards/margins": 13.52760124206543, "rewards/rejected": -20.612924575805664, "step": 111 }, { "epoch": 0.16700838769804288, "grad_norm": 86.38909626151015, "learning_rate": 9.633448037159166e-07, "logits/chosen": 1.8809916973114014, "logits/rejected": 1.7090436220169067, "logps/chosen": -0.9259803891181946, "logps/rejected": -2.25235915184021, "loss": 2.541, "nll_loss": 0.9259804487228394, "rewards/accuracies": 0.875, "rewards/chosen": -9.259803771972656, "rewards/margins": 13.263786315917969, "rewards/rejected": -22.523590087890625, "step": 112 }, { "epoch": 0.1684995340167754, "grad_norm": 40.13582518270976, "learning_rate": 9.624109337052837e-07, "logits/chosen": 2.2280895709991455, "logits/rejected": 2.281741142272949, "logps/chosen": -0.8172340393066406, "logps/rejected": -1.4414923191070557, "loss": 1.9405, "nll_loss": 0.8172340393066406, "rewards/accuracies": 0.875, "rewards/chosen": -8.172340393066406, "rewards/margins": 6.24258279800415, "rewards/rejected": -14.414921760559082, "step": 113 }, { "epoch": 0.16999068033550793, "grad_norm": 77.81905751277964, "learning_rate": 9.6146578100284e-07, "logits/chosen": 1.5080493688583374, "logits/rejected": 1.7396886348724365, "logps/chosen": -0.8940697312355042, "logps/rejected": -1.4884928464889526, "loss": 2.6538, "nll_loss": 0.8940697908401489, "rewards/accuracies": 0.875, "rewards/chosen": -8.94069766998291, "rewards/margins": 5.944231986999512, "rewards/rejected": -14.884929656982422, "step": 114 }, { "epoch": 0.17148182665424044, "grad_norm": 40.75714524289799, "learning_rate": 9.605093686700353e-07, "logits/chosen": 1.5538138151168823, "logits/rejected": 1.5067213773727417, "logps/chosen": -0.7099705338478088, "logps/rejected": -1.2894647121429443, "loss": 2.1382, "nll_loss": 0.7099704742431641, "rewards/accuracies": 0.875, "rewards/chosen": -7.099704742431641, "rewards/margins": 5.794943332672119, "rewards/rejected": -12.894647598266602, "step": 115 }, { "epoch": 0.17297297297297298, "grad_norm": 39.36609870883376, "learning_rate": 9.595417200430515e-07, "logits/chosen": 1.8860256671905518, "logits/rejected": 2.049478054046631, "logps/chosen": -0.7657876014709473, "logps/rejected": -2.3453309535980225, "loss": 2.6743, "nll_loss": 0.7657876014709473, "rewards/accuracies": 0.875, "rewards/chosen": -7.657876014709473, "rewards/margins": 15.795431137084961, "rewards/rejected": -23.453306198120117, "step": 116 }, { "epoch": 0.1744641192917055, "grad_norm": 55.96194893684382, "learning_rate": 9.585628587322328e-07, "logits/chosen": 1.9524157047271729, "logits/rejected": 1.797798991203308, "logps/chosen": -1.1159601211547852, "logps/rejected": -1.6060389280319214, "loss": 3.3782, "nll_loss": 1.1159600019454956, "rewards/accuracies": 0.625, "rewards/chosen": -11.159601211547852, "rewards/margins": 4.9007887840271, "rewards/rejected": -16.06039047241211, "step": 117 }, { "epoch": 0.17595526561043803, "grad_norm": 41.101151414498915, "learning_rate": 9.575728086215091e-07, "logits/chosen": 1.20786714553833, "logits/rejected": 1.257465124130249, "logps/chosen": -1.0939463376998901, "logps/rejected": -1.5144948959350586, "loss": 2.4688, "nll_loss": 1.0939463376998901, "rewards/accuracies": 0.75, "rewards/chosen": -10.93946361541748, "rewards/margins": 4.20548677444458, "rewards/rejected": -15.144948959350586, "step": 118 }, { "epoch": 0.17744641192917054, "grad_norm": 35.493847514643186, "learning_rate": 9.565715938678145e-07, "logits/chosen": 2.0380167961120605, "logits/rejected": 2.095473527908325, "logps/chosen": -0.7855863571166992, "logps/rejected": -1.56964910030365, "loss": 2.261, "nll_loss": 0.7855863571166992, "rewards/accuracies": 1.0, "rewards/chosen": -7.855864524841309, "rewards/margins": 7.840627670288086, "rewards/rejected": -15.696491241455078, "step": 119 }, { "epoch": 0.17893755824790308, "grad_norm": 57.163210188312476, "learning_rate": 9.555592389004966e-07, "logits/chosen": 2.790797710418701, "logits/rejected": 1.9768295288085938, "logps/chosen": -1.2539488077163696, "logps/rejected": -3.159113883972168, "loss": 1.935, "nll_loss": 1.2539489269256592, "rewards/accuracies": 1.0, "rewards/chosen": -12.539488792419434, "rewards/margins": 19.051651000976562, "rewards/rejected": -31.591140747070312, "step": 120 }, { "epoch": 0.1804287045666356, "grad_norm": 79.11517785089488, "learning_rate": 9.54535768420721e-07, "logits/chosen": 2.376458168029785, "logits/rejected": 2.3259100914001465, "logps/chosen": -1.0639259815216064, "logps/rejected": -2.951509952545166, "loss": 2.1214, "nll_loss": 1.0639259815216064, "rewards/accuracies": 0.75, "rewards/chosen": -10.639259338378906, "rewards/margins": 18.875837326049805, "rewards/rejected": -29.51509666442871, "step": 121 }, { "epoch": 0.18191985088536813, "grad_norm": 161.6550146595864, "learning_rate": 9.535012074008686e-07, "logits/chosen": 2.5242788791656494, "logits/rejected": 2.7374792098999023, "logps/chosen": -1.1811103820800781, "logps/rejected": -1.8748825788497925, "loss": 3.2394, "nll_loss": 1.1811102628707886, "rewards/accuracies": 0.75, "rewards/chosen": -11.811103820800781, "rewards/margins": 6.937723159790039, "rewards/rejected": -18.74882698059082, "step": 122 }, { "epoch": 0.18341099720410065, "grad_norm": 68.86205628275955, "learning_rate": 9.524555810839266e-07, "logits/chosen": 1.5950171947479248, "logits/rejected": 1.648951768875122, "logps/chosen": -1.2424349784851074, "logps/rejected": -2.0438003540039062, "loss": 2.3309, "nll_loss": 1.2424349784851074, "rewards/accuracies": 1.0, "rewards/chosen": -12.424349784851074, "rewards/margins": 8.013655662536621, "rewards/rejected": -20.438003540039062, "step": 123 }, { "epoch": 0.18490214352283318, "grad_norm": 66.32682008465957, "learning_rate": 9.513989149828717e-07, "logits/chosen": 0.9972255229949951, "logits/rejected": 2.005828619003296, "logps/chosen": -0.8773561120033264, "logps/rejected": -1.890647530555725, "loss": 2.728, "nll_loss": 0.8773560523986816, "rewards/accuracies": 0.875, "rewards/chosen": -8.773560523986816, "rewards/margins": 10.132913589477539, "rewards/rejected": -18.906475067138672, "step": 124 }, { "epoch": 0.1863932898415657, "grad_norm": 45.06477535770408, "learning_rate": 9.503312348800485e-07, "logits/chosen": 2.1520888805389404, "logits/rejected": 2.2559258937835693, "logps/chosen": -1.0796295404434204, "logps/rejected": -2.325199842453003, "loss": 2.3406, "nll_loss": 1.0796295404434204, "rewards/accuracies": 0.875, "rewards/chosen": -10.796295166015625, "rewards/margins": 12.455702781677246, "rewards/rejected": -23.251996994018555, "step": 125 }, { "epoch": 0.18788443616029823, "grad_norm": 76.37524825807895, "learning_rate": 9.492525668265399e-07, "logits/chosen": 2.432969093322754, "logits/rejected": 2.440464496612549, "logps/chosen": -0.8307008743286133, "logps/rejected": -1.703589916229248, "loss": 2.1258, "nll_loss": 0.8307008147239685, "rewards/accuracies": 0.875, "rewards/chosen": -8.307008743286133, "rewards/margins": 8.728890419006348, "rewards/rejected": -17.035898208618164, "step": 126 }, { "epoch": 0.18937558247903075, "grad_norm": 106.49269707447871, "learning_rate": 9.481629371415313e-07, "logits/chosen": 2.3593316078186035, "logits/rejected": 2.433490753173828, "logps/chosen": -0.9862598180770874, "logps/rejected": -1.6820356845855713, "loss": 2.0768, "nll_loss": 0.9862598180770874, "rewards/accuracies": 0.75, "rewards/chosen": -9.862598419189453, "rewards/margins": 6.957759380340576, "rewards/rejected": -16.820358276367188, "step": 127 }, { "epoch": 0.19086672879776329, "grad_norm": 119.96635126252004, "learning_rate": 9.470623724116692e-07, "logits/chosen": 1.5657812356948853, "logits/rejected": 2.199969530105591, "logps/chosen": -0.754319965839386, "logps/rejected": -1.9645689725875854, "loss": 2.2599, "nll_loss": 0.754319965839386, "rewards/accuracies": 0.875, "rewards/chosen": -7.543199062347412, "rewards/margins": 12.102490425109863, "rewards/rejected": -19.645689010620117, "step": 128 }, { "epoch": 0.1923578751164958, "grad_norm": 27.764965593166103, "learning_rate": 9.459508994904117e-07, "logits/chosen": 1.3861720561981201, "logits/rejected": 1.846404790878296, "logps/chosen": -0.7986534833908081, "logps/rejected": -1.7672803401947021, "loss": 1.6964, "nll_loss": 0.7986533641815186, "rewards/accuracies": 0.875, "rewards/chosen": -7.986534118652344, "rewards/margins": 9.686269760131836, "rewards/rejected": -17.67280387878418, "step": 129 }, { "epoch": 0.19384902143522834, "grad_norm": 51.67532936315, "learning_rate": 9.448285454973737e-07, "logits/chosen": 1.0668275356292725, "logits/rejected": 1.1291849613189697, "logps/chosen": -0.7701537609100342, "logps/rejected": -2.3240244388580322, "loss": 2.1264, "nll_loss": 0.7701537013053894, "rewards/accuracies": 0.875, "rewards/chosen": -7.7015380859375, "rewards/margins": 15.53870677947998, "rewards/rejected": -23.240245819091797, "step": 130 }, { "epoch": 0.19534016775396085, "grad_norm": 93.9771989772975, "learning_rate": 9.436953378176648e-07, "logits/chosen": 1.8385086059570312, "logits/rejected": 1.6848714351654053, "logps/chosen": -0.7946656942367554, "logps/rejected": -2.1132001876831055, "loss": 2.6497, "nll_loss": 0.7946656942367554, "rewards/accuracies": 0.875, "rewards/chosen": -7.946656703948975, "rewards/margins": 13.185344696044922, "rewards/rejected": -21.132001876831055, "step": 131 }, { "epoch": 0.1968313140726934, "grad_norm": 61.220728520327334, "learning_rate": 9.425513041012219e-07, "logits/chosen": 1.8041369915008545, "logits/rejected": 2.2424862384796143, "logps/chosen": -0.901249885559082, "logps/rejected": -2.4530515670776367, "loss": 2.2458, "nll_loss": 0.9012499451637268, "rewards/accuracies": 1.0, "rewards/chosen": -9.01249885559082, "rewards/margins": 15.518016815185547, "rewards/rejected": -24.530517578125, "step": 132 }, { "epoch": 0.1983224603914259, "grad_norm": 61.87691602410989, "learning_rate": 9.413964722621337e-07, "logits/chosen": 1.863405704498291, "logits/rejected": 1.7748808860778809, "logps/chosen": -0.8791462779045105, "logps/rejected": -1.6963310241699219, "loss": 2.7868, "nll_loss": 0.8791462779045105, "rewards/accuracies": 0.875, "rewards/chosen": -8.791460990905762, "rewards/margins": 8.17184829711914, "rewards/rejected": -16.96331024169922, "step": 133 }, { "epoch": 0.19981360671015844, "grad_norm": 60.53050864085932, "learning_rate": 9.402308704779598e-07, "logits/chosen": 2.4300382137298584, "logits/rejected": 2.6305856704711914, "logps/chosen": -1.0600104331970215, "logps/rejected": -1.7147343158721924, "loss": 2.2858, "nll_loss": 1.060010552406311, "rewards/accuracies": 0.875, "rewards/chosen": -10.600104331970215, "rewards/margins": 6.547240257263184, "rewards/rejected": -17.147342681884766, "step": 134 }, { "epoch": 0.20130475302889095, "grad_norm": 91.17746842885462, "learning_rate": 9.390545271890437e-07, "logits/chosen": 2.2293026447296143, "logits/rejected": 2.311601161956787, "logps/chosen": -0.912305474281311, "logps/rejected": -1.4468194246292114, "loss": 2.9031, "nll_loss": 0.912305474281311, "rewards/accuracies": 0.75, "rewards/chosen": -9.123055458068848, "rewards/margins": 5.3451385498046875, "rewards/rejected": -14.468194007873535, "step": 135 }, { "epoch": 0.2027958993476235, "grad_norm": 37.83215752166125, "learning_rate": 9.378674710978183e-07, "logits/chosen": 2.585904359817505, "logits/rejected": 2.4605495929718018, "logps/chosen": -1.084753394126892, "logps/rejected": -1.461085319519043, "loss": 2.405, "nll_loss": 1.084753394126892, "rewards/accuracies": 0.625, "rewards/chosen": -10.847535133361816, "rewards/margins": 3.7633185386657715, "rewards/rejected": -14.610852241516113, "step": 136 }, { "epoch": 0.204287045666356, "grad_norm": 109.12252125543667, "learning_rate": 9.366697311681057e-07, "logits/chosen": 2.2206501960754395, "logits/rejected": 1.6608248949050903, "logps/chosen": -0.8300536274909973, "logps/rejected": -1.4110511541366577, "loss": 2.315, "nll_loss": 0.8300537467002869, "rewards/accuracies": 0.875, "rewards/chosen": -8.300537109375, "rewards/margins": 5.8099751472473145, "rewards/rejected": -14.110511779785156, "step": 137 }, { "epoch": 0.20577819198508854, "grad_norm": 33.452774130013616, "learning_rate": 9.354613366244106e-07, "logits/chosen": 1.9580575227737427, "logits/rejected": 2.0505290031433105, "logps/chosen": -0.8890818953514099, "logps/rejected": -1.7066799402236938, "loss": 2.4832, "nll_loss": 0.8890818953514099, "rewards/accuracies": 0.875, "rewards/chosen": -8.890819549560547, "rewards/margins": 8.175981521606445, "rewards/rejected": -17.06679916381836, "step": 138 }, { "epoch": 0.20726933830382105, "grad_norm": 63.238180246927655, "learning_rate": 9.342423169512071e-07, "logits/chosen": 1.7694742679595947, "logits/rejected": 1.70145845413208, "logps/chosen": -1.2566676139831543, "logps/rejected": -1.3205902576446533, "loss": 3.0226, "nll_loss": 1.2566676139831543, "rewards/accuracies": 0.625, "rewards/chosen": -12.566676139831543, "rewards/margins": 0.6392264366149902, "rewards/rejected": -13.205903053283691, "step": 139 }, { "epoch": 0.2087604846225536, "grad_norm": 243.72222558177097, "learning_rate": 9.330127018922193e-07, "logits/chosen": 2.3753468990325928, "logits/rejected": 1.9688851833343506, "logps/chosen": -1.1052511930465698, "logps/rejected": -2.9732117652893066, "loss": 3.8775, "nll_loss": 1.1052511930465698, "rewards/accuracies": 1.0, "rewards/chosen": -11.052511215209961, "rewards/margins": 18.67960548400879, "rewards/rejected": -29.73211669921875, "step": 140 }, { "epoch": 0.21025163094128613, "grad_norm": 85.72150014359558, "learning_rate": 9.317725214496959e-07, "logits/chosen": 1.732937216758728, "logits/rejected": 1.5855662822723389, "logps/chosen": -0.9783846139907837, "logps/rejected": -2.407928943634033, "loss": 3.3785, "nll_loss": 0.9783846139907837, "rewards/accuracies": 0.875, "rewards/chosen": -9.783845901489258, "rewards/margins": 14.295442581176758, "rewards/rejected": -24.07929039001465, "step": 141 }, { "epoch": 0.21174277726001864, "grad_norm": 76.65538553179297, "learning_rate": 9.305218058836776e-07, "logits/chosen": 2.091996669769287, "logits/rejected": 2.419956684112549, "logps/chosen": -0.8971315622329712, "logps/rejected": -1.9470024108886719, "loss": 2.8238, "nll_loss": 0.897131621837616, "rewards/accuracies": 0.75, "rewards/chosen": -8.97131633758545, "rewards/margins": 10.498709678649902, "rewards/rejected": -19.47002601623535, "step": 142 }, { "epoch": 0.21323392357875118, "grad_norm": 27.340783406736662, "learning_rate": 9.292605857112594e-07, "logits/chosen": 2.6284255981445312, "logits/rejected": 2.7718348503112793, "logps/chosen": -1.089355230331421, "logps/rejected": -1.6409265995025635, "loss": 1.8538, "nll_loss": 1.089355230331421, "rewards/accuracies": 0.875, "rewards/chosen": -10.893552780151367, "rewards/margins": 5.515713214874268, "rewards/rejected": -16.409265518188477, "step": 143 }, { "epoch": 0.2147250698974837, "grad_norm": 58.23543041434007, "learning_rate": 9.279888917058451e-07, "logits/chosen": 2.169576406478882, "logits/rejected": 2.6294941902160645, "logps/chosen": -1.123268723487854, "logps/rejected": -2.1478488445281982, "loss": 2.3351, "nll_loss": 1.123268723487854, "rewards/accuracies": 0.75, "rewards/chosen": -11.232686996459961, "rewards/margins": 10.245802879333496, "rewards/rejected": -21.478490829467773, "step": 144 }, { "epoch": 0.21621621621621623, "grad_norm": 101.4240161071547, "learning_rate": 9.267067548963974e-07, "logits/chosen": 1.8942837715148926, "logits/rejected": 2.2905285358428955, "logps/chosen": -1.0335276126861572, "logps/rejected": -2.0474987030029297, "loss": 3.0506, "nll_loss": 1.0335276126861572, "rewards/accuracies": 0.625, "rewards/chosen": -10.335275650024414, "rewards/margins": 10.13970947265625, "rewards/rejected": -20.474985122680664, "step": 145 }, { "epoch": 0.21770736253494874, "grad_norm": 164.47407171109947, "learning_rate": 9.2541420656668e-07, "logits/chosen": 2.1074047088623047, "logits/rejected": 2.3331003189086914, "logps/chosen": -0.8820992112159729, "logps/rejected": -1.657348394393921, "loss": 3.3369, "nll_loss": 0.8820992112159729, "rewards/accuracies": 0.875, "rewards/chosen": -8.820991516113281, "rewards/margins": 7.752492427825928, "rewards/rejected": -16.573484420776367, "step": 146 }, { "epoch": 0.21919850885368128, "grad_norm": 86.30863915055022, "learning_rate": 9.241112782544951e-07, "logits/chosen": 1.4786248207092285, "logits/rejected": 1.909964680671692, "logps/chosen": -1.0103991031646729, "logps/rejected": -2.397908926010132, "loss": 2.5863, "nll_loss": 1.0103992223739624, "rewards/accuracies": 0.75, "rewards/chosen": -10.10399055480957, "rewards/margins": 13.875099182128906, "rewards/rejected": -23.97909164428711, "step": 147 }, { "epoch": 0.2206896551724138, "grad_norm": 77.89270460808487, "learning_rate": 9.22798001750913e-07, "logits/chosen": 1.9113011360168457, "logits/rejected": 1.9127446413040161, "logps/chosen": -1.0667080879211426, "logps/rejected": -2.3992092609405518, "loss": 2.0689, "nll_loss": 1.0667080879211426, "rewards/accuracies": 0.75, "rewards/chosen": -10.667080879211426, "rewards/margins": 13.325010299682617, "rewards/rejected": -23.99209213256836, "step": 148 }, { "epoch": 0.22218080149114633, "grad_norm": 30.33940939627606, "learning_rate": 9.214744090994973e-07, "logits/chosen": 1.8804258108139038, "logits/rejected": 1.7602787017822266, "logps/chosen": -0.7917377352714539, "logps/rejected": -1.8364883661270142, "loss": 1.9412, "nll_loss": 0.7917377948760986, "rewards/accuracies": 0.875, "rewards/chosen": -7.91737699508667, "rewards/margins": 10.447505950927734, "rewards/rejected": -18.364883422851562, "step": 149 }, { "epoch": 0.22367194780987884, "grad_norm": 26.804555421773678, "learning_rate": 9.20140532595522e-07, "logits/chosen": 2.0434508323669434, "logits/rejected": 2.001006841659546, "logps/chosen": -0.8837192058563232, "logps/rejected": -2.5913500785827637, "loss": 2.1711, "nll_loss": 0.8837192058563232, "rewards/accuracies": 1.0, "rewards/chosen": -8.83719253540039, "rewards/margins": 17.07630729675293, "rewards/rejected": -25.913501739501953, "step": 150 }, { "epoch": 0.22516309412861138, "grad_norm": 56.03717385018212, "learning_rate": 9.18796404785185e-07, "logits/chosen": 1.435347318649292, "logits/rejected": 1.2728326320648193, "logps/chosen": -1.0722644329071045, "logps/rejected": -1.2300254106521606, "loss": 3.2513, "nll_loss": 1.072264552116394, "rewards/accuracies": 0.75, "rewards/chosen": -10.722643852233887, "rewards/margins": 1.5776088237762451, "rewards/rejected": -12.300253868103027, "step": 151 }, { "epoch": 0.2266542404473439, "grad_norm": 33.26179033555356, "learning_rate": 9.174420584648122e-07, "logits/chosen": 1.7228583097457886, "logits/rejected": 1.9539413452148438, "logps/chosen": -0.999208927154541, "logps/rejected": -2.489034652709961, "loss": 2.3513, "nll_loss": 0.9992088675498962, "rewards/accuracies": 0.75, "rewards/chosen": -9.99208927154541, "rewards/margins": 14.8982572555542, "rewards/rejected": -24.89034652709961, "step": 152 }, { "epoch": 0.22814538676607643, "grad_norm": 272.4402787764153, "learning_rate": 9.160775266800582e-07, "logits/chosen": 1.1529669761657715, "logits/rejected": 1.5829405784606934, "logps/chosen": -0.8998862504959106, "logps/rejected": -1.6488455533981323, "loss": 3.4049, "nll_loss": 0.8998862504959106, "rewards/accuracies": 0.75, "rewards/chosen": -8.998862266540527, "rewards/margins": 7.489593505859375, "rewards/rejected": -16.48845672607422, "step": 153 }, { "epoch": 0.22963653308480894, "grad_norm": 100.54421248357787, "learning_rate": 9.147028427251009e-07, "logits/chosen": 1.2050219774246216, "logits/rejected": 1.6227113008499146, "logps/chosen": -0.6130104064941406, "logps/rejected": -1.9992823600769043, "loss": 2.1776, "nll_loss": 0.6130104064941406, "rewards/accuracies": 1.0, "rewards/chosen": -6.130103588104248, "rewards/margins": 13.862720489501953, "rewards/rejected": -19.99282455444336, "step": 154 }, { "epoch": 0.23112767940354148, "grad_norm": 102.84523025559395, "learning_rate": 9.13318040141827e-07, "logits/chosen": 1.593552589416504, "logits/rejected": 1.550580620765686, "logps/chosen": -0.819354236125946, "logps/rejected": -1.9264180660247803, "loss": 2.7251, "nll_loss": 0.8193541169166565, "rewards/accuracies": 1.0, "rewards/chosen": -8.19354248046875, "rewards/margins": 11.070638656616211, "rewards/rejected": -19.264179229736328, "step": 155 }, { "epoch": 0.232618825722274, "grad_norm": 79.66896245906493, "learning_rate": 9.119231527190158e-07, "logits/chosen": 1.7341417074203491, "logits/rejected": 1.8601384162902832, "logps/chosen": -0.9296043515205383, "logps/rejected": -1.9038300514221191, "loss": 2.6692, "nll_loss": 0.9296042919158936, "rewards/accuracies": 0.875, "rewards/chosen": -9.296043395996094, "rewards/margins": 9.742258071899414, "rewards/rejected": -19.038301467895508, "step": 156 }, { "epoch": 0.23410997204100653, "grad_norm": 61.386657530764126, "learning_rate": 9.105182144915129e-07, "logits/chosen": 1.5324293375015259, "logits/rejected": 1.4971590042114258, "logps/chosen": -1.050441861152649, "logps/rejected": -2.616759777069092, "loss": 2.0364, "nll_loss": 1.0504419803619385, "rewards/accuracies": 1.0, "rewards/chosen": -10.504420280456543, "rewards/margins": 15.663180351257324, "rewards/rejected": -26.167598724365234, "step": 157 }, { "epoch": 0.23560111835973904, "grad_norm": 31.59053647001086, "learning_rate": 9.091032597394012e-07, "logits/chosen": 1.8084895610809326, "logits/rejected": 1.6436604261398315, "logps/chosen": -1.0257327556610107, "logps/rejected": -1.6942673921585083, "loss": 2.1228, "nll_loss": 1.0257327556610107, "rewards/accuracies": 0.875, "rewards/chosen": -10.257328033447266, "rewards/margins": 6.6853461265563965, "rewards/rejected": -16.94267463684082, "step": 158 }, { "epoch": 0.23709226467847158, "grad_norm": 99.7544542434043, "learning_rate": 9.076783229871634e-07, "logits/chosen": 1.39780855178833, "logits/rejected": 1.226241111755371, "logps/chosen": -1.3455320596694946, "logps/rejected": -2.6359424591064453, "loss": 2.157, "nll_loss": 1.3455320596694946, "rewards/accuracies": 0.875, "rewards/chosen": -13.455320358276367, "rewards/margins": 12.904104232788086, "rewards/rejected": -26.359424591064453, "step": 159 }, { "epoch": 0.2385834109972041, "grad_norm": 65.81757474306599, "learning_rate": 9.062434390028407e-07, "logits/chosen": 2.117692232131958, "logits/rejected": 2.501654624938965, "logps/chosen": -1.0490020513534546, "logps/rejected": -2.2091197967529297, "loss": 2.1304, "nll_loss": 1.0490020513534546, "rewards/accuracies": 0.875, "rewards/chosen": -10.490020751953125, "rewards/margins": 11.601176261901855, "rewards/rejected": -22.091196060180664, "step": 160 }, { "epoch": 0.24007455731593663, "grad_norm": 60.48173676184412, "learning_rate": 9.04798642797183e-07, "logits/chosen": 2.0881049633026123, "logits/rejected": 2.2217414379119873, "logps/chosen": -1.2211053371429443, "logps/rejected": -2.209552764892578, "loss": 2.9786, "nll_loss": 1.2211053371429443, "rewards/accuracies": 0.625, "rewards/chosen": -12.211054801940918, "rewards/margins": 9.884474754333496, "rewards/rejected": -22.09552764892578, "step": 161 }, { "epoch": 0.24156570363466914, "grad_norm": 63.745333924082374, "learning_rate": 9.033439696227965e-07, "logits/chosen": 1.9421632289886475, "logits/rejected": 2.154196262359619, "logps/chosen": -0.9993748068809509, "logps/rejected": -1.5965303182601929, "loss": 1.8593, "nll_loss": 0.9993748068809509, "rewards/accuracies": 0.875, "rewards/chosen": -9.993748664855957, "rewards/margins": 5.971555709838867, "rewards/rejected": -15.965304374694824, "step": 162 }, { "epoch": 0.24305684995340168, "grad_norm": 84.14922727194168, "learning_rate": 9.018794549732817e-07, "logits/chosen": 1.134750485420227, "logits/rejected": 1.1239275932312012, "logps/chosen": -0.850483238697052, "logps/rejected": -2.0938446521759033, "loss": 1.8786, "nll_loss": 0.850483238697052, "rewards/accuracies": 1.0, "rewards/chosen": -8.50483226776123, "rewards/margins": 12.433614730834961, "rewards/rejected": -20.938446044921875, "step": 163 }, { "epoch": 0.2445479962721342, "grad_norm": 37.840829532481365, "learning_rate": 9.004051345823688e-07, "logits/chosen": 2.2150847911834717, "logits/rejected": 2.0349390506744385, "logps/chosen": -0.9850893020629883, "logps/rejected": -2.0254619121551514, "loss": 1.7743, "nll_loss": 0.9850894212722778, "rewards/accuracies": 0.625, "rewards/chosen": -9.850893020629883, "rewards/margins": 10.40372371673584, "rewards/rejected": -20.25461769104004, "step": 164 }, { "epoch": 0.24603914259086673, "grad_norm": 25.344809556394146, "learning_rate": 8.989210444230449e-07, "logits/chosen": 1.4311250448226929, "logits/rejected": 1.9220129251480103, "logps/chosen": -0.9075038433074951, "logps/rejected": -3.189478635787964, "loss": 1.9771, "nll_loss": 0.9075037240982056, "rewards/accuracies": 0.875, "rewards/chosen": -9.075037956237793, "rewards/margins": 22.819747924804688, "rewards/rejected": -31.89478302001953, "step": 165 }, { "epoch": 0.24753028890959924, "grad_norm": 111.42978654994121, "learning_rate": 8.974272207066767e-07, "logits/chosen": 1.41557776927948, "logits/rejected": 1.558440089225769, "logps/chosen": -1.0430129766464233, "logps/rejected": -1.9753856658935547, "loss": 2.8302, "nll_loss": 1.0430129766464233, "rewards/accuracies": 0.75, "rewards/chosen": -10.430130958557129, "rewards/margins": 9.323726654052734, "rewards/rejected": -19.753856658935547, "step": 166 }, { "epoch": 0.24902143522833178, "grad_norm": 108.79421551199331, "learning_rate": 8.959236998821266e-07, "logits/chosen": 1.4906010627746582, "logits/rejected": 1.4089393615722656, "logps/chosen": -1.2226719856262207, "logps/rejected": -2.027992010116577, "loss": 2.139, "nll_loss": 1.2226719856262207, "rewards/accuracies": 0.875, "rewards/chosen": -12.226719856262207, "rewards/margins": 8.05319881439209, "rewards/rejected": -20.279918670654297, "step": 167 }, { "epoch": 0.2505125815470643, "grad_norm": 124.37069299295982, "learning_rate": 8.944105186348645e-07, "logits/chosen": 2.3309714794158936, "logits/rejected": 2.2218449115753174, "logps/chosen": -0.8178737163543701, "logps/rejected": -1.4837485551834106, "loss": 1.8927, "nll_loss": 0.8178737163543701, "rewards/accuracies": 0.875, "rewards/chosen": -8.178736686706543, "rewards/margins": 6.658747673034668, "rewards/rejected": -14.837486267089844, "step": 168 }, { "epoch": 0.25200372786579683, "grad_norm": 49.07982484344473, "learning_rate": 8.928877138860706e-07, "logits/chosen": 2.5806398391723633, "logits/rejected": 2.6362061500549316, "logps/chosen": -1.1920853853225708, "logps/rejected": -1.460356593132019, "loss": 2.77, "nll_loss": 1.1920853853225708, "rewards/accuracies": 0.625, "rewards/chosen": -11.920853614807129, "rewards/margins": 2.6827123165130615, "rewards/rejected": -14.603567123413086, "step": 169 }, { "epoch": 0.2534948741845294, "grad_norm": 57.895474258549186, "learning_rate": 8.913553227917365e-07, "logits/chosen": 1.51754891872406, "logits/rejected": 1.5921522378921509, "logps/chosen": -1.219523310661316, "logps/rejected": -2.685457944869995, "loss": 1.996, "nll_loss": 1.219523310661316, "rewards/accuracies": 0.875, "rewards/chosen": -12.195232391357422, "rewards/margins": 14.659347534179688, "rewards/rejected": -26.85457992553711, "step": 170 }, { "epoch": 0.25498602050326186, "grad_norm": 50.35318504402622, "learning_rate": 8.898133827417577e-07, "logits/chosen": 1.492168664932251, "logits/rejected": 1.3493200540542603, "logps/chosen": -0.917851984500885, "logps/rejected": -1.3204575777053833, "loss": 2.4887, "nll_loss": 0.917851984500885, "rewards/accuracies": 0.75, "rewards/chosen": -9.178519248962402, "rewards/margins": 4.026056289672852, "rewards/rejected": -13.204574584960938, "step": 171 }, { "epoch": 0.2564771668219944, "grad_norm": 123.00885834314533, "learning_rate": 8.882619313590212e-07, "logits/chosen": 2.1078224182128906, "logits/rejected": 1.8869191408157349, "logps/chosen": -1.1606742143630981, "logps/rejected": -1.7969461679458618, "loss": 2.2829, "nll_loss": 1.1606740951538086, "rewards/accuracies": 0.625, "rewards/chosen": -11.606741905212402, "rewards/margins": 6.36271858215332, "rewards/rejected": -17.96946144104004, "step": 172 }, { "epoch": 0.25796831314072693, "grad_norm": 87.34749076746039, "learning_rate": 8.867010064984879e-07, "logits/chosen": 1.897976279258728, "logits/rejected": 1.5353047847747803, "logps/chosen": -1.3330408334732056, "logps/rejected": -2.919948101043701, "loss": 2.1636, "nll_loss": 1.3330408334732056, "rewards/accuracies": 0.875, "rewards/chosen": -13.33040714263916, "rewards/margins": 15.869071006774902, "rewards/rejected": -29.199480056762695, "step": 173 }, { "epoch": 0.2594594594594595, "grad_norm": 90.01313232214396, "learning_rate": 8.851306462462688e-07, "logits/chosen": 1.750410556793213, "logits/rejected": 1.5374597311019897, "logps/chosen": -1.1078706979751587, "logps/rejected": -2.269902229309082, "loss": 2.1, "nll_loss": 1.1078706979751587, "rewards/accuracies": 0.75, "rewards/chosen": -11.078706741333008, "rewards/margins": 11.620316505432129, "rewards/rejected": -22.69902229309082, "step": 174 }, { "epoch": 0.26095060577819196, "grad_norm": 76.91976422919461, "learning_rate": 8.835508889186956e-07, "logits/chosen": 1.9853813648223877, "logits/rejected": 2.1543500423431396, "logps/chosen": -1.3369462490081787, "logps/rejected": -1.8868308067321777, "loss": 2.8227, "nll_loss": 1.3369462490081787, "rewards/accuracies": 0.625, "rewards/chosen": -13.369462013244629, "rewards/margins": 5.498845100402832, "rewards/rejected": -18.86830711364746, "step": 175 }, { "epoch": 0.2624417520969245, "grad_norm": 55.13369487383775, "learning_rate": 8.819617730613862e-07, "logits/chosen": 1.2609195709228516, "logits/rejected": 1.3708417415618896, "logps/chosen": -1.2357456684112549, "logps/rejected": -2.014441728591919, "loss": 2.5874, "nll_loss": 1.2357456684112549, "rewards/accuracies": 0.875, "rewards/chosen": -12.35745620727539, "rewards/margins": 7.786960601806641, "rewards/rejected": -20.144418716430664, "step": 176 }, { "epoch": 0.26393289841565704, "grad_norm": 76.22181618820312, "learning_rate": 8.803633374483035e-07, "logits/chosen": 1.3541204929351807, "logits/rejected": 1.1606299877166748, "logps/chosen": -0.838552713394165, "logps/rejected": -2.6401939392089844, "loss": 1.0984, "nll_loss": 0.838552713394165, "rewards/accuracies": 1.0, "rewards/chosen": -8.385526657104492, "rewards/margins": 18.01641273498535, "rewards/rejected": -26.40193748474121, "step": 177 }, { "epoch": 0.2654240447343896, "grad_norm": 64.42198540747405, "learning_rate": 8.7875562108081e-07, "logits/chosen": 0.4180157482624054, "logits/rejected": 0.13569970428943634, "logps/chosen": -0.8393802046775818, "logps/rejected": -2.059353828430176, "loss": 2.2279, "nll_loss": 0.8393802642822266, "rewards/accuracies": 1.0, "rewards/chosen": -8.39380168914795, "rewards/margins": 12.199737548828125, "rewards/rejected": -20.59354019165039, "step": 178 }, { "epoch": 0.2669151910531221, "grad_norm": 65.33071806962461, "learning_rate": 8.771386631867157e-07, "logits/chosen": 2.04560923576355, "logits/rejected": 2.5156006813049316, "logps/chosen": -0.8595657348632812, "logps/rejected": -1.5896443128585815, "loss": 2.4389, "nll_loss": 0.8595657348632812, "rewards/accuracies": 0.75, "rewards/chosen": -8.595657348632812, "rewards/margins": 7.300787925720215, "rewards/rejected": -15.896444320678711, "step": 179 }, { "epoch": 0.2684063373718546, "grad_norm": 56.757214632206704, "learning_rate": 8.755125032193214e-07, "logits/chosen": 2.0090067386627197, "logits/rejected": 1.9684314727783203, "logps/chosen": -1.0932655334472656, "logps/rejected": -1.6011093854904175, "loss": 2.2746, "nll_loss": 1.0932655334472656, "rewards/accuracies": 0.5, "rewards/chosen": -10.932655334472656, "rewards/margins": 5.078439235687256, "rewards/rejected": -16.01109504699707, "step": 180 }, { "epoch": 0.26989748369058714, "grad_norm": 57.80690014465061, "learning_rate": 8.738771808564555e-07, "logits/chosen": 1.186753749847412, "logits/rejected": 1.113171935081482, "logps/chosen": -0.7738239765167236, "logps/rejected": -3.2991132736206055, "loss": 2.059, "nll_loss": 0.7738240361213684, "rewards/accuracies": 1.0, "rewards/chosen": -7.738240718841553, "rewards/margins": 25.25288963317871, "rewards/rejected": -32.99113082885742, "step": 181 }, { "epoch": 0.2713886300093197, "grad_norm": 100.06676793231738, "learning_rate": 8.722327359995063e-07, "logits/chosen": 1.3211489915847778, "logits/rejected": 1.5033185482025146, "logps/chosen": -0.6754621267318726, "logps/rejected": -4.53987979888916, "loss": 1.9102, "nll_loss": 0.6754621267318726, "rewards/accuracies": 1.0, "rewards/chosen": -6.7546210289001465, "rewards/margins": 38.6441764831543, "rewards/rejected": -45.398799896240234, "step": 182 }, { "epoch": 0.2728797763280522, "grad_norm": 75.50764406211981, "learning_rate": 8.705792087724484e-07, "logits/chosen": 2.040464401245117, "logits/rejected": 2.061976909637451, "logps/chosen": -0.8497739434242249, "logps/rejected": -1.8095765113830566, "loss": 2.4733, "nll_loss": 0.8497739434242249, "rewards/accuracies": 0.75, "rewards/chosen": -8.497739791870117, "rewards/margins": 9.598026275634766, "rewards/rejected": -18.095766067504883, "step": 183 }, { "epoch": 0.2743709226467847, "grad_norm": 100.4492023921661, "learning_rate": 8.689166395208636e-07, "logits/chosen": 1.2215818166732788, "logits/rejected": 1.020734190940857, "logps/chosen": -0.8507159948348999, "logps/rejected": -2.4126875400543213, "loss": 2.3262, "nll_loss": 0.8507159948348999, "rewards/accuracies": 1.0, "rewards/chosen": -8.507159233093262, "rewards/margins": 15.619712829589844, "rewards/rejected": -24.126873016357422, "step": 184 }, { "epoch": 0.27586206896551724, "grad_norm": 180.94476563933884, "learning_rate": 8.672450688109563e-07, "logits/chosen": 2.0847837924957275, "logits/rejected": 2.215726613998413, "logps/chosen": -1.1926288604736328, "logps/rejected": -1.9761557579040527, "loss": 1.5812, "nll_loss": 1.1926288604736328, "rewards/accuracies": 0.875, "rewards/chosen": -11.926289558410645, "rewards/margins": 7.835270881652832, "rewards/rejected": -19.761560440063477, "step": 185 }, { "epoch": 0.2773532152842498, "grad_norm": 82.03593596024302, "learning_rate": 8.655645374285636e-07, "logits/chosen": 2.012111186981201, "logits/rejected": 2.043102264404297, "logps/chosen": -1.3665342330932617, "logps/rejected": -2.8659491539001465, "loss": 2.409, "nll_loss": 1.3665341138839722, "rewards/accuracies": 0.875, "rewards/chosen": -13.665343284606934, "rewards/margins": 14.994148254394531, "rewards/rejected": -28.65949249267578, "step": 186 }, { "epoch": 0.2788443616029823, "grad_norm": 79.23210472745285, "learning_rate": 8.638750863781612e-07, "logits/chosen": 1.6436148881912231, "logits/rejected": 2.1180176734924316, "logps/chosen": -0.9192164540290833, "logps/rejected": -1.9463462829589844, "loss": 3.5592, "nll_loss": 0.9192165732383728, "rewards/accuracies": 1.0, "rewards/chosen": -9.192164421081543, "rewards/margins": 10.271299362182617, "rewards/rejected": -19.463462829589844, "step": 187 }, { "epoch": 0.2803355079217148, "grad_norm": 26.47667976709633, "learning_rate": 8.621767568818612e-07, "logits/chosen": 0.6154603958129883, "logits/rejected": 0.6709473133087158, "logps/chosen": -0.9064656496047974, "logps/rejected": -2.3845648765563965, "loss": 1.662, "nll_loss": 0.9064657092094421, "rewards/accuracies": 0.875, "rewards/chosen": -9.064657211303711, "rewards/margins": 14.780990600585938, "rewards/rejected": -23.84564781188965, "step": 188 }, { "epoch": 0.28182665424044734, "grad_norm": 101.4282206916804, "learning_rate": 8.604695903784079e-07, "logits/chosen": 0.42705780267715454, "logits/rejected": 0.9927361607551575, "logps/chosen": -0.9533852934837341, "logps/rejected": -2.294440746307373, "loss": 1.7952, "nll_loss": 0.9533852338790894, "rewards/accuracies": 0.875, "rewards/chosen": -9.533852577209473, "rewards/margins": 13.410554885864258, "rewards/rejected": -22.944406509399414, "step": 189 }, { "epoch": 0.2833178005591799, "grad_norm": 78.40187913745498, "learning_rate": 8.587536285221655e-07, "logits/chosen": 1.032325267791748, "logits/rejected": 1.2811267375946045, "logps/chosen": -0.676677942276001, "logps/rejected": -1.7353984117507935, "loss": 2.2404, "nll_loss": 0.6766780018806458, "rewards/accuracies": 0.875, "rewards/chosen": -6.766780853271484, "rewards/margins": 10.587203979492188, "rewards/rejected": -17.353984832763672, "step": 190 }, { "epoch": 0.2848089468779124, "grad_norm": 45.624929829577816, "learning_rate": 8.570289131821024e-07, "logits/chosen": 1.9201204776763916, "logits/rejected": 2.1733713150024414, "logps/chosen": -1.3790674209594727, "logps/rejected": -2.599050521850586, "loss": 2.4758, "nll_loss": 1.3790674209594727, "rewards/accuracies": 0.875, "rewards/chosen": -13.790674209594727, "rewards/margins": 12.19983196258545, "rewards/rejected": -25.990507125854492, "step": 191 }, { "epoch": 0.2863000931966449, "grad_norm": 92.43704748026303, "learning_rate": 8.552954864407697e-07, "logits/chosen": 1.352588415145874, "logits/rejected": 1.5578041076660156, "logps/chosen": -0.861832857131958, "logps/rejected": -1.6976982355117798, "loss": 2.9659, "nll_loss": 0.861832857131958, "rewards/accuracies": 0.75, "rewards/chosen": -8.618329048156738, "rewards/margins": 8.358652114868164, "rewards/rejected": -16.97698211669922, "step": 192 }, { "epoch": 0.28779123951537744, "grad_norm": 51.642031676003405, "learning_rate": 8.535533905932737e-07, "logits/chosen": 1.9250706434249878, "logits/rejected": 2.259392023086548, "logps/chosen": -1.432119369506836, "logps/rejected": -2.4200923442840576, "loss": 2.3788, "nll_loss": 1.432119369506836, "rewards/accuracies": 0.625, "rewards/chosen": -14.321192741394043, "rewards/margins": 9.879730224609375, "rewards/rejected": -24.200923919677734, "step": 193 }, { "epoch": 0.28928238583411, "grad_norm": 57.855469319584095, "learning_rate": 8.518026681462447e-07, "logits/chosen": 0.6635885834693909, "logits/rejected": 0.5963816046714783, "logps/chosen": -0.943480908870697, "logps/rejected": -1.6345998048782349, "loss": 2.6174, "nll_loss": 0.9434809684753418, "rewards/accuracies": 0.625, "rewards/chosen": -9.434808731079102, "rewards/margins": 6.911189079284668, "rewards/rejected": -16.345998764038086, "step": 194 }, { "epoch": 0.2907735321528425, "grad_norm": 252.85548256416274, "learning_rate": 8.500433618167992e-07, "logits/chosen": 1.7494518756866455, "logits/rejected": 1.9818410873413086, "logps/chosen": -1.2326170206069946, "logps/rejected": -2.1006078720092773, "loss": 2.0705, "nll_loss": 1.2326171398162842, "rewards/accuracies": 0.875, "rewards/chosen": -12.326169967651367, "rewards/margins": 8.679910659790039, "rewards/rejected": -21.00608253479004, "step": 195 }, { "epoch": 0.292264678471575, "grad_norm": 44.05241942856872, "learning_rate": 8.482755145314985e-07, "logits/chosen": 2.0492377281188965, "logits/rejected": 2.0539886951446533, "logps/chosen": -1.2818440198898315, "logps/rejected": -3.063652753829956, "loss": 2.1861, "nll_loss": 1.281843900680542, "rewards/accuracies": 0.875, "rewards/chosen": -12.818438529968262, "rewards/margins": 17.818090438842773, "rewards/rejected": -30.63652992248535, "step": 196 }, { "epoch": 0.29375582479030754, "grad_norm": 135.40620235088878, "learning_rate": 8.464991694253e-07, "logits/chosen": 1.953153133392334, "logits/rejected": 1.931667685508728, "logps/chosen": -1.0608192682266235, "logps/rejected": -2.6127781867980957, "loss": 3.1345, "nll_loss": 1.060819387435913, "rewards/accuracies": 1.0, "rewards/chosen": -10.60819149017334, "rewards/margins": 15.519588470458984, "rewards/rejected": -26.12778091430664, "step": 197 }, { "epoch": 0.2952469711090401, "grad_norm": 63.912791933616546, "learning_rate": 8.447143698405059e-07, "logits/chosen": 1.5860276222229004, "logits/rejected": 1.4829771518707275, "logps/chosen": -1.001695156097412, "logps/rejected": -1.9609891176223755, "loss": 2.4446, "nll_loss": 1.001695156097412, "rewards/accuracies": 0.75, "rewards/chosen": -10.016950607299805, "rewards/margins": 9.592939376831055, "rewards/rejected": -19.60988998413086, "step": 198 }, { "epoch": 0.2967381174277726, "grad_norm": 185.48348945131482, "learning_rate": 8.429211593257052e-07, "logits/chosen": 2.0849227905273438, "logits/rejected": 2.0906858444213867, "logps/chosen": -1.3137246370315552, "logps/rejected": -2.2139267921447754, "loss": 4.0443, "nll_loss": 1.3137246370315552, "rewards/accuracies": 0.625, "rewards/chosen": -13.137248039245605, "rewards/margins": 9.002019882202148, "rewards/rejected": -22.139266967773438, "step": 199 }, { "epoch": 0.2982292637465051, "grad_norm": 39.150692300992226, "learning_rate": 8.41119581634711e-07, "logits/chosen": 2.822572946548462, "logits/rejected": 2.72379469871521, "logps/chosen": -1.3302345275878906, "logps/rejected": -2.1766927242279053, "loss": 1.6867, "nll_loss": 1.3302347660064697, "rewards/accuracies": 0.875, "rewards/chosen": -13.302345275878906, "rewards/margins": 8.464580535888672, "rewards/rejected": -21.766925811767578, "step": 200 }, { "epoch": 0.29972041006523764, "grad_norm": 72.1674290233523, "learning_rate": 8.393096807254931e-07, "logits/chosen": 1.5731306076049805, "logits/rejected": 1.7772272825241089, "logps/chosen": -0.8957565426826477, "logps/rejected": -1.3099387884140015, "loss": 2.6164, "nll_loss": 0.8957564830780029, "rewards/accuracies": 0.625, "rewards/chosen": -8.957565307617188, "rewards/margins": 4.141822814941406, "rewards/rejected": -13.099388122558594, "step": 201 }, { "epoch": 0.3012115563839702, "grad_norm": 44.80807964315525, "learning_rate": 8.374915007591052e-07, "logits/chosen": 1.3576855659484863, "logits/rejected": 1.9479650259017944, "logps/chosen": -1.4070185422897339, "logps/rejected": -2.286010265350342, "loss": 1.6984, "nll_loss": 1.4070186614990234, "rewards/accuracies": 0.75, "rewards/chosen": -14.070186614990234, "rewards/margins": 8.78991413116455, "rewards/rejected": -22.86009979248047, "step": 202 }, { "epoch": 0.3027027027027027, "grad_norm": 36.876274463209846, "learning_rate": 8.356650860986081e-07, "logits/chosen": 0.7637724280357361, "logits/rejected": 0.8034124970436096, "logps/chosen": -1.2442846298217773, "logps/rejected": -2.19854474067688, "loss": 2.6261, "nll_loss": 1.244284749031067, "rewards/accuracies": 0.75, "rewards/chosen": -12.44284725189209, "rewards/margins": 9.542598724365234, "rewards/rejected": -21.985445022583008, "step": 203 }, { "epoch": 0.3041938490214352, "grad_norm": 67.41135362002561, "learning_rate": 8.338304813079864e-07, "logits/chosen": 1.3719521760940552, "logits/rejected": 1.5598344802856445, "logps/chosen": -1.2198079824447632, "logps/rejected": -1.4364782571792603, "loss": 2.5946, "nll_loss": 1.2198078632354736, "rewards/accuracies": 0.75, "rewards/chosen": -12.198080062866211, "rewards/margins": 2.1667017936706543, "rewards/rejected": -14.364782333374023, "step": 204 }, { "epoch": 0.30568499534016774, "grad_norm": 44.739442068978555, "learning_rate": 8.319877311510612e-07, "logits/chosen": 2.3804569244384766, "logits/rejected": 2.126559257507324, "logps/chosen": -0.942017674446106, "logps/rejected": -1.4884874820709229, "loss": 2.0004, "nll_loss": 0.9420175552368164, "rewards/accuracies": 0.75, "rewards/chosen": -9.42017650604248, "rewards/margins": 5.464698314666748, "rewards/rejected": -14.88487434387207, "step": 205 }, { "epoch": 0.3071761416589003, "grad_norm": 56.20523602273714, "learning_rate": 8.301368805903986e-07, "logits/chosen": 1.1620374917984009, "logits/rejected": 1.2464760541915894, "logps/chosen": -1.3252227306365967, "logps/rejected": -2.126899003982544, "loss": 2.0718, "nll_loss": 1.3252228498458862, "rewards/accuracies": 0.875, "rewards/chosen": -13.252228736877441, "rewards/margins": 8.016761779785156, "rewards/rejected": -21.26898956298828, "step": 206 }, { "epoch": 0.3086672879776328, "grad_norm": 53.096782115478575, "learning_rate": 8.282779747862121e-07, "logits/chosen": 0.9776243567466736, "logits/rejected": 1.0243405103683472, "logps/chosen": -0.9001644849777222, "logps/rejected": -2.3409206867218018, "loss": 1.6615, "nll_loss": 0.9001644849777222, "rewards/accuracies": 0.875, "rewards/chosen": -9.0016450881958, "rewards/margins": 14.407562255859375, "rewards/rejected": -23.409208297729492, "step": 207 }, { "epoch": 0.3101584342963653, "grad_norm": 104.49663966984495, "learning_rate": 8.264110590952607e-07, "logits/chosen": 1.2132370471954346, "logits/rejected": 1.3872023820877075, "logps/chosen": -0.9424193501472473, "logps/rejected": -1.9429956674575806, "loss": 2.2228, "nll_loss": 0.9424192905426025, "rewards/accuracies": 1.0, "rewards/chosen": -9.424193382263184, "rewards/margins": 10.005763053894043, "rewards/rejected": -19.429956436157227, "step": 208 }, { "epoch": 0.31164958061509784, "grad_norm": 46.95221324695929, "learning_rate": 8.245361790697425e-07, "logits/chosen": 1.2756640911102295, "logits/rejected": 1.76149320602417, "logps/chosen": -1.1788560152053833, "logps/rejected": -2.207589864730835, "loss": 1.8589, "nll_loss": 1.1788560152053833, "rewards/accuracies": 0.875, "rewards/chosen": -11.788559913635254, "rewards/margins": 10.287336349487305, "rewards/rejected": -22.075897216796875, "step": 209 }, { "epoch": 0.3131407269338304, "grad_norm": 62.19023946080797, "learning_rate": 8.226533804561826e-07, "logits/chosen": 1.648912787437439, "logits/rejected": 1.7056903839111328, "logps/chosen": -1.1090244054794312, "logps/rejected": -1.929490566253662, "loss": 2.3942, "nll_loss": 1.1090245246887207, "rewards/accuracies": 0.75, "rewards/chosen": -11.09024429321289, "rewards/margins": 8.204662322998047, "rewards/rejected": -19.294906616210938, "step": 210 }, { "epoch": 0.3146318732525629, "grad_norm": 39.42181196509823, "learning_rate": 8.207627091943177e-07, "logits/chosen": 2.0703964233398438, "logits/rejected": 2.2637991905212402, "logps/chosen": -1.0555453300476074, "logps/rejected": -1.8380850553512573, "loss": 1.8003, "nll_loss": 1.0555452108383179, "rewards/accuracies": 0.75, "rewards/chosen": -10.555453300476074, "rewards/margins": 7.8253960609436035, "rewards/rejected": -18.38085174560547, "step": 211 }, { "epoch": 0.31612301957129546, "grad_norm": 54.07476460664365, "learning_rate": 8.188642114159746e-07, "logits/chosen": 1.6686798334121704, "logits/rejected": 1.6440613269805908, "logps/chosen": -1.2204382419586182, "logps/rejected": -1.252202033996582, "loss": 2.4999, "nll_loss": 1.2204382419586182, "rewards/accuracies": 0.5, "rewards/chosen": -12.204381942749023, "rewards/margins": 0.31763792037963867, "rewards/rejected": -12.52202033996582, "step": 212 }, { "epoch": 0.31761416589002794, "grad_norm": 108.18112958443436, "learning_rate": 8.169579334439452e-07, "logits/chosen": 1.827782154083252, "logits/rejected": 2.365203857421875, "logps/chosen": -0.9755070805549622, "logps/rejected": -2.1315362453460693, "loss": 1.8175, "nll_loss": 0.9755070805549622, "rewards/accuracies": 0.75, "rewards/chosen": -9.755070686340332, "rewards/margins": 11.56029224395752, "rewards/rejected": -21.31536293029785, "step": 213 }, { "epoch": 0.3191053122087605, "grad_norm": 139.8642654166928, "learning_rate": 8.150439217908556e-07, "logits/chosen": 1.1471678018569946, "logits/rejected": 1.8254046440124512, "logps/chosen": -1.2145636081695557, "logps/rejected": -2.123124122619629, "loss": 2.0822, "nll_loss": 1.2145636081695557, "rewards/accuracies": 0.875, "rewards/chosen": -12.145635604858398, "rewards/margins": 9.085602760314941, "rewards/rejected": -21.231239318847656, "step": 214 }, { "epoch": 0.320596458527493, "grad_norm": 56.175736770434895, "learning_rate": 8.131222231580313e-07, "logits/chosen": 1.517196536064148, "logits/rejected": 1.686833381652832, "logps/chosen": -0.9890428781509399, "logps/rejected": -2.2462382316589355, "loss": 1.8137, "nll_loss": 0.9890428185462952, "rewards/accuracies": 0.875, "rewards/chosen": -9.89042854309082, "rewards/margins": 12.571954727172852, "rewards/rejected": -22.46238136291504, "step": 215 }, { "epoch": 0.32208760484622556, "grad_norm": 115.21812937694328, "learning_rate": 8.111928844343578e-07, "logits/chosen": 1.493722915649414, "logits/rejected": 1.4394516944885254, "logps/chosen": -0.7523890733718872, "logps/rejected": -2.181933641433716, "loss": 2.421, "nll_loss": 0.752389132976532, "rewards/accuracies": 1.0, "rewards/chosen": -7.523890018463135, "rewards/margins": 14.295445442199707, "rewards/rejected": -21.819337844848633, "step": 216 }, { "epoch": 0.32357875116495805, "grad_norm": 33.129629195390294, "learning_rate": 8.092559526951374e-07, "logits/chosen": 2.051661729812622, "logits/rejected": 2.359527349472046, "logps/chosen": -1.0224583148956299, "logps/rejected": -2.7590489387512207, "loss": 1.7781, "nll_loss": 1.0224583148956299, "rewards/accuracies": 0.625, "rewards/chosen": -10.224583625793457, "rewards/margins": 17.36590576171875, "rewards/rejected": -27.590490341186523, "step": 217 }, { "epoch": 0.3250698974836906, "grad_norm": 49.585571265516876, "learning_rate": 8.073114752009387e-07, "logits/chosen": 0.7824015617370605, "logits/rejected": 0.8636319637298584, "logps/chosen": -1.0705488920211792, "logps/rejected": -1.5145641565322876, "loss": 2.0008, "nll_loss": 1.0705488920211792, "rewards/accuracies": 0.75, "rewards/chosen": -10.705488204956055, "rewards/margins": 4.440152645111084, "rewards/rejected": -15.14564037322998, "step": 218 }, { "epoch": 0.3265610438024231, "grad_norm": 78.79188629369138, "learning_rate": 8.053594993964452e-07, "logits/chosen": 0.21954438090324402, "logits/rejected": 0.705274760723114, "logps/chosen": -1.100860595703125, "logps/rejected": -2.3908841609954834, "loss": 1.687, "nll_loss": 1.100860595703125, "rewards/accuracies": 0.75, "rewards/chosen": -11.00860595703125, "rewards/margins": 12.90023422241211, "rewards/rejected": -23.90884017944336, "step": 219 }, { "epoch": 0.32805219012115566, "grad_norm": 188.92539787768402, "learning_rate": 8.034000729092967e-07, "logits/chosen": 1.0372661352157593, "logits/rejected": 1.5985794067382812, "logps/chosen": -0.9695538282394409, "logps/rejected": -2.930168628692627, "loss": 2.9507, "nll_loss": 0.9695538282394409, "rewards/accuracies": 0.75, "rewards/chosen": -9.695537567138672, "rewards/margins": 19.60614585876465, "rewards/rejected": -29.301685333251953, "step": 220 }, { "epoch": 0.32954333643988815, "grad_norm": 94.92786446919038, "learning_rate": 8.014332435489275e-07, "logits/chosen": 1.1338255405426025, "logits/rejected": 1.1260263919830322, "logps/chosen": -1.053155779838562, "logps/rejected": -2.1766085624694824, "loss": 2.4868, "nll_loss": 1.053155779838562, "rewards/accuracies": 0.875, "rewards/chosen": -10.531557083129883, "rewards/margins": 11.234529495239258, "rewards/rejected": -21.76608657836914, "step": 221 }, { "epoch": 0.3310344827586207, "grad_norm": 60.85719317292509, "learning_rate": 7.994590593054e-07, "logits/chosen": 1.7554562091827393, "logits/rejected": 1.8874410390853882, "logps/chosen": -1.1214702129364014, "logps/rejected": -1.3658537864685059, "loss": 2.3518, "nll_loss": 1.1214702129364014, "rewards/accuracies": 0.625, "rewards/chosen": -11.214702606201172, "rewards/margins": 2.4438343048095703, "rewards/rejected": -13.658536911010742, "step": 222 }, { "epoch": 0.3325256290773532, "grad_norm": 87.43032722800943, "learning_rate": 7.974775683482337e-07, "logits/chosen": 0.4400970935821533, "logits/rejected": 0.8527993559837341, "logps/chosen": -0.97307950258255, "logps/rejected": -1.959856390953064, "loss": 1.2906, "nll_loss": 0.9730795621871948, "rewards/accuracies": 1.0, "rewards/chosen": -9.730794906616211, "rewards/margins": 9.867768287658691, "rewards/rejected": -19.59856414794922, "step": 223 }, { "epoch": 0.33401677539608576, "grad_norm": 98.45314305769313, "learning_rate": 7.954888190252291e-07, "logits/chosen": 1.3810157775878906, "logits/rejected": 0.5474785566329956, "logps/chosen": -1.309791922569275, "logps/rejected": -2.2794816493988037, "loss": 2.1317, "nll_loss": 1.309791922569275, "rewards/accuracies": 0.75, "rewards/chosen": -13.097918510437012, "rewards/margins": 9.696897506713867, "rewards/rejected": -22.794815063476562, "step": 224 }, { "epoch": 0.33550792171481825, "grad_norm": 35.120986400927954, "learning_rate": 7.934928598612895e-07, "logits/chosen": 1.025597095489502, "logits/rejected": 1.417877435684204, "logps/chosen": -1.4049136638641357, "logps/rejected": -3.658029317855835, "loss": 1.7105, "nll_loss": 1.4049136638641357, "rewards/accuracies": 0.875, "rewards/chosen": -14.049138069152832, "rewards/margins": 22.53115463256836, "rewards/rejected": -36.580291748046875, "step": 225 }, { "epoch": 0.3369990680335508, "grad_norm": 81.14812842799493, "learning_rate": 7.91489739557236e-07, "logits/chosen": 0.44910019636154175, "logits/rejected": 0.7166433334350586, "logps/chosen": -1.2824578285217285, "logps/rejected": -2.48989200592041, "loss": 2.5273, "nll_loss": 1.2824578285217285, "rewards/accuracies": 0.75, "rewards/chosen": -12.824578285217285, "rewards/margins": 12.074341773986816, "rewards/rejected": -24.8989200592041, "step": 226 }, { "epoch": 0.3384902143522833, "grad_norm": 39.55482261548164, "learning_rate": 7.894795069886191e-07, "logits/chosen": 0.6071157455444336, "logits/rejected": 1.4952878952026367, "logps/chosen": -0.8678624629974365, "logps/rejected": -1.9564037322998047, "loss": 2.135, "nll_loss": 0.8678624629974365, "rewards/accuracies": 1.0, "rewards/chosen": -8.678624153137207, "rewards/margins": 10.885414123535156, "rewards/rejected": -19.56403923034668, "step": 227 }, { "epoch": 0.33998136067101586, "grad_norm": 47.63663070972851, "learning_rate": 7.874622112045268e-07, "logits/chosen": 2.552133321762085, "logits/rejected": 2.604572296142578, "logps/chosen": -1.0520918369293213, "logps/rejected": -1.1822909116744995, "loss": 2.2303, "nll_loss": 1.0520917177200317, "rewards/accuracies": 0.375, "rewards/chosen": -10.520917892456055, "rewards/margins": 1.3019917011260986, "rewards/rejected": -11.82291030883789, "step": 228 }, { "epoch": 0.34147250698974835, "grad_norm": 147.2009343892087, "learning_rate": 7.854379014263876e-07, "logits/chosen": 1.5173033475875854, "logits/rejected": 1.780479073524475, "logps/chosen": -1.1946444511413574, "logps/rejected": -2.0428507328033447, "loss": 2.6133, "nll_loss": 1.1946444511413574, "rewards/accuracies": 0.875, "rewards/chosen": -11.946443557739258, "rewards/margins": 8.482065200805664, "rewards/rejected": -20.428508758544922, "step": 229 }, { "epoch": 0.3429636533084809, "grad_norm": 68.3722920532662, "learning_rate": 7.834066270467689e-07, "logits/chosen": 2.3850624561309814, "logits/rejected": 2.422222137451172, "logps/chosen": -1.7031844854354858, "logps/rejected": -2.2421417236328125, "loss": 2.4329, "nll_loss": 1.7031843662261963, "rewards/accuracies": 0.75, "rewards/chosen": -17.031845092773438, "rewards/margins": 5.3895721435546875, "rewards/rejected": -22.421419143676758, "step": 230 }, { "epoch": 0.3444547996272134, "grad_norm": 263.55673959692865, "learning_rate": 7.813684376281729e-07, "logits/chosen": 2.1059210300445557, "logits/rejected": 2.218613862991333, "logps/chosen": -1.3147671222686768, "logps/rejected": -2.3974826335906982, "loss": 2.335, "nll_loss": 1.3147673606872559, "rewards/accuracies": 0.625, "rewards/chosen": -13.147672653198242, "rewards/margins": 10.827152252197266, "rewards/rejected": -23.974824905395508, "step": 231 }, { "epoch": 0.34594594594594597, "grad_norm": 103.1795350178365, "learning_rate": 7.793233829018262e-07, "logits/chosen": 2.139861583709717, "logits/rejected": 2.9385435581207275, "logps/chosen": -1.1145626306533813, "logps/rejected": -2.2230255603790283, "loss": 2.1556, "nll_loss": 1.1145626306533813, "rewards/accuracies": 0.875, "rewards/chosen": -11.14562702178955, "rewards/margins": 11.08462905883789, "rewards/rejected": -22.230255126953125, "step": 232 }, { "epoch": 0.34743709226467845, "grad_norm": 60.679522428681075, "learning_rate": 7.772715127664676e-07, "logits/chosen": 1.7296453714370728, "logits/rejected": 1.732195496559143, "logps/chosen": -1.1248666048049927, "logps/rejected": -2.177182197570801, "loss": 2.0203, "nll_loss": 1.1248664855957031, "rewards/accuracies": 1.0, "rewards/chosen": -11.248664855957031, "rewards/margins": 10.52315616607666, "rewards/rejected": -21.771820068359375, "step": 233 }, { "epoch": 0.348928238583411, "grad_norm": 99.81297830742463, "learning_rate": 7.752128772871292e-07, "logits/chosen": 0.9700308442115784, "logits/rejected": 1.4567511081695557, "logps/chosen": -1.3183832168579102, "logps/rejected": -2.960069417953491, "loss": 2.8233, "nll_loss": 1.3183832168579102, "rewards/accuracies": 0.625, "rewards/chosen": -13.183832168579102, "rewards/margins": 16.416860580444336, "rewards/rejected": -29.600690841674805, "step": 234 }, { "epoch": 0.3504193849021435, "grad_norm": 96.12368102104242, "learning_rate": 7.731475266939158e-07, "logits/chosen": 1.5304874181747437, "logits/rejected": 0.8800415396690369, "logps/chosen": -0.9192510843276978, "logps/rejected": -2.304888963699341, "loss": 2.0082, "nll_loss": 0.9192511439323425, "rewards/accuracies": 1.0, "rewards/chosen": -9.192510604858398, "rewards/margins": 13.856379508972168, "rewards/rejected": -23.04888916015625, "step": 235 }, { "epoch": 0.35191053122087607, "grad_norm": 44.76418279154148, "learning_rate": 7.710755113807793e-07, "logits/chosen": 1.342360496520996, "logits/rejected": 1.1625380516052246, "logps/chosen": -0.9219427108764648, "logps/rejected": -2.1155216693878174, "loss": 2.2282, "nll_loss": 0.9219425916671753, "rewards/accuracies": 1.0, "rewards/chosen": -9.219427108764648, "rewards/margins": 11.935790061950684, "rewards/rejected": -21.155216217041016, "step": 236 }, { "epoch": 0.35340167753960855, "grad_norm": 52.81015880959803, "learning_rate": 7.689968819042882e-07, "logits/chosen": 1.2148919105529785, "logits/rejected": 1.383809208869934, "logps/chosen": -1.007696270942688, "logps/rejected": -2.194352626800537, "loss": 1.9773, "nll_loss": 1.007696270942688, "rewards/accuracies": 1.0, "rewards/chosen": -10.0769624710083, "rewards/margins": 11.866562843322754, "rewards/rejected": -21.943525314331055, "step": 237 }, { "epoch": 0.3548928238583411, "grad_norm": 28.186399074668028, "learning_rate": 7.669116889823954e-07, "logits/chosen": 0.8694225549697876, "logits/rejected": 1.2528514862060547, "logps/chosen": -1.0248780250549316, "logps/rejected": -2.271354913711548, "loss": 1.7403, "nll_loss": 1.0248781442642212, "rewards/accuracies": 0.875, "rewards/chosen": -10.248781204223633, "rewards/margins": 12.464767456054688, "rewards/rejected": -22.713546752929688, "step": 238 }, { "epoch": 0.35638397017707363, "grad_norm": 100.15322759268103, "learning_rate": 7.648199834931992e-07, "logits/chosen": 0.634692907333374, "logits/rejected": 1.1396393775939941, "logps/chosen": -1.1581244468688965, "logps/rejected": -5.170987129211426, "loss": 1.8909, "nll_loss": 1.158124566078186, "rewards/accuracies": 0.75, "rewards/chosen": -11.581245422363281, "rewards/margins": 40.12862777709961, "rewards/rejected": -51.709869384765625, "step": 239 }, { "epoch": 0.35787511649580617, "grad_norm": 300.44269554022554, "learning_rate": 7.62721816473703e-07, "logits/chosen": 2.2034199237823486, "logits/rejected": 2.4888501167297363, "logps/chosen": -1.7597814798355103, "logps/rejected": -2.8892478942871094, "loss": 2.6985, "nll_loss": 1.7597814798355103, "rewards/accuracies": 0.75, "rewards/chosen": -17.59781265258789, "rewards/margins": 11.29466438293457, "rewards/rejected": -28.892480850219727, "step": 240 }, { "epoch": 0.35936626281453865, "grad_norm": 84.61056841792454, "learning_rate": 7.606172391185699e-07, "logits/chosen": 0.9357536435127258, "logits/rejected": 1.495705485343933, "logps/chosen": -0.7502763867378235, "logps/rejected": -2.4145474433898926, "loss": 1.6242, "nll_loss": 0.7502763867378235, "rewards/accuracies": 0.75, "rewards/chosen": -7.502763748168945, "rewards/margins": 16.64270782470703, "rewards/rejected": -24.14547348022461, "step": 241 }, { "epoch": 0.3608574091332712, "grad_norm": 30.01462048225669, "learning_rate": 7.58506302778873e-07, "logits/chosen": 1.139107584953308, "logits/rejected": 1.5516914129257202, "logps/chosen": -0.9279584884643555, "logps/rejected": -2.4357917308807373, "loss": 1.9693, "nll_loss": 0.927958607673645, "rewards/accuracies": 0.75, "rewards/chosen": -9.279584884643555, "rewards/margins": 15.07833480834961, "rewards/rejected": -24.357919692993164, "step": 242 }, { "epoch": 0.36234855545200373, "grad_norm": 49.87160158067065, "learning_rate": 7.563890589608426e-07, "logits/chosen": 0.7547241449356079, "logits/rejected": 1.018823266029358, "logps/chosen": -0.6867074966430664, "logps/rejected": -2.4492809772491455, "loss": 1.627, "nll_loss": 0.6867074966430664, "rewards/accuracies": 1.0, "rewards/chosen": -6.867075443267822, "rewards/margins": 17.625732421875, "rewards/rejected": -24.492809295654297, "step": 243 }, { "epoch": 0.36383970177073627, "grad_norm": 79.18909399401014, "learning_rate": 7.542655593246103e-07, "logits/chosen": 2.3590550422668457, "logits/rejected": 2.4431886672973633, "logps/chosen": -1.115024447441101, "logps/rejected": -2.00443172454834, "loss": 2.179, "nll_loss": 1.1150243282318115, "rewards/accuracies": 0.875, "rewards/chosen": -11.150243759155273, "rewards/margins": 8.894072532653809, "rewards/rejected": -20.04431915283203, "step": 244 }, { "epoch": 0.36533084808946875, "grad_norm": 76.09626746873882, "learning_rate": 7.521358556829469e-07, "logits/chosen": 2.7354531288146973, "logits/rejected": 2.9832749366760254, "logps/chosen": -1.0923782587051392, "logps/rejected": -2.1333441734313965, "loss": 2.8052, "nll_loss": 1.0923781394958496, "rewards/accuracies": 0.75, "rewards/chosen": -10.923782348632812, "rewards/margins": 10.409658432006836, "rewards/rejected": -21.33344078063965, "step": 245 }, { "epoch": 0.3668219944082013, "grad_norm": 51.58831925276444, "learning_rate": 7.5e-07, "logits/chosen": 0.9712691307067871, "logits/rejected": 2.3313350677490234, "logps/chosen": -1.1223640441894531, "logps/rejected": -3.0363516807556152, "loss": 1.9817, "nll_loss": 1.1223642826080322, "rewards/accuracies": 0.875, "rewards/chosen": -11.223642349243164, "rewards/margins": 19.139873504638672, "rewards/rejected": -30.36351203918457, "step": 246 }, { "epoch": 0.36831314072693383, "grad_norm": 72.27660793379593, "learning_rate": 7.478580443900246e-07, "logits/chosen": 1.3063576221466064, "logits/rejected": 1.4398159980773926, "logps/chosen": -1.2569646835327148, "logps/rejected": -2.022681951522827, "loss": 2.9092, "nll_loss": 1.2569645643234253, "rewards/accuracies": 0.625, "rewards/chosen": -12.569645881652832, "rewards/margins": 7.657173156738281, "rewards/rejected": -20.226818084716797, "step": 247 }, { "epoch": 0.36980428704566637, "grad_norm": 69.93348511338252, "learning_rate": 7.457100411161127e-07, "logits/chosen": 1.797006368637085, "logits/rejected": 1.4189502000808716, "logps/chosen": -0.7586382031440735, "logps/rejected": -1.9290591478347778, "loss": 2.2318, "nll_loss": 0.7586381435394287, "rewards/accuracies": 1.0, "rewards/chosen": -7.5863823890686035, "rewards/margins": 11.70421028137207, "rewards/rejected": -19.290592193603516, "step": 248 }, { "epoch": 0.3712954333643989, "grad_norm": 31.343399697142512, "learning_rate": 7.435560425889168e-07, "logits/chosen": 0.6109998822212219, "logits/rejected": 0.8668403625488281, "logps/chosen": -0.9712101221084595, "logps/rejected": -2.0562825202941895, "loss": 1.4127, "nll_loss": 0.9712100625038147, "rewards/accuracies": 1.0, "rewards/chosen": -9.712100982666016, "rewards/margins": 10.850724220275879, "rewards/rejected": -20.562824249267578, "step": 249 }, { "epoch": 0.3727865796831314, "grad_norm": 59.38203502577897, "learning_rate": 7.413961013653725e-07, "logits/chosen": 1.6231962442398071, "logits/rejected": 1.9316778182983398, "logps/chosen": -0.8256513476371765, "logps/rejected": -1.3191328048706055, "loss": 2.3647, "nll_loss": 0.8256514072418213, "rewards/accuracies": 0.75, "rewards/chosen": -8.256513595581055, "rewards/margins": 4.934813976287842, "rewards/rejected": -13.191327095031738, "step": 250 }, { "epoch": 0.37427772600186393, "grad_norm": 39.973759798819756, "learning_rate": 7.39230270147415e-07, "logits/chosen": 1.6450904607772827, "logits/rejected": 1.9193065166473389, "logps/chosen": -1.230831265449524, "logps/rejected": -2.295335054397583, "loss": 2.7592, "nll_loss": 1.230831265449524, "rewards/accuracies": 0.75, "rewards/chosen": -12.308311462402344, "rewards/margins": 10.645038604736328, "rewards/rejected": -22.953350067138672, "step": 251 }, { "epoch": 0.37576887232059647, "grad_norm": 67.97110261391505, "learning_rate": 7.370586017806941e-07, "logits/chosen": 2.092968463897705, "logits/rejected": 2.5929911136627197, "logps/chosen": -0.9929396510124207, "logps/rejected": -4.780749797821045, "loss": 2.546, "nll_loss": 0.9929396510124207, "rewards/accuracies": 0.875, "rewards/chosen": -9.929396629333496, "rewards/margins": 37.87810134887695, "rewards/rejected": -47.8074951171875, "step": 252 }, { "epoch": 0.377260018639329, "grad_norm": 37.78048177829846, "learning_rate": 7.348811492532839e-07, "logits/chosen": 1.4211300611495972, "logits/rejected": 1.9608873128890991, "logps/chosen": -1.005782961845398, "logps/rejected": -1.8782293796539307, "loss": 2.26, "nll_loss": 1.005782961845398, "rewards/accuracies": 0.75, "rewards/chosen": -10.057829856872559, "rewards/margins": 8.724465370178223, "rewards/rejected": -18.78229522705078, "step": 253 }, { "epoch": 0.3787511649580615, "grad_norm": 99.38104653847641, "learning_rate": 7.326979656943905e-07, "logits/chosen": 2.002239227294922, "logits/rejected": 2.198601722717285, "logps/chosen": -1.406559944152832, "logps/rejected": -2.0653531551361084, "loss": 2.4987, "nll_loss": 1.4065600633621216, "rewards/accuracies": 0.75, "rewards/chosen": -14.06559944152832, "rewards/margins": 6.587931156158447, "rewards/rejected": -20.653532028198242, "step": 254 }, { "epoch": 0.38024231127679403, "grad_norm": 60.106932026337496, "learning_rate": 7.305091043730557e-07, "logits/chosen": 2.4402213096618652, "logits/rejected": 2.591193675994873, "logps/chosen": -1.0940014123916626, "logps/rejected": -1.9400519132614136, "loss": 1.4045, "nll_loss": 1.0940014123916626, "rewards/accuracies": 1.0, "rewards/chosen": -10.940013885498047, "rewards/margins": 8.460504531860352, "rewards/rejected": -19.40052032470703, "step": 255 }, { "epoch": 0.38173345759552657, "grad_norm": 122.9602377946072, "learning_rate": 7.283146186968565e-07, "logits/chosen": 2.069875478744507, "logits/rejected": 2.4555001258850098, "logps/chosen": -1.2773873805999756, "logps/rejected": -3.3619141578674316, "loss": 1.7268, "nll_loss": 1.277387261390686, "rewards/accuracies": 0.875, "rewards/chosen": -12.773874282836914, "rewards/margins": 20.845266342163086, "rewards/rejected": -33.619140625, "step": 256 }, { "epoch": 0.3832246039142591, "grad_norm": 52.28679278358108, "learning_rate": 7.261145622106032e-07, "logits/chosen": 0.6450112462043762, "logits/rejected": 0.8822189569473267, "logps/chosen": -1.0086485147476196, "logps/rejected": -2.0962486267089844, "loss": 2.059, "nll_loss": 1.0086486339569092, "rewards/accuracies": 0.625, "rewards/chosen": -10.086485862731934, "rewards/margins": 10.87600040435791, "rewards/rejected": -20.962486267089844, "step": 257 }, { "epoch": 0.3847157502329916, "grad_norm": 77.09312372429817, "learning_rate": 7.239089885950316e-07, "logits/chosen": 1.278009057044983, "logits/rejected": 1.4298620223999023, "logps/chosen": -0.978814423084259, "logps/rejected": -2.0365781784057617, "loss": 2.7127, "nll_loss": 0.978814423084259, "rewards/accuracies": 0.875, "rewards/chosen": -9.7881441116333, "rewards/margins": 10.57763671875, "rewards/rejected": -20.36578369140625, "step": 258 }, { "epoch": 0.38620689655172413, "grad_norm": 55.887201759176676, "learning_rate": 7.216979516654943e-07, "logits/chosen": 1.4817012548446655, "logits/rejected": 1.557416319847107, "logps/chosen": -0.8139100074768066, "logps/rejected": -2.3971123695373535, "loss": 1.4846, "nll_loss": 0.8139100670814514, "rewards/accuracies": 0.875, "rewards/chosen": -8.139101028442383, "rewards/margins": 15.832023620605469, "rewards/rejected": -23.97112464904785, "step": 259 }, { "epoch": 0.38769804287045667, "grad_norm": 88.12892241472093, "learning_rate": 7.19481505370647e-07, "logits/chosen": 2.003685712814331, "logits/rejected": 1.614970326423645, "logps/chosen": -1.1081788539886475, "logps/rejected": -2.8416566848754883, "loss": 1.35, "nll_loss": 1.108178973197937, "rewards/accuracies": 1.0, "rewards/chosen": -11.081789016723633, "rewards/margins": 17.334775924682617, "rewards/rejected": -28.41656494140625, "step": 260 }, { "epoch": 0.3891891891891892, "grad_norm": 99.391692996486, "learning_rate": 7.172597037911322e-07, "logits/chosen": 0.43143507838249207, "logits/rejected": 0.20857657492160797, "logps/chosen": -1.3084512948989868, "logps/rejected": -3.066157579421997, "loss": 2.148, "nll_loss": 1.3084512948989868, "rewards/accuracies": 0.75, "rewards/chosen": -13.084512710571289, "rewards/margins": 17.577064514160156, "rewards/rejected": -30.661577224731445, "step": 261 }, { "epoch": 0.3906803355079217, "grad_norm": 97.93389248759458, "learning_rate": 7.150326011382603e-07, "logits/chosen": 0.6526418924331665, "logits/rejected": 1.205736517906189, "logps/chosen": -0.9462600946426392, "logps/rejected": -2.2558352947235107, "loss": 1.8139, "nll_loss": 0.9462600946426392, "rewards/accuracies": 0.875, "rewards/chosen": -9.462600708007812, "rewards/margins": 13.09575080871582, "rewards/rejected": -22.558353424072266, "step": 262 }, { "epoch": 0.39217148182665423, "grad_norm": 53.98492289271101, "learning_rate": 7.128002517526856e-07, "logits/chosen": 0.9504834413528442, "logits/rejected": 0.7977679967880249, "logps/chosen": -0.8659110069274902, "logps/rejected": -1.5257205963134766, "loss": 2.0547, "nll_loss": 0.8659110069274902, "rewards/accuracies": 1.0, "rewards/chosen": -8.659110069274902, "rewards/margins": 6.598095893859863, "rewards/rejected": -15.257207870483398, "step": 263 }, { "epoch": 0.3936626281453868, "grad_norm": 53.028756273355576, "learning_rate": 7.105627101030815e-07, "logits/chosen": 1.035836935043335, "logits/rejected": 1.1337536573410034, "logps/chosen": -0.9609843492507935, "logps/rejected": -2.066173791885376, "loss": 2.0787, "nll_loss": 0.9609844088554382, "rewards/accuracies": 1.0, "rewards/chosen": -9.609843254089355, "rewards/margins": 11.051895141601562, "rewards/rejected": -20.6617374420166, "step": 264 }, { "epoch": 0.3951537744641193, "grad_norm": 70.03179421250289, "learning_rate": 7.083200307848115e-07, "logits/chosen": 2.2480337619781494, "logits/rejected": 2.4453728199005127, "logps/chosen": -1.1059800386428833, "logps/rejected": -2.8152523040771484, "loss": 1.693, "nll_loss": 1.1059800386428833, "rewards/accuracies": 1.0, "rewards/chosen": -11.059800148010254, "rewards/margins": 17.092721939086914, "rewards/rejected": -28.152523040771484, "step": 265 }, { "epoch": 0.3966449207828518, "grad_norm": 65.05167358830555, "learning_rate": 7.06072268518596e-07, "logits/chosen": 1.054613471031189, "logits/rejected": 1.4466171264648438, "logps/chosen": -1.3642425537109375, "logps/rejected": -1.3795865774154663, "loss": 1.9083, "nll_loss": 1.364242434501648, "rewards/accuracies": 0.625, "rewards/chosen": -13.642424583435059, "rewards/margins": 0.15344035625457764, "rewards/rejected": -13.795865058898926, "step": 266 }, { "epoch": 0.39813606710158433, "grad_norm": 115.63895045724882, "learning_rate": 7.038194781491785e-07, "logits/chosen": 1.7098942995071411, "logits/rejected": 2.203080892562866, "logps/chosen": -1.1696326732635498, "logps/rejected": -2.4187185764312744, "loss": 2.6433, "nll_loss": 1.1696325540542603, "rewards/accuracies": 0.875, "rewards/chosen": -11.69632625579834, "rewards/margins": 12.490857124328613, "rewards/rejected": -24.18718147277832, "step": 267 }, { "epoch": 0.3996272134203169, "grad_norm": 67.04895942633188, "learning_rate": 7.015617146439861e-07, "logits/chosen": 1.6581642627716064, "logits/rejected": 1.669770359992981, "logps/chosen": -1.2857623100280762, "logps/rejected": -2.7904677391052246, "loss": 1.8274, "nll_loss": 1.2857623100280762, "rewards/accuracies": 1.0, "rewards/chosen": -12.857623100280762, "rewards/margins": 15.047052383422852, "rewards/rejected": -27.904674530029297, "step": 268 }, { "epoch": 0.4011183597390494, "grad_norm": 91.36427605731544, "learning_rate": 6.992990330917896e-07, "logits/chosen": 0.932449996471405, "logits/rejected": 1.0148789882659912, "logps/chosen": -0.8048295974731445, "logps/rejected": -2.076087474822998, "loss": 2.27, "nll_loss": 0.8048295974731445, "rewards/accuracies": 0.875, "rewards/chosen": -8.048295974731445, "rewards/margins": 12.712578773498535, "rewards/rejected": -20.760875701904297, "step": 269 }, { "epoch": 0.4026095060577819, "grad_norm": 41.60383549655561, "learning_rate": 6.970314887013585e-07, "logits/chosen": 1.052612066268921, "logits/rejected": 1.0198369026184082, "logps/chosen": -0.9521026611328125, "logps/rejected": -3.0467047691345215, "loss": 1.3033, "nll_loss": 0.9521026611328125, "rewards/accuracies": 1.0, "rewards/chosen": -9.521026611328125, "rewards/margins": 20.946022033691406, "rewards/rejected": -30.46704864501953, "step": 270 }, { "epoch": 0.40410065237651444, "grad_norm": 37.2054556810954, "learning_rate": 6.947591368001137e-07, "logits/chosen": 1.8026392459869385, "logits/rejected": 1.524202585220337, "logps/chosen": -1.2173160314559937, "logps/rejected": -3.043490409851074, "loss": 2.0298, "nll_loss": 1.2173161506652832, "rewards/accuracies": 1.0, "rewards/chosen": -12.173162460327148, "rewards/margins": 18.261743545532227, "rewards/rejected": -30.434906005859375, "step": 271 }, { "epoch": 0.405591798695247, "grad_norm": 40.791654458384144, "learning_rate": 6.924820328327785e-07, "logits/chosen": 0.8878618478775024, "logits/rejected": 0.8154944181442261, "logps/chosen": -1.2944376468658447, "logps/rejected": -1.636922001838684, "loss": 2.34, "nll_loss": 1.2944377660751343, "rewards/accuracies": 0.625, "rewards/chosen": -12.944376945495605, "rewards/margins": 3.4248437881469727, "rewards/rejected": -16.369220733642578, "step": 272 }, { "epoch": 0.4070829450139795, "grad_norm": 87.21918058543856, "learning_rate": 6.902002323600251e-07, "logits/chosen": 1.4907180070877075, "logits/rejected": 1.9472870826721191, "logps/chosen": -1.0754882097244263, "logps/rejected": -3.0603857040405273, "loss": 2.6077, "nll_loss": 1.0754882097244263, "rewards/accuracies": 0.875, "rewards/chosen": -10.754881858825684, "rewards/margins": 19.848976135253906, "rewards/rejected": -30.60385513305664, "step": 273 }, { "epoch": 0.408574091332712, "grad_norm": 54.02250862515129, "learning_rate": 6.87913791057119e-07, "logits/chosen": 1.0437284708023071, "logits/rejected": 1.4450937509536743, "logps/chosen": -1.167959451675415, "logps/rejected": -2.6578826904296875, "loss": 1.453, "nll_loss": 1.1679595708847046, "rewards/accuracies": 1.0, "rewards/chosen": -11.679594039916992, "rewards/margins": 14.89923095703125, "rewards/rejected": -26.578826904296875, "step": 274 }, { "epoch": 0.41006523765144454, "grad_norm": 75.6400790003386, "learning_rate": 6.856227647125607e-07, "logits/chosen": 0.8017367720603943, "logits/rejected": 1.474280834197998, "logps/chosen": -1.1050833463668823, "logps/rejected": -1.9618220329284668, "loss": 2.6651, "nll_loss": 1.1050833463668823, "rewards/accuracies": 0.75, "rewards/chosen": -11.050833702087402, "rewards/margins": 8.56738567352295, "rewards/rejected": -19.61821937561035, "step": 275 }, { "epoch": 0.4115563839701771, "grad_norm": 30.388626869364693, "learning_rate": 6.83327209226724e-07, "logits/chosen": 1.4652159214019775, "logits/rejected": 1.2830939292907715, "logps/chosen": -0.8472062349319458, "logps/rejected": -1.9603122472763062, "loss": 1.5787, "nll_loss": 0.8472062945365906, "rewards/accuracies": 1.0, "rewards/chosen": -8.472063064575195, "rewards/margins": 11.131060600280762, "rewards/rejected": -19.60312271118164, "step": 276 }, { "epoch": 0.4130475302889096, "grad_norm": 39.216919718833466, "learning_rate": 6.81027180610493e-07, "logits/chosen": 1.3558590412139893, "logits/rejected": 1.70036780834198, "logps/chosen": -1.2558890581130981, "logps/rejected": -2.966331958770752, "loss": 2.0647, "nll_loss": 1.2558890581130981, "rewards/accuracies": 0.875, "rewards/chosen": -12.558890342712402, "rewards/margins": 17.104429244995117, "rewards/rejected": -29.66331672668457, "step": 277 }, { "epoch": 0.4145386766076421, "grad_norm": 58.789675286827766, "learning_rate": 6.787227349838946e-07, "logits/chosen": 0.9922891855239868, "logits/rejected": 1.273958683013916, "logps/chosen": -1.1365571022033691, "logps/rejected": -1.5940685272216797, "loss": 2.0423, "nll_loss": 1.1365571022033691, "rewards/accuracies": 0.75, "rewards/chosen": -11.365571022033691, "rewards/margins": 4.575113773345947, "rewards/rejected": -15.940685272216797, "step": 278 }, { "epoch": 0.41602982292637464, "grad_norm": 127.19344910907691, "learning_rate": 6.764139285747291e-07, "logits/chosen": 0.5256351232528687, "logits/rejected": 0.7551167011260986, "logps/chosen": -0.8241851329803467, "logps/rejected": -1.77034592628479, "loss": 1.9205, "nll_loss": 0.8241850733757019, "rewards/accuracies": 0.75, "rewards/chosen": -8.241851806640625, "rewards/margins": 9.461607933044434, "rewards/rejected": -17.703458786010742, "step": 279 }, { "epoch": 0.4175209692451072, "grad_norm": 52.297853670482816, "learning_rate": 6.741008177171993e-07, "logits/chosen": 0.844875693321228, "logits/rejected": 0.7142741084098816, "logps/chosen": -1.081954836845398, "logps/rejected": -1.9283109903335571, "loss": 1.703, "nll_loss": 1.081954836845398, "rewards/accuracies": 0.875, "rewards/chosen": -10.819547653198242, "rewards/margins": 8.463563919067383, "rewards/rejected": -19.283111572265625, "step": 280 }, { "epoch": 0.4190121155638397, "grad_norm": 41.30444921997634, "learning_rate": 6.717834588505349e-07, "logits/chosen": 0.35103124380111694, "logits/rejected": 0.7616575956344604, "logps/chosen": -1.1927512884140015, "logps/rejected": -2.2596030235290527, "loss": 1.7112, "nll_loss": 1.192751407623291, "rewards/accuracies": 1.0, "rewards/chosen": -11.927513122558594, "rewards/margins": 10.668517112731934, "rewards/rejected": -22.596031188964844, "step": 281 }, { "epoch": 0.42050326188257225, "grad_norm": 55.48858060729688, "learning_rate": 6.694619085176159e-07, "logits/chosen": 1.6717503070831299, "logits/rejected": 1.8445909023284912, "logps/chosen": -1.1160863637924194, "logps/rejected": -1.5369952917099, "loss": 1.5962, "nll_loss": 1.1160863637924194, "rewards/accuracies": 0.75, "rewards/chosen": -11.16086483001709, "rewards/margins": 4.2090888023376465, "rewards/rejected": -15.369952201843262, "step": 282 }, { "epoch": 0.42199440820130474, "grad_norm": 57.240320712688224, "learning_rate": 6.671362233635925e-07, "logits/chosen": 2.156038522720337, "logits/rejected": 2.2529256343841553, "logps/chosen": -1.3289601802825928, "logps/rejected": -2.3087949752807617, "loss": 2.1229, "nll_loss": 1.3289601802825928, "rewards/accuracies": 0.625, "rewards/chosen": -13.289603233337402, "rewards/margins": 9.798346519470215, "rewards/rejected": -23.087949752807617, "step": 283 }, { "epoch": 0.4234855545200373, "grad_norm": 74.80923783541773, "learning_rate": 6.64806460134504e-07, "logits/chosen": 2.358487129211426, "logits/rejected": 2.6080195903778076, "logps/chosen": -1.087075114250183, "logps/rejected": -3.3152096271514893, "loss": 1.7529, "nll_loss": 1.087075114250183, "rewards/accuracies": 0.75, "rewards/chosen": -10.870750427246094, "rewards/margins": 22.281347274780273, "rewards/rejected": -33.152099609375, "step": 284 }, { "epoch": 0.4249767008387698, "grad_norm": 27.565727625360296, "learning_rate": 6.624726756758927e-07, "logits/chosen": 1.5838301181793213, "logits/rejected": 1.3991775512695312, "logps/chosen": -1.7774771451950073, "logps/rejected": -2.480024814605713, "loss": 1.3078, "nll_loss": 1.7774772644042969, "rewards/accuracies": 0.75, "rewards/chosen": -17.77477264404297, "rewards/margins": 7.025475025177002, "rewards/rejected": -24.800247192382812, "step": 285 }, { "epoch": 0.42646784715750236, "grad_norm": 154.74744398137636, "learning_rate": 6.601349269314187e-07, "logits/chosen": 1.8415088653564453, "logits/rejected": 1.869084358215332, "logps/chosen": -1.4172766208648682, "logps/rejected": -2.5456197261810303, "loss": 2.3037, "nll_loss": 1.4172766208648682, "rewards/accuracies": 0.875, "rewards/chosen": -14.172767639160156, "rewards/margins": 11.283432006835938, "rewards/rejected": -25.45619773864746, "step": 286 }, { "epoch": 0.42795899347623484, "grad_norm": 33.654264848772605, "learning_rate": 6.577932709414689e-07, "logits/chosen": 0.06796303391456604, "logits/rejected": 0.024106621742248535, "logps/chosen": -1.2476297616958618, "logps/rejected": -2.5938475131988525, "loss": 1.9129, "nll_loss": 1.2476297616958618, "rewards/accuracies": 0.75, "rewards/chosen": -12.476297378540039, "rewards/margins": 13.462179183959961, "rewards/rejected": -25.938474655151367, "step": 287 }, { "epoch": 0.4294501397949674, "grad_norm": 61.274908342374445, "learning_rate": 6.554477648417655e-07, "logits/chosen": 1.084038496017456, "logits/rejected": 0.7969342470169067, "logps/chosen": -0.8771347999572754, "logps/rejected": -1.9453235864639282, "loss": 2.2445, "nll_loss": 0.8771347999572754, "rewards/accuracies": 1.0, "rewards/chosen": -8.771347999572754, "rewards/margins": 10.68188762664795, "rewards/rejected": -19.453235626220703, "step": 288 }, { "epoch": 0.4309412861136999, "grad_norm": 33.92864029856102, "learning_rate": 6.530984658619733e-07, "logits/chosen": 0.6454198956489563, "logits/rejected": 0.6794722080230713, "logps/chosen": -0.6796860694885254, "logps/rejected": -2.1921310424804688, "loss": 1.4368, "nll_loss": 0.6796860098838806, "rewards/accuracies": 1.0, "rewards/chosen": -6.796860694885254, "rewards/margins": 15.124448776245117, "rewards/rejected": -21.921310424804688, "step": 289 }, { "epoch": 0.43243243243243246, "grad_norm": 102.89530504035014, "learning_rate": 6.507454313243015e-07, "logits/chosen": 0.5887856483459473, "logits/rejected": 0.8982241749763489, "logps/chosen": -1.2342543601989746, "logps/rejected": -3.1363353729248047, "loss": 1.4046, "nll_loss": 1.2342543601989746, "rewards/accuracies": 1.0, "rewards/chosen": -12.342544555664062, "rewards/margins": 19.020811080932617, "rewards/rejected": -31.363353729248047, "step": 290 }, { "epoch": 0.43392357875116494, "grad_norm": 49.68076761184129, "learning_rate": 6.483887186421058e-07, "logits/chosen": 0.8749809265136719, "logits/rejected": 0.6832801103591919, "logps/chosen": -0.9976881742477417, "logps/rejected": -1.4613938331604004, "loss": 2.2379, "nll_loss": 0.9976882338523865, "rewards/accuracies": 0.75, "rewards/chosen": -9.976881980895996, "rewards/margins": 4.63705587387085, "rewards/rejected": -14.613937377929688, "step": 291 }, { "epoch": 0.4354147250698975, "grad_norm": 39.12656003901273, "learning_rate": 6.460283853184879e-07, "logits/chosen": 1.0188599824905396, "logits/rejected": 1.0585620403289795, "logps/chosen": -0.8566121459007263, "logps/rejected": -2.6598193645477295, "loss": 2.3505, "nll_loss": 0.8566122055053711, "rewards/accuracies": 0.875, "rewards/chosen": -8.566122055053711, "rewards/margins": 18.032073974609375, "rewards/rejected": -26.598194122314453, "step": 292 }, { "epoch": 0.43690587138863, "grad_norm": 123.87555388902, "learning_rate": 6.436644889448919e-07, "logits/chosen": 2.1357312202453613, "logits/rejected": 2.082622766494751, "logps/chosen": -1.5212548971176147, "logps/rejected": -2.3258817195892334, "loss": 2.7392, "nll_loss": 1.5212547779083252, "rewards/accuracies": 0.625, "rewards/chosen": -15.21254825592041, "rewards/margins": 8.046268463134766, "rewards/rejected": -23.25881576538086, "step": 293 }, { "epoch": 0.43839701770736256, "grad_norm": 44.15702706121298, "learning_rate": 6.412970871996995e-07, "logits/chosen": 2.1622915267944336, "logits/rejected": 2.064239263534546, "logps/chosen": -1.0755740404129028, "logps/rejected": -3.1556620597839355, "loss": 1.7613, "nll_loss": 1.0755740404129028, "rewards/accuracies": 0.75, "rewards/chosen": -10.755739212036133, "rewards/margins": 20.800884246826172, "rewards/rejected": -31.556623458862305, "step": 294 }, { "epoch": 0.43988816402609504, "grad_norm": 63.85906467207744, "learning_rate": 6.389262378468219e-07, "logits/chosen": 0.337302565574646, "logits/rejected": 0.3216743469238281, "logps/chosen": -1.0648306608200073, "logps/rejected": -1.7549852132797241, "loss": 2.2155, "nll_loss": 1.0648306608200073, "rewards/accuracies": 0.875, "rewards/chosen": -10.648306846618652, "rewards/margins": 6.901545524597168, "rewards/rejected": -17.54985237121582, "step": 295 }, { "epoch": 0.4413793103448276, "grad_norm": 63.02949408369288, "learning_rate": 6.365519987342915e-07, "logits/chosen": 1.4290008544921875, "logits/rejected": 2.151444435119629, "logps/chosen": -1.3096098899841309, "logps/rejected": -3.7374768257141113, "loss": 2.3925, "nll_loss": 1.3096097707748413, "rewards/accuracies": 0.75, "rewards/chosen": -13.096098899841309, "rewards/margins": 24.278675079345703, "rewards/rejected": -37.37477111816406, "step": 296 }, { "epoch": 0.4428704566635601, "grad_norm": 103.14840083443293, "learning_rate": 6.341744277928499e-07, "logits/chosen": -0.02471376582980156, "logits/rejected": 0.1734105348587036, "logps/chosen": -1.2056688070297241, "logps/rejected": -1.9607056379318237, "loss": 2.3567, "nll_loss": 1.2056688070297241, "rewards/accuracies": 0.625, "rewards/chosen": -12.056687355041504, "rewards/margins": 7.550368785858154, "rewards/rejected": -19.6070556640625, "step": 297 }, { "epoch": 0.44436160298229266, "grad_norm": 60.70508187697226, "learning_rate": 6.317935830345338e-07, "logits/chosen": 1.4392168521881104, "logits/rejected": 1.5099271535873413, "logps/chosen": -1.2347393035888672, "logps/rejected": -1.8257811069488525, "loss": 2.9378, "nll_loss": 1.2347395420074463, "rewards/accuracies": 1.0, "rewards/chosen": -12.347393989562988, "rewards/margins": 5.910416126251221, "rewards/rejected": -18.257810592651367, "step": 298 }, { "epoch": 0.44585274930102514, "grad_norm": 196.96211764289563, "learning_rate": 6.294095225512604e-07, "logits/chosen": 0.2620046138763428, "logits/rejected": 0.18626247346401215, "logps/chosen": -1.016269564628601, "logps/rejected": -2.0889391899108887, "loss": 1.6622, "nll_loss": 1.016269564628601, "rewards/accuracies": 0.875, "rewards/chosen": -10.162696838378906, "rewards/margins": 10.72669792175293, "rewards/rejected": -20.889392852783203, "step": 299 }, { "epoch": 0.4473438956197577, "grad_norm": 57.732026030259775, "learning_rate": 6.270223045134095e-07, "logits/chosen": 1.0570734739303589, "logits/rejected": 1.2986547946929932, "logps/chosen": -1.048396348953247, "logps/rejected": -2.0673375129699707, "loss": 2.7017, "nll_loss": 1.048396348953247, "rewards/accuracies": 0.625, "rewards/chosen": -10.483963966369629, "rewards/margins": 10.189414978027344, "rewards/rejected": -20.67337989807129, "step": 300 }, { "epoch": 0.4488350419384902, "grad_norm": 33.69710577119018, "learning_rate": 6.246319871684047e-07, "logits/chosen": 0.7944878339767456, "logits/rejected": 1.3021888732910156, "logps/chosen": -1.0949335098266602, "logps/rejected": -3.1596837043762207, "loss": 1.4899, "nll_loss": 1.0949335098266602, "rewards/accuracies": 0.875, "rewards/chosen": -10.949335098266602, "rewards/margins": 20.64750099182129, "rewards/rejected": -31.59683609008789, "step": 301 }, { "epoch": 0.45032618825722276, "grad_norm": 49.16607074744664, "learning_rate": 6.222386288392914e-07, "logits/chosen": 1.3668736219406128, "logits/rejected": 1.658337116241455, "logps/chosen": -0.9744982123374939, "logps/rejected": -2.594623327255249, "loss": 2.4226, "nll_loss": 0.9744983315467834, "rewards/accuracies": 0.875, "rewards/chosen": -9.74498176574707, "rewards/margins": 16.201250076293945, "rewards/rejected": -25.946231842041016, "step": 302 }, { "epoch": 0.45181733457595524, "grad_norm": 67.63665525502225, "learning_rate": 6.19842287923314e-07, "logits/chosen": 0.8895013332366943, "logits/rejected": 0.7559343576431274, "logps/chosen": -0.8952937126159668, "logps/rejected": -3.781191110610962, "loss": 1.7441, "nll_loss": 0.8952935934066772, "rewards/accuracies": 0.875, "rewards/chosen": -8.952936172485352, "rewards/margins": 28.858976364135742, "rewards/rejected": -37.81190872192383, "step": 303 }, { "epoch": 0.4533084808946878, "grad_norm": 84.23259859109938, "learning_rate": 6.174430228904919e-07, "logits/chosen": 0.8855783343315125, "logits/rejected": 0.9021013975143433, "logps/chosen": -0.9263310432434082, "logps/rejected": -2.732335329055786, "loss": 1.809, "nll_loss": 0.9263309836387634, "rewards/accuracies": 0.875, "rewards/chosen": -9.263310432434082, "rewards/margins": 18.060047149658203, "rewards/rejected": -27.32335662841797, "step": 304 }, { "epoch": 0.4547996272134203, "grad_norm": 95.50345921648152, "learning_rate": 6.150408922821911e-07, "logits/chosen": 0.833683431148529, "logits/rejected": 0.7572767734527588, "logps/chosen": -0.8070122003555298, "logps/rejected": -1.7145060300827026, "loss": 2.7335, "nll_loss": 0.8070122599601746, "rewards/accuracies": 1.0, "rewards/chosen": -8.070121765136719, "rewards/margins": 9.07493782043457, "rewards/rejected": -17.14505958557129, "step": 305 }, { "epoch": 0.45629077353215286, "grad_norm": 47.04220627181392, "learning_rate": 6.126359547096974e-07, "logits/chosen": 0.17184323072433472, "logits/rejected": 0.23235543072223663, "logps/chosen": -0.9182405471801758, "logps/rejected": -2.4804115295410156, "loss": 1.858, "nll_loss": 0.9182405471801758, "rewards/accuracies": 1.0, "rewards/chosen": -9.182406425476074, "rewards/margins": 15.621709823608398, "rewards/rejected": -24.804115295410156, "step": 306 }, { "epoch": 0.45778191985088534, "grad_norm": 89.25209929634599, "learning_rate": 6.102282688527859e-07, "logits/chosen": 1.0285824537277222, "logits/rejected": 0.3567197620868683, "logps/chosen": -1.3488802909851074, "logps/rejected": -2.2083587646484375, "loss": 2.0896, "nll_loss": 1.3488802909851074, "rewards/accuracies": 0.875, "rewards/chosen": -13.488801956176758, "rewards/margins": 8.594785690307617, "rewards/rejected": -22.083589553833008, "step": 307 }, { "epoch": 0.4592730661696179, "grad_norm": 47.26521425033619, "learning_rate": 6.078178934582885e-07, "logits/chosen": 0.8061944246292114, "logits/rejected": 1.3660072088241577, "logps/chosen": -1.383995771408081, "logps/rejected": -3.055185317993164, "loss": 2.1618, "nll_loss": 1.383995771408081, "rewards/accuracies": 0.875, "rewards/chosen": -13.839958190917969, "rewards/margins": 16.711894989013672, "rewards/rejected": -30.55185317993164, "step": 308 }, { "epoch": 0.4607642124883504, "grad_norm": 44.101062459563245, "learning_rate": 6.054048873386612e-07, "logits/chosen": 1.240386962890625, "logits/rejected": 1.2306119203567505, "logps/chosen": -1.1784520149230957, "logps/rejected": -3.2531423568725586, "loss": 1.1368, "nll_loss": 1.1784520149230957, "rewards/accuracies": 0.875, "rewards/chosen": -11.78451919555664, "rewards/margins": 20.746902465820312, "rewards/rejected": -32.53142547607422, "step": 309 }, { "epoch": 0.46225535880708296, "grad_norm": 147.43759399932563, "learning_rate": 6.029893093705491e-07, "logits/chosen": 0.9249796271324158, "logits/rejected": 0.8558982610702515, "logps/chosen": -1.0307352542877197, "logps/rejected": -2.5594286918640137, "loss": 2.2389, "nll_loss": 1.0307352542877197, "rewards/accuracies": 0.875, "rewards/chosen": -10.307353019714355, "rewards/margins": 15.286933898925781, "rewards/rejected": -25.594287872314453, "step": 310 }, { "epoch": 0.46374650512581544, "grad_norm": 56.960912871190224, "learning_rate": 6.005712184933497e-07, "logits/chosen": 1.5723234415054321, "logits/rejected": 2.2183189392089844, "logps/chosen": -1.2421478033065796, "logps/rejected": -2.286712408065796, "loss": 1.4243, "nll_loss": 1.2421478033065796, "rewards/accuracies": 1.0, "rewards/chosen": -12.421477317810059, "rewards/margins": 10.445649147033691, "rewards/rejected": -22.86712646484375, "step": 311 }, { "epoch": 0.465237651444548, "grad_norm": 45.47868786092567, "learning_rate": 5.981506737077743e-07, "logits/chosen": 1.2714695930480957, "logits/rejected": 1.4802864789962769, "logps/chosen": -0.8951252698898315, "logps/rejected": -2.4004831314086914, "loss": 2.2213, "nll_loss": 0.8951252698898315, "rewards/accuracies": 1.0, "rewards/chosen": -8.951252937316895, "rewards/margins": 15.053577423095703, "rewards/rejected": -24.00482940673828, "step": 312 }, { "epoch": 0.4667287977632805, "grad_norm": 40.21728414149163, "learning_rate": 5.957277340744094e-07, "logits/chosen": 0.9764309525489807, "logits/rejected": 0.9029860496520996, "logps/chosen": -1.137838363647461, "logps/rejected": -1.8395271301269531, "loss": 2.2428, "nll_loss": 1.137838363647461, "rewards/accuracies": 1.0, "rewards/chosen": -11.37838363647461, "rewards/margins": 7.016888618469238, "rewards/rejected": -18.39527130126953, "step": 313 }, { "epoch": 0.46821994408201306, "grad_norm": 185.20061412223453, "learning_rate": 5.933024587122745e-07, "logits/chosen": 1.2819750308990479, "logits/rejected": 0.6815961003303528, "logps/chosen": -1.3975375890731812, "logps/rejected": -2.166456699371338, "loss": 1.5696, "nll_loss": 1.3975378274917603, "rewards/accuracies": 0.875, "rewards/chosen": -13.97537612915039, "rewards/margins": 7.689189910888672, "rewards/rejected": -21.664566040039062, "step": 314 }, { "epoch": 0.46971109040074555, "grad_norm": 73.99447495932938, "learning_rate": 5.908749067973809e-07, "logits/chosen": 1.1808252334594727, "logits/rejected": 1.8961601257324219, "logps/chosen": -1.1919835805892944, "logps/rejected": -1.9030534029006958, "loss": 2.9395, "nll_loss": 1.1919835805892944, "rewards/accuracies": 0.75, "rewards/chosen": -11.919836044311523, "rewards/margins": 7.11069917678833, "rewards/rejected": -19.030534744262695, "step": 315 }, { "epoch": 0.4712022367194781, "grad_norm": 41.91892232926434, "learning_rate": 5.884451375612865e-07, "logits/chosen": 0.43265220522880554, "logits/rejected": 0.669975221157074, "logps/chosen": -1.172359585762024, "logps/rejected": -2.715709686279297, "loss": 2.1396, "nll_loss": 1.172359585762024, "rewards/accuracies": 0.625, "rewards/chosen": -11.723597526550293, "rewards/margins": 15.43349838256836, "rewards/rejected": -27.157094955444336, "step": 316 }, { "epoch": 0.4726933830382106, "grad_norm": 67.47008277225329, "learning_rate": 5.860132102896515e-07, "logits/chosen": 1.0299711227416992, "logits/rejected": 1.513453483581543, "logps/chosen": -1.4263699054718018, "logps/rejected": -3.114851236343384, "loss": 2.4576, "nll_loss": 1.4263699054718018, "rewards/accuracies": 1.0, "rewards/chosen": -14.26369857788086, "rewards/margins": 16.884815216064453, "rewards/rejected": -31.148513793945312, "step": 317 }, { "epoch": 0.47418452935694316, "grad_norm": 106.55390409463216, "learning_rate": 5.835791843207916e-07, "logits/chosen": 0.5274050235748291, "logits/rejected": 1.4777504205703735, "logps/chosen": -1.3521366119384766, "logps/rejected": -5.964361667633057, "loss": 2.3519, "nll_loss": 1.3521366119384766, "rewards/accuracies": 0.875, "rewards/chosen": -13.521367073059082, "rewards/margins": 46.122249603271484, "rewards/rejected": -59.64361572265625, "step": 318 }, { "epoch": 0.4756756756756757, "grad_norm": 52.47560817081044, "learning_rate": 5.8114311904423e-07, "logits/chosen": 0.8356415033340454, "logits/rejected": 1.1021226644515991, "logps/chosen": -1.2969446182250977, "logps/rejected": -2.987818956375122, "loss": 2.0389, "nll_loss": 1.296944499015808, "rewards/accuracies": 0.875, "rewards/chosen": -12.96944808959961, "rewards/margins": 16.908740997314453, "rewards/rejected": -29.878189086914062, "step": 319 }, { "epoch": 0.4771668219944082, "grad_norm": 44.58696731411979, "learning_rate": 5.787050738992481e-07, "logits/chosen": 0.8062883615493774, "logits/rejected": 1.2388557195663452, "logps/chosen": -0.9122974276542664, "logps/rejected": -3.280360221862793, "loss": 2.3128, "nll_loss": 0.9122973680496216, "rewards/accuracies": 1.0, "rewards/chosen": -9.122974395751953, "rewards/margins": 23.680631637573242, "rewards/rejected": -32.80360412597656, "step": 320 }, { "epoch": 0.4786579683131407, "grad_norm": 47.624989755164464, "learning_rate": 5.762651083734362e-07, "logits/chosen": 1.5454519987106323, "logits/rejected": 1.7682874202728271, "logps/chosen": -1.2734403610229492, "logps/rejected": -2.9317433834075928, "loss": 1.8263, "nll_loss": 1.2734405994415283, "rewards/accuracies": 0.75, "rewards/chosen": -12.734405517578125, "rewards/margins": 16.583026885986328, "rewards/rejected": -29.317432403564453, "step": 321 }, { "epoch": 0.48014911463187326, "grad_norm": 62.34214654651794, "learning_rate": 5.738232820012407e-07, "logits/chosen": 1.1003084182739258, "logits/rejected": 1.1037921905517578, "logps/chosen": -1.0845376253128052, "logps/rejected": -2.8120718002319336, "loss": 2.0593, "nll_loss": 1.0845376253128052, "rewards/accuracies": 0.875, "rewards/chosen": -10.845376968383789, "rewards/margins": 17.275341033935547, "rewards/rejected": -28.120716094970703, "step": 322 }, { "epoch": 0.4816402609506058, "grad_norm": 36.0212815292935, "learning_rate": 5.713796543625122e-07, "logits/chosen": 1.975404143333435, "logits/rejected": 1.3179008960723877, "logps/chosen": -1.2851284742355347, "logps/rejected": -2.0051522254943848, "loss": 1.4676, "nll_loss": 1.2851283550262451, "rewards/accuracies": 0.75, "rewards/chosen": -12.85128402709961, "rewards/margins": 7.200236797332764, "rewards/rejected": -20.05152130126953, "step": 323 }, { "epoch": 0.4831314072693383, "grad_norm": 87.18844716839915, "learning_rate": 5.689342850810522e-07, "logits/chosen": 1.2656850814819336, "logits/rejected": 1.8935495615005493, "logps/chosen": -1.0454943180084229, "logps/rejected": -3.235139846801758, "loss": 2.5662, "nll_loss": 1.0454943180084229, "rewards/accuracies": 0.75, "rewards/chosen": -10.454943656921387, "rewards/margins": 21.896453857421875, "rewards/rejected": -32.35139846801758, "step": 324 }, { "epoch": 0.4846225535880708, "grad_norm": 84.9837736807849, "learning_rate": 5.664872338231571e-07, "logits/chosen": 1.088675856590271, "logits/rejected": 1.2341116666793823, "logps/chosen": -1.3650341033935547, "logps/rejected": -2.4809322357177734, "loss": 2.9029, "nll_loss": 1.3650341033935547, "rewards/accuracies": 0.75, "rewards/chosen": -13.650341987609863, "rewards/margins": 11.158980369567871, "rewards/rejected": -24.809322357177734, "step": 325 }, { "epoch": 0.48611369990680336, "grad_norm": 60.076091304902064, "learning_rate": 5.640385602961634e-07, "logits/chosen": 0.5524032115936279, "logits/rejected": 0.778762936592102, "logps/chosen": -1.2645853757858276, "logps/rejected": -4.169212341308594, "loss": 1.5392, "nll_loss": 1.264585256576538, "rewards/accuracies": 0.875, "rewards/chosen": -12.645853042602539, "rewards/margins": 29.0462703704834, "rewards/rejected": -41.69211959838867, "step": 326 }, { "epoch": 0.4876048462255359, "grad_norm": 45.87994623387452, "learning_rate": 5.615883242469905e-07, "logits/chosen": 0.7510640621185303, "logits/rejected": 0.7140066623687744, "logps/chosen": -1.442211627960205, "logps/rejected": -2.5686025619506836, "loss": 2.3796, "nll_loss": 1.4422115087509155, "rewards/accuracies": 0.875, "rewards/chosen": -14.422115325927734, "rewards/margins": 11.263908386230469, "rewards/rejected": -25.686025619506836, "step": 327 }, { "epoch": 0.4890959925442684, "grad_norm": 63.29029676996501, "learning_rate": 5.591365854606829e-07, "logits/chosen": 2.1231472492218018, "logits/rejected": 2.1009750366210938, "logps/chosen": -0.9882567524909973, "logps/rejected": -1.7455428838729858, "loss": 2.2137, "nll_loss": 0.9882567524909973, "rewards/accuracies": 0.75, "rewards/chosen": -9.882567405700684, "rewards/margins": 7.572861671447754, "rewards/rejected": -17.455429077148438, "step": 328 }, { "epoch": 0.4905871388630009, "grad_norm": 304.2743901322164, "learning_rate": 5.566834037589511e-07, "logits/chosen": 2.063725709915161, "logits/rejected": 2.0055620670318604, "logps/chosen": -1.1700128316879272, "logps/rejected": -2.869445323944092, "loss": 2.0797, "nll_loss": 1.1700127124786377, "rewards/accuracies": 0.875, "rewards/chosen": -11.700127601623535, "rewards/margins": 16.994325637817383, "rewards/rejected": -28.694454193115234, "step": 329 }, { "epoch": 0.49207828518173347, "grad_norm": 45.53013522811574, "learning_rate": 5.542288389987128e-07, "logits/chosen": 0.43271785974502563, "logits/rejected": 1.0451266765594482, "logps/chosen": -0.6082872748374939, "logps/rejected": -2.1641836166381836, "loss": 2.4139, "nll_loss": 0.6082872152328491, "rewards/accuracies": 1.0, "rewards/chosen": -6.08287239074707, "rewards/margins": 15.558965682983398, "rewards/rejected": -21.6418399810791, "step": 330 }, { "epoch": 0.493569431500466, "grad_norm": 43.98817680932417, "learning_rate": 5.517729510706315e-07, "logits/chosen": 1.076559066772461, "logits/rejected": 1.4049584865570068, "logps/chosen": -1.6192044019699097, "logps/rejected": -2.2700164318084717, "loss": 2.5434, "nll_loss": 1.6192045211791992, "rewards/accuracies": 0.625, "rewards/chosen": -16.192045211791992, "rewards/margins": 6.508121013641357, "rewards/rejected": -22.700164794921875, "step": 331 }, { "epoch": 0.4950605778191985, "grad_norm": 55.50922181112932, "learning_rate": 5.493157998976559e-07, "logits/chosen": 1.8846677541732788, "logits/rejected": 2.510406017303467, "logps/chosen": -1.385533094406128, "logps/rejected": -3.4347152709960938, "loss": 1.1139, "nll_loss": 1.385533094406128, "rewards/accuracies": 0.875, "rewards/chosen": -13.855329513549805, "rewards/margins": 20.491819381713867, "rewards/rejected": -34.34715270996094, "step": 332 }, { "epoch": 0.496551724137931, "grad_norm": 86.0791136849411, "learning_rate": 5.468574454335574e-07, "logits/chosen": 1.4486123323440552, "logits/rejected": 1.5563056468963623, "logps/chosen": -1.481582760810852, "logps/rejected": -7.358170509338379, "loss": 2.5366, "nll_loss": 1.4815826416015625, "rewards/accuracies": 0.75, "rewards/chosen": -14.815827369689941, "rewards/margins": 58.765869140625, "rewards/rejected": -73.58169555664062, "step": 333 }, { "epoch": 0.49804287045666357, "grad_norm": 275.8773072828461, "learning_rate": 5.443979476614674e-07, "logits/chosen": 0.7218424081802368, "logits/rejected": 1.135964274406433, "logps/chosen": -1.110957145690918, "logps/rejected": -3.2192254066467285, "loss": 2.6062, "nll_loss": 1.1109572649002075, "rewards/accuracies": 1.0, "rewards/chosen": -11.10957145690918, "rewards/margins": 21.082679748535156, "rewards/rejected": -32.19225311279297, "step": 334 }, { "epoch": 0.4995340167753961, "grad_norm": 70.54195359782109, "learning_rate": 5.419373665924136e-07, "logits/chosen": 1.0993138551712036, "logits/rejected": 1.5738513469696045, "logps/chosen": -1.3446593284606934, "logps/rejected": -2.769869327545166, "loss": 1.7894, "nll_loss": 1.3446592092514038, "rewards/accuracies": 0.875, "rewards/chosen": -13.446593284606934, "rewards/margins": 14.252100944519043, "rewards/rejected": -27.698692321777344, "step": 335 }, { "epoch": 0.5010251630941286, "grad_norm": 67.12861397994634, "learning_rate": 5.394757622638559e-07, "logits/chosen": 1.9011707305908203, "logits/rejected": 2.1747989654541016, "logps/chosen": -1.1282011270523071, "logps/rejected": -1.4653615951538086, "loss": 2.6, "nll_loss": 1.1282011270523071, "rewards/accuracies": 0.75, "rewards/chosen": -11.282011985778809, "rewards/margins": 3.3716037273406982, "rewards/rejected": -14.653615951538086, "step": 336 }, { "epoch": 0.5025163094128612, "grad_norm": 51.2055086895284, "learning_rate": 5.370131947382214e-07, "logits/chosen": 1.7519341707229614, "logits/rejected": 1.9194046258926392, "logps/chosen": -1.4238516092300415, "logps/rejected": -3.2271335124969482, "loss": 1.6988, "nll_loss": 1.423851490020752, "rewards/accuracies": 1.0, "rewards/chosen": -14.238516807556152, "rewards/margins": 18.032821655273438, "rewards/rejected": -32.27133560180664, "step": 337 }, { "epoch": 0.5040074557315937, "grad_norm": 325.67543178955356, "learning_rate": 5.34549724101439e-07, "logits/chosen": 1.4702489376068115, "logits/rejected": 1.1438722610473633, "logps/chosen": -1.0996648073196411, "logps/rejected": -1.7886468172073364, "loss": 1.822, "nll_loss": 1.0996649265289307, "rewards/accuracies": 0.875, "rewards/chosen": -10.996648788452148, "rewards/margins": 6.889819145202637, "rewards/rejected": -17.8864688873291, "step": 338 }, { "epoch": 0.5054986020503262, "grad_norm": 280.19664588162436, "learning_rate": 5.32085410461473e-07, "logits/chosen": 1.8264939785003662, "logits/rejected": 1.9685840606689453, "logps/chosen": -1.5246539115905762, "logps/rejected": -2.096618413925171, "loss": 3.1637, "nll_loss": 1.5246539115905762, "rewards/accuracies": 0.875, "rewards/chosen": -15.246540069580078, "rewards/margins": 5.719644546508789, "rewards/rejected": -20.966184616088867, "step": 339 }, { "epoch": 0.5069897483690587, "grad_norm": 76.58759054404746, "learning_rate": 5.296203139468571e-07, "logits/chosen": 0.6917173266410828, "logits/rejected": 1.098940134048462, "logps/chosen": -1.4546515941619873, "logps/rejected": -3.8073456287384033, "loss": 2.1558, "nll_loss": 1.4546515941619873, "rewards/accuracies": 0.625, "rewards/chosen": -14.546515464782715, "rewards/margins": 23.526941299438477, "rewards/rejected": -38.073455810546875, "step": 340 }, { "epoch": 0.5084808946877912, "grad_norm": 104.9310864210115, "learning_rate": 5.271544947052266e-07, "logits/chosen": 1.0520612001419067, "logits/rejected": 1.0490987300872803, "logps/chosen": -1.0005167722702026, "logps/rejected": -1.7621972560882568, "loss": 2.0453, "nll_loss": 1.000516653060913, "rewards/accuracies": 0.75, "rewards/chosen": -10.005167961120605, "rewards/margins": 7.616805553436279, "rewards/rejected": -17.621973037719727, "step": 341 }, { "epoch": 0.5099720410065237, "grad_norm": 53.97625383827924, "learning_rate": 5.246880129018515e-07, "logits/chosen": 2.2423133850097656, "logits/rejected": 2.537630558013916, "logps/chosen": -1.5444772243499756, "logps/rejected": -2.222283363342285, "loss": 1.6948, "nll_loss": 1.5444772243499756, "rewards/accuracies": 0.625, "rewards/chosen": -15.444771766662598, "rewards/margins": 6.778061389923096, "rewards/rejected": -22.22283363342285, "step": 342 }, { "epoch": 0.5114631873252563, "grad_norm": 112.33294253398827, "learning_rate": 5.222209287181676e-07, "logits/chosen": 0.6487884521484375, "logits/rejected": 0.6867713332176208, "logps/chosen": -1.3492956161499023, "logps/rejected": -2.9653830528259277, "loss": 2.5893, "nll_loss": 1.3492956161499023, "rewards/accuracies": 0.875, "rewards/chosen": -13.492956161499023, "rewards/margins": 16.160873413085938, "rewards/rejected": -29.653831481933594, "step": 343 }, { "epoch": 0.5129543336439888, "grad_norm": 107.60256343611687, "learning_rate": 5.197533023503089e-07, "logits/chosen": 0.4045717716217041, "logits/rejected": 0.7231236696243286, "logps/chosen": -0.9761478900909424, "logps/rejected": -2.7345666885375977, "loss": 2.312, "nll_loss": 0.9761478900909424, "rewards/accuracies": 0.875, "rewards/chosen": -9.761479377746582, "rewards/margins": 17.584186553955078, "rewards/rejected": -27.345664978027344, "step": 344 }, { "epoch": 0.5144454799627214, "grad_norm": 156.63547061998838, "learning_rate": 5.172851940076387e-07, "logits/chosen": 0.7327659130096436, "logits/rejected": 0.9826483726501465, "logps/chosen": -1.4324758052825928, "logps/rejected": -2.6677839756011963, "loss": 2.8925, "nll_loss": 1.4324758052825928, "rewards/accuracies": 0.875, "rewards/chosen": -14.324756622314453, "rewards/margins": 12.353079795837402, "rewards/rejected": -26.677839279174805, "step": 345 }, { "epoch": 0.5159366262814539, "grad_norm": 129.0147341187787, "learning_rate": 5.148166639112799e-07, "logits/chosen": 0.9722475409507751, "logits/rejected": 0.9801483154296875, "logps/chosen": -1.194012999534607, "logps/rejected": -2.0182580947875977, "loss": 2.5496, "nll_loss": 1.1940131187438965, "rewards/accuracies": 0.625, "rewards/chosen": -11.940130233764648, "rewards/margins": 8.242452621459961, "rewards/rejected": -20.18258285522461, "step": 346 }, { "epoch": 0.5174277726001864, "grad_norm": 41.688227996168365, "learning_rate": 5.123477722926464e-07, "logits/chosen": 1.3067666292190552, "logits/rejected": 2.00052809715271, "logps/chosen": -1.4092761278152466, "logps/rejected": -3.3266148567199707, "loss": 2.2764, "nll_loss": 1.4092758893966675, "rewards/accuracies": 0.875, "rewards/chosen": -14.092761993408203, "rewards/margins": 19.17338752746582, "rewards/rejected": -33.266151428222656, "step": 347 }, { "epoch": 0.518918918918919, "grad_norm": 40.993761190079624, "learning_rate": 5.098785793919732e-07, "logits/chosen": 1.0896856784820557, "logits/rejected": 1.4323949813842773, "logps/chosen": -1.5167326927185059, "logps/rejected": -2.3191676139831543, "loss": 2.6912, "nll_loss": 1.5167325735092163, "rewards/accuracies": 0.75, "rewards/chosen": -15.167327880859375, "rewards/margins": 8.024349212646484, "rewards/rejected": -23.19167709350586, "step": 348 }, { "epoch": 0.5204100652376514, "grad_norm": 25.50126534337896, "learning_rate": 5.074091454568463e-07, "logits/chosen": 1.669668436050415, "logits/rejected": 2.283931255340576, "logps/chosen": -1.0836777687072754, "logps/rejected": -2.233471632003784, "loss": 1.4233, "nll_loss": 1.0836777687072754, "rewards/accuracies": 0.875, "rewards/chosen": -10.836777687072754, "rewards/margins": 11.497940063476562, "rewards/rejected": -22.334716796875, "step": 349 }, { "epoch": 0.5219012115563839, "grad_norm": 50.976717957780835, "learning_rate": 5.049395307407328e-07, "logits/chosen": 0.9089032411575317, "logits/rejected": 0.991895854473114, "logps/chosen": -1.0036165714263916, "logps/rejected": -1.5741000175476074, "loss": 1.6741, "nll_loss": 1.0036165714263916, "rewards/accuracies": 0.625, "rewards/chosen": -10.036165237426758, "rewards/margins": 5.704835414886475, "rewards/rejected": -15.741000175476074, "step": 350 }, { "epoch": 0.5233923578751165, "grad_norm": 56.51327778810148, "learning_rate": 5.024697955015111e-07, "logits/chosen": 1.0180648565292358, "logits/rejected": 1.2536265850067139, "logps/chosen": -1.1623241901397705, "logps/rejected": -1.6183818578720093, "loss": 2.5333, "nll_loss": 1.162324070930481, "rewards/accuracies": 0.625, "rewards/chosen": -11.623241424560547, "rewards/margins": 4.560578346252441, "rewards/rejected": -16.183818817138672, "step": 351 }, { "epoch": 0.524883504193849, "grad_norm": 49.29582711152062, "learning_rate": 5e-07, "logits/chosen": 0.0769793838262558, "logits/rejected": 1.3817670345306396, "logps/chosen": -1.8828178644180298, "logps/rejected": -3.151564121246338, "loss": 2.696, "nll_loss": 1.8828177452087402, "rewards/accuracies": 0.75, "rewards/chosen": -18.828176498413086, "rewards/margins": 12.687463760375977, "rewards/rejected": -31.515642166137695, "step": 352 }, { "epoch": 0.5263746505125816, "grad_norm": 127.45507991470484, "learning_rate": 4.975302044984888e-07, "logits/chosen": 1.6833350658416748, "logits/rejected": 1.887645959854126, "logps/chosen": -0.7672575116157532, "logps/rejected": -1.969806432723999, "loss": 3.2517, "nll_loss": 0.7672575116157532, "rewards/accuracies": 0.875, "rewards/chosen": -7.6725754737854, "rewards/margins": 12.025486946105957, "rewards/rejected": -19.698062896728516, "step": 353 }, { "epoch": 0.5278657968313141, "grad_norm": 43.674234447941004, "learning_rate": 4.950604692592673e-07, "logits/chosen": 1.0634629726409912, "logits/rejected": 1.2298520803451538, "logps/chosen": -1.300775170326233, "logps/rejected": -2.43308424949646, "loss": 1.3793, "nll_loss": 1.300775170326233, "rewards/accuracies": 1.0, "rewards/chosen": -13.007752418518066, "rewards/margins": 11.323091506958008, "rewards/rejected": -24.330841064453125, "step": 354 }, { "epoch": 0.5293569431500466, "grad_norm": 52.989957684512696, "learning_rate": 4.925908545431537e-07, "logits/chosen": 1.6194576025009155, "logits/rejected": 2.248783588409424, "logps/chosen": -1.4075263738632202, "logps/rejected": -2.3023507595062256, "loss": 1.6829, "nll_loss": 1.4075263738632202, "rewards/accuracies": 1.0, "rewards/chosen": -14.075263977050781, "rewards/margins": 8.94824504852295, "rewards/rejected": -23.023508071899414, "step": 355 }, { "epoch": 0.5308480894687791, "grad_norm": 73.36737835982937, "learning_rate": 4.901214206080268e-07, "logits/chosen": 1.199830174446106, "logits/rejected": 1.2315750122070312, "logps/chosen": -1.0550888776779175, "logps/rejected": -2.9333043098449707, "loss": 1.3172, "nll_loss": 1.055088758468628, "rewards/accuracies": 0.875, "rewards/chosen": -10.550888061523438, "rewards/margins": 18.782155990600586, "rewards/rejected": -29.333044052124023, "step": 356 }, { "epoch": 0.5323392357875116, "grad_norm": 122.00666595480716, "learning_rate": 4.876522277073534e-07, "logits/chosen": 0.8202993869781494, "logits/rejected": 1.3249539136886597, "logps/chosen": -1.5087026357650757, "logps/rejected": -2.674197196960449, "loss": 2.4716, "nll_loss": 1.5087026357650757, "rewards/accuracies": 1.0, "rewards/chosen": -15.087026596069336, "rewards/margins": 11.654947280883789, "rewards/rejected": -26.741973876953125, "step": 357 }, { "epoch": 0.5338303821062442, "grad_norm": 246.76773763488845, "learning_rate": 4.851833360887201e-07, "logits/chosen": 0.8156963586807251, "logits/rejected": 1.3810663223266602, "logps/chosen": -1.6010940074920654, "logps/rejected": -2.600064754486084, "loss": 2.0726, "nll_loss": 1.6010942459106445, "rewards/accuracies": 0.75, "rewards/chosen": -16.010940551757812, "rewards/margins": 9.989707946777344, "rewards/rejected": -26.00065040588379, "step": 358 }, { "epoch": 0.5353215284249767, "grad_norm": 35.91044102378384, "learning_rate": 4.827148059923613e-07, "logits/chosen": 0.42476320266723633, "logits/rejected": 0.9761238694190979, "logps/chosen": -1.138462781906128, "logps/rejected": -2.1033501625061035, "loss": 1.8047, "nll_loss": 1.138462781906128, "rewards/accuracies": 1.0, "rewards/chosen": -11.384627342224121, "rewards/margins": 9.648874282836914, "rewards/rejected": -21.03350067138672, "step": 359 }, { "epoch": 0.5368126747437092, "grad_norm": 95.29465790699732, "learning_rate": 4.802466976496911e-07, "logits/chosen": 0.7101988196372986, "logits/rejected": 0.693535566329956, "logps/chosen": -1.7004222869873047, "logps/rejected": -4.368194103240967, "loss": 1.8352, "nll_loss": 1.7004221677780151, "rewards/accuracies": 0.75, "rewards/chosen": -17.004222869873047, "rewards/margins": 26.677719116210938, "rewards/rejected": -43.681941986083984, "step": 360 }, { "epoch": 0.5383038210624418, "grad_norm": 76.17525593551915, "learning_rate": 4.777790712818323e-07, "logits/chosen": 1.3881099224090576, "logits/rejected": 1.3718510866165161, "logps/chosen": -1.3567442893981934, "logps/rejected": -1.6875207424163818, "loss": 2.2402, "nll_loss": 1.356744408607483, "rewards/accuracies": 0.75, "rewards/chosen": -13.56744384765625, "rewards/margins": 3.3077638149261475, "rewards/rejected": -16.875205993652344, "step": 361 }, { "epoch": 0.5397949673811743, "grad_norm": 89.06386872504564, "learning_rate": 4.753119870981485e-07, "logits/chosen": 1.8389939069747925, "logits/rejected": 1.5736289024353027, "logps/chosen": -1.0070418119430542, "logps/rejected": -2.262326240539551, "loss": 2.3572, "nll_loss": 1.0070418119430542, "rewards/accuracies": 0.875, "rewards/chosen": -10.070418357849121, "rewards/margins": 12.55284309387207, "rewards/rejected": -22.623260498046875, "step": 362 }, { "epoch": 0.5412861136999068, "grad_norm": 36.98114107146971, "learning_rate": 4.728455052947732e-07, "logits/chosen": 0.567602813243866, "logits/rejected": 1.1926748752593994, "logps/chosen": -1.1497440338134766, "logps/rejected": -3.3005409240722656, "loss": 1.661, "nll_loss": 1.1497440338134766, "rewards/accuracies": 0.875, "rewards/chosen": -11.497440338134766, "rewards/margins": 21.50796890258789, "rewards/rejected": -33.005409240722656, "step": 363 }, { "epoch": 0.5427772600186394, "grad_norm": 57.16827451171126, "learning_rate": 4.703796860531429e-07, "logits/chosen": 1.8566818237304688, "logits/rejected": 1.505059838294983, "logps/chosen": -1.3204902410507202, "logps/rejected": -2.035156726837158, "loss": 1.8646, "nll_loss": 1.3204902410507202, "rewards/accuracies": 0.875, "rewards/chosen": -13.204903602600098, "rewards/margins": 7.146665096282959, "rewards/rejected": -20.351566314697266, "step": 364 }, { "epoch": 0.5442684063373718, "grad_norm": 62.435237140372806, "learning_rate": 4.679145895385269e-07, "logits/chosen": 0.8371939659118652, "logits/rejected": 1.0040203332901, "logps/chosen": -1.4417243003845215, "logps/rejected": -2.8484854698181152, "loss": 1.8472, "nll_loss": 1.4417245388031006, "rewards/accuracies": 0.75, "rewards/chosen": -14.417244911193848, "rewards/margins": 14.067611694335938, "rewards/rejected": -28.48485565185547, "step": 365 }, { "epoch": 0.5457595526561044, "grad_norm": 115.67435377041399, "learning_rate": 4.6545027589856105e-07, "logits/chosen": 1.0038310289382935, "logits/rejected": 1.2312933206558228, "logps/chosen": -0.9938388466835022, "logps/rejected": -2.520301342010498, "loss": 1.9517, "nll_loss": 0.9938388466835022, "rewards/accuracies": 1.0, "rewards/chosen": -9.93838882446289, "rewards/margins": 15.264625549316406, "rewards/rejected": -25.203014373779297, "step": 366 }, { "epoch": 0.5472506989748369, "grad_norm": 59.543682048827165, "learning_rate": 4.6298680526177855e-07, "logits/chosen": 1.9276726245880127, "logits/rejected": 1.532571792602539, "logps/chosen": -1.876322865486145, "logps/rejected": -2.3194832801818848, "loss": 2.5512, "nll_loss": 1.876322865486145, "rewards/accuracies": 0.5, "rewards/chosen": -18.763229370117188, "rewards/margins": 4.43160343170166, "rewards/rejected": -23.194833755493164, "step": 367 }, { "epoch": 0.5487418452935694, "grad_norm": 35.52641208148878, "learning_rate": 4.60524237736144e-07, "logits/chosen": 1.6724348068237305, "logits/rejected": 1.3174070119857788, "logps/chosen": -1.13054358959198, "logps/rejected": -2.897552490234375, "loss": 2.0898, "nll_loss": 1.1305434703826904, "rewards/accuracies": 1.0, "rewards/chosen": -11.305435180664062, "rewards/margins": 17.670085906982422, "rewards/rejected": -28.975521087646484, "step": 368 }, { "epoch": 0.550232991612302, "grad_norm": 39.92830775665279, "learning_rate": 4.5806263340758636e-07, "logits/chosen": 2.316521167755127, "logits/rejected": 2.480522871017456, "logps/chosen": -1.6596410274505615, "logps/rejected": -2.200403928756714, "loss": 2.3369, "nll_loss": 1.6596410274505615, "rewards/accuracies": 0.5, "rewards/chosen": -16.596410751342773, "rewards/margins": 5.407629013061523, "rewards/rejected": -22.004037857055664, "step": 369 }, { "epoch": 0.5517241379310345, "grad_norm": 60.985171266996204, "learning_rate": 4.556020523385326e-07, "logits/chosen": 1.0311435461044312, "logits/rejected": 1.2425912618637085, "logps/chosen": -1.3279130458831787, "logps/rejected": -2.388808012008667, "loss": 2.5089, "nll_loss": 1.3279130458831787, "rewards/accuracies": 0.875, "rewards/chosen": -13.279131889343262, "rewards/margins": 10.6089506149292, "rewards/rejected": -23.888080596923828, "step": 370 }, { "epoch": 0.553215284249767, "grad_norm": 59.15727767457908, "learning_rate": 4.531425545664425e-07, "logits/chosen": 0.4395480751991272, "logits/rejected": 0.9258232712745667, "logps/chosen": -0.7140366435050964, "logps/rejected": -3.106039524078369, "loss": 2.1409, "nll_loss": 0.7140365839004517, "rewards/accuracies": 0.875, "rewards/chosen": -7.140366554260254, "rewards/margins": 23.92003059387207, "rewards/rejected": -31.060396194458008, "step": 371 }, { "epoch": 0.5547064305684996, "grad_norm": 113.41652725050223, "learning_rate": 4.5068420010234413e-07, "logits/chosen": 1.692410945892334, "logits/rejected": 1.9226614236831665, "logps/chosen": -1.5776925086975098, "logps/rejected": -2.640813112258911, "loss": 2.5214, "nll_loss": 1.5776923894882202, "rewards/accuracies": 0.75, "rewards/chosen": -15.776925086975098, "rewards/margins": 10.631204605102539, "rewards/rejected": -26.408130645751953, "step": 372 }, { "epoch": 0.556197576887232, "grad_norm": 42.52993255604326, "learning_rate": 4.482270489293685e-07, "logits/chosen": 1.9083011150360107, "logits/rejected": 2.202073574066162, "logps/chosen": -1.2748469114303589, "logps/rejected": -3.1168975830078125, "loss": 1.9004, "nll_loss": 1.2748469114303589, "rewards/accuracies": 1.0, "rewards/chosen": -12.748468399047852, "rewards/margins": 18.420507431030273, "rewards/rejected": -31.168975830078125, "step": 373 }, { "epoch": 0.5576887232059646, "grad_norm": 45.99191086143298, "learning_rate": 4.457711610012873e-07, "logits/chosen": 1.3463859558105469, "logits/rejected": 2.0031678676605225, "logps/chosen": -1.2941997051239014, "logps/rejected": -2.7461254596710205, "loss": 2.0979, "nll_loss": 1.2941997051239014, "rewards/accuracies": 0.875, "rewards/chosen": -12.941995620727539, "rewards/margins": 14.519258499145508, "rewards/rejected": -27.461254119873047, "step": 374 }, { "epoch": 0.5591798695246971, "grad_norm": 65.97353088513727, "learning_rate": 4.4331659624104876e-07, "logits/chosen": 1.1032230854034424, "logits/rejected": 0.8625024557113647, "logps/chosen": -1.2884620428085327, "logps/rejected": -2.2827272415161133, "loss": 2.5707, "nll_loss": 1.2884619235992432, "rewards/accuracies": 0.75, "rewards/chosen": -12.88461971282959, "rewards/margins": 9.94265365600586, "rewards/rejected": -22.827274322509766, "step": 375 }, { "epoch": 0.5606710158434296, "grad_norm": 63.093501055358686, "learning_rate": 4.4086341453931714e-07, "logits/chosen": 0.72505784034729, "logits/rejected": 1.518155813217163, "logps/chosen": -1.6150670051574707, "logps/rejected": -2.603109359741211, "loss": 3.561, "nll_loss": 1.6150668859481812, "rewards/accuracies": 0.625, "rewards/chosen": -16.15066909790039, "rewards/margins": 9.880425453186035, "rewards/rejected": -26.03109359741211, "step": 376 }, { "epoch": 0.5621621621621622, "grad_norm": 45.300199929904046, "learning_rate": 4.3841167575300933e-07, "logits/chosen": 0.9565849900245667, "logits/rejected": 1.066584587097168, "logps/chosen": -1.4316563606262207, "logps/rejected": -1.8951045274734497, "loss": 1.5506, "nll_loss": 1.4316564798355103, "rewards/accuracies": 0.875, "rewards/chosen": -14.316564559936523, "rewards/margins": 4.634482383728027, "rewards/rejected": -18.951045989990234, "step": 377 }, { "epoch": 0.5636533084808947, "grad_norm": 67.19431471811166, "learning_rate": 4.359614397038366e-07, "logits/chosen": 1.5544148683547974, "logits/rejected": 1.700253963470459, "logps/chosen": -1.516951560974121, "logps/rejected": -1.8979204893112183, "loss": 2.6884, "nll_loss": 1.516951560974121, "rewards/accuracies": 0.875, "rewards/chosen": -15.169515609741211, "rewards/margins": 3.809689521789551, "rewards/rejected": -18.979204177856445, "step": 378 }, { "epoch": 0.5651444547996272, "grad_norm": 50.18842054638841, "learning_rate": 4.3351276617684285e-07, "logits/chosen": 1.82659113407135, "logits/rejected": 2.0349600315093994, "logps/chosen": -1.2060390710830688, "logps/rejected": -2.184781551361084, "loss": 1.1838, "nll_loss": 1.2060391902923584, "rewards/accuracies": 0.875, "rewards/chosen": -12.060392379760742, "rewards/margins": 9.787421226501465, "rewards/rejected": -21.84781265258789, "step": 379 }, { "epoch": 0.5666356011183598, "grad_norm": 100.82150396227959, "learning_rate": 4.310657149189478e-07, "logits/chosen": 0.8955983519554138, "logits/rejected": 1.8611811399459839, "logps/chosen": -1.1403526067733765, "logps/rejected": -2.4114925861358643, "loss": 2.1743, "nll_loss": 1.1403526067733765, "rewards/accuracies": 1.0, "rewards/chosen": -11.403526306152344, "rewards/margins": 12.71139907836914, "rewards/rejected": -24.114925384521484, "step": 380 }, { "epoch": 0.5681267474370922, "grad_norm": 49.37574167209818, "learning_rate": 4.2862034563748765e-07, "logits/chosen": 1.445024847984314, "logits/rejected": 1.0723494291305542, "logps/chosen": -1.2697113752365112, "logps/rejected": -2.8943827152252197, "loss": 1.8205, "nll_loss": 1.2697113752365112, "rewards/accuracies": 0.75, "rewards/chosen": -12.697113990783691, "rewards/margins": 16.246713638305664, "rewards/rejected": -28.94382667541504, "step": 381 }, { "epoch": 0.5696178937558248, "grad_norm": 71.69580673521774, "learning_rate": 4.2617671799875944e-07, "logits/chosen": 1.3698145151138306, "logits/rejected": 1.2572070360183716, "logps/chosen": -1.0628803968429565, "logps/rejected": -1.848346471786499, "loss": 2.5224, "nll_loss": 1.0628806352615356, "rewards/accuracies": 0.875, "rewards/chosen": -10.628803253173828, "rewards/margins": 7.854660511016846, "rewards/rejected": -18.48346710205078, "step": 382 }, { "epoch": 0.5711090400745573, "grad_norm": 49.57487875570872, "learning_rate": 4.237348916265637e-07, "logits/chosen": 0.9029265642166138, "logits/rejected": 1.0498796701431274, "logps/chosen": -0.828253984451294, "logps/rejected": -2.1390380859375, "loss": 1.2875, "nll_loss": 0.8282539248466492, "rewards/accuracies": 1.0, "rewards/chosen": -8.282539367675781, "rewards/margins": 13.107840538024902, "rewards/rejected": -21.390378952026367, "step": 383 }, { "epoch": 0.5726001863932898, "grad_norm": 47.192019525125666, "learning_rate": 4.2129492610075183e-07, "logits/chosen": 1.824859619140625, "logits/rejected": 1.5693551301956177, "logps/chosen": -1.3151293992996216, "logps/rejected": -1.891690731048584, "loss": 2.0956, "nll_loss": 1.3151293992996216, "rewards/accuracies": 0.625, "rewards/chosen": -13.151294708251953, "rewards/margins": 5.765612602233887, "rewards/rejected": -18.916908264160156, "step": 384 }, { "epoch": 0.5740913327120224, "grad_norm": 68.62274734978858, "learning_rate": 4.1885688095577e-07, "logits/chosen": 0.8779973387718201, "logits/rejected": 1.3293192386627197, "logps/chosen": -0.8747768402099609, "logps/rejected": -2.0907740592956543, "loss": 1.6596, "nll_loss": 0.8747768402099609, "rewards/accuracies": 1.0, "rewards/chosen": -8.74776840209961, "rewards/margins": 12.15997314453125, "rewards/rejected": -20.90774154663086, "step": 385 }, { "epoch": 0.5755824790307549, "grad_norm": 27.63739304721906, "learning_rate": 4.164208156792084e-07, "logits/chosen": 0.765286922454834, "logits/rejected": 1.0338056087493896, "logps/chosen": -0.9056370258331299, "logps/rejected": -1.8033547401428223, "loss": 2.1928, "nll_loss": 0.9056369066238403, "rewards/accuracies": 0.75, "rewards/chosen": -9.05636978149414, "rewards/margins": 8.977177619934082, "rewards/rejected": -18.033546447753906, "step": 386 }, { "epoch": 0.5770736253494875, "grad_norm": 36.881961688683596, "learning_rate": 4.139867897103484e-07, "logits/chosen": 0.45405313372612, "logits/rejected": 0.6241415739059448, "logps/chosen": -1.109076976776123, "logps/rejected": -2.5582451820373535, "loss": 1.6022, "nll_loss": 1.109076976776123, "rewards/accuracies": 1.0, "rewards/chosen": -11.090768814086914, "rewards/margins": 14.491682052612305, "rewards/rejected": -25.58245086669922, "step": 387 }, { "epoch": 0.57856477166822, "grad_norm": 55.64832356660189, "learning_rate": 4.1155486243871363e-07, "logits/chosen": 1.1602576971054077, "logits/rejected": 1.5388338565826416, "logps/chosen": -1.3016027212142944, "logps/rejected": -2.1892929077148438, "loss": 2.5378, "nll_loss": 1.3016026020050049, "rewards/accuracies": 0.875, "rewards/chosen": -13.016027450561523, "rewards/margins": 8.876900672912598, "rewards/rejected": -21.892927169799805, "step": 388 }, { "epoch": 0.5800559179869524, "grad_norm": 35.55429599551597, "learning_rate": 4.091250932026191e-07, "logits/chosen": 0.9565999507904053, "logits/rejected": 1.5665593147277832, "logps/chosen": -1.0186738967895508, "logps/rejected": -2.1096742153167725, "loss": 1.8503, "nll_loss": 1.0186740159988403, "rewards/accuracies": 0.875, "rewards/chosen": -10.186738967895508, "rewards/margins": 10.910001754760742, "rewards/rejected": -21.09674072265625, "step": 389 }, { "epoch": 0.581547064305685, "grad_norm": 69.06322291465483, "learning_rate": 4.066975412877255e-07, "logits/chosen": 0.36940881609916687, "logits/rejected": 0.5106832385063171, "logps/chosen": -1.763047695159912, "logps/rejected": -2.7701592445373535, "loss": 1.7896, "nll_loss": 1.763047695159912, "rewards/accuracies": 0.875, "rewards/chosen": -17.630477905273438, "rewards/margins": 10.071113586425781, "rewards/rejected": -27.701589584350586, "step": 390 }, { "epoch": 0.5830382106244175, "grad_norm": 54.823494430910216, "learning_rate": 4.042722659255906e-07, "logits/chosen": 1.4899789094924927, "logits/rejected": 1.5993945598602295, "logps/chosen": -1.0938667058944702, "logps/rejected": -2.578829050064087, "loss": 2.1689, "nll_loss": 1.0938668251037598, "rewards/accuracies": 1.0, "rewards/chosen": -10.938667297363281, "rewards/margins": 14.849624633789062, "rewards/rejected": -25.788291931152344, "step": 391 }, { "epoch": 0.58452935694315, "grad_norm": 59.87078190391284, "learning_rate": 4.0184932629222574e-07, "logits/chosen": 1.8408669233322144, "logits/rejected": 1.8936665058135986, "logps/chosen": -0.9872679114341736, "logps/rejected": -5.953817367553711, "loss": 2.0431, "nll_loss": 0.9872679710388184, "rewards/accuracies": 0.875, "rewards/chosen": -9.87267780303955, "rewards/margins": 49.665489196777344, "rewards/rejected": -59.538169860839844, "step": 392 }, { "epoch": 0.5860205032618826, "grad_norm": 99.10506654477443, "learning_rate": 3.9942878150665027e-07, "logits/chosen": 1.2618341445922852, "logits/rejected": 1.1964116096496582, "logps/chosen": -1.6831088066101074, "logps/rejected": -2.680295467376709, "loss": 2.4551, "nll_loss": 1.683108925819397, "rewards/accuracies": 0.625, "rewards/chosen": -16.83108901977539, "rewards/margins": 9.971864700317383, "rewards/rejected": -26.802955627441406, "step": 393 }, { "epoch": 0.5875116495806151, "grad_norm": 69.95462863715613, "learning_rate": 3.970106906294509e-07, "logits/chosen": 1.3020118474960327, "logits/rejected": 0.9566605091094971, "logps/chosen": -1.1724271774291992, "logps/rejected": -1.895683765411377, "loss": 2.4444, "nll_loss": 1.1724271774291992, "rewards/accuracies": 0.75, "rewards/chosen": -11.72426986694336, "rewards/margins": 7.232565879821777, "rewards/rejected": -18.956836700439453, "step": 394 }, { "epoch": 0.5890027958993477, "grad_norm": 72.28018046630525, "learning_rate": 3.945951126613387e-07, "logits/chosen": 1.7904846668243408, "logits/rejected": 1.971991777420044, "logps/chosen": -1.2904613018035889, "logps/rejected": -2.2912511825561523, "loss": 1.9116, "nll_loss": 1.2904613018035889, "rewards/accuracies": 0.75, "rewards/chosen": -12.904610633850098, "rewards/margins": 10.007899284362793, "rewards/rejected": -22.91250991821289, "step": 395 }, { "epoch": 0.5904939422180802, "grad_norm": 64.6520711455083, "learning_rate": 3.921821065417116e-07, "logits/chosen": 0.5514054298400879, "logits/rejected": 0.4734039306640625, "logps/chosen": -1.5220355987548828, "logps/rejected": -2.2492659091949463, "loss": 2.0462, "nll_loss": 1.522035837173462, "rewards/accuracies": 0.875, "rewards/chosen": -15.220356941223145, "rewards/margins": 7.272302150726318, "rewards/rejected": -22.492658615112305, "step": 396 }, { "epoch": 0.5919850885368126, "grad_norm": 74.35865053377688, "learning_rate": 3.89771731147214e-07, "logits/chosen": 0.790753960609436, "logits/rejected": 0.9726923108100891, "logps/chosen": -1.4724862575531006, "logps/rejected": -1.5546221733093262, "loss": 2.1503, "nll_loss": 1.4724860191345215, "rewards/accuracies": 0.5, "rewards/chosen": -14.724863052368164, "rewards/margins": 0.8213585615158081, "rewards/rejected": -15.546221733093262, "step": 397 }, { "epoch": 0.5934762348555452, "grad_norm": 54.58234848469142, "learning_rate": 3.8736404529030255e-07, "logits/chosen": 0.807062029838562, "logits/rejected": 1.2913286685943604, "logps/chosen": -1.2444322109222412, "logps/rejected": -2.2048096656799316, "loss": 1.4848, "nll_loss": 1.2444320917129517, "rewards/accuracies": 0.75, "rewards/chosen": -12.44432258605957, "rewards/margins": 9.60377311706543, "rewards/rejected": -22.048095703125, "step": 398 }, { "epoch": 0.5949673811742777, "grad_norm": 88.92242648972613, "learning_rate": 3.8495910771780893e-07, "logits/chosen": 1.281713843345642, "logits/rejected": 1.7675105333328247, "logps/chosen": -1.7222932577133179, "logps/rejected": -2.6808714866638184, "loss": 3.014, "nll_loss": 1.7222931385040283, "rewards/accuracies": 0.625, "rewards/chosen": -17.222932815551758, "rewards/margins": 9.585779190063477, "rewards/rejected": -26.808712005615234, "step": 399 }, { "epoch": 0.5964585274930102, "grad_norm": 43.757364321920235, "learning_rate": 3.825569771095082e-07, "logits/chosen": 1.114524483680725, "logits/rejected": 1.375151515007019, "logps/chosen": -1.072527527809143, "logps/rejected": -3.058666467666626, "loss": 1.7625, "nll_loss": 1.0725274085998535, "rewards/accuracies": 0.875, "rewards/chosen": -10.725275993347168, "rewards/margins": 19.861387252807617, "rewards/rejected": -30.5866641998291, "step": 400 }, { "epoch": 0.5979496738117428, "grad_norm": 53.13995705660573, "learning_rate": 3.801577120766859e-07, "logits/chosen": 1.2310510873794556, "logits/rejected": 1.5479485988616943, "logps/chosen": -1.18758225440979, "logps/rejected": -2.707247018814087, "loss": 2.4506, "nll_loss": 1.18758225440979, "rewards/accuracies": 1.0, "rewards/chosen": -11.875821113586426, "rewards/margins": 15.196648597717285, "rewards/rejected": -27.072471618652344, "step": 401 }, { "epoch": 0.5994408201304753, "grad_norm": 59.186371030115765, "learning_rate": 3.777613711607087e-07, "logits/chosen": 0.6153742074966431, "logits/rejected": 1.1276042461395264, "logps/chosen": -1.373226523399353, "logps/rejected": -2.5880839824676514, "loss": 1.3769, "nll_loss": 1.373226523399353, "rewards/accuracies": 0.875, "rewards/chosen": -13.732264518737793, "rewards/margins": 12.148576736450195, "rewards/rejected": -25.880840301513672, "step": 402 }, { "epoch": 0.6009319664492079, "grad_norm": 34.158229059058655, "learning_rate": 3.753680128315952e-07, "logits/chosen": -0.09757497161626816, "logits/rejected": 0.2057550698518753, "logps/chosen": -1.454146385192871, "logps/rejected": -3.5339365005493164, "loss": 1.6317, "nll_loss": 1.454146385192871, "rewards/accuracies": 1.0, "rewards/chosen": -14.541465759277344, "rewards/margins": 20.797897338867188, "rewards/rejected": -35.33936309814453, "step": 403 }, { "epoch": 0.6024231127679404, "grad_norm": 43.757133641560635, "learning_rate": 3.7297769548659046e-07, "logits/chosen": 1.29662024974823, "logits/rejected": 1.9946157932281494, "logps/chosen": -1.143658995628357, "logps/rejected": -2.533445119857788, "loss": 2.3373, "nll_loss": 1.143658995628357, "rewards/accuracies": 0.875, "rewards/chosen": -11.436590194702148, "rewards/margins": 13.897862434387207, "rewards/rejected": -25.334453582763672, "step": 404 }, { "epoch": 0.6039142590866728, "grad_norm": 68.84761989778333, "learning_rate": 3.7059047744873955e-07, "logits/chosen": 0.7742293477058411, "logits/rejected": 0.8148295879364014, "logps/chosen": -0.9669423699378967, "logps/rejected": -2.176255702972412, "loss": 2.4883, "nll_loss": 0.9669424295425415, "rewards/accuracies": 0.75, "rewards/chosen": -9.669424057006836, "rewards/margins": 12.093133926391602, "rewards/rejected": -21.762556076049805, "step": 405 }, { "epoch": 0.6054054054054054, "grad_norm": 72.63496095280603, "learning_rate": 3.6820641696546627e-07, "logits/chosen": 1.5729310512542725, "logits/rejected": 1.7845934629440308, "logps/chosen": -1.5630098581314087, "logps/rejected": -3.274157762527466, "loss": 1.5419, "nll_loss": 1.5630098581314087, "rewards/accuracies": 1.0, "rewards/chosen": -15.63010025024414, "rewards/margins": 17.111478805541992, "rewards/rejected": -32.741580963134766, "step": 406 }, { "epoch": 0.6068965517241379, "grad_norm": 149.93946543476252, "learning_rate": 3.6582557220714997e-07, "logits/chosen": 0.264228880405426, "logits/rejected": 1.0309929847717285, "logps/chosen": -1.0704864263534546, "logps/rejected": -3.3166284561157227, "loss": 1.7656, "nll_loss": 1.0704864263534546, "rewards/accuracies": 0.875, "rewards/chosen": -10.704864501953125, "rewards/margins": 22.461421966552734, "rewards/rejected": -33.166290283203125, "step": 407 }, { "epoch": 0.6083876980428704, "grad_norm": 70.07278408398058, "learning_rate": 3.634480012657084e-07, "logits/chosen": 1.3579182624816895, "logits/rejected": 1.3562121391296387, "logps/chosen": -1.4525789022445679, "logps/rejected": -2.9784724712371826, "loss": 1.872, "nll_loss": 1.4525790214538574, "rewards/accuracies": 0.875, "rewards/chosen": -14.525790214538574, "rewards/margins": 15.25893783569336, "rewards/rejected": -29.784727096557617, "step": 408 }, { "epoch": 0.609878844361603, "grad_norm": 215.56266336124844, "learning_rate": 3.610737621531781e-07, "logits/chosen": 1.726470947265625, "logits/rejected": 2.071410655975342, "logps/chosen": -1.3145530223846436, "logps/rejected": -2.4200377464294434, "loss": 1.8869, "nll_loss": 1.314553141593933, "rewards/accuracies": 0.875, "rewards/chosen": -13.145530700683594, "rewards/margins": 11.05484676361084, "rewards/rejected": -24.20037841796875, "step": 409 }, { "epoch": 0.6113699906803355, "grad_norm": 48.49475915636012, "learning_rate": 3.587029128003006e-07, "logits/chosen": 0.8173800110816956, "logits/rejected": 0.923591673374176, "logps/chosen": -1.4306600093841553, "logps/rejected": -3.322990894317627, "loss": 2.6237, "nll_loss": 1.4306602478027344, "rewards/accuracies": 0.875, "rewards/chosen": -14.306600570678711, "rewards/margins": 18.92330551147461, "rewards/rejected": -33.22990798950195, "step": 410 }, { "epoch": 0.6128611369990681, "grad_norm": 54.00669496137651, "learning_rate": 3.5633551105510806e-07, "logits/chosen": 0.9530036449432373, "logits/rejected": 0.9924337863922119, "logps/chosen": -1.5625369548797607, "logps/rejected": -2.6102585792541504, "loss": 2.5014, "nll_loss": 1.5625371932983398, "rewards/accuracies": 0.5, "rewards/chosen": -15.625370025634766, "rewards/margins": 10.477216720581055, "rewards/rejected": -26.102584838867188, "step": 411 }, { "epoch": 0.6143522833178006, "grad_norm": 45.648147784415954, "learning_rate": 3.5397161468151214e-07, "logits/chosen": 1.569969654083252, "logits/rejected": 1.1461650133132935, "logps/chosen": -1.364423155784607, "logps/rejected": -2.5526044368743896, "loss": 1.6183, "nll_loss": 1.3644229173660278, "rewards/accuracies": 0.625, "rewards/chosen": -13.644231796264648, "rewards/margins": 11.881815910339355, "rewards/rejected": -25.526046752929688, "step": 412 }, { "epoch": 0.615843429636533, "grad_norm": 61.258417482495204, "learning_rate": 3.516112813578941e-07, "logits/chosen": 0.2635102868080139, "logits/rejected": 0.8463178277015686, "logps/chosen": -1.4131048917770386, "logps/rejected": -3.087088108062744, "loss": 2.2833, "nll_loss": 1.4131051301956177, "rewards/accuracies": 0.875, "rewards/chosen": -14.131050109863281, "rewards/margins": 16.73983383178711, "rewards/rejected": -30.87088394165039, "step": 413 }, { "epoch": 0.6173345759552656, "grad_norm": 60.87896365725072, "learning_rate": 3.492545686756986e-07, "logits/chosen": 1.755131483078003, "logits/rejected": 2.0312671661376953, "logps/chosen": -1.026936411857605, "logps/rejected": -2.6670114994049072, "loss": 2.4555, "nll_loss": 1.0269362926483154, "rewards/accuracies": 1.0, "rewards/chosen": -10.269363403320312, "rewards/margins": 16.400753021240234, "rewards/rejected": -26.670114517211914, "step": 414 }, { "epoch": 0.6188257222739981, "grad_norm": 73.6020982510389, "learning_rate": 3.4690153413802653e-07, "logits/chosen": 1.4790246486663818, "logits/rejected": 1.977297306060791, "logps/chosen": -1.4076223373413086, "logps/rejected": -2.840873956680298, "loss": 2.2488, "nll_loss": 1.4076223373413086, "rewards/accuracies": 0.75, "rewards/chosen": -14.076223373413086, "rewards/margins": 14.332515716552734, "rewards/rejected": -28.408740997314453, "step": 415 }, { "epoch": 0.6203168685927306, "grad_norm": 65.13663123971088, "learning_rate": 3.445522351582344e-07, "logits/chosen": 1.1710360050201416, "logits/rejected": 1.0605045557022095, "logps/chosen": -1.2275431156158447, "logps/rejected": -2.114682912826538, "loss": 2.9264, "nll_loss": 1.2275429964065552, "rewards/accuracies": 0.75, "rewards/chosen": -12.275430679321289, "rewards/margins": 8.871397018432617, "rewards/rejected": -21.146827697753906, "step": 416 }, { "epoch": 0.6218080149114632, "grad_norm": 52.427745005888106, "learning_rate": 3.4220672905853107e-07, "logits/chosen": 1.3692402839660645, "logits/rejected": 1.266655445098877, "logps/chosen": -1.1131441593170166, "logps/rejected": -2.0882177352905273, "loss": 1.8158, "nll_loss": 1.1131441593170166, "rewards/accuracies": 0.75, "rewards/chosen": -11.131441116333008, "rewards/margins": 9.75073528289795, "rewards/rejected": -20.882179260253906, "step": 417 }, { "epoch": 0.6232991612301957, "grad_norm": 170.39584633177057, "learning_rate": 3.3986507306858125e-07, "logits/chosen": 0.5034229755401611, "logits/rejected": 0.8567270636558533, "logps/chosen": -1.0039225816726685, "logps/rejected": -2.926978588104248, "loss": 2.5382, "nll_loss": 1.003922462463379, "rewards/accuracies": 1.0, "rewards/chosen": -10.039225578308105, "rewards/margins": 19.230560302734375, "rewards/rejected": -29.269784927368164, "step": 418 }, { "epoch": 0.6247903075489283, "grad_norm": 56.76544732171944, "learning_rate": 3.375273243241071e-07, "logits/chosen": 1.012601613998413, "logits/rejected": 1.2233651876449585, "logps/chosen": -0.9432302117347717, "logps/rejected": -3.0564088821411133, "loss": 1.8637, "nll_loss": 0.943230152130127, "rewards/accuracies": 1.0, "rewards/chosen": -9.43230152130127, "rewards/margins": 21.13178825378418, "rewards/rejected": -30.5640869140625, "step": 419 }, { "epoch": 0.6262814538676608, "grad_norm": 50.03299372561917, "learning_rate": 3.3519353986549604e-07, "logits/chosen": 1.4403176307678223, "logits/rejected": 2.0863468647003174, "logps/chosen": -1.264074444770813, "logps/rejected": -3.074423313140869, "loss": 1.5339, "nll_loss": 1.264074444770813, "rewards/accuracies": 0.875, "rewards/chosen": -12.640746116638184, "rewards/margins": 18.10348892211914, "rewards/rejected": -30.744232177734375, "step": 420 }, { "epoch": 0.6277726001863932, "grad_norm": 83.85556547470075, "learning_rate": 3.328637766364075e-07, "logits/chosen": 0.2867899537086487, "logits/rejected": 0.5098231434822083, "logps/chosen": -1.5081285238265991, "logps/rejected": -2.784846544265747, "loss": 2.5833, "nll_loss": 1.5081287622451782, "rewards/accuracies": 0.875, "rewards/chosen": -15.081287384033203, "rewards/margins": 12.76717758178711, "rewards/rejected": -27.848464965820312, "step": 421 }, { "epoch": 0.6292637465051258, "grad_norm": 91.27366980862237, "learning_rate": 3.305380914823842e-07, "logits/chosen": 1.0160527229309082, "logits/rejected": 0.8778313994407654, "logps/chosen": -1.4328757524490356, "logps/rejected": -1.9611214399337769, "loss": 3.2579, "nll_loss": 1.4328757524490356, "rewards/accuracies": 0.625, "rewards/chosen": -14.32875919342041, "rewards/margins": 5.282455921173096, "rewards/rejected": -19.6112117767334, "step": 422 }, { "epoch": 0.6307548928238583, "grad_norm": 42.63298100880217, "learning_rate": 3.2821654114946496e-07, "logits/chosen": 1.3331093788146973, "logits/rejected": 1.788509488105774, "logps/chosen": -1.002078890800476, "logps/rejected": -1.5614389181137085, "loss": 1.9875, "nll_loss": 1.0020790100097656, "rewards/accuracies": 0.75, "rewards/chosen": -10.020788192749023, "rewards/margins": 5.593601226806641, "rewards/rejected": -15.614389419555664, "step": 423 }, { "epoch": 0.6322460391425909, "grad_norm": 59.79902227784364, "learning_rate": 3.2589918228280066e-07, "logits/chosen": 2.0965588092803955, "logits/rejected": 2.404791831970215, "logps/chosen": -1.8881897926330566, "logps/rejected": -3.009868621826172, "loss": 1.9466, "nll_loss": 1.8881897926330566, "rewards/accuracies": 0.75, "rewards/chosen": -18.88189697265625, "rewards/margins": 11.216791152954102, "rewards/rejected": -30.09868812561035, "step": 424 }, { "epoch": 0.6337371854613234, "grad_norm": 38.641026026310826, "learning_rate": 3.235860714252708e-07, "logits/chosen": 0.49497750401496887, "logits/rejected": 0.8354448676109314, "logps/chosen": -0.9587732553482056, "logps/rejected": -2.164642572402954, "loss": 1.4651, "nll_loss": 0.9587733149528503, "rewards/accuracies": 1.0, "rewards/chosen": -9.587732315063477, "rewards/margins": 12.058693885803223, "rewards/rejected": -21.646427154541016, "step": 425 }, { "epoch": 0.6352283317800559, "grad_norm": 49.29895906205073, "learning_rate": 3.2127726501610554e-07, "logits/chosen": 0.8333710432052612, "logits/rejected": 0.6832611560821533, "logps/chosen": -1.228257179260254, "logps/rejected": -2.4580414295196533, "loss": 1.424, "nll_loss": 1.2282572984695435, "rewards/accuracies": 0.875, "rewards/chosen": -12.282571792602539, "rewards/margins": 12.297839164733887, "rewards/rejected": -24.580411911010742, "step": 426 }, { "epoch": 0.6367194780987885, "grad_norm": 59.09457192361289, "learning_rate": 3.189728193895069e-07, "logits/chosen": 1.2447106838226318, "logits/rejected": 1.6795345544815063, "logps/chosen": -1.2835984230041504, "logps/rejected": -2.071232795715332, "loss": 3.1679, "nll_loss": 1.2835984230041504, "rewards/accuracies": 0.75, "rewards/chosen": -12.835984230041504, "rewards/margins": 7.876343727111816, "rewards/rejected": -20.712326049804688, "step": 427 }, { "epoch": 0.638210624417521, "grad_norm": 55.472676645539266, "learning_rate": 3.1667279077327596e-07, "logits/chosen": 1.4515255689620972, "logits/rejected": 1.4992138147354126, "logps/chosen": -1.2138221263885498, "logps/rejected": -2.1761789321899414, "loss": 2.4694, "nll_loss": 1.2138221263885498, "rewards/accuracies": 1.0, "rewards/chosen": -12.138221740722656, "rewards/margins": 9.62356948852539, "rewards/rejected": -21.761789321899414, "step": 428 }, { "epoch": 0.6397017707362534, "grad_norm": 48.72155621735601, "learning_rate": 3.143772352874393e-07, "logits/chosen": 1.3959624767303467, "logits/rejected": 1.6266913414001465, "logps/chosen": -1.3503375053405762, "logps/rejected": -2.3042397499084473, "loss": 2.1414, "nll_loss": 1.3503373861312866, "rewards/accuracies": 0.75, "rewards/chosen": -13.503375053405762, "rewards/margins": 9.539024353027344, "rewards/rejected": -23.04239845275879, "step": 429 }, { "epoch": 0.641192917054986, "grad_norm": 35.66908163280164, "learning_rate": 3.12086208942881e-07, "logits/chosen": 0.44496166706085205, "logits/rejected": 1.2202012538909912, "logps/chosen": -1.1374258995056152, "logps/rejected": -3.3101794719696045, "loss": 1.208, "nll_loss": 1.1374258995056152, "rewards/accuracies": 0.875, "rewards/chosen": -11.374258995056152, "rewards/margins": 21.727537155151367, "rewards/rejected": -33.10179901123047, "step": 430 }, { "epoch": 0.6426840633737185, "grad_norm": 30.10695795802461, "learning_rate": 3.0979976763997483e-07, "logits/chosen": 1.0399227142333984, "logits/rejected": 0.6588888764381409, "logps/chosen": -1.250860571861267, "logps/rejected": -2.8929429054260254, "loss": 1.8239, "nll_loss": 1.250860333442688, "rewards/accuracies": 0.875, "rewards/chosen": -12.5086030960083, "rewards/margins": 16.420822143554688, "rewards/rejected": -28.929426193237305, "step": 431 }, { "epoch": 0.6441752096924511, "grad_norm": 47.98314828937602, "learning_rate": 3.0751796716722154e-07, "logits/chosen": 1.1352158784866333, "logits/rejected": 1.410915493965149, "logps/chosen": -1.0950578451156616, "logps/rejected": -2.431433916091919, "loss": 1.5171, "nll_loss": 1.0950578451156616, "rewards/accuracies": 1.0, "rewards/chosen": -10.950577735900879, "rewards/margins": 13.363761901855469, "rewards/rejected": -24.314340591430664, "step": 432 }, { "epoch": 0.6456663560111836, "grad_norm": 35.63586624987481, "learning_rate": 3.052408631998863e-07, "logits/chosen": 1.6566545963287354, "logits/rejected": 1.4458212852478027, "logps/chosen": -1.2611005306243896, "logps/rejected": -2.0897819995880127, "loss": 1.4812, "nll_loss": 1.2611005306243896, "rewards/accuracies": 0.625, "rewards/chosen": -12.611005783081055, "rewards/margins": 8.286813735961914, "rewards/rejected": -20.89781951904297, "step": 433 }, { "epoch": 0.6471575023299161, "grad_norm": 46.07313177405188, "learning_rate": 3.0296851129864165e-07, "logits/chosen": 1.6016666889190674, "logits/rejected": 1.2090141773223877, "logps/chosen": -1.4727472066879272, "logps/rejected": -2.661288261413574, "loss": 1.8318, "nll_loss": 1.4727472066879272, "rewards/accuracies": 1.0, "rewards/chosen": -14.727469444274902, "rewards/margins": 11.885411262512207, "rewards/rejected": -26.612884521484375, "step": 434 }, { "epoch": 0.6486486486486487, "grad_norm": 74.1761189064138, "learning_rate": 3.007009669082103e-07, "logits/chosen": 1.45290207862854, "logits/rejected": 1.8482708930969238, "logps/chosen": -1.1907374858856201, "logps/rejected": -2.357393741607666, "loss": 2.126, "nll_loss": 1.1907377243041992, "rewards/accuracies": 0.75, "rewards/chosen": -11.90737533569336, "rewards/margins": 11.6665620803833, "rewards/rejected": -23.573938369750977, "step": 435 }, { "epoch": 0.6501397949673812, "grad_norm": 64.14650524937251, "learning_rate": 2.9843828535601397e-07, "logits/chosen": 1.1652297973632812, "logits/rejected": 0.9601278305053711, "logps/chosen": -1.6680173873901367, "logps/rejected": -3.202648878097534, "loss": 2.3454, "nll_loss": 1.6680173873901367, "rewards/accuracies": 0.5, "rewards/chosen": -16.680173873901367, "rewards/margins": 15.34631633758545, "rewards/rejected": -32.0264892578125, "step": 436 }, { "epoch": 0.6516309412861137, "grad_norm": 53.104136065409264, "learning_rate": 2.9618052185082155e-07, "logits/chosen": 0.9669073224067688, "logits/rejected": 0.7368067502975464, "logps/chosen": -1.1525717973709106, "logps/rejected": -2.254432439804077, "loss": 2.2193, "nll_loss": 1.1525717973709106, "rewards/accuracies": 0.875, "rewards/chosen": -11.525718688964844, "rewards/margins": 11.018606185913086, "rewards/rejected": -22.54432487487793, "step": 437 }, { "epoch": 0.6531220876048462, "grad_norm": 73.70829574207959, "learning_rate": 2.9392773148140404e-07, "logits/chosen": 2.054089307785034, "logits/rejected": 2.129491090774536, "logps/chosen": -1.5031163692474365, "logps/rejected": -3.7904915809631348, "loss": 1.9853, "nll_loss": 1.5031163692474365, "rewards/accuracies": 0.875, "rewards/chosen": -15.031164169311523, "rewards/margins": 22.873748779296875, "rewards/rejected": -37.904911041259766, "step": 438 }, { "epoch": 0.6546132339235787, "grad_norm": 39.8178746230072, "learning_rate": 2.916799692151884e-07, "logits/chosen": 1.1287901401519775, "logits/rejected": 0.6743125319480896, "logps/chosen": -1.3089931011199951, "logps/rejected": -2.152021646499634, "loss": 1.6465, "nll_loss": 1.3089929819107056, "rewards/accuracies": 0.875, "rewards/chosen": -13.089929580688477, "rewards/margins": 8.43028450012207, "rewards/rejected": -21.52021598815918, "step": 439 }, { "epoch": 0.6561043802423113, "grad_norm": 44.707402411865324, "learning_rate": 2.8943728989691857e-07, "logits/chosen": 1.9802640676498413, "logits/rejected": 1.882559061050415, "logps/chosen": -1.4435269832611084, "logps/rejected": -2.479424238204956, "loss": 2.0328, "nll_loss": 1.4435269832611084, "rewards/accuracies": 0.875, "rewards/chosen": -14.435270309448242, "rewards/margins": 10.35897445678711, "rewards/rejected": -24.79424476623535, "step": 440 }, { "epoch": 0.6575955265610438, "grad_norm": 52.793943718785414, "learning_rate": 2.871997482473144e-07, "logits/chosen": 1.3309818506240845, "logits/rejected": 1.430246114730835, "logps/chosen": -1.0432627201080322, "logps/rejected": -2.574901580810547, "loss": 1.7849, "nll_loss": 1.0432627201080322, "rewards/accuracies": 0.875, "rewards/chosen": -10.43262767791748, "rewards/margins": 15.316386222839355, "rewards/rejected": -25.74901580810547, "step": 441 }, { "epoch": 0.6590866728797763, "grad_norm": 40.87763052696959, "learning_rate": 2.849673988617399e-07, "logits/chosen": 1.6311396360397339, "logits/rejected": 0.8815900683403015, "logps/chosen": -0.9415394067764282, "logps/rejected": -1.797757625579834, "loss": 1.2666, "nll_loss": 0.9415394067764282, "rewards/accuracies": 1.0, "rewards/chosen": -9.41539478302002, "rewards/margins": 8.56218147277832, "rewards/rejected": -17.977575302124023, "step": 442 }, { "epoch": 0.6605778191985089, "grad_norm": 41.007404689910715, "learning_rate": 2.827402962088677e-07, "logits/chosen": 1.7913353443145752, "logits/rejected": 2.1356258392333984, "logps/chosen": -1.023738980293274, "logps/rejected": -3.2478387355804443, "loss": 1.739, "nll_loss": 1.0237390995025635, "rewards/accuracies": 0.875, "rewards/chosen": -10.237390518188477, "rewards/margins": 22.240999221801758, "rewards/rejected": -32.478389739990234, "step": 443 }, { "epoch": 0.6620689655172414, "grad_norm": 55.087459504774436, "learning_rate": 2.8051849462935317e-07, "logits/chosen": 2.3499698638916016, "logits/rejected": 2.2420759201049805, "logps/chosen": -1.4096755981445312, "logps/rejected": -1.641516923904419, "loss": 2.7251, "nll_loss": 1.4096755981445312, "rewards/accuracies": 0.625, "rewards/chosen": -14.096755981445312, "rewards/margins": 2.3184118270874023, "rewards/rejected": -16.4151668548584, "step": 444 }, { "epoch": 0.6635601118359739, "grad_norm": 33.19261803739409, "learning_rate": 2.783020483345057e-07, "logits/chosen": 0.9375135898590088, "logits/rejected": 0.7836179733276367, "logps/chosen": -1.0821305513381958, "logps/rejected": -3.1024487018585205, "loss": 2.0048, "nll_loss": 1.0821305513381958, "rewards/accuracies": 0.75, "rewards/chosen": -10.821305274963379, "rewards/margins": 20.20318031311035, "rewards/rejected": -31.024486541748047, "step": 445 }, { "epoch": 0.6650512581547064, "grad_norm": 46.66424288671262, "learning_rate": 2.760910114049686e-07, "logits/chosen": 0.45911887288093567, "logits/rejected": 0.7530557513237, "logps/chosen": -1.196040153503418, "logps/rejected": -2.101353883743286, "loss": 2.4426, "nll_loss": 1.1960399150848389, "rewards/accuracies": 0.875, "rewards/chosen": -11.96040153503418, "rewards/margins": 9.05313777923584, "rewards/rejected": -21.013540267944336, "step": 446 }, { "epoch": 0.6665424044734389, "grad_norm": 68.75236713436564, "learning_rate": 2.738854377893969e-07, "logits/chosen": 1.666689157485962, "logits/rejected": 2.0555055141448975, "logps/chosen": -1.335540533065796, "logps/rejected": -2.899733781814575, "loss": 2.2997, "nll_loss": 1.3355404138565063, "rewards/accuracies": 0.875, "rewards/chosen": -13.3554048538208, "rewards/margins": 15.641935348510742, "rewards/rejected": -28.997339248657227, "step": 447 }, { "epoch": 0.6680335507921715, "grad_norm": 54.42062109615498, "learning_rate": 2.7168538130314345e-07, "logits/chosen": 0.376677930355072, "logits/rejected": 0.675430178642273, "logps/chosen": -0.9450101256370544, "logps/rejected": -2.8028371334075928, "loss": 1.9346, "nll_loss": 0.9450101256370544, "rewards/accuracies": 1.0, "rewards/chosen": -9.450100898742676, "rewards/margins": 18.578269958496094, "rewards/rejected": -28.028369903564453, "step": 448 }, { "epoch": 0.669524697110904, "grad_norm": 71.23646436175397, "learning_rate": 2.6949089562694433e-07, "logits/chosen": 0.9078048467636108, "logits/rejected": 1.0864940881729126, "logps/chosen": -1.1694973707199097, "logps/rejected": -3.034714698791504, "loss": 2.4166, "nll_loss": 1.1694972515106201, "rewards/accuracies": 1.0, "rewards/chosen": -11.694973945617676, "rewards/margins": 18.652170181274414, "rewards/rejected": -30.347145080566406, "step": 449 }, { "epoch": 0.6710158434296365, "grad_norm": 49.91139078148586, "learning_rate": 2.673020343056094e-07, "logits/chosen": 0.33078503608703613, "logits/rejected": 0.979889988899231, "logps/chosen": -1.1470595598220825, "logps/rejected": -2.078333854675293, "loss": 1.6967, "nll_loss": 1.147059440612793, "rewards/accuracies": 0.875, "rewards/chosen": -11.47059440612793, "rewards/margins": 9.312743186950684, "rewards/rejected": -20.78333854675293, "step": 450 }, { "epoch": 0.6725069897483691, "grad_norm": 43.26217547679494, "learning_rate": 2.651188507467161e-07, "logits/chosen": 1.538424015045166, "logits/rejected": 1.7342791557312012, "logps/chosen": -1.0186197757720947, "logps/rejected": -1.8571202754974365, "loss": 2.0133, "nll_loss": 1.0186196565628052, "rewards/accuracies": 0.75, "rewards/chosen": -10.186197280883789, "rewards/margins": 8.385005950927734, "rewards/rejected": -18.57120132446289, "step": 451 }, { "epoch": 0.6739981360671016, "grad_norm": 71.85168571075707, "learning_rate": 2.629413982193059e-07, "logits/chosen": 1.079938530921936, "logits/rejected": 1.236559271812439, "logps/chosen": -1.266383409500122, "logps/rejected": -1.8653570413589478, "loss": 2.9246, "nll_loss": 1.266383409500122, "rewards/accuracies": 0.75, "rewards/chosen": -12.663833618164062, "rewards/margins": 5.989736080169678, "rewards/rejected": -18.653568267822266, "step": 452 }, { "epoch": 0.6754892823858341, "grad_norm": 129.63565634881414, "learning_rate": 2.60769729852585e-07, "logits/chosen": -0.3111036419868469, "logits/rejected": 0.7170066833496094, "logps/chosen": -0.6509460806846619, "logps/rejected": -3.1134836673736572, "loss": 2.6189, "nll_loss": 0.6509461402893066, "rewards/accuracies": 1.0, "rewards/chosen": -6.509461879730225, "rewards/margins": 24.625377655029297, "rewards/rejected": -31.134841918945312, "step": 453 }, { "epoch": 0.6769804287045667, "grad_norm": 41.446021715687706, "learning_rate": 2.5860389863462763e-07, "logits/chosen": 0.4215829372406006, "logits/rejected": 1.3934569358825684, "logps/chosen": -1.2299526929855347, "logps/rejected": -2.8610968589782715, "loss": 1.6905, "nll_loss": 1.2299526929855347, "rewards/accuracies": 1.0, "rewards/chosen": -12.29952621459961, "rewards/margins": 16.31144142150879, "rewards/rejected": -28.61096954345703, "step": 454 }, { "epoch": 0.6784715750232991, "grad_norm": 60.40564921235719, "learning_rate": 2.564439574110833e-07, "logits/chosen": 1.331271767616272, "logits/rejected": 2.258624792098999, "logps/chosen": -1.3361079692840576, "logps/rejected": -2.4887638092041016, "loss": 2.1492, "nll_loss": 1.3361079692840576, "rewards/accuracies": 0.875, "rewards/chosen": -13.361080169677734, "rewards/margins": 11.526557922363281, "rewards/rejected": -24.887638092041016, "step": 455 }, { "epoch": 0.6799627213420317, "grad_norm": 213.62780772955298, "learning_rate": 2.542899588838875e-07, "logits/chosen": 1.3909834623336792, "logits/rejected": 1.3308240175247192, "logps/chosen": -1.6496881246566772, "logps/rejected": -2.274871826171875, "loss": 2.1291, "nll_loss": 1.6496882438659668, "rewards/accuracies": 0.75, "rewards/chosen": -16.49688148498535, "rewards/margins": 6.251836776733398, "rewards/rejected": -22.748720169067383, "step": 456 }, { "epoch": 0.6814538676607642, "grad_norm": 113.79819976533673, "learning_rate": 2.521419556099754e-07, "logits/chosen": 0.2305797040462494, "logits/rejected": 0.02278699167072773, "logps/chosen": -1.1409465074539185, "logps/rejected": -2.5157318115234375, "loss": 2.0458, "nll_loss": 1.140946626663208, "rewards/accuracies": 0.875, "rewards/chosen": -11.409464836120605, "rewards/margins": 13.747852325439453, "rewards/rejected": -25.157318115234375, "step": 457 }, { "epoch": 0.6829450139794967, "grad_norm": 53.05205087447335, "learning_rate": 2.500000000000001e-07, "logits/chosen": 1.3694684505462646, "logits/rejected": 1.2448177337646484, "logps/chosen": -0.9474362134933472, "logps/rejected": -2.576544761657715, "loss": 2.4503, "nll_loss": 0.9474362134933472, "rewards/accuracies": 1.0, "rewards/chosen": -9.474361419677734, "rewards/margins": 16.291086196899414, "rewards/rejected": -25.76544761657715, "step": 458 }, { "epoch": 0.6844361602982293, "grad_norm": 94.38074467414543, "learning_rate": 2.47864144317053e-07, "logits/chosen": 0.28457310795783997, "logits/rejected": 0.5626116991043091, "logps/chosen": -1.023999571800232, "logps/rejected": -4.407548427581787, "loss": 2.6478, "nll_loss": 1.023999571800232, "rewards/accuracies": 0.875, "rewards/chosen": -10.239995956420898, "rewards/margins": 33.835487365722656, "rewards/rejected": -44.07548522949219, "step": 459 }, { "epoch": 0.6859273066169618, "grad_norm": 155.12455111533777, "learning_rate": 2.4573444067538985e-07, "logits/chosen": 1.1949751377105713, "logits/rejected": 1.0052484273910522, "logps/chosen": -1.0581481456756592, "logps/rejected": -2.1276328563690186, "loss": 3.0956, "nll_loss": 1.0581481456756592, "rewards/accuracies": 0.75, "rewards/chosen": -10.581480979919434, "rewards/margins": 10.694849014282227, "rewards/rejected": -21.27633285522461, "step": 460 }, { "epoch": 0.6874184529356944, "grad_norm": 33.62384733106699, "learning_rate": 2.4361094103915724e-07, "logits/chosen": 0.48369476199150085, "logits/rejected": 0.27421849966049194, "logps/chosen": -1.0688875913619995, "logps/rejected": -1.9155361652374268, "loss": 1.6311, "nll_loss": 1.06888747215271, "rewards/accuracies": 0.75, "rewards/chosen": -10.688875198364258, "rewards/margins": 8.466485977172852, "rewards/rejected": -19.15536117553711, "step": 461 }, { "epoch": 0.6889095992544269, "grad_norm": 49.979637593529766, "learning_rate": 2.4149369722112715e-07, "logits/chosen": 0.8130873441696167, "logits/rejected": 1.4969799518585205, "logps/chosen": -1.1139277219772339, "logps/rejected": -2.584320545196533, "loss": 1.8583, "nll_loss": 1.1139277219772339, "rewards/accuracies": 1.0, "rewards/chosen": -11.139278411865234, "rewards/margins": 14.703924179077148, "rewards/rejected": -25.843204498291016, "step": 462 }, { "epoch": 0.6904007455731593, "grad_norm": 85.69499562338711, "learning_rate": 2.3938276088143e-07, "logits/chosen": 1.6853172779083252, "logits/rejected": 1.7209492921829224, "logps/chosen": -1.050376296043396, "logps/rejected": -2.0380706787109375, "loss": 2.5335, "nll_loss": 1.050376296043396, "rewards/accuracies": 0.75, "rewards/chosen": -10.503764152526855, "rewards/margins": 9.876941680908203, "rewards/rejected": -20.380706787109375, "step": 463 }, { "epoch": 0.6918918918918919, "grad_norm": 98.42041645310131, "learning_rate": 2.3727818352629708e-07, "logits/chosen": 1.3016146421432495, "logits/rejected": 1.5215528011322021, "logps/chosen": -1.1616820096969604, "logps/rejected": -1.9143564701080322, "loss": 2.4056, "nll_loss": 1.161681890487671, "rewards/accuracies": 0.875, "rewards/chosen": -11.616820335388184, "rewards/margins": 7.526744842529297, "rewards/rejected": -19.143566131591797, "step": 464 }, { "epoch": 0.6933830382106244, "grad_norm": 67.84600762969895, "learning_rate": 2.351800165068008e-07, "logits/chosen": 2.1544480323791504, "logits/rejected": 2.2394189834594727, "logps/chosen": -0.8722681999206543, "logps/rejected": -3.095642328262329, "loss": 3.2354, "nll_loss": 0.8722682595252991, "rewards/accuracies": 1.0, "rewards/chosen": -8.72268295288086, "rewards/margins": 22.23373794555664, "rewards/rejected": -30.9564208984375, "step": 465 }, { "epoch": 0.6948741845293569, "grad_norm": 42.045527132294, "learning_rate": 2.3308831101760483e-07, "logits/chosen": 1.230238914489746, "logits/rejected": 1.6758296489715576, "logps/chosen": -1.7337133884429932, "logps/rejected": -2.593168258666992, "loss": 1.672, "nll_loss": 1.7337136268615723, "rewards/accuracies": 0.875, "rewards/chosen": -17.337133407592773, "rewards/margins": 8.594550132751465, "rewards/rejected": -25.931682586669922, "step": 466 }, { "epoch": 0.6963653308480895, "grad_norm": 43.5637245965348, "learning_rate": 2.310031180957117e-07, "logits/chosen": 0.024984115734696388, "logits/rejected": 0.7004745006561279, "logps/chosen": -0.9020689725875854, "logps/rejected": -2.2720069885253906, "loss": 1.8545, "nll_loss": 0.9020689725875854, "rewards/accuracies": 0.875, "rewards/chosen": -9.020689010620117, "rewards/margins": 13.699378967285156, "rewards/rejected": -22.720067977905273, "step": 467 }, { "epoch": 0.697856477166822, "grad_norm": 511.0890454343964, "learning_rate": 2.289244886192207e-07, "logits/chosen": 0.222794309258461, "logits/rejected": 0.4778207540512085, "logps/chosen": -2.1019086837768555, "logps/rejected": -2.340156316757202, "loss": 3.0353, "nll_loss": 2.1019086837768555, "rewards/accuracies": 0.875, "rewards/chosen": -21.01908302307129, "rewards/margins": 2.3824777603149414, "rewards/rejected": -23.401561737060547, "step": 468 }, { "epoch": 0.6993476234855546, "grad_norm": 38.06885262051829, "learning_rate": 2.2685247330608414e-07, "logits/chosen": 0.7382382154464722, "logits/rejected": 0.8615285158157349, "logps/chosen": -1.1256132125854492, "logps/rejected": -2.2803823947906494, "loss": 1.5148, "nll_loss": 1.1256132125854492, "rewards/accuracies": 0.875, "rewards/chosen": -11.256132125854492, "rewards/margins": 11.547691345214844, "rewards/rejected": -22.803823471069336, "step": 469 }, { "epoch": 0.700838769804287, "grad_norm": 123.78505103720795, "learning_rate": 2.2478712271287087e-07, "logits/chosen": -0.260221004486084, "logits/rejected": 0.08943118155002594, "logps/chosen": -1.3078944683074951, "logps/rejected": -2.9328818321228027, "loss": 2.8933, "nll_loss": 1.3078944683074951, "rewards/accuracies": 0.75, "rewards/chosen": -13.078944206237793, "rewards/margins": 16.24987030029297, "rewards/rejected": -29.32881736755371, "step": 470 }, { "epoch": 0.7023299161230195, "grad_norm": 51.30693741373788, "learning_rate": 2.227284872335325e-07, "logits/chosen": 0.8327363729476929, "logits/rejected": 1.1221492290496826, "logps/chosen": -1.0217963457107544, "logps/rejected": -4.047585487365723, "loss": 1.6299, "nll_loss": 1.0217963457107544, "rewards/accuracies": 0.875, "rewards/chosen": -10.217963218688965, "rewards/margins": 30.257888793945312, "rewards/rejected": -40.475852966308594, "step": 471 }, { "epoch": 0.7038210624417521, "grad_norm": 28.86346583951883, "learning_rate": 2.2067661709817382e-07, "logits/chosen": -0.10334792733192444, "logits/rejected": 0.23174366354942322, "logps/chosen": -0.8516181707382202, "logps/rejected": -2.4010567665100098, "loss": 1.774, "nll_loss": 0.8516180515289307, "rewards/accuracies": 1.0, "rewards/chosen": -8.516180992126465, "rewards/margins": 15.49438762664795, "rewards/rejected": -24.010568618774414, "step": 472 }, { "epoch": 0.7053122087604846, "grad_norm": 53.7095788706672, "learning_rate": 2.1863156237182724e-07, "logits/chosen": 0.23709386587142944, "logits/rejected": 0.6802022457122803, "logps/chosen": -1.0881783962249756, "logps/rejected": -2.5452468395233154, "loss": 2.0664, "nll_loss": 1.0881783962249756, "rewards/accuracies": 1.0, "rewards/chosen": -10.881784439086914, "rewards/margins": 14.570684432983398, "rewards/rejected": -25.452468872070312, "step": 473 }, { "epoch": 0.7068033550792171, "grad_norm": 107.01228096480615, "learning_rate": 2.1659337295323114e-07, "logits/chosen": 2.4457285404205322, "logits/rejected": 2.192387104034424, "logps/chosen": -1.1308457851409912, "logps/rejected": -2.3631060123443604, "loss": 1.9536, "nll_loss": 1.1308457851409912, "rewards/accuracies": 0.625, "rewards/chosen": -11.30845832824707, "rewards/margins": 12.322601318359375, "rewards/rejected": -23.631059646606445, "step": 474 }, { "epoch": 0.7082945013979497, "grad_norm": 49.156409055564126, "learning_rate": 2.1456209857361246e-07, "logits/chosen": 0.9657554626464844, "logits/rejected": 1.1160908937454224, "logps/chosen": -1.2342641353607178, "logps/rejected": -2.2354118824005127, "loss": 2.0943, "nll_loss": 1.2342641353607178, "rewards/accuracies": 0.875, "rewards/chosen": -12.342641830444336, "rewards/margins": 10.011476516723633, "rewards/rejected": -22.354116439819336, "step": 475 }, { "epoch": 0.7097856477166822, "grad_norm": 50.310764029228174, "learning_rate": 2.1253778879547317e-07, "logits/chosen": 0.5516436696052551, "logits/rejected": 1.0273289680480957, "logps/chosen": -1.400739073753357, "logps/rejected": -5.508915901184082, "loss": 1.5569, "nll_loss": 1.400739073753357, "rewards/accuracies": 0.875, "rewards/chosen": -14.007391929626465, "rewards/margins": 41.08176803588867, "rewards/rejected": -55.08916091918945, "step": 476 }, { "epoch": 0.7112767940354148, "grad_norm": 60.52879609068727, "learning_rate": 2.1052049301138092e-07, "logits/chosen": 0.475372850894928, "logits/rejected": 0.21769298613071442, "logps/chosen": -1.2100934982299805, "logps/rejected": -2.600706100463867, "loss": 1.5172, "nll_loss": 1.2100934982299805, "rewards/accuracies": 1.0, "rewards/chosen": -12.100934982299805, "rewards/margins": 13.906126976013184, "rewards/rejected": -26.007061004638672, "step": 477 }, { "epoch": 0.7127679403541473, "grad_norm": 77.34557138399435, "learning_rate": 2.0851026044276405e-07, "logits/chosen": 0.7670489549636841, "logits/rejected": 0.7285082340240479, "logps/chosen": -1.1550674438476562, "logps/rejected": -2.071988344192505, "loss": 1.6234, "nll_loss": 1.1550673246383667, "rewards/accuracies": 0.75, "rewards/chosen": -11.550674438476562, "rewards/margins": 9.169209480285645, "rewards/rejected": -20.71988296508789, "step": 478 }, { "epoch": 0.7142590866728797, "grad_norm": 36.927537934249365, "learning_rate": 2.0650714013871045e-07, "logits/chosen": 1.0101845264434814, "logits/rejected": 1.308388113975525, "logps/chosen": -1.1056818962097168, "logps/rejected": -2.300252676010132, "loss": 1.7636, "nll_loss": 1.1056818962097168, "rewards/accuracies": 1.0, "rewards/chosen": -11.056818008422852, "rewards/margins": 11.945709228515625, "rewards/rejected": -23.00252914428711, "step": 479 }, { "epoch": 0.7157502329916123, "grad_norm": 62.772752564819186, "learning_rate": 2.0451118097477093e-07, "logits/chosen": 1.5709446668624878, "logits/rejected": 1.7383079528808594, "logps/chosen": -1.1260969638824463, "logps/rejected": -2.729841470718384, "loss": 1.903, "nll_loss": 1.1260970830917358, "rewards/accuracies": 1.0, "rewards/chosen": -11.260971069335938, "rewards/margins": 16.037445068359375, "rewards/rejected": -27.298416137695312, "step": 480 }, { "epoch": 0.7172413793103448, "grad_norm": 32.97119318254156, "learning_rate": 2.025224316517663e-07, "logits/chosen": 0.7222650051116943, "logits/rejected": 1.2684485912322998, "logps/chosen": -1.0870883464813232, "logps/rejected": -2.2934865951538086, "loss": 1.6303, "nll_loss": 1.0870884656906128, "rewards/accuracies": 0.875, "rewards/chosen": -10.87088394165039, "rewards/margins": 12.063982009887695, "rewards/rejected": -22.934865951538086, "step": 481 }, { "epoch": 0.7187325256290773, "grad_norm": 93.56185034910865, "learning_rate": 2.005409406946e-07, "logits/chosen": 0.7665479779243469, "logits/rejected": 0.9628517627716064, "logps/chosen": -1.3418065309524536, "logps/rejected": -2.322467803955078, "loss": 2.4619, "nll_loss": 1.3418065309524536, "rewards/accuracies": 0.875, "rewards/chosen": -13.41806411743164, "rewards/margins": 9.80661392211914, "rewards/rejected": -23.224679946899414, "step": 482 }, { "epoch": 0.7202236719478099, "grad_norm": 83.63211918908515, "learning_rate": 1.985667564510724e-07, "logits/chosen": 1.6486438512802124, "logits/rejected": 1.6210968494415283, "logps/chosen": -1.4717875719070435, "logps/rejected": -1.9454703330993652, "loss": 2.1969, "nll_loss": 1.471787691116333, "rewards/accuracies": 0.625, "rewards/chosen": -14.717876434326172, "rewards/margins": 4.736826419830322, "rewards/rejected": -19.45470428466797, "step": 483 }, { "epoch": 0.7217148182665424, "grad_norm": 44.77487482055843, "learning_rate": 1.9659992709070344e-07, "logits/chosen": 1.3622450828552246, "logits/rejected": 1.1589062213897705, "logps/chosen": -0.8679985404014587, "logps/rejected": -2.2884294986724854, "loss": 2.0511, "nll_loss": 0.8679985404014587, "rewards/accuracies": 0.875, "rewards/chosen": -8.679985046386719, "rewards/margins": 14.204309463500977, "rewards/rejected": -22.884296417236328, "step": 484 }, { "epoch": 0.723205964585275, "grad_norm": 59.93427920155752, "learning_rate": 1.946405006035548e-07, "logits/chosen": 0.12034586071968079, "logits/rejected": 0.5588881969451904, "logps/chosen": -1.1300389766693115, "logps/rejected": -3.5124731063842773, "loss": 1.6296, "nll_loss": 1.130039095878601, "rewards/accuracies": 0.875, "rewards/chosen": -11.300390243530273, "rewards/margins": 23.8243408203125, "rewards/rejected": -35.124732971191406, "step": 485 }, { "epoch": 0.7246971109040075, "grad_norm": 46.34699445302191, "learning_rate": 1.9268852479906145e-07, "logits/chosen": 2.0305707454681396, "logits/rejected": 2.339308261871338, "logps/chosen": -1.8795857429504395, "logps/rejected": -2.736845016479492, "loss": 2.0155, "nll_loss": 1.8795857429504395, "rewards/accuracies": 0.75, "rewards/chosen": -18.795856475830078, "rewards/margins": 8.572591781616211, "rewards/rejected": -27.368450164794922, "step": 486 }, { "epoch": 0.7261882572227399, "grad_norm": 51.974864556365205, "learning_rate": 1.907440473048626e-07, "logits/chosen": 0.6879156827926636, "logits/rejected": 0.8359827995300293, "logps/chosen": -1.199193000793457, "logps/rejected": -1.9355554580688477, "loss": 1.8027, "nll_loss": 1.199193000793457, "rewards/accuracies": 0.875, "rewards/chosen": -11.99193000793457, "rewards/margins": 7.363624572753906, "rewards/rejected": -19.355554580688477, "step": 487 }, { "epoch": 0.7276794035414725, "grad_norm": 44.0225250357595, "learning_rate": 1.8880711556564212e-07, "logits/chosen": 0.9353025555610657, "logits/rejected": 0.5232765078544617, "logps/chosen": -1.4775171279907227, "logps/rejected": -2.885024070739746, "loss": 1.8494, "nll_loss": 1.4775171279907227, "rewards/accuracies": 0.875, "rewards/chosen": -14.775171279907227, "rewards/margins": 14.075069427490234, "rewards/rejected": -28.85024070739746, "step": 488 }, { "epoch": 0.729170549860205, "grad_norm": 50.69197433851532, "learning_rate": 1.8687777684196882e-07, "logits/chosen": 1.100256323814392, "logits/rejected": 1.2217504978179932, "logps/chosen": -1.3356492519378662, "logps/rejected": -2.1796789169311523, "loss": 2.6281, "nll_loss": 1.3356491327285767, "rewards/accuracies": 0.875, "rewards/chosen": -13.356492042541504, "rewards/margins": 8.440298080444336, "rewards/rejected": -21.796791076660156, "step": 489 }, { "epoch": 0.7306616961789375, "grad_norm": 78.26649672369982, "learning_rate": 1.849560782091445e-07, "logits/chosen": 0.6997460126876831, "logits/rejected": 0.25134438276290894, "logps/chosen": -1.0516213178634644, "logps/rejected": -1.7665810585021973, "loss": 1.7613, "nll_loss": 1.0516211986541748, "rewards/accuracies": 0.875, "rewards/chosen": -10.516212463378906, "rewards/margins": 7.149598598480225, "rewards/rejected": -17.66581153869629, "step": 490 }, { "epoch": 0.7321528424976701, "grad_norm": 50.189099136000976, "learning_rate": 1.8304206655605474e-07, "logits/chosen": 0.45600277185440063, "logits/rejected": 1.139525055885315, "logps/chosen": -1.9147918224334717, "logps/rejected": -2.3654191493988037, "loss": 2.2826, "nll_loss": 1.9147917032241821, "rewards/accuracies": 0.5, "rewards/chosen": -19.147918701171875, "rewards/margins": 4.5062737464904785, "rewards/rejected": -23.654191970825195, "step": 491 }, { "epoch": 0.7336439888164026, "grad_norm": 41.38891098119506, "learning_rate": 1.811357885840254e-07, "logits/chosen": 1.0754787921905518, "logits/rejected": 0.987869381904602, "logps/chosen": -1.3695212602615356, "logps/rejected": -2.920154571533203, "loss": 1.5894, "nll_loss": 1.369521141052246, "rewards/accuracies": 1.0, "rewards/chosen": -13.695213317871094, "rewards/margins": 15.506331443786621, "rewards/rejected": -29.20154571533203, "step": 492 }, { "epoch": 0.7351351351351352, "grad_norm": 81.00793367937105, "learning_rate": 1.7923729080568239e-07, "logits/chosen": 1.4710057973861694, "logits/rejected": 1.6370110511779785, "logps/chosen": -1.0914305448532104, "logps/rejected": -1.9285211563110352, "loss": 1.7259, "nll_loss": 1.091430425643921, "rewards/accuracies": 0.875, "rewards/chosen": -10.914305686950684, "rewards/margins": 8.370904922485352, "rewards/rejected": -19.28521156311035, "step": 493 }, { "epoch": 0.7366262814538677, "grad_norm": 37.78420798456383, "learning_rate": 1.7734661954381752e-07, "logits/chosen": 0.871479332447052, "logits/rejected": 0.8258588314056396, "logps/chosen": -1.3462225198745728, "logps/rejected": -2.5515835285186768, "loss": 1.8386, "nll_loss": 1.3462225198745728, "rewards/accuracies": 0.75, "rewards/chosen": -13.462225914001465, "rewards/margins": 12.053609848022461, "rewards/rejected": -25.51583480834961, "step": 494 }, { "epoch": 0.7381174277726001, "grad_norm": 131.99066573478086, "learning_rate": 1.7546382093025758e-07, "logits/chosen": -0.19732213020324707, "logits/rejected": -0.07462179660797119, "logps/chosen": -1.0856541395187378, "logps/rejected": -2.3993844985961914, "loss": 1.978, "nll_loss": 1.0856541395187378, "rewards/accuracies": 1.0, "rewards/chosen": -10.85654067993164, "rewards/margins": 13.137301445007324, "rewards/rejected": -23.99384307861328, "step": 495 }, { "epoch": 0.7396085740913327, "grad_norm": 75.4429379923374, "learning_rate": 1.7358894090473924e-07, "logits/chosen": 1.1260228157043457, "logits/rejected": 1.5973632335662842, "logps/chosen": -1.3908843994140625, "logps/rejected": -2.6310038566589355, "loss": 3.0098, "nll_loss": 1.3908843994140625, "rewards/accuracies": 1.0, "rewards/chosen": -13.908843994140625, "rewards/margins": 12.40119457244873, "rewards/rejected": -26.310039520263672, "step": 496 }, { "epoch": 0.7410997204100652, "grad_norm": 57.59245498721595, "learning_rate": 1.7172202521378793e-07, "logits/chosen": 1.5798145532608032, "logits/rejected": 1.379747986793518, "logps/chosen": -1.0907506942749023, "logps/rejected": -2.8836097717285156, "loss": 1.5375, "nll_loss": 1.0907506942749023, "rewards/accuracies": 0.875, "rewards/chosen": -10.907506942749023, "rewards/margins": 17.9285888671875, "rewards/rejected": -28.836095809936523, "step": 497 }, { "epoch": 0.7425908667287978, "grad_norm": 56.25519043444952, "learning_rate": 1.6986311940960147e-07, "logits/chosen": 1.462363362312317, "logits/rejected": 1.563219666481018, "logps/chosen": -1.2027587890625, "logps/rejected": -1.8627557754516602, "loss": 2.5616, "nll_loss": 1.2027586698532104, "rewards/accuracies": 0.75, "rewards/chosen": -12.027586936950684, "rewards/margins": 6.599970817565918, "rewards/rejected": -18.6275577545166, "step": 498 }, { "epoch": 0.7440820130475303, "grad_norm": 58.6146470332597, "learning_rate": 1.6801226884893893e-07, "logits/chosen": 0.9026723504066467, "logits/rejected": 1.0392895936965942, "logps/chosen": -1.2807862758636475, "logps/rejected": -2.035679578781128, "loss": 2.2686, "nll_loss": 1.280786395072937, "rewards/accuracies": 0.625, "rewards/chosen": -12.807863235473633, "rewards/margins": 7.548933982849121, "rewards/rejected": -20.35679817199707, "step": 499 }, { "epoch": 0.7455731593662628, "grad_norm": 127.17666337209936, "learning_rate": 1.6616951869201378e-07, "logits/chosen": 1.4067152738571167, "logits/rejected": 1.5482068061828613, "logps/chosen": -1.6144959926605225, "logps/rejected": -2.988914966583252, "loss": 2.2455, "nll_loss": 1.6144959926605225, "rewards/accuracies": 0.625, "rewards/chosen": -16.144960403442383, "rewards/margins": 13.744190216064453, "rewards/rejected": -29.88915252685547, "step": 500 }, { "epoch": 0.7470643056849954, "grad_norm": 22.65024245941002, "learning_rate": 1.6433491390139176e-07, "logits/chosen": 1.386856198310852, "logits/rejected": 0.9873509407043457, "logps/chosen": -0.8607431650161743, "logps/rejected": -1.7035075426101685, "loss": 0.846, "nll_loss": 0.8607431054115295, "rewards/accuracies": 0.875, "rewards/chosen": -8.607431411743164, "rewards/margins": 8.427644729614258, "rewards/rejected": -17.035076141357422, "step": 501 }, { "epoch": 0.7485554520037279, "grad_norm": 97.32340827582183, "learning_rate": 1.6250849924089482e-07, "logits/chosen": 0.7162159085273743, "logits/rejected": 1.4583920240402222, "logps/chosen": -1.1494529247283936, "logps/rejected": -1.890822410583496, "loss": 2.3423, "nll_loss": 1.1494529247283936, "rewards/accuracies": 1.0, "rewards/chosen": -11.49453067779541, "rewards/margins": 7.413692951202393, "rewards/rejected": -18.90822410583496, "step": 502 }, { "epoch": 0.7500465983224603, "grad_norm": 66.08646716231478, "learning_rate": 1.6069031927450692e-07, "logits/chosen": 2.2476208209991455, "logits/rejected": 2.9070322513580322, "logps/chosen": -1.1560183763504028, "logps/rejected": -3.0961015224456787, "loss": 2.566, "nll_loss": 1.1560183763504028, "rewards/accuracies": 1.0, "rewards/chosen": -11.56018352508545, "rewards/margins": 19.40083122253418, "rewards/rejected": -30.961013793945312, "step": 503 }, { "epoch": 0.7515377446411929, "grad_norm": 37.34765381376621, "learning_rate": 1.5888041836528914e-07, "logits/chosen": 0.9423756003379822, "logits/rejected": 0.8755822777748108, "logps/chosen": -1.1919398307800293, "logps/rejected": -2.717829942703247, "loss": 2.0692, "nll_loss": 1.1919398307800293, "rewards/accuracies": 0.875, "rewards/chosen": -11.919397354125977, "rewards/margins": 15.25890064239502, "rewards/rejected": -27.178298950195312, "step": 504 }, { "epoch": 0.7530288909599254, "grad_norm": 43.39433506785265, "learning_rate": 1.5707884067429471e-07, "logits/chosen": 0.8235660195350647, "logits/rejected": 0.9912365078926086, "logps/chosen": -1.7123976945877075, "logps/rejected": -3.390589475631714, "loss": 1.8105, "nll_loss": 1.7123976945877075, "rewards/accuracies": 0.75, "rewards/chosen": -17.123977661132812, "rewards/margins": 16.781919479370117, "rewards/rejected": -33.9058952331543, "step": 505 }, { "epoch": 0.754520037278658, "grad_norm": 47.056197370329635, "learning_rate": 1.552856301594942e-07, "logits/chosen": -0.24026452004909515, "logits/rejected": 0.23408591747283936, "logps/chosen": -1.2406717538833618, "logps/rejected": -3.218183994293213, "loss": 2.5921, "nll_loss": 1.2406718730926514, "rewards/accuracies": 0.875, "rewards/chosen": -12.406718254089355, "rewards/margins": 19.775121688842773, "rewards/rejected": -32.18183898925781, "step": 506 }, { "epoch": 0.7560111835973905, "grad_norm": 121.73397969203478, "learning_rate": 1.5350083057469998e-07, "logits/chosen": 1.063665747642517, "logits/rejected": 1.3006550073623657, "logps/chosen": -0.9579554200172424, "logps/rejected": -3.6276798248291016, "loss": 2.4975, "nll_loss": 0.9579554200172424, "rewards/accuracies": 0.875, "rewards/chosen": -9.579554557800293, "rewards/margins": 26.697248458862305, "rewards/rejected": -36.27680206298828, "step": 507 }, { "epoch": 0.757502329916123, "grad_norm": 74.82208165860814, "learning_rate": 1.5172448546850163e-07, "logits/chosen": 1.4153053760528564, "logits/rejected": 1.931809902191162, "logps/chosen": -1.1757323741912842, "logps/rejected": -3.0061824321746826, "loss": 2.2088, "nll_loss": 1.1757322549819946, "rewards/accuracies": 0.875, "rewards/chosen": -11.75732421875, "rewards/margins": 18.30449867248535, "rewards/rejected": -30.061824798583984, "step": 508 }, { "epoch": 0.7589934762348556, "grad_norm": 107.8466687249151, "learning_rate": 1.4995663818320071e-07, "logits/chosen": 1.366182804107666, "logits/rejected": 1.5222809314727783, "logps/chosen": -1.8494032621383667, "logps/rejected": -2.9303297996520996, "loss": 2.3564, "nll_loss": 1.8494032621383667, "rewards/accuracies": 0.875, "rewards/chosen": -18.494033813476562, "rewards/margins": 10.809263229370117, "rewards/rejected": -29.303298950195312, "step": 509 }, { "epoch": 0.7604846225535881, "grad_norm": 61.16767852788904, "learning_rate": 1.4819733185375531e-07, "logits/chosen": 1.347992181777954, "logits/rejected": 1.6785510778427124, "logps/chosen": -0.7743812799453735, "logps/rejected": -2.0755648612976074, "loss": 2.1959, "nll_loss": 0.7743812799453735, "rewards/accuracies": 0.875, "rewards/chosen": -7.743812561035156, "rewards/margins": 13.011836051940918, "rewards/rejected": -20.75564956665039, "step": 510 }, { "epoch": 0.7619757688723205, "grad_norm": 58.20432865668072, "learning_rate": 1.4644660940672627e-07, "logits/chosen": 0.6616173982620239, "logits/rejected": 0.8984595537185669, "logps/chosen": -1.6654181480407715, "logps/rejected": -3.81215238571167, "loss": 2.0691, "nll_loss": 1.665418267250061, "rewards/accuracies": 0.875, "rewards/chosen": -16.6541805267334, "rewards/margins": 21.467344284057617, "rewards/rejected": -38.12152862548828, "step": 511 }, { "epoch": 0.7634669151910531, "grad_norm": 52.55922504686432, "learning_rate": 1.4470451355923024e-07, "logits/chosen": 0.6327857971191406, "logits/rejected": 0.07023850828409195, "logps/chosen": -0.8465753793716431, "logps/rejected": -2.0434751510620117, "loss": 2.1946, "nll_loss": 0.8465753793716431, "rewards/accuracies": 1.0, "rewards/chosen": -8.465753555297852, "rewards/margins": 11.968996047973633, "rewards/rejected": -20.434751510620117, "step": 512 }, { "epoch": 0.7649580615097856, "grad_norm": 49.18096490201257, "learning_rate": 1.4297108681789749e-07, "logits/chosen": 1.6971843242645264, "logits/rejected": 1.4284838438034058, "logps/chosen": -1.6984788179397583, "logps/rejected": -1.7207996845245361, "loss": 2.1438, "nll_loss": 1.6984788179397583, "rewards/accuracies": 0.375, "rewards/chosen": -16.98478889465332, "rewards/margins": 0.22320926189422607, "rewards/rejected": -17.207996368408203, "step": 513 }, { "epoch": 0.7664492078285182, "grad_norm": 63.55350906873402, "learning_rate": 1.412463714778343e-07, "logits/chosen": 1.1489533185958862, "logits/rejected": 1.6662311553955078, "logps/chosen": -1.5277798175811768, "logps/rejected": -2.280330181121826, "loss": 1.2412, "nll_loss": 1.5277798175811768, "rewards/accuracies": 0.5, "rewards/chosen": -15.27779769897461, "rewards/margins": 7.525506019592285, "rewards/rejected": -22.803302764892578, "step": 514 }, { "epoch": 0.7679403541472507, "grad_norm": 57.944172276092736, "learning_rate": 1.3953040962159207e-07, "logits/chosen": 1.8766756057739258, "logits/rejected": 1.922022819519043, "logps/chosen": -1.529194712638855, "logps/rejected": -2.722909450531006, "loss": 2.0906, "nll_loss": 1.5291945934295654, "rewards/accuracies": 0.875, "rewards/chosen": -15.291946411132812, "rewards/margins": 11.937149047851562, "rewards/rejected": -27.229095458984375, "step": 515 }, { "epoch": 0.7694315004659832, "grad_norm": 51.39188096538205, "learning_rate": 1.3782324311813858e-07, "logits/chosen": 1.4132533073425293, "logits/rejected": 0.8430649042129517, "logps/chosen": -1.2825404405593872, "logps/rejected": -2.1167383193969727, "loss": 1.9975, "nll_loss": 1.2825404405593872, "rewards/accuracies": 0.625, "rewards/chosen": -12.82540512084961, "rewards/margins": 8.3419771194458, "rewards/rejected": -21.167383193969727, "step": 516 }, { "epoch": 0.7709226467847158, "grad_norm": 59.228716606988534, "learning_rate": 1.3612491362183887e-07, "logits/chosen": 0.7483983635902405, "logits/rejected": 0.7808694243431091, "logps/chosen": -1.6511446237564087, "logps/rejected": -2.752027750015259, "loss": 2.5048, "nll_loss": 1.6511446237564087, "rewards/accuracies": 0.75, "rewards/chosen": -16.511444091796875, "rewards/margins": 11.008831024169922, "rewards/rejected": -27.52027702331543, "step": 517 }, { "epoch": 0.7724137931034483, "grad_norm": 66.07522781310549, "learning_rate": 1.3443546257143623e-07, "logits/chosen": 0.7868104577064514, "logits/rejected": 0.8091244101524353, "logps/chosen": -0.9046810269355774, "logps/rejected": -5.082855224609375, "loss": 1.9796, "nll_loss": 0.9046810269355774, "rewards/accuracies": 1.0, "rewards/chosen": -9.046810150146484, "rewards/margins": 41.78173828125, "rewards/rejected": -50.82854461669922, "step": 518 }, { "epoch": 0.7739049394221807, "grad_norm": 63.93450684685768, "learning_rate": 1.3275493118904385e-07, "logits/chosen": 0.5779823660850525, "logits/rejected": 0.955024003982544, "logps/chosen": -1.513691782951355, "logps/rejected": -2.9146623611450195, "loss": 1.7256, "nll_loss": 1.513691782951355, "rewards/accuracies": 0.875, "rewards/chosen": -15.136917114257812, "rewards/margins": 14.009706497192383, "rewards/rejected": -29.146621704101562, "step": 519 }, { "epoch": 0.7753960857409133, "grad_norm": 91.77706933219835, "learning_rate": 1.3108336047913633e-07, "logits/chosen": 1.4290512800216675, "logits/rejected": 1.507631540298462, "logps/chosen": -1.4324660301208496, "logps/rejected": -3.8189008235931396, "loss": 2.546, "nll_loss": 1.4324660301208496, "rewards/accuracies": 1.0, "rewards/chosen": -14.324661254882812, "rewards/margins": 23.86434555053711, "rewards/rejected": -38.18900680541992, "step": 520 }, { "epoch": 0.7768872320596458, "grad_norm": 106.59145513895703, "learning_rate": 1.2942079122755163e-07, "logits/chosen": 0.4324963688850403, "logits/rejected": 0.6091005802154541, "logps/chosen": -1.304144263267517, "logps/rejected": -2.2639334201812744, "loss": 2.1878, "nll_loss": 1.3041443824768066, "rewards/accuracies": 0.75, "rewards/chosen": -13.041441917419434, "rewards/margins": 9.597891807556152, "rewards/rejected": -22.63933563232422, "step": 521 }, { "epoch": 0.7783783783783784, "grad_norm": 62.71439391098405, "learning_rate": 1.277672640004936e-07, "logits/chosen": 1.8813598155975342, "logits/rejected": 2.191922187805176, "logps/chosen": -0.9609942436218262, "logps/rejected": -2.417336940765381, "loss": 1.5024, "nll_loss": 0.9609941840171814, "rewards/accuracies": 0.875, "rewards/chosen": -9.609942436218262, "rewards/margins": 14.563425064086914, "rewards/rejected": -24.173368453979492, "step": 522 }, { "epoch": 0.7798695246971109, "grad_norm": 46.30568847741028, "learning_rate": 1.261228191435445e-07, "logits/chosen": 0.815243661403656, "logits/rejected": 1.5941507816314697, "logps/chosen": -1.1938700675964355, "logps/rejected": -4.132516860961914, "loss": 1.6403, "nll_loss": 1.1938700675964355, "rewards/accuracies": 1.0, "rewards/chosen": -11.938700675964355, "rewards/margins": 29.386470794677734, "rewards/rejected": -41.325172424316406, "step": 523 }, { "epoch": 0.7813606710158434, "grad_norm": 58.473201038518226, "learning_rate": 1.2448749678067855e-07, "logits/chosen": 0.14041255414485931, "logits/rejected": -0.08654403686523438, "logps/chosen": -0.8794372081756592, "logps/rejected": -2.187777519226074, "loss": 2.5804, "nll_loss": 0.8794372081756592, "rewards/accuracies": 1.0, "rewards/chosen": -8.79437255859375, "rewards/margins": 13.083402633666992, "rewards/rejected": -21.877775192260742, "step": 524 }, { "epoch": 0.782851817334576, "grad_norm": 48.892135342518635, "learning_rate": 1.228613368132842e-07, "logits/chosen": 0.5651916861534119, "logits/rejected": 0.992500901222229, "logps/chosen": -1.1246737241744995, "logps/rejected": -2.251676321029663, "loss": 2.0336, "nll_loss": 1.1246737241744995, "rewards/accuracies": 0.5, "rewards/chosen": -11.24673843383789, "rewards/margins": 11.270025253295898, "rewards/rejected": -22.51676368713379, "step": 525 }, { "epoch": 0.7843429636533085, "grad_norm": 58.22641776546797, "learning_rate": 1.2124437891918993e-07, "logits/chosen": 0.8821541666984558, "logits/rejected": 1.1546046733856201, "logps/chosen": -1.0839579105377197, "logps/rejected": -2.731253147125244, "loss": 2.2118, "nll_loss": 1.0839580297470093, "rewards/accuracies": 1.0, "rewards/chosen": -10.839579582214355, "rewards/margins": 16.47295379638672, "rewards/rejected": -27.312532424926758, "step": 526 }, { "epoch": 0.7858341099720411, "grad_norm": 136.31863333056407, "learning_rate": 1.1963666255169645e-07, "logits/chosen": 2.6750237941741943, "logits/rejected": 2.6834774017333984, "logps/chosen": -1.1975797414779663, "logps/rejected": -3.423466444015503, "loss": 2.3932, "nll_loss": 1.1975797414779663, "rewards/accuracies": 0.75, "rewards/chosen": -11.975796699523926, "rewards/margins": 22.258865356445312, "rewards/rejected": -34.23466110229492, "step": 527 }, { "epoch": 0.7873252562907735, "grad_norm": 68.2415641801262, "learning_rate": 1.1803822693861377e-07, "logits/chosen": 0.39841076731681824, "logits/rejected": 0.6484661102294922, "logps/chosen": -0.8886002898216248, "logps/rejected": -2.301114082336426, "loss": 1.5966, "nll_loss": 0.8886002898216248, "rewards/accuracies": 0.875, "rewards/chosen": -8.886002540588379, "rewards/margins": 14.125136375427246, "rewards/rejected": -23.011140823364258, "step": 528 }, { "epoch": 0.788816402609506, "grad_norm": 51.51857479336736, "learning_rate": 1.1644911108130434e-07, "logits/chosen": -0.4365030825138092, "logits/rejected": -0.8091474175453186, "logps/chosen": -1.275532603263855, "logps/rejected": -2.5970406532287598, "loss": 1.5767, "nll_loss": 1.275532603263855, "rewards/accuracies": 0.875, "rewards/chosen": -12.755326271057129, "rewards/margins": 13.215079307556152, "rewards/rejected": -25.97040557861328, "step": 529 }, { "epoch": 0.7903075489282386, "grad_norm": 80.27794159826863, "learning_rate": 1.1486935375373124e-07, "logits/chosen": 1.818937063217163, "logits/rejected": 2.0481789112091064, "logps/chosen": -1.1100715398788452, "logps/rejected": -1.631326675415039, "loss": 1.4015, "nll_loss": 1.1100716590881348, "rewards/accuracies": 0.875, "rewards/chosen": -11.100716590881348, "rewards/margins": 5.212551593780518, "rewards/rejected": -16.313268661499023, "step": 530 }, { "epoch": 0.7917986952469711, "grad_norm": 76.1622733301883, "learning_rate": 1.1329899350151212e-07, "logits/chosen": 0.8246088624000549, "logits/rejected": 1.6595807075500488, "logps/chosen": -1.6400606632232666, "logps/rejected": -3.3556313514709473, "loss": 2.8103, "nll_loss": 1.6400606632232666, "rewards/accuracies": 0.875, "rewards/chosen": -16.400606155395508, "rewards/margins": 17.15570831298828, "rewards/rejected": -33.55632019042969, "step": 531 }, { "epoch": 0.7932898415657036, "grad_norm": 69.79747788603979, "learning_rate": 1.1173806864097884e-07, "logits/chosen": 0.603188693523407, "logits/rejected": 1.2856109142303467, "logps/chosen": -1.375902533531189, "logps/rejected": -2.999077081680298, "loss": 2.2676, "nll_loss": 1.3759024143218994, "rewards/accuracies": 1.0, "rewards/chosen": -13.759024620056152, "rewards/margins": 16.23174476623535, "rewards/rejected": -29.99077033996582, "step": 532 }, { "epoch": 0.7947809878844362, "grad_norm": 107.54460434268915, "learning_rate": 1.101866172582423e-07, "logits/chosen": -0.07562939822673798, "logits/rejected": 0.8102388381958008, "logps/chosen": -1.3312407732009888, "logps/rejected": -3.5981345176696777, "loss": 2.4654, "nll_loss": 1.3312406539916992, "rewards/accuracies": 0.875, "rewards/chosen": -13.312407493591309, "rewards/margins": 22.66893768310547, "rewards/rejected": -35.981346130371094, "step": 533 }, { "epoch": 0.7962721342031687, "grad_norm": 37.98279471066673, "learning_rate": 1.0864467720826343e-07, "logits/chosen": 2.0333845615386963, "logits/rejected": 1.3615199327468872, "logps/chosen": -0.9712334275245667, "logps/rejected": -2.4473283290863037, "loss": 1.73, "nll_loss": 0.9712334275245667, "rewards/accuracies": 1.0, "rewards/chosen": -9.712334632873535, "rewards/margins": 14.76095199584961, "rewards/rejected": -24.473285675048828, "step": 534 }, { "epoch": 0.7977632805219013, "grad_norm": 70.00952769733418, "learning_rate": 1.0711228611392936e-07, "logits/chosen": 1.8075847625732422, "logits/rejected": 2.0040998458862305, "logps/chosen": -1.8060308694839478, "logps/rejected": -2.689011573791504, "loss": 2.6005, "nll_loss": 1.8060306310653687, "rewards/accuracies": 0.875, "rewards/chosen": -18.0603084564209, "rewards/margins": 8.82980728149414, "rewards/rejected": -26.890113830566406, "step": 535 }, { "epoch": 0.7992544268406337, "grad_norm": 33.600323761305795, "learning_rate": 1.0558948136513534e-07, "logits/chosen": 1.3355344533920288, "logits/rejected": 1.2286067008972168, "logps/chosen": -1.0286279916763306, "logps/rejected": -1.7476571798324585, "loss": 1.7296, "nll_loss": 1.0286281108856201, "rewards/accuracies": 0.625, "rewards/chosen": -10.28628158569336, "rewards/margins": 7.190290451049805, "rewards/rejected": -17.476572036743164, "step": 536 }, { "epoch": 0.8007455731593662, "grad_norm": 39.83559136796347, "learning_rate": 1.0407630011787328e-07, "logits/chosen": 1.4175457954406738, "logits/rejected": 1.360532283782959, "logps/chosen": -1.1332502365112305, "logps/rejected": -4.271042346954346, "loss": 1.7862, "nll_loss": 1.1332502365112305, "rewards/accuracies": 1.0, "rewards/chosen": -11.332502365112305, "rewards/margins": 31.37792205810547, "rewards/rejected": -42.71042251586914, "step": 537 }, { "epoch": 0.8022367194780988, "grad_norm": 49.229915814260536, "learning_rate": 1.0257277929332331e-07, "logits/chosen": 0.09370435774326324, "logits/rejected": 0.3487294912338257, "logps/chosen": -1.0228612422943115, "logps/rejected": -2.6441450119018555, "loss": 1.7603, "nll_loss": 1.0228612422943115, "rewards/accuracies": 0.75, "rewards/chosen": -10.22861099243164, "rewards/margins": 16.212839126586914, "rewards/rejected": -26.441448211669922, "step": 538 }, { "epoch": 0.8037278657968313, "grad_norm": 43.20291464295552, "learning_rate": 1.0107895557695523e-07, "logits/chosen": 1.0065107345581055, "logits/rejected": 0.9046751856803894, "logps/chosen": -1.1206310987472534, "logps/rejected": -2.8322832584381104, "loss": 2.025, "nll_loss": 1.1206310987472534, "rewards/accuracies": 1.0, "rewards/chosen": -11.20631217956543, "rewards/margins": 17.116519927978516, "rewards/rejected": -28.322832107543945, "step": 539 }, { "epoch": 0.8052190121155638, "grad_norm": 87.85547545961866, "learning_rate": 9.959486541763118e-08, "logits/chosen": 0.5322209596633911, "logits/rejected": 1.0989861488342285, "logps/chosen": -1.4311912059783936, "logps/rejected": -8.2313871383667, "loss": 1.784, "nll_loss": 1.4311912059783936, "rewards/accuracies": 0.875, "rewards/chosen": -14.311910629272461, "rewards/margins": 68.00196838378906, "rewards/rejected": -82.3138656616211, "step": 540 }, { "epoch": 0.8067101584342964, "grad_norm": 35.18616471342836, "learning_rate": 9.812054502671834e-08, "logits/chosen": 1.0977833271026611, "logits/rejected": 1.6604937314987183, "logps/chosen": -1.293630838394165, "logps/rejected": -2.6169798374176025, "loss": 2.5025, "nll_loss": 1.2936309576034546, "rewards/accuracies": 0.75, "rewards/chosen": -12.936308860778809, "rewards/margins": 13.233489990234375, "rewards/rejected": -26.1697998046875, "step": 541 }, { "epoch": 0.8082013047530289, "grad_norm": 100.07499413913182, "learning_rate": 9.66560303772035e-08, "logits/chosen": 0.806122899055481, "logits/rejected": 1.437886118888855, "logps/chosen": -1.526921272277832, "logps/rejected": -1.9441941976547241, "loss": 2.3626, "nll_loss": 1.526921272277832, "rewards/accuracies": 0.5, "rewards/chosen": -15.269213676452637, "rewards/margins": 4.172728538513184, "rewards/rejected": -19.44194221496582, "step": 542 }, { "epoch": 0.8096924510717615, "grad_norm": 85.39766336575707, "learning_rate": 9.520135720281691e-08, "logits/chosen": 1.511439323425293, "logits/rejected": 1.9532711505889893, "logps/chosen": -1.13179612159729, "logps/rejected": -2.3110647201538086, "loss": 2.6602, "nll_loss": 1.1317960023880005, "rewards/accuracies": 1.0, "rewards/chosen": -11.317960739135742, "rewards/margins": 11.79268741607666, "rewards/rejected": -23.110647201538086, "step": 543 }, { "epoch": 0.811183597390494, "grad_norm": 103.46761333299857, "learning_rate": 9.375656099715934e-08, "logits/chosen": 1.2593380212783813, "logits/rejected": 1.5915199518203735, "logps/chosen": -1.6882553100585938, "logps/rejected": -2.8256163597106934, "loss": 1.8447, "nll_loss": 1.6882551908493042, "rewards/accuracies": 0.75, "rewards/chosen": -16.882553100585938, "rewards/margins": 11.373611450195312, "rewards/rejected": -28.256162643432617, "step": 544 }, { "epoch": 0.8126747437092264, "grad_norm": 55.77866402643874, "learning_rate": 9.23216770128365e-08, "logits/chosen": 1.0406886339187622, "logits/rejected": 0.8361440300941467, "logps/chosen": -1.4198107719421387, "logps/rejected": -1.9521433115005493, "loss": 2.1216, "nll_loss": 1.4198107719421387, "rewards/accuracies": 0.875, "rewards/chosen": -14.19810676574707, "rewards/margins": 5.323326587677002, "rewards/rejected": -19.521432876586914, "step": 545 }, { "epoch": 0.814165890027959, "grad_norm": 63.57895472466941, "learning_rate": 9.08967402605988e-08, "logits/chosen": 0.09123604744672775, "logits/rejected": -0.03876099735498428, "logps/chosen": -0.7894692420959473, "logps/rejected": -1.5298676490783691, "loss": 2.2162, "nll_loss": 0.7894692420959473, "rewards/accuracies": 0.875, "rewards/chosen": -7.894692420959473, "rewards/margins": 7.403984069824219, "rewards/rejected": -15.298676490783691, "step": 546 }, { "epoch": 0.8156570363466915, "grad_norm": 54.349500942594744, "learning_rate": 8.9481785508487e-08, "logits/chosen": 1.6667102575302124, "logits/rejected": 2.203434944152832, "logps/chosen": -1.2966077327728271, "logps/rejected": -3.049083948135376, "loss": 2.4018, "nll_loss": 1.2966077327728271, "rewards/accuracies": 0.875, "rewards/chosen": -12.966076850891113, "rewards/margins": 17.52476692199707, "rewards/rejected": -30.490842819213867, "step": 547 }, { "epoch": 0.817148182665424, "grad_norm": 54.9356955967697, "learning_rate": 8.807684728098419e-08, "logits/chosen": 1.4617446660995483, "logits/rejected": 2.135504961013794, "logps/chosen": -1.8497101068496704, "logps/rejected": -3.3187100887298584, "loss": 2.004, "nll_loss": 1.8497099876403809, "rewards/accuracies": 0.875, "rewards/chosen": -18.497100830078125, "rewards/margins": 14.689998626708984, "rewards/rejected": -33.187103271484375, "step": 548 }, { "epoch": 0.8186393289841566, "grad_norm": 86.27572672988894, "learning_rate": 8.668195985817289e-08, "logits/chosen": 1.0450414419174194, "logits/rejected": 1.7045292854309082, "logps/chosen": -1.3397154808044434, "logps/rejected": -2.9649925231933594, "loss": 2.3038, "nll_loss": 1.3397154808044434, "rewards/accuracies": 1.0, "rewards/chosen": -13.397154808044434, "rewards/margins": 16.252769470214844, "rewards/rejected": -29.649925231933594, "step": 549 }, { "epoch": 0.8201304753028891, "grad_norm": 56.54653825260291, "learning_rate": 8.529715727489912e-08, "logits/chosen": 0.3464043438434601, "logits/rejected": 0.28210508823394775, "logps/chosen": -1.1573177576065063, "logps/rejected": -2.432623863220215, "loss": 1.177, "nll_loss": 1.1573176383972168, "rewards/accuracies": 0.875, "rewards/chosen": -11.5731782913208, "rewards/margins": 12.753060340881348, "rewards/rejected": -24.32623863220215, "step": 550 }, { "epoch": 0.8216216216216217, "grad_norm": 62.44280686277024, "learning_rate": 8.392247331994173e-08, "logits/chosen": 1.1887741088867188, "logits/rejected": 1.7529797554016113, "logps/chosen": -0.8768868446350098, "logps/rejected": -2.667144775390625, "loss": 2.5923, "nll_loss": 0.8768867254257202, "rewards/accuracies": 0.625, "rewards/chosen": -8.768867492675781, "rewards/margins": 17.90258026123047, "rewards/rejected": -26.671445846557617, "step": 551 }, { "epoch": 0.8231127679403542, "grad_norm": 45.94550332790867, "learning_rate": 8.255794153518798e-08, "logits/chosen": 0.9571793079376221, "logits/rejected": 1.1607004404067993, "logps/chosen": -1.3052154779434204, "logps/rejected": -3.089205265045166, "loss": 2.5004, "nll_loss": 1.3052154779434204, "rewards/accuracies": 1.0, "rewards/chosen": -13.052156448364258, "rewards/margins": 17.83989715576172, "rewards/rejected": -30.892053604125977, "step": 552 }, { "epoch": 0.8246039142590866, "grad_norm": 124.92358818048629, "learning_rate": 8.120359521481501e-08, "logits/chosen": 0.836337149143219, "logits/rejected": 1.1571295261383057, "logps/chosen": -1.6450185775756836, "logps/rejected": -2.187589645385742, "loss": 2.6545, "nll_loss": 1.6450185775756836, "rewards/accuracies": 0.875, "rewards/chosen": -16.450185775756836, "rewards/margins": 5.425711154937744, "rewards/rejected": -21.875896453857422, "step": 553 }, { "epoch": 0.8260950605778192, "grad_norm": 44.49060840808658, "learning_rate": 7.985946740447791e-08, "logits/chosen": -0.2631034255027771, "logits/rejected": 0.145988330245018, "logps/chosen": -1.6813266277313232, "logps/rejected": -2.6556098461151123, "loss": 2.0102, "nll_loss": 1.6813266277313232, "rewards/accuracies": 0.625, "rewards/chosen": -16.81326675415039, "rewards/margins": 9.742830276489258, "rewards/rejected": -26.55609703063965, "step": 554 }, { "epoch": 0.8275862068965517, "grad_norm": 37.85218664393302, "learning_rate": 7.852559090050276e-08, "logits/chosen": 1.5902546644210815, "logits/rejected": 1.614829421043396, "logps/chosen": -1.2583115100860596, "logps/rejected": -2.5400302410125732, "loss": 1.4264, "nll_loss": 1.25831139087677, "rewards/accuracies": 0.875, "rewards/chosen": -12.583114624023438, "rewards/margins": 12.817188262939453, "rewards/rejected": -25.40030288696289, "step": 555 }, { "epoch": 0.8290773532152842, "grad_norm": 49.26407932935538, "learning_rate": 7.720199824908691e-08, "logits/chosen": 1.361399531364441, "logits/rejected": 1.409611463546753, "logps/chosen": -1.4241909980773926, "logps/rejected": -3.1376218795776367, "loss": 0.9169, "nll_loss": 1.424190878868103, "rewards/accuracies": 0.875, "rewards/chosen": -14.24190902709961, "rewards/margins": 17.13431167602539, "rewards/rejected": -31.376222610473633, "step": 556 }, { "epoch": 0.8305684995340168, "grad_norm": 31.238322178644815, "learning_rate": 7.588872174550498e-08, "logits/chosen": 0.7303067445755005, "logits/rejected": 1.2701952457427979, "logps/chosen": -1.468266248703003, "logps/rejected": -2.4732413291931152, "loss": 1.712, "nll_loss": 1.4682661294937134, "rewards/accuracies": 0.875, "rewards/chosen": -14.682662010192871, "rewards/margins": 10.049752235412598, "rewards/rejected": -24.73241424560547, "step": 557 }, { "epoch": 0.8320596458527493, "grad_norm": 38.209656730980115, "learning_rate": 7.458579343331995e-08, "logits/chosen": 1.2878832817077637, "logits/rejected": 1.2985243797302246, "logps/chosen": -1.9119257926940918, "logps/rejected": -3.6364402770996094, "loss": 1.634, "nll_loss": 1.911926031112671, "rewards/accuracies": 0.875, "rewards/chosen": -19.119258880615234, "rewards/margins": 17.24514389038086, "rewards/rejected": -36.364402770996094, "step": 558 }, { "epoch": 0.8335507921714819, "grad_norm": 66.1367570680353, "learning_rate": 7.329324510360269e-08, "logits/chosen": 0.8806648254394531, "logits/rejected": 1.2991594076156616, "logps/chosen": -1.3400239944458008, "logps/rejected": -2.963407516479492, "loss": 1.9285, "nll_loss": 1.3400241136550903, "rewards/accuracies": 1.0, "rewards/chosen": -13.400240898132324, "rewards/margins": 16.233837127685547, "rewards/rejected": -29.634078979492188, "step": 559 }, { "epoch": 0.8350419384902144, "grad_norm": 298.9857034157112, "learning_rate": 7.20111082941548e-08, "logits/chosen": 1.279545783996582, "logits/rejected": 1.127055048942566, "logps/chosen": -1.0575612783432007, "logps/rejected": -1.7720974683761597, "loss": 3.8, "nll_loss": 1.0575611591339111, "rewards/accuracies": 0.875, "rewards/chosen": -10.57561206817627, "rewards/margins": 7.145363807678223, "rewards/rejected": -17.720977783203125, "step": 560 }, { "epoch": 0.8365330848089468, "grad_norm": 41.34376803126447, "learning_rate": 7.073941428874064e-08, "logits/chosen": 1.465010643005371, "logits/rejected": 1.6831226348876953, "logps/chosen": -1.0001267194747925, "logps/rejected": -2.4494314193725586, "loss": 2.1194, "nll_loss": 1.0001269578933716, "rewards/accuracies": 0.875, "rewards/chosen": -10.00126838684082, "rewards/margins": 14.493046760559082, "rewards/rejected": -24.49431610107422, "step": 561 }, { "epoch": 0.8380242311276794, "grad_norm": 64.56056727469941, "learning_rate": 6.947819411632222e-08, "logits/chosen": 1.2089592218399048, "logits/rejected": 1.150889277458191, "logps/chosen": -1.1490910053253174, "logps/rejected": -2.86586594581604, "loss": 1.8347, "nll_loss": 1.1490910053253174, "rewards/accuracies": 0.875, "rewards/chosen": -11.490909576416016, "rewards/margins": 17.167747497558594, "rewards/rejected": -28.65865707397461, "step": 562 }, { "epoch": 0.8395153774464119, "grad_norm": 44.61651161600099, "learning_rate": 6.822747855030414e-08, "logits/chosen": 0.7507335543632507, "logits/rejected": 0.36784982681274414, "logps/chosen": -1.301298975944519, "logps/rejected": -2.050633430480957, "loss": 2.4966, "nll_loss": 1.3012988567352295, "rewards/accuracies": 0.875, "rewards/chosen": -13.012989044189453, "rewards/margins": 7.493346214294434, "rewards/rejected": -20.506336212158203, "step": 563 }, { "epoch": 0.8410065237651445, "grad_norm": 59.12945591715341, "learning_rate": 6.698729810778064e-08, "logits/chosen": 1.094009518623352, "logits/rejected": 1.172784686088562, "logps/chosen": -1.4129308462142944, "logps/rejected": -2.2906455993652344, "loss": 2.0096, "nll_loss": 1.412930965423584, "rewards/accuracies": 0.875, "rewards/chosen": -14.129308700561523, "rewards/margins": 8.777146339416504, "rewards/rejected": -22.906455993652344, "step": 564 }, { "epoch": 0.842497670083877, "grad_norm": 96.69133779250849, "learning_rate": 6.575768304879292e-08, "logits/chosen": 1.5224504470825195, "logits/rejected": 0.9588276147842407, "logps/chosen": -1.3630435466766357, "logps/rejected": -1.9835121631622314, "loss": 2.4146, "nll_loss": 1.3630435466766357, "rewards/accuracies": 0.75, "rewards/chosen": -13.630434036254883, "rewards/margins": 6.204684257507324, "rewards/rejected": -19.835119247436523, "step": 565 }, { "epoch": 0.8439888164026095, "grad_norm": 42.53739974720539, "learning_rate": 6.453866337558939e-08, "logits/chosen": 0.6651906371116638, "logits/rejected": 0.856502890586853, "logps/chosen": -1.211168646812439, "logps/rejected": -2.185823440551758, "loss": 2.0492, "nll_loss": 1.2111684083938599, "rewards/accuracies": 0.875, "rewards/chosen": -12.111686706542969, "rewards/margins": 9.746549606323242, "rewards/rejected": -21.858234405517578, "step": 566 }, { "epoch": 0.8454799627213421, "grad_norm": 54.55584240804619, "learning_rate": 6.333026883189424e-08, "logits/chosen": 1.4041742086410522, "logits/rejected": 1.2528077363967896, "logps/chosen": -1.6688252687454224, "logps/rejected": -3.5840983390808105, "loss": 1.9207, "nll_loss": 1.668825387954712, "rewards/accuracies": 1.0, "rewards/chosen": -16.688251495361328, "rewards/margins": 19.152727127075195, "rewards/rejected": -35.840980529785156, "step": 567 }, { "epoch": 0.8469711090400746, "grad_norm": 38.24669997132275, "learning_rate": 6.213252890218162e-08, "logits/chosen": 1.8699870109558105, "logits/rejected": 1.9853416681289673, "logps/chosen": -1.3241424560546875, "logps/rejected": -1.7929151058197021, "loss": 2.0296, "nll_loss": 1.324142575263977, "rewards/accuracies": 0.625, "rewards/chosen": -13.241425514221191, "rewards/margins": 4.687725067138672, "rewards/rejected": -17.929149627685547, "step": 568 }, { "epoch": 0.848462255358807, "grad_norm": 50.15694631335723, "learning_rate": 6.094547281095619e-08, "logits/chosen": 1.642357349395752, "logits/rejected": 1.6782177686691284, "logps/chosen": -1.40309476852417, "logps/rejected": -3.434246778488159, "loss": 1.5013, "nll_loss": 1.4030948877334595, "rewards/accuracies": 1.0, "rewards/chosen": -14.0309476852417, "rewards/margins": 20.311519622802734, "rewards/rejected": -34.342464447021484, "step": 569 }, { "epoch": 0.8499534016775396, "grad_norm": 71.49518736065667, "learning_rate": 5.976912952204016e-08, "logits/chosen": 1.4287097454071045, "logits/rejected": 1.656920313835144, "logps/chosen": -1.3490947484970093, "logps/rejected": -2.7284812927246094, "loss": 2.129, "nll_loss": 1.3490948677062988, "rewards/accuracies": 1.0, "rewards/chosen": -13.490946769714355, "rewards/margins": 13.793864250183105, "rewards/rejected": -27.284812927246094, "step": 570 }, { "epoch": 0.8514445479962721, "grad_norm": 32.25588380634021, "learning_rate": 5.8603527737866307e-08, "logits/chosen": 0.8476590514183044, "logits/rejected": 0.6341254711151123, "logps/chosen": -1.073239803314209, "logps/rejected": -3.493922710418701, "loss": 1.265, "nll_loss": 1.0732399225234985, "rewards/accuracies": 0.875, "rewards/chosen": -10.73239803314209, "rewards/margins": 24.206829071044922, "rewards/rejected": -34.93922805786133, "step": 571 }, { "epoch": 0.8529356943150047, "grad_norm": 53.60751501091694, "learning_rate": 5.7448695898778097e-08, "logits/chosen": 1.249593734741211, "logits/rejected": 1.2364052534103394, "logps/chosen": -1.1790369749069214, "logps/rejected": -6.050329208374023, "loss": 1.3328, "nll_loss": 1.1790369749069214, "rewards/accuracies": 0.875, "rewards/chosen": -11.790369987487793, "rewards/margins": 48.71292495727539, "rewards/rejected": -60.5032958984375, "step": 572 }, { "epoch": 0.8544268406337372, "grad_norm": 59.88563770735995, "learning_rate": 5.63046621823352e-08, "logits/chosen": 0.17915304005146027, "logits/rejected": 0.8957693576812744, "logps/chosen": -1.2315869331359863, "logps/rejected": -2.526779890060425, "loss": 1.785, "nll_loss": 1.2315869331359863, "rewards/accuracies": 0.625, "rewards/chosen": -12.315869331359863, "rewards/margins": 12.951930046081543, "rewards/rejected": -25.267799377441406, "step": 573 }, { "epoch": 0.8559179869524697, "grad_norm": 74.06734466100485, "learning_rate": 5.517145450262639e-08, "logits/chosen": 0.946685791015625, "logits/rejected": 0.8377312421798706, "logps/chosen": -0.8227486610412598, "logps/rejected": -4.0714111328125, "loss": 2.0619, "nll_loss": 0.8227487206459045, "rewards/accuracies": 0.875, "rewards/chosen": -8.227487564086914, "rewards/margins": 32.48662185668945, "rewards/rejected": -40.714107513427734, "step": 574 }, { "epoch": 0.8574091332712023, "grad_norm": 79.67704275085624, "learning_rate": 5.404910050958833e-08, "logits/chosen": 0.5169818997383118, "logits/rejected": 0.3395709991455078, "logps/chosen": -1.0307010412216187, "logps/rejected": -2.207871437072754, "loss": 2.3389, "nll_loss": 1.0307011604309082, "rewards/accuracies": 0.875, "rewards/chosen": -10.30700969696045, "rewards/margins": 11.771703720092773, "rewards/rejected": -22.07871437072754, "step": 575 }, { "epoch": 0.8589002795899348, "grad_norm": 55.25972369360697, "learning_rate": 5.29376275883307e-08, "logits/chosen": 1.3334778547286987, "logits/rejected": 1.3576440811157227, "logps/chosen": -1.3050498962402344, "logps/rejected": -2.214493751525879, "loss": 1.5946, "nll_loss": 1.305050015449524, "rewards/accuracies": 1.0, "rewards/chosen": -13.050498962402344, "rewards/margins": 9.094437599182129, "rewards/rejected": -22.144935607910156, "step": 576 }, { "epoch": 0.8603914259086672, "grad_norm": 66.70966380503094, "learning_rate": 5.183706285846873e-08, "logits/chosen": 1.4294792413711548, "logits/rejected": 1.2827816009521484, "logps/chosen": -1.2769086360931396, "logps/rejected": -2.087651491165161, "loss": 1.557, "nll_loss": 1.2769086360931396, "rewards/accuracies": 0.75, "rewards/chosen": -12.769086837768555, "rewards/margins": 8.107427597045898, "rewards/rejected": -20.876514434814453, "step": 577 }, { "epoch": 0.8618825722273998, "grad_norm": 98.46543978254562, "learning_rate": 5.0747433173460086e-08, "logits/chosen": 0.6869046688079834, "logits/rejected": 0.4922327697277069, "logps/chosen": -1.5533781051635742, "logps/rejected": -2.522840738296509, "loss": 2.2821, "nll_loss": 1.5533778667449951, "rewards/accuracies": 0.625, "rewards/chosen": -15.53377914428711, "rewards/margins": 9.694629669189453, "rewards/rejected": -25.228408813476562, "step": 578 }, { "epoch": 0.8633737185461323, "grad_norm": 81.29000068555673, "learning_rate": 4.966876511995149e-08, "logits/chosen": 1.6201094388961792, "logits/rejected": 2.0438897609710693, "logps/chosen": -1.243577480316162, "logps/rejected": -2.3286705017089844, "loss": 1.0166, "nll_loss": 1.243577480316162, "rewards/accuracies": 1.0, "rewards/chosen": -12.435773849487305, "rewards/margins": 10.850931167602539, "rewards/rejected": -23.286705017089844, "step": 579 }, { "epoch": 0.8648648648648649, "grad_norm": 40.303681390891704, "learning_rate": 4.860108501712823e-08, "logits/chosen": 0.5733177661895752, "logits/rejected": 0.7400199174880981, "logps/chosen": -1.0819733142852783, "logps/rejected": -2.0159566402435303, "loss": 1.8131, "nll_loss": 1.0819733142852783, "rewards/accuracies": 0.75, "rewards/chosen": -10.819732666015625, "rewards/margins": 9.339835166931152, "rewards/rejected": -20.159568786621094, "step": 580 }, { "epoch": 0.8663560111835974, "grad_norm": 47.867505185407104, "learning_rate": 4.754441891607347e-08, "logits/chosen": 0.6819798946380615, "logits/rejected": 1.1605446338653564, "logps/chosen": -1.33807373046875, "logps/rejected": -3.4481253623962402, "loss": 1.5911, "nll_loss": 1.338073492050171, "rewards/accuracies": 0.875, "rewards/chosen": -13.380736351013184, "rewards/margins": 21.100513458251953, "rewards/rejected": -34.48125076293945, "step": 581 }, { "epoch": 0.8678471575023299, "grad_norm": 39.98802770089898, "learning_rate": 4.649879259913136e-08, "logits/chosen": 0.2189992517232895, "logits/rejected": 0.5796717405319214, "logps/chosen": -1.145107388496399, "logps/rejected": -2.4502243995666504, "loss": 2.1808, "nll_loss": 1.145107388496399, "rewards/accuracies": 1.0, "rewards/chosen": -11.451074600219727, "rewards/margins": 13.051168441772461, "rewards/rejected": -24.502243041992188, "step": 582 }, { "epoch": 0.8693383038210625, "grad_norm": 61.294343400740814, "learning_rate": 4.54642315792792e-08, "logits/chosen": 0.9891291856765747, "logits/rejected": 0.7852681875228882, "logps/chosen": -1.4123867750167847, "logps/rejected": -2.1884164810180664, "loss": 2.0489, "nll_loss": 1.4123867750167847, "rewards/accuracies": 0.625, "rewards/chosen": -14.123867988586426, "rewards/margins": 7.7602972984313965, "rewards/rejected": -21.884164810180664, "step": 583 }, { "epoch": 0.870829450139795, "grad_norm": 131.9277214916375, "learning_rate": 4.4440761099503456e-08, "logits/chosen": 0.7502199411392212, "logits/rejected": 1.2810550928115845, "logps/chosen": -1.5025020837783813, "logps/rejected": -3.7406561374664307, "loss": 2.0273, "nll_loss": 1.5025020837783813, "rewards/accuracies": 0.875, "rewards/chosen": -15.025020599365234, "rewards/margins": 22.38153839111328, "rewards/rejected": -37.40655517578125, "step": 584 }, { "epoch": 0.8723205964585274, "grad_norm": 102.54915589860299, "learning_rate": 4.342840613218546e-08, "logits/chosen": 0.70381760597229, "logits/rejected": 0.3464395999908447, "logps/chosen": -1.022120714187622, "logps/rejected": -2.0207552909851074, "loss": 1.713, "nll_loss": 1.0221205949783325, "rewards/accuracies": 1.0, "rewards/chosen": -10.221206665039062, "rewards/margins": 9.986349105834961, "rewards/rejected": -20.207555770874023, "step": 585 }, { "epoch": 0.87381174277726, "grad_norm": 172.5828711123485, "learning_rate": 4.242719137849077e-08, "logits/chosen": 0.3403228521347046, "logits/rejected": 0.8271859884262085, "logps/chosen": -1.2993659973144531, "logps/rejected": -3.0384278297424316, "loss": 2.1327, "nll_loss": 1.2993658781051636, "rewards/accuracies": 1.0, "rewards/chosen": -12.993660926818848, "rewards/margins": 17.3906192779541, "rewards/rejected": -30.384279251098633, "step": 586 }, { "epoch": 0.8753028890959925, "grad_norm": 41.59906874697002, "learning_rate": 4.143714126776715e-08, "logits/chosen": 0.8385416865348816, "logits/rejected": 0.5051766633987427, "logps/chosen": -1.114525318145752, "logps/rejected": -2.0112345218658447, "loss": 2.0028, "nll_loss": 1.114525318145752, "rewards/accuracies": 0.875, "rewards/chosen": -11.145252227783203, "rewards/margins": 8.967092514038086, "rewards/rejected": -20.112346649169922, "step": 587 }, { "epoch": 0.8767940354147251, "grad_norm": 44.70675834424241, "learning_rate": 4.045827995694834e-08, "logits/chosen": 0.10920746624469757, "logits/rejected": 0.29233425855636597, "logps/chosen": -1.331228494644165, "logps/rejected": -2.7238054275512695, "loss": 1.9673, "nll_loss": 1.3312286138534546, "rewards/accuracies": 0.875, "rewards/chosen": -13.312284469604492, "rewards/margins": 13.92576789855957, "rewards/rejected": -27.238056182861328, "step": 588 }, { "epoch": 0.8782851817334576, "grad_norm": 70.80189184290701, "learning_rate": 3.9490631329964554e-08, "logits/chosen": 1.0415701866149902, "logits/rejected": 1.1610839366912842, "logps/chosen": -1.3345025777816772, "logps/rejected": -2.035282850265503, "loss": 2.3617, "nll_loss": 1.3345026969909668, "rewards/accuracies": 0.5, "rewards/chosen": -13.345026016235352, "rewards/margins": 7.0078020095825195, "rewards/rejected": -20.352828979492188, "step": 589 }, { "epoch": 0.8797763280521901, "grad_norm": 97.95046250851314, "learning_rate": 3.853421899715992e-08, "logits/chosen": 0.18456491827964783, "logits/rejected": 0.5277552008628845, "logps/chosen": -1.9759291410446167, "logps/rejected": -2.498504161834717, "loss": 3.3488, "nll_loss": 1.9759293794631958, "rewards/accuracies": 0.875, "rewards/chosen": -19.759288787841797, "rewards/margins": 5.2257513999938965, "rewards/rejected": -24.985044479370117, "step": 590 }, { "epoch": 0.8812674743709227, "grad_norm": 41.890934746252, "learning_rate": 3.758906629471614e-08, "logits/chosen": 0.46019110083580017, "logits/rejected": 0.36783766746520996, "logps/chosen": -0.7036344408988953, "logps/rejected": -2.221921920776367, "loss": 1.2762, "nll_loss": 0.70363450050354, "rewards/accuracies": 1.0, "rewards/chosen": -7.036343574523926, "rewards/margins": 15.182877540588379, "rewards/rejected": -22.219223022460938, "step": 591 }, { "epoch": 0.8827586206896552, "grad_norm": 123.64391132718144, "learning_rate": 3.665519628408331e-08, "logits/chosen": 0.2561766505241394, "logits/rejected": 0.9669838547706604, "logps/chosen": -0.9937634468078613, "logps/rejected": -3.312620162963867, "loss": 2.3651, "nll_loss": 0.9937633872032166, "rewards/accuracies": 1.0, "rewards/chosen": -9.937634468078613, "rewards/margins": 23.188566207885742, "rewards/rejected": -33.126197814941406, "step": 592 }, { "epoch": 0.8842497670083876, "grad_norm": 41.395426479146714, "learning_rate": 3.5732631751417054e-08, "logits/chosen": 1.6422443389892578, "logits/rejected": 1.427198052406311, "logps/chosen": -1.3927075862884521, "logps/rejected": -2.9072377681732178, "loss": 2.1218, "nll_loss": 1.3927075862884521, "rewards/accuracies": 1.0, "rewards/chosen": -13.92707633972168, "rewards/margins": 15.14530086517334, "rewards/rejected": -29.072378158569336, "step": 593 }, { "epoch": 0.8857409133271202, "grad_norm": 51.132575782347416, "learning_rate": 3.482139520702276e-08, "logits/chosen": 0.4521043300628662, "logits/rejected": 0.9026426076889038, "logps/chosen": -0.948829174041748, "logps/rejected": -2.1322081089019775, "loss": 2.4032, "nll_loss": 0.9488292336463928, "rewards/accuracies": 0.875, "rewards/chosen": -9.48829174041748, "rewards/margins": 11.833791732788086, "rewards/rejected": -21.32208251953125, "step": 594 }, { "epoch": 0.8872320596458527, "grad_norm": 50.644399265019686, "learning_rate": 3.39215088848061e-08, "logits/chosen": 0.24500666558742523, "logits/rejected": 0.5586891174316406, "logps/chosen": -1.1638610363006592, "logps/rejected": -2.017951726913452, "loss": 2.2166, "nll_loss": 1.1638609170913696, "rewards/accuracies": 0.875, "rewards/chosen": -11.638608932495117, "rewards/margins": 8.54090690612793, "rewards/rejected": -20.179515838623047, "step": 595 }, { "epoch": 0.8887232059645853, "grad_norm": 36.11216268799047, "learning_rate": 3.303299474173066e-08, "logits/chosen": 0.936069667339325, "logits/rejected": 1.1829633712768555, "logps/chosen": -1.165700912475586, "logps/rejected": -2.8161368370056152, "loss": 1.9274, "nll_loss": 1.1657007932662964, "rewards/accuracies": 1.0, "rewards/chosen": -11.657008171081543, "rewards/margins": 16.50436019897461, "rewards/rejected": -28.161367416381836, "step": 596 }, { "epoch": 0.8902143522833178, "grad_norm": 59.19934715323104, "learning_rate": 3.2155874457282185e-08, "logits/chosen": 0.3062177896499634, "logits/rejected": 0.34192609786987305, "logps/chosen": -0.7291023135185242, "logps/rejected": -3.108163356781006, "loss": 2.2441, "nll_loss": 0.7291023135185242, "rewards/accuracies": 1.0, "rewards/chosen": -7.291023254394531, "rewards/margins": 23.790611267089844, "rewards/rejected": -31.081634521484375, "step": 597 }, { "epoch": 0.8917054986020503, "grad_norm": 42.851544166935966, "learning_rate": 3.129016943293955e-08, "logits/chosen": 0.6586010456085205, "logits/rejected": 0.4904717206954956, "logps/chosen": -1.0113714933395386, "logps/rejected": -1.9497562646865845, "loss": 1.3841, "nll_loss": 1.011371374130249, "rewards/accuracies": 0.875, "rewards/chosen": -10.113714218139648, "rewards/margins": 9.383848190307617, "rewards/rejected": -19.497562408447266, "step": 598 }, { "epoch": 0.8931966449207829, "grad_norm": 56.49355049536372, "learning_rate": 3.043590079165281e-08, "logits/chosen": 0.10095011442899704, "logits/rejected": 0.2807294726371765, "logps/chosen": -1.176227331161499, "logps/rejected": -2.48679518699646, "loss": 1.7267, "nll_loss": 1.176227331161499, "rewards/accuracies": 0.875, "rewards/chosen": -11.762274742126465, "rewards/margins": 13.105676651000977, "rewards/rejected": -24.867952346801758, "step": 599 }, { "epoch": 0.8946877912395154, "grad_norm": 102.1023178737433, "learning_rate": 2.9593089377327242e-08, "logits/chosen": 1.2022994756698608, "logits/rejected": 1.4467451572418213, "logps/chosen": -0.6037580966949463, "logps/rejected": -2.4274349212646484, "loss": 2.5207, "nll_loss": 0.6037580966949463, "rewards/accuracies": 1.0, "rewards/chosen": -6.037580966949463, "rewards/margins": 18.236770629882812, "rewards/rejected": -24.274349212646484, "step": 600 }, { "epoch": 0.896178937558248, "grad_norm": 70.55892559060052, "learning_rate": 2.8761755754315663e-08, "logits/chosen": 1.6827621459960938, "logits/rejected": 1.490635871887207, "logps/chosen": -1.2804337739944458, "logps/rejected": -2.2912468910217285, "loss": 1.8508, "nll_loss": 1.2804336547851562, "rewards/accuracies": 1.0, "rewards/chosen": -12.804336547851562, "rewards/margins": 10.108132362365723, "rewards/rejected": -22.9124698638916, "step": 601 }, { "epoch": 0.8976700838769804, "grad_norm": 91.97458303860651, "learning_rate": 2.7941920206915436e-08, "logits/chosen": 1.8210875988006592, "logits/rejected": 2.193286418914795, "logps/chosen": -1.0207717418670654, "logps/rejected": -2.7276382446289062, "loss": 2.1616, "nll_loss": 1.0207717418670654, "rewards/accuracies": 1.0, "rewards/chosen": -10.207717895507812, "rewards/margins": 17.06866455078125, "rewards/rejected": -27.276384353637695, "step": 602 }, { "epoch": 0.8991612301957129, "grad_norm": 320.06858059071607, "learning_rate": 2.7133602738874995e-08, "logits/chosen": 0.7025165557861328, "logits/rejected": 0.778084397315979, "logps/chosen": -1.287645697593689, "logps/rejected": -4.223053932189941, "loss": 2.0365, "nll_loss": 1.2876458168029785, "rewards/accuracies": 1.0, "rewards/chosen": -12.876458168029785, "rewards/margins": 29.354087829589844, "rewards/rejected": -42.23053741455078, "step": 603 }, { "epoch": 0.9006523765144455, "grad_norm": 99.66976447018196, "learning_rate": 2.63368230729043e-08, "logits/chosen": 1.1805634498596191, "logits/rejected": 0.9838704466819763, "logps/chosen": -1.0174989700317383, "logps/rejected": -2.3494932651519775, "loss": 1.7424, "nll_loss": 1.0174988508224487, "rewards/accuracies": 0.875, "rewards/chosen": -10.174989700317383, "rewards/margins": 13.319944381713867, "rewards/rejected": -23.49493408203125, "step": 604 }, { "epoch": 0.902143522833178, "grad_norm": 74.51528491716375, "learning_rate": 2.5551600650194906e-08, "logits/chosen": 0.9548903703689575, "logits/rejected": 1.0899478197097778, "logps/chosen": -1.306660532951355, "logps/rejected": -2.837043046951294, "loss": 2.1283, "nll_loss": 1.306660532951355, "rewards/accuracies": 0.625, "rewards/chosen": -13.066605567932129, "rewards/margins": 15.303826332092285, "rewards/rejected": -28.37042999267578, "step": 605 }, { "epoch": 0.9036346691519105, "grad_norm": 51.993252521508545, "learning_rate": 2.4777954629944475e-08, "logits/chosen": 0.8407946825027466, "logits/rejected": 0.9599366784095764, "logps/chosen": -1.164607286453247, "logps/rejected": -2.8004937171936035, "loss": 1.8249, "nll_loss": 1.164607286453247, "rewards/accuracies": 1.0, "rewards/chosen": -11.646072387695312, "rewards/margins": 16.358863830566406, "rewards/rejected": -28.00493812561035, "step": 606 }, { "epoch": 0.9051258154706431, "grad_norm": 62.129272211552866, "learning_rate": 2.4015903888890242e-08, "logits/chosen": 1.026720643043518, "logits/rejected": 1.730891227722168, "logps/chosen": -1.6653600931167603, "logps/rejected": -3.1316123008728027, "loss": 1.8785, "nll_loss": 1.6653600931167603, "rewards/accuracies": 1.0, "rewards/chosen": -16.65359878540039, "rewards/margins": 14.662520408630371, "rewards/rejected": -31.316120147705078, "step": 607 }, { "epoch": 0.9066169617893756, "grad_norm": 64.74965431651496, "learning_rate": 2.3265467020847863e-08, "logits/chosen": 0.856157124042511, "logits/rejected": 1.233034372329712, "logps/chosen": -0.943716824054718, "logps/rejected": -1.8875681161880493, "loss": 1.3686, "nll_loss": 0.943716824054718, "rewards/accuracies": 0.875, "rewards/chosen": -9.437169075012207, "rewards/margins": 9.43851375579834, "rewards/rejected": -18.875682830810547, "step": 608 }, { "epoch": 0.9081081081081082, "grad_norm": 78.35000696177752, "learning_rate": 2.2526662336257828e-08, "logits/chosen": 1.7188822031021118, "logits/rejected": 2.068807601928711, "logps/chosen": -1.2436611652374268, "logps/rejected": -5.058825492858887, "loss": 2.7311, "nll_loss": 1.2436611652374268, "rewards/accuracies": 0.875, "rewards/chosen": -12.436612129211426, "rewards/margins": 38.15163803100586, "rewards/rejected": -50.58824920654297, "step": 609 }, { "epoch": 0.9095992544268406, "grad_norm": 51.12329553486554, "learning_rate": 2.1799507861738788e-08, "logits/chosen": 1.5251386165618896, "logits/rejected": 0.9365883469581604, "logps/chosen": -0.9077203273773193, "logps/rejected": -2.4619970321655273, "loss": 1.861, "nll_loss": 0.9077203273773193, "rewards/accuracies": 0.875, "rewards/chosen": -9.077203750610352, "rewards/margins": 15.542766571044922, "rewards/rejected": -24.61996841430664, "step": 610 }, { "epoch": 0.9110904007455731, "grad_norm": 61.222416516366145, "learning_rate": 2.1084021339647707e-08, "logits/chosen": 0.5966976881027222, "logits/rejected": 0.9337188601493835, "logps/chosen": -1.2190940380096436, "logps/rejected": -3.3994054794311523, "loss": 2.4781, "nll_loss": 1.219094157218933, "rewards/accuracies": 0.75, "rewards/chosen": -12.190940856933594, "rewards/margins": 21.803115844726562, "rewards/rejected": -33.994056701660156, "step": 611 }, { "epoch": 0.9125815470643057, "grad_norm": 49.41778458085236, "learning_rate": 2.038022022764685e-08, "logits/chosen": 1.6303707361221313, "logits/rejected": 1.7657480239868164, "logps/chosen": -1.535523533821106, "logps/rejected": -3.0815024375915527, "loss": 2.1474, "nll_loss": 1.5355234146118164, "rewards/accuracies": 0.875, "rewards/chosen": -15.355234146118164, "rewards/margins": 15.459792137145996, "rewards/rejected": -30.815027236938477, "step": 612 }, { "epoch": 0.9140726933830382, "grad_norm": 61.05392998371482, "learning_rate": 1.9688121698277993e-08, "logits/chosen": 1.0834482908248901, "logits/rejected": 1.8084291219711304, "logps/chosen": -1.3302435874938965, "logps/rejected": -3.0064194202423096, "loss": 1.4181, "nll_loss": 1.330243706703186, "rewards/accuracies": 1.0, "rewards/chosen": -13.302435874938965, "rewards/margins": 16.761760711669922, "rewards/rejected": -30.06419563293457, "step": 613 }, { "epoch": 0.9155638397017707, "grad_norm": 34.21776098764438, "learning_rate": 1.90077426385431e-08, "logits/chosen": 0.7245126962661743, "logits/rejected": 0.22384954988956451, "logps/chosen": -1.3143211603164673, "logps/rejected": -1.861713171005249, "loss": 1.8045, "nll_loss": 1.3143210411071777, "rewards/accuracies": 0.75, "rewards/chosen": -13.143211364746094, "rewards/margins": 5.4739227294921875, "rewards/rejected": -18.61713218688965, "step": 614 }, { "epoch": 0.9170549860205033, "grad_norm": 42.5324206312152, "learning_rate": 1.8339099649492762e-08, "logits/chosen": 0.3494272232055664, "logits/rejected": 0.9441218376159668, "logps/chosen": -1.5409001111984253, "logps/rejected": -3.023975372314453, "loss": 2.1119, "nll_loss": 1.5409001111984253, "rewards/accuracies": 0.75, "rewards/chosen": -15.409000396728516, "rewards/margins": 14.83074951171875, "rewards/rejected": -30.239749908447266, "step": 615 }, { "epoch": 0.9185461323392358, "grad_norm": 43.06626427761403, "learning_rate": 1.7682209045820684e-08, "logits/chosen": 0.9627107977867126, "logits/rejected": 1.0145137310028076, "logps/chosen": -0.9481117725372314, "logps/rejected": -2.1838059425354004, "loss": 2.1317, "nll_loss": 0.9481117725372314, "rewards/accuracies": 1.0, "rewards/chosen": -9.481118202209473, "rewards/margins": 12.356941223144531, "rewards/rejected": -21.838058471679688, "step": 616 }, { "epoch": 0.9200372786579684, "grad_norm": 178.43839248334547, "learning_rate": 1.7037086855465898e-08, "logits/chosen": 0.6895330548286438, "logits/rejected": 0.6384507417678833, "logps/chosen": -1.1706783771514893, "logps/rejected": -1.7648053169250488, "loss": 2.2996, "nll_loss": 1.1706783771514893, "rewards/accuracies": 0.75, "rewards/chosen": -11.706781387329102, "rewards/margins": 5.94127082824707, "rewards/rejected": -17.648052215576172, "step": 617 }, { "epoch": 0.9215284249767008, "grad_norm": 66.09270295798773, "learning_rate": 1.6403748819221462e-08, "logits/chosen": 0.9936915040016174, "logits/rejected": 1.0178120136260986, "logps/chosen": -1.1915603876113892, "logps/rejected": -1.6295900344848633, "loss": 2.3494, "nll_loss": 1.1915605068206787, "rewards/accuracies": 0.75, "rewards/chosen": -11.915602684020996, "rewards/margins": 4.38029670715332, "rewards/rejected": -16.295902252197266, "step": 618 }, { "epoch": 0.9230195712954333, "grad_norm": 72.23670774806365, "learning_rate": 1.5782210390350713e-08, "logits/chosen": 0.9415168762207031, "logits/rejected": 0.9538595080375671, "logps/chosen": -1.3151698112487793, "logps/rejected": -2.0880651473999023, "loss": 1.289, "nll_loss": 1.3151699304580688, "rewards/accuracies": 0.75, "rewards/chosen": -13.15169906616211, "rewards/margins": 7.728951930999756, "rewards/rejected": -20.880651473999023, "step": 619 }, { "epoch": 0.9245107176141659, "grad_norm": 517.5186319293753, "learning_rate": 1.5172486734209788e-08, "logits/chosen": 1.1598143577575684, "logits/rejected": 1.4311909675598145, "logps/chosen": -1.2884063720703125, "logps/rejected": -2.6638433933258057, "loss": 3.2189, "nll_loss": 1.2884063720703125, "rewards/accuracies": 0.75, "rewards/chosen": -12.884064674377441, "rewards/margins": 13.75437068939209, "rewards/rejected": -26.63843536376953, "step": 620 }, { "epoch": 0.9260018639328984, "grad_norm": 48.93138506507298, "learning_rate": 1.4574592727878088e-08, "logits/chosen": 1.112391471862793, "logits/rejected": 1.7636058330535889, "logps/chosen": -1.2476575374603271, "logps/rejected": -2.744623899459839, "loss": 1.6136, "nll_loss": 1.2476575374603271, "rewards/accuracies": 0.875, "rewards/chosen": -12.476574897766113, "rewards/margins": 14.9696626663208, "rewards/rejected": -27.446237564086914, "step": 621 }, { "epoch": 0.9274930102516309, "grad_norm": 69.13203966426812, "learning_rate": 1.3988542959794625e-08, "logits/chosen": 1.5924474000930786, "logits/rejected": 1.2295907735824585, "logps/chosen": -1.1291077136993408, "logps/rejected": -1.6911044120788574, "loss": 2.1184, "nll_loss": 1.1291077136993408, "rewards/accuracies": 0.625, "rewards/chosen": -11.291078567504883, "rewards/margins": 5.619965076446533, "rewards/rejected": -16.911041259765625, "step": 622 }, { "epoch": 0.9289841565703635, "grad_norm": 53.60782937532031, "learning_rate": 1.3414351729402863e-08, "logits/chosen": 0.5311175584793091, "logits/rejected": 0.696600079536438, "logps/chosen": -1.6392128467559814, "logps/rejected": -2.6737866401672363, "loss": 1.3688, "nll_loss": 1.6392128467559814, "rewards/accuracies": 0.875, "rewards/chosen": -16.39212989807129, "rewards/margins": 10.34573745727539, "rewards/rejected": -26.737865447998047, "step": 623 }, { "epoch": 0.930475302889096, "grad_norm": 66.6061457553233, "learning_rate": 1.2852033046801104e-08, "logits/chosen": 1.6591801643371582, "logits/rejected": 1.7488484382629395, "logps/chosen": -1.3164441585540771, "logps/rejected": -3.4187371730804443, "loss": 2.2486, "nll_loss": 1.3164441585540771, "rewards/accuracies": 1.0, "rewards/chosen": -13.164440155029297, "rewards/margins": 21.022930145263672, "rewards/rejected": -34.18737030029297, "step": 624 }, { "epoch": 0.9319664492078286, "grad_norm": 76.24305169308032, "learning_rate": 1.230160063240121e-08, "logits/chosen": 0.9626173973083496, "logits/rejected": 0.7872528433799744, "logps/chosen": -1.2172050476074219, "logps/rejected": -2.30886173248291, "loss": 1.8232, "nll_loss": 1.2172050476074219, "rewards/accuracies": 0.875, "rewards/chosen": -12.172050476074219, "rewards/margins": 10.916565895080566, "rewards/rejected": -23.0886173248291, "step": 625 }, { "epoch": 0.933457595526561, "grad_norm": 65.28519911178785, "learning_rate": 1.176306791659326e-08, "logits/chosen": 0.561115026473999, "logits/rejected": 0.9786922931671143, "logps/chosen": -1.3929322957992554, "logps/rejected": -3.053676128387451, "loss": 2.4585, "nll_loss": 1.3929322957992554, "rewards/accuracies": 0.75, "rewards/chosen": -13.9293212890625, "rewards/margins": 16.607440948486328, "rewards/rejected": -30.536762237548828, "step": 626 }, { "epoch": 0.9349487418452935, "grad_norm": 71.50712565768815, "learning_rate": 1.1236448039418423e-08, "logits/chosen": 0.45714130997657776, "logits/rejected": 0.8796458840370178, "logps/chosen": -1.5751104354858398, "logps/rejected": -4.103085994720459, "loss": 1.451, "nll_loss": 1.5751101970672607, "rewards/accuracies": 0.875, "rewards/chosen": -15.751103401184082, "rewards/margins": 25.279754638671875, "rewards/rejected": -41.030860900878906, "step": 627 }, { "epoch": 0.9364398881640261, "grad_norm": 61.241804371004626, "learning_rate": 1.0721753850247984e-08, "logits/chosen": 0.9310768842697144, "logits/rejected": 0.5946216583251953, "logps/chosen": -1.1604797840118408, "logps/rejected": -2.7220888137817383, "loss": 2.3107, "nll_loss": 1.1604797840118408, "rewards/accuracies": 0.75, "rewards/chosen": -11.604798316955566, "rewards/margins": 15.6160888671875, "rewards/rejected": -27.22088623046875, "step": 628 }, { "epoch": 0.9379310344827586, "grad_norm": 121.50896184894009, "learning_rate": 1.021899790746994e-08, "logits/chosen": 0.08384272456169128, "logits/rejected": 0.2588069438934326, "logps/chosen": -0.8513846397399902, "logps/rejected": -2.030687093734741, "loss": 2.3264, "nll_loss": 0.8513847589492798, "rewards/accuracies": 0.875, "rewards/chosen": -8.513846397399902, "rewards/margins": 11.793025016784668, "rewards/rejected": -20.306873321533203, "step": 629 }, { "epoch": 0.9394221808014911, "grad_norm": 35.10139854500652, "learning_rate": 9.728192478182573e-09, "logits/chosen": 1.4342637062072754, "logits/rejected": 1.5611820220947266, "logps/chosen": -1.205596685409546, "logps/rejected": -2.7099761962890625, "loss": 1.568, "nll_loss": 1.205596685409546, "rewards/accuracies": 0.875, "rewards/chosen": -12.055967330932617, "rewards/margins": 15.043792724609375, "rewards/rejected": -27.099760055541992, "step": 630 }, { "epoch": 0.9409133271202237, "grad_norm": 29.967230346920555, "learning_rate": 9.249349537894968e-09, "logits/chosen": 0.3210725784301758, "logits/rejected": 0.6524243354797363, "logps/chosen": -0.7849727272987366, "logps/rejected": -2.915769338607788, "loss": 1.5934, "nll_loss": 0.784972608089447, "rewards/accuracies": 1.0, "rewards/chosen": -7.849726676940918, "rewards/margins": 21.307964324951172, "rewards/rejected": -29.15769386291504, "step": 631 }, { "epoch": 0.9424044734389562, "grad_norm": 141.74255777816967, "learning_rate": 8.782480770235246e-09, "logits/chosen": 0.3069119453430176, "logits/rejected": 0.485944926738739, "logps/chosen": -1.1904934644699097, "logps/rejected": -3.739229440689087, "loss": 2.0276, "nll_loss": 1.1904934644699097, "rewards/accuracies": 1.0, "rewards/chosen": -11.904936790466309, "rewards/margins": 25.487361907958984, "rewards/rejected": -37.392295837402344, "step": 632 }, { "epoch": 0.9438956197576888, "grad_norm": 60.527308659160475, "learning_rate": 8.327597566665013e-09, "logits/chosen": 1.219635009765625, "logits/rejected": 1.583566427230835, "logps/chosen": -1.4690210819244385, "logps/rejected": -2.254682779312134, "loss": 2.6834, "nll_loss": 1.4690210819244385, "rewards/accuracies": 0.75, "rewards/chosen": -14.690210342407227, "rewards/margins": 7.856616973876953, "rewards/rejected": -22.54682731628418, "step": 633 }, { "epoch": 0.9453867660764212, "grad_norm": 56.10853338090398, "learning_rate": 7.884711026201584e-09, "logits/chosen": 1.9719793796539307, "logits/rejected": 2.1581757068634033, "logps/chosen": -1.436596393585205, "logps/rejected": -2.7122642993927, "loss": 1.6313, "nll_loss": 1.436596393585205, "rewards/accuracies": 1.0, "rewards/chosen": -14.36596393585205, "rewards/margins": 12.75667667388916, "rewards/rejected": -27.122638702392578, "step": 634 }, { "epoch": 0.9468779123951537, "grad_norm": 37.272532661154244, "learning_rate": 7.453831955147428e-09, "logits/chosen": 0.12578025460243225, "logits/rejected": -0.2529383897781372, "logps/chosen": -1.0211150646209717, "logps/rejected": -2.2397446632385254, "loss": 1.4805, "nll_loss": 1.0211150646209717, "rewards/accuracies": 1.0, "rewards/chosen": -10.211149215698242, "rewards/margins": 12.186296463012695, "rewards/rejected": -22.397445678710938, "step": 635 }, { "epoch": 0.9483690587138863, "grad_norm": 56.88066251632034, "learning_rate": 7.034970866825973e-09, "logits/chosen": 0.7423490285873413, "logits/rejected": 0.7912289500236511, "logps/chosen": -1.2308918237686157, "logps/rejected": -1.750805139541626, "loss": 1.4598, "nll_loss": 1.2308918237686157, "rewards/accuracies": 0.875, "rewards/chosen": -12.308917999267578, "rewards/margins": 5.19913387298584, "rewards/rejected": -17.508052825927734, "step": 636 }, { "epoch": 0.9498602050326188, "grad_norm": 113.66211780898051, "learning_rate": 6.62813798132561e-09, "logits/chosen": 1.0367634296417236, "logits/rejected": 0.7382550835609436, "logps/chosen": -1.2447309494018555, "logps/rejected": -1.7634875774383545, "loss": 3.0594, "nll_loss": 1.2447309494018555, "rewards/accuracies": 0.75, "rewards/chosen": -12.447310447692871, "rewards/margins": 5.187565326690674, "rewards/rejected": -17.634876251220703, "step": 637 }, { "epoch": 0.9513513513513514, "grad_norm": 63.164427816842846, "learning_rate": 6.233343225249932e-09, "logits/chosen": 1.2581459283828735, "logits/rejected": 1.399833083152771, "logps/chosen": -0.9300872683525085, "logps/rejected": -1.7020875215530396, "loss": 2.3141, "nll_loss": 0.9300872683525085, "rewards/accuracies": 0.875, "rewards/chosen": -9.300872802734375, "rewards/margins": 7.7200026512146, "rewards/rejected": -17.0208740234375, "step": 638 }, { "epoch": 0.9528424976700839, "grad_norm": 116.17240382010048, "learning_rate": 5.850596231475768e-09, "logits/chosen": 1.2323070764541626, "logits/rejected": 1.4736135005950928, "logps/chosen": -1.9990520477294922, "logps/rejected": -2.0188441276550293, "loss": 2.3635, "nll_loss": 1.9990520477294922, "rewards/accuracies": 0.5, "rewards/chosen": -19.990522384643555, "rewards/margins": 0.19792091846466064, "rewards/rejected": -20.18844223022461, "step": 639 }, { "epoch": 0.9543336439888164, "grad_norm": 148.9927846719541, "learning_rate": 5.4799063389179834e-09, "logits/chosen": 0.46052801609039307, "logits/rejected": 0.3371131420135498, "logps/chosen": -1.3033381700515747, "logps/rejected": -2.339939832687378, "loss": 2.2474, "nll_loss": 1.3033380508422852, "rewards/accuracies": 0.625, "rewards/chosen": -13.033382415771484, "rewards/margins": 10.36601448059082, "rewards/rejected": -23.399396896362305, "step": 640 }, { "epoch": 0.955824790307549, "grad_norm": 48.0116134291069, "learning_rate": 5.1212825923019345e-09, "logits/chosen": 0.8156238794326782, "logits/rejected": 1.123835563659668, "logps/chosen": -1.2078592777252197, "logps/rejected": -4.5481648445129395, "loss": 1.7785, "nll_loss": 1.2078593969345093, "rewards/accuracies": 1.0, "rewards/chosen": -12.078592300415039, "rewards/margins": 33.40306091308594, "rewards/rejected": -45.481651306152344, "step": 641 }, { "epoch": 0.9573159366262814, "grad_norm": 40.00712354284295, "learning_rate": 4.7747337419422054e-09, "logits/chosen": 0.6781156063079834, "logits/rejected": 1.3144218921661377, "logps/chosen": -1.5253450870513916, "logps/rejected": -3.2332403659820557, "loss": 2.1828, "nll_loss": 1.5253452062606812, "rewards/accuracies": 1.0, "rewards/chosen": -15.253450393676758, "rewards/margins": 17.078954696655273, "rewards/rejected": -32.33240509033203, "step": 642 }, { "epoch": 0.9588070829450139, "grad_norm": 80.79198208372269, "learning_rate": 4.440268243529666e-09, "logits/chosen": 0.9073221683502197, "logits/rejected": 0.5689750909805298, "logps/chosen": -0.7339849472045898, "logps/rejected": -1.8821582794189453, "loss": 1.6134, "nll_loss": 0.7339848279953003, "rewards/accuracies": 0.75, "rewards/chosen": -7.33984899520874, "rewards/margins": 11.481732368469238, "rewards/rejected": -18.82158088684082, "step": 643 }, { "epoch": 0.9602982292637465, "grad_norm": 90.62829032832231, "learning_rate": 4.117894257924803e-09, "logits/chosen": 0.46462368965148926, "logits/rejected": 0.5671396255493164, "logps/chosen": -1.1450039148330688, "logps/rejected": -2.218526601791382, "loss": 2.9791, "nll_loss": 1.1450039148330688, "rewards/accuracies": 1.0, "rewards/chosen": -11.450039863586426, "rewards/margins": 10.735227584838867, "rewards/rejected": -22.185266494750977, "step": 644 }, { "epoch": 0.961789375582479, "grad_norm": 53.71551572686904, "learning_rate": 3.807619650958827e-09, "logits/chosen": 0.6184532642364502, "logits/rejected": 0.8686254024505615, "logps/chosen": -1.2457393407821655, "logps/rejected": -3.1474404335021973, "loss": 1.738, "nll_loss": 1.2457393407821655, "rewards/accuracies": 1.0, "rewards/chosen": -12.457393646240234, "rewards/margins": 19.01700782775879, "rewards/rejected": -31.474403381347656, "step": 645 }, { "epoch": 0.9632805219012116, "grad_norm": 53.90284021390508, "learning_rate": 3.509451993241541e-09, "logits/chosen": 0.43453526496887207, "logits/rejected": 0.8017415404319763, "logps/chosen": -1.391627311706543, "logps/rejected": -2.1751885414123535, "loss": 1.8167, "nll_loss": 1.3916271924972534, "rewards/accuracies": 0.625, "rewards/chosen": -13.916272163391113, "rewards/margins": 7.835611343383789, "rewards/rejected": -21.75188446044922, "step": 646 }, { "epoch": 0.9647716682199441, "grad_norm": 52.50485960299144, "learning_rate": 3.22339855997672e-09, "logits/chosen": 0.2412007749080658, "logits/rejected": 0.03859227895736694, "logps/chosen": -1.3658912181854248, "logps/rejected": -2.9418210983276367, "loss": 1.9701, "nll_loss": 1.3658912181854248, "rewards/accuracies": 1.0, "rewards/chosen": -13.65891170501709, "rewards/margins": 15.759300231933594, "rewards/rejected": -29.418210983276367, "step": 647 }, { "epoch": 0.9662628145386766, "grad_norm": 34.81335059745858, "learning_rate": 2.9494663307847443e-09, "logits/chosen": 1.0914424657821655, "logits/rejected": 1.1791269779205322, "logps/chosen": -1.3588266372680664, "logps/rejected": -2.738715887069702, "loss": 1.9192, "nll_loss": 1.3588268756866455, "rewards/accuracies": 1.0, "rewards/chosen": -13.588268280029297, "rewards/margins": 13.798890113830566, "rewards/rejected": -27.38715934753418, "step": 648 }, { "epoch": 0.9677539608574092, "grad_norm": 69.14758369873154, "learning_rate": 2.687661989531964e-09, "logits/chosen": 0.7465240359306335, "logits/rejected": 1.6419605016708374, "logps/chosen": -1.1785346269607544, "logps/rejected": -2.6736412048339844, "loss": 2.6972, "nll_loss": 1.1785345077514648, "rewards/accuracies": 1.0, "rewards/chosen": -11.785346984863281, "rewards/margins": 14.951066970825195, "rewards/rejected": -26.736412048339844, "step": 649 }, { "epoch": 0.9692451071761417, "grad_norm": 55.38444741315943, "learning_rate": 2.437991924167937e-09, "logits/chosen": 1.2680692672729492, "logits/rejected": 1.549578070640564, "logps/chosen": -0.9802628755569458, "logps/rejected": -2.6409151554107666, "loss": 1.8713, "nll_loss": 0.9802627563476562, "rewards/accuracies": 1.0, "rewards/chosen": -9.802628517150879, "rewards/margins": 16.606521606445312, "rewards/rejected": -26.40915298461914, "step": 650 }, { "epoch": 0.9707362534948741, "grad_norm": 43.159724090253455, "learning_rate": 2.2004622265693882e-09, "logits/chosen": 1.2227438688278198, "logits/rejected": 1.2243788242340088, "logps/chosen": -1.2036027908325195, "logps/rejected": -2.386033296585083, "loss": 1.9318, "nll_loss": 1.2036027908325195, "rewards/accuracies": 0.875, "rewards/chosen": -12.036026000976562, "rewards/margins": 11.82430648803711, "rewards/rejected": -23.860336303710938, "step": 651 }, { "epoch": 0.9722273998136067, "grad_norm": 56.44120590824913, "learning_rate": 1.975078692391552e-09, "logits/chosen": 0.7070285081863403, "logits/rejected": 0.6214600801467896, "logps/chosen": -1.1754833459854126, "logps/rejected": -4.710268497467041, "loss": 1.9201, "nll_loss": 1.175483226776123, "rewards/accuracies": 1.0, "rewards/chosen": -11.75483226776123, "rewards/margins": 35.34785461425781, "rewards/rejected": -47.102684020996094, "step": 652 }, { "epoch": 0.9737185461323392, "grad_norm": 51.00916882401214, "learning_rate": 1.7618468209268933e-09, "logits/chosen": 1.83562433719635, "logits/rejected": 1.3902753591537476, "logps/chosen": -1.349332332611084, "logps/rejected": -5.075465202331543, "loss": 2.4647, "nll_loss": 1.349332332611084, "rewards/accuracies": 0.875, "rewards/chosen": -13.493322372436523, "rewards/margins": 37.261329650878906, "rewards/rejected": -50.75465774536133, "step": 653 }, { "epoch": 0.9752096924510718, "grad_norm": 173.61554484377376, "learning_rate": 1.5607718149708848e-09, "logits/chosen": 1.0174025297164917, "logits/rejected": 1.0746914148330688, "logps/chosen": -1.3610864877700806, "logps/rejected": -3.2821123600006104, "loss": 3.1357, "nll_loss": 1.361086368560791, "rewards/accuracies": 0.75, "rewards/chosen": -13.610864639282227, "rewards/margins": 19.21026039123535, "rewards/rejected": -32.821128845214844, "step": 654 }, { "epoch": 0.9767008387698043, "grad_norm": 73.2726059205274, "learning_rate": 1.37185858069494e-09, "logits/chosen": 1.0790843963623047, "logits/rejected": 0.8912894129753113, "logps/chosen": -1.2312716245651245, "logps/rejected": -2.1072559356689453, "loss": 1.3087, "nll_loss": 1.2312716245651245, "rewards/accuracies": 0.75, "rewards/chosen": -12.31271743774414, "rewards/margins": 8.759840965270996, "rewards/rejected": -21.07255744934082, "step": 655 }, { "epoch": 0.9781919850885368, "grad_norm": 37.979537721749395, "learning_rate": 1.195111727526843e-09, "logits/chosen": 0.765521764755249, "logits/rejected": 1.9364503622055054, "logps/chosen": -0.838382363319397, "logps/rejected": -4.206418514251709, "loss": 1.2937, "nll_loss": 0.838382363319397, "rewards/accuracies": 1.0, "rewards/chosen": -8.38382339477539, "rewards/margins": 33.68035888671875, "rewards/rejected": -42.064186096191406, "step": 656 }, { "epoch": 0.9796831314072694, "grad_norm": 60.05396788153073, "learning_rate": 1.0305355680382266e-09, "logits/chosen": 1.1509252786636353, "logits/rejected": 1.104540467262268, "logps/chosen": -1.3822535276412964, "logps/rejected": -3.314253807067871, "loss": 1.632, "nll_loss": 1.3822535276412964, "rewards/accuracies": 1.0, "rewards/chosen": -13.822535514831543, "rewards/margins": 19.32000160217285, "rewards/rejected": -33.14253616333008, "step": 657 }, { "epoch": 0.9811742777260019, "grad_norm": 60.548920303826634, "learning_rate": 8.781341178393242e-10, "logits/chosen": 1.27897047996521, "logits/rejected": 1.580756664276123, "logps/chosen": -1.4538989067077637, "logps/rejected": -3.935889720916748, "loss": 2.3341, "nll_loss": 1.4538989067077637, "rewards/accuracies": 0.875, "rewards/chosen": -14.538990020751953, "rewards/margins": 24.81990623474121, "rewards/rejected": -39.35889434814453, "step": 658 }, { "epoch": 0.9826654240447343, "grad_norm": 45.86956571349362, "learning_rate": 7.379110954810475e-10, "logits/chosen": 0.560336172580719, "logits/rejected": 0.9452404379844666, "logps/chosen": -1.4137957096099854, "logps/rejected": -3.3868355751037598, "loss": 1.798, "nll_loss": 1.4137957096099854, "rewards/accuracies": 0.75, "rewards/chosen": -14.137956619262695, "rewards/margins": 19.730396270751953, "rewards/rejected": -33.86835479736328, "step": 659 }, { "epoch": 0.9841565703634669, "grad_norm": 57.46987230573217, "learning_rate": 6.098699223641701e-10, "logits/chosen": 0.6086639761924744, "logits/rejected": 0.858557939529419, "logps/chosen": -1.8596971035003662, "logps/rejected": -3.0429792404174805, "loss": 2.201, "nll_loss": 1.8596974611282349, "rewards/accuracies": 0.75, "rewards/chosen": -18.596973419189453, "rewards/margins": 11.832818031311035, "rewards/rejected": -30.429792404174805, "step": 660 }, { "epoch": 0.9856477166821994, "grad_norm": 62.5753873150776, "learning_rate": 4.940137226560615e-10, "logits/chosen": 0.6363529562950134, "logits/rejected": 1.5825424194335938, "logps/chosen": -1.7450621128082275, "logps/rejected": -2.7526190280914307, "loss": 2.2655, "nll_loss": 1.7450621128082275, "rewards/accuracies": 0.75, "rewards/chosen": -17.450620651245117, "rewards/margins": 10.075567245483398, "rewards/rejected": -27.52618980407715, "step": 661 }, { "epoch": 0.987138863000932, "grad_norm": 47.832905093901545, "learning_rate": 3.903453232140808e-10, "logits/chosen": 1.244388222694397, "logits/rejected": 2.0133092403411865, "logps/chosen": -1.0886551141738892, "logps/rejected": -2.5148251056671143, "loss": 1.9311, "nll_loss": 1.0886549949645996, "rewards/accuracies": 0.875, "rewards/chosen": -10.886550903320312, "rewards/margins": 14.261701583862305, "rewards/rejected": -25.148252487182617, "step": 662 }, { "epoch": 0.9886300093196645, "grad_norm": 41.41579902475401, "learning_rate": 2.988672535169656e-10, "logits/chosen": 1.6264417171478271, "logits/rejected": 1.6430389881134033, "logps/chosen": -1.0195305347442627, "logps/rejected": -2.44887638092041, "loss": 1.1391, "nll_loss": 1.0195305347442627, "rewards/accuracies": 1.0, "rewards/chosen": -10.195306777954102, "rewards/margins": 14.293455123901367, "rewards/rejected": -24.48876190185547, "step": 663 }, { "epoch": 0.990121155638397, "grad_norm": 61.20192914397783, "learning_rate": 2.1958174560282594e-10, "logits/chosen": 0.9436533451080322, "logits/rejected": 1.2115904092788696, "logps/chosen": -1.481966495513916, "logps/rejected": -3.0835845470428467, "loss": 1.5986, "nll_loss": 1.481966495513916, "rewards/accuracies": 0.75, "rewards/chosen": -14.819665908813477, "rewards/margins": 16.01618003845215, "rewards/rejected": -30.835845947265625, "step": 664 }, { "epoch": 0.9916123019571296, "grad_norm": 79.46932255527533, "learning_rate": 1.5249073401502055e-10, "logits/chosen": 1.7353107929229736, "logits/rejected": 2.1959099769592285, "logps/chosen": -1.3309359550476074, "logps/rejected": -3.29795503616333, "loss": 1.6526, "nll_loss": 1.330936074256897, "rewards/accuracies": 0.875, "rewards/chosen": -13.309359550476074, "rewards/margins": 19.670188903808594, "rewards/rejected": -32.979549407958984, "step": 665 }, { "epoch": 0.993103448275862, "grad_norm": 41.19926284687701, "learning_rate": 9.759585575458417e-11, "logits/chosen": 0.7111138701438904, "logits/rejected": 0.504228949546814, "logps/chosen": -1.7196786403656006, "logps/rejected": -2.507728099822998, "loss": 1.775, "nll_loss": 1.7196786403656006, "rewards/accuracies": 0.625, "rewards/chosen": -17.19678497314453, "rewards/margins": 7.880496025085449, "rewards/rejected": -25.077281951904297, "step": 666 }, { "epoch": 0.9945945945945946, "grad_norm": 165.31221246030398, "learning_rate": 5.4898450240536964e-11, "logits/chosen": 1.0977013111114502, "logits/rejected": 1.3191360235214233, "logps/chosen": -1.375827431678772, "logps/rejected": -2.653599262237549, "loss": 2.869, "nll_loss": 1.3758275508880615, "rewards/accuracies": 0.625, "rewards/chosen": -13.75827407836914, "rewards/margins": 12.777718544006348, "rewards/rejected": -26.535995483398438, "step": 667 }, { "epoch": 0.9960857409133271, "grad_norm": 50.15883035033017, "learning_rate": 2.4399559277132885e-11, "logits/chosen": 1.5955461263656616, "logits/rejected": 1.0452600717544556, "logps/chosen": -1.4784938097000122, "logps/rejected": -2.4211621284484863, "loss": 1.0848, "nll_loss": 1.4784936904907227, "rewards/accuracies": 0.875, "rewards/chosen": -14.784937858581543, "rewards/margins": 9.426685333251953, "rewards/rejected": -24.21162223815918, "step": 668 }, { "epoch": 0.9975768872320596, "grad_norm": 39.829485947617805, "learning_rate": 6.099927028380136e-12, "logits/chosen": 1.5119438171386719, "logits/rejected": 1.5995426177978516, "logps/chosen": -1.1745887994766235, "logps/rejected": -2.149003267288208, "loss": 1.478, "nll_loss": 1.174588918685913, "rewards/accuracies": 0.75, "rewards/chosen": -11.745888710021973, "rewards/margins": 9.744142532348633, "rewards/rejected": -21.490032196044922, "step": 669 }, { "epoch": 0.9990680335507922, "grad_norm": 35.06242587573981, "learning_rate": 0.0, "logits/chosen": 1.1288516521453857, "logits/rejected": 1.465525507926941, "logps/chosen": -1.4203619956970215, "logps/rejected": -5.043858528137207, "loss": 2.0247, "nll_loss": 1.420362114906311, "rewards/accuracies": 0.875, "rewards/chosen": -14.203620910644531, "rewards/margins": 36.234962463378906, "rewards/rejected": -50.4385871887207, "step": 670 }, { "epoch": 0.9990680335507922, "step": 670, "total_flos": 0.0, "train_loss": 2.3420828120032353, "train_runtime": 14192.8209, "train_samples_per_second": 3.024, "train_steps_per_second": 0.047 } ], "logging_steps": 1, "max_steps": 670, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }