{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994389377220871, "eval_steps": 134, "global_step": 1336, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007480830372171312, "grad_norm": 59.974050584778176, "learning_rate": 1.2195121951219512e-08, "logps/chosen": -96.5219955444336, "logps/rejected": -105.89979553222656, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 1.0480493307113647, "losses/total": 0.6931471824645996, "ref_logps/chosen": -96.5219955444336, "ref_logps/rejected": -105.89979553222656, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0014961660744342623, "grad_norm": 50.97061327832686, "learning_rate": 2.4390243902439023e-08, "logps/chosen": -88.67269897460938, "logps/rejected": -97.79839324951172, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 0.8721723556518555, "losses/total": 0.6931471824645996, "ref_logps/chosen": -88.67269897460938, "ref_logps/rejected": -97.79839324951172, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0022442491116513932, "grad_norm": 51.34404612924222, "learning_rate": 3.658536585365853e-08, "logps/chosen": -82.60709381103516, "logps/rejected": -91.552490234375, "loss": 0.6877, "losses/dpo": 0.6847645044326782, "losses/sft": 0.7619496583938599, "losses/total": 0.6847645044326782, "ref_logps/chosen": -82.68243408203125, "ref_logps/rejected": -91.51325988769531, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0075337765738368034, "rewards/margins": 0.01145703811198473, "rewards/rejected": -0.003923260606825352, "step": 3 }, { "epoch": 0.0029923321488685246, "grad_norm": 55.65705030530943, "learning_rate": 4.878048780487805e-08, "logps/chosen": -108.95577239990234, "logps/rejected": -110.39165496826172, "loss": 0.6964, "losses/dpo": 0.7135478854179382, "losses/sft": 1.1664021015167236, "losses/total": 0.7135478854179382, "ref_logps/chosen": -108.83744812011719, "ref_logps/rejected": -110.33306121826172, "rewards/accuracies": 0.46875, "rewards/chosen": -0.011832120828330517, "rewards/margins": -0.005972540006041527, "rewards/rejected": -0.00585958082228899, "step": 4 }, { "epoch": 0.0037404151860856555, "grad_norm": 55.90046801021424, "learning_rate": 6.097560975609756e-08, "logps/chosen": -96.78623962402344, "logps/rejected": -100.73726654052734, "loss": 0.6987, "losses/dpo": 0.6758191585540771, "losses/sft": 0.6152290105819702, "losses/total": 0.6758191585540771, "ref_logps/chosen": -96.76331329345703, "ref_logps/rejected": -100.81658935546875, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0022934856824576855, "rewards/margins": -0.010226046666502953, "rewards/rejected": 0.007932561449706554, "step": 5 }, { "epoch": 0.0044884982233027865, "grad_norm": 66.30510570492672, "learning_rate": 7.317073170731706e-08, "logps/chosen": -83.2293701171875, "logps/rejected": -87.60787963867188, "loss": 0.6956, "losses/dpo": 0.7265294194221497, "losses/sft": 1.0552270412445068, "losses/total": 0.7265294194221497, "ref_logps/chosen": -83.2189712524414, "ref_logps/rejected": -87.63996887207031, "rewards/accuracies": 0.375, "rewards/chosen": -0.0010399851016700268, "rewards/margins": -0.004248150624334812, "rewards/rejected": 0.0032081662211567163, "step": 6 }, { "epoch": 0.005236581260519918, "grad_norm": 61.026766892186195, "learning_rate": 8.536585365853659e-08, "logps/chosen": -76.70850372314453, "logps/rejected": -84.92127990722656, "loss": 0.6904, "losses/dpo": 0.6828237175941467, "losses/sft": 0.5018908381462097, "losses/total": 0.6828237175941467, "ref_logps/chosen": -76.69213104248047, "ref_logps/rejected": -84.84483337402344, "rewards/accuracies": 0.5, "rewards/chosen": -0.0016372414538636804, "rewards/margins": 0.006007486954331398, "rewards/rejected": -0.0076447282917797565, "step": 7 }, { "epoch": 0.005984664297737049, "grad_norm": 53.568933125094894, "learning_rate": 9.75609756097561e-08, "logps/chosen": -88.31718444824219, "logps/rejected": -91.24917602539062, "loss": 0.6953, "losses/dpo": 0.6982625722885132, "losses/sft": 0.835720419883728, "losses/total": 0.6982625722885132, "ref_logps/chosen": -88.28501892089844, "ref_logps/rejected": -91.25572967529297, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0032170102931559086, "rewards/margins": -0.003872552188113332, "rewards/rejected": 0.0006555425934493542, "step": 8 }, { "epoch": 0.00673274733495418, "grad_norm": 63.7463823975001, "learning_rate": 1.097560975609756e-07, "logps/chosen": -84.27828979492188, "logps/rejected": -93.98930358886719, "loss": 0.688, "losses/dpo": 0.6819353103637695, "losses/sft": 0.8514418005943298, "losses/total": 0.6819353103637695, "ref_logps/chosen": -84.30104064941406, "ref_logps/rejected": -93.904296875, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0022748024202883244, "rewards/margins": 0.010774126276373863, "rewards/rejected": -0.008499324321746826, "step": 9 }, { "epoch": 0.007480830372171311, "grad_norm": 70.09916529355763, "learning_rate": 1.219512195121951e-07, "logps/chosen": -104.21539306640625, "logps/rejected": -112.92938232421875, "loss": 0.6903, "losses/dpo": 0.6834984421730042, "losses/sft": 1.0034970045089722, "losses/total": 0.6834984421730042, "ref_logps/chosen": -104.26239776611328, "ref_logps/rejected": -112.91433715820312, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004701527766883373, "rewards/margins": 0.006205395795404911, "rewards/rejected": -0.0015038668643683195, "step": 10 }, { "epoch": 0.008228913409388442, "grad_norm": 48.982005279508336, "learning_rate": 1.3414634146341465e-07, "logps/chosen": -86.68479919433594, "logps/rejected": -98.31290435791016, "loss": 0.6885, "losses/dpo": 0.6822197437286377, "losses/sft": 0.6507266163825989, "losses/total": 0.6822197437286377, "ref_logps/chosen": -86.75291442871094, "ref_logps/rejected": -98.28250122070312, "rewards/accuracies": 0.625, "rewards/chosen": 0.006812172941863537, "rewards/margins": 0.009852271527051926, "rewards/rejected": -0.00304009928368032, "step": 11 }, { "epoch": 0.008976996446605573, "grad_norm": 44.878427379921064, "learning_rate": 1.4634146341463413e-07, "logps/chosen": -76.83417510986328, "logps/rejected": -78.97032165527344, "loss": 0.696, "losses/dpo": 0.6876137256622314, "losses/sft": 0.578190267086029, "losses/total": 0.6876137256622314, "ref_logps/chosen": -76.81863403320312, "ref_logps/rejected": -79.00775146484375, "rewards/accuracies": 0.375, "rewards/chosen": -0.0015541142784059048, "rewards/margins": -0.005296960938721895, "rewards/rejected": 0.0037428471259772778, "step": 12 }, { "epoch": 0.009725079483822705, "grad_norm": 53.51957731256799, "learning_rate": 1.5853658536585366e-07, "logps/chosen": -106.34982299804688, "logps/rejected": -114.72909545898438, "loss": 0.6923, "losses/dpo": 0.691738486289978, "losses/sft": 0.9301700592041016, "losses/total": 0.691738486289978, "ref_logps/chosen": -106.34454345703125, "ref_logps/rejected": -114.7013931274414, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0005280462792143226, "rewards/margins": 0.0022426038049161434, "rewards/rejected": -0.002770650666207075, "step": 13 }, { "epoch": 0.010473162521039836, "grad_norm": 49.2781756015982, "learning_rate": 1.7073170731707317e-07, "logps/chosen": -87.08540344238281, "logps/rejected": -96.45118713378906, "loss": 0.6946, "losses/dpo": 0.6899164319038391, "losses/sft": 0.48029160499572754, "losses/total": 0.6899164319038391, "ref_logps/chosen": -87.04013061523438, "ref_logps/rejected": -96.43143463134766, "rewards/accuracies": 0.4375, "rewards/chosen": -0.004526582546532154, "rewards/margins": -0.002550914417952299, "rewards/rejected": -0.0019756671972572803, "step": 14 }, { "epoch": 0.011221245558256966, "grad_norm": 83.2821807294014, "learning_rate": 1.8292682926829268e-07, "logps/chosen": -84.44003295898438, "logps/rejected": -92.03987121582031, "loss": 0.695, "losses/dpo": 0.6700431108474731, "losses/sft": 0.477091908454895, "losses/total": 0.6700431108474731, "ref_logps/chosen": -84.43235778808594, "ref_logps/rejected": -92.06175231933594, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0007672780193388462, "rewards/margins": -0.0029550669714808464, "rewards/rejected": 0.0021877880208194256, "step": 15 }, { "epoch": 0.011969328595474098, "grad_norm": 57.54071830651204, "learning_rate": 1.951219512195122e-07, "logps/chosen": -90.35257720947266, "logps/rejected": -97.41439819335938, "loss": 0.6878, "losses/dpo": 0.6848957538604736, "losses/sft": 0.42037433385849, "losses/total": 0.6848957538604736, "ref_logps/chosen": -90.40846252441406, "ref_logps/rejected": -97.35786437988281, "rewards/accuracies": 0.5625, "rewards/chosen": 0.005588349886238575, "rewards/margins": 0.011242459528148174, "rewards/rejected": -0.005654109176248312, "step": 16 }, { "epoch": 0.012717411632691229, "grad_norm": 51.507084035213516, "learning_rate": 2.073170731707317e-07, "logps/chosen": -91.31112670898438, "logps/rejected": -100.69586944580078, "loss": 0.68, "losses/dpo": 0.6908072233200073, "losses/sft": 0.7803502082824707, "losses/total": 0.6908072233200073, "ref_logps/chosen": -91.40369415283203, "ref_logps/rejected": -100.51895141601562, "rewards/accuracies": 0.8125, "rewards/chosen": 0.009256690740585327, "rewards/margins": 0.026947977021336555, "rewards/rejected": -0.01769128441810608, "step": 17 }, { "epoch": 0.01346549466990836, "grad_norm": 78.4783212285233, "learning_rate": 2.195121951219512e-07, "logps/chosen": -81.65106201171875, "logps/rejected": -95.31147766113281, "loss": 0.697, "losses/dpo": 0.6907550096511841, "losses/sft": 0.6824198365211487, "losses/total": 0.6907550096511841, "ref_logps/chosen": -81.60857391357422, "ref_logps/rejected": -95.34159851074219, "rewards/accuracies": 0.4375, "rewards/chosen": -0.004249152727425098, "rewards/margins": -0.00726162176579237, "rewards/rejected": 0.0030124697368592024, "step": 18 }, { "epoch": 0.014213577707125492, "grad_norm": 60.714003443662854, "learning_rate": 2.3170731707317074e-07, "logps/chosen": -97.74163055419922, "logps/rejected": -106.2554931640625, "loss": 0.6924, "losses/dpo": 0.6738797426223755, "losses/sft": 1.2954185009002686, "losses/total": 0.6738797426223755, "ref_logps/chosen": -97.8375244140625, "ref_logps/rejected": -106.32772827148438, "rewards/accuracies": 0.46875, "rewards/chosen": 0.009589070454239845, "rewards/margins": 0.0023669907823204994, "rewards/rejected": 0.007222080137580633, "step": 19 }, { "epoch": 0.014961660744342622, "grad_norm": 62.96834410733016, "learning_rate": 2.439024390243902e-07, "logps/chosen": -61.521541595458984, "logps/rejected": -72.29498291015625, "loss": 0.6907, "losses/dpo": 0.7008021473884583, "losses/sft": 0.2065332680940628, "losses/total": 0.7008021473884583, "ref_logps/chosen": -61.53730392456055, "ref_logps/rejected": -72.2581558227539, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0015764713753014803, "rewards/margins": 0.005259665660560131, "rewards/rejected": -0.003683194750919938, "step": 20 }, { "epoch": 0.015709743781559753, "grad_norm": 50.04066687460498, "learning_rate": 2.5609756097560976e-07, "logps/chosen": -83.53723907470703, "logps/rejected": -94.33818054199219, "loss": 0.6956, "losses/dpo": 0.6744053363800049, "losses/sft": 0.7949776649475098, "losses/total": 0.6744053363800049, "ref_logps/chosen": -83.52857208251953, "ref_logps/rejected": -94.37078094482422, "rewards/accuracies": 0.4375, "rewards/chosen": -0.000866475747898221, "rewards/margins": -0.004126338288187981, "rewards/rejected": 0.0032598613761365414, "step": 21 }, { "epoch": 0.016457826818776885, "grad_norm": 62.90105448992747, "learning_rate": 2.682926829268293e-07, "logps/chosen": -75.12887573242188, "logps/rejected": -82.03578186035156, "loss": 0.6864, "losses/dpo": 0.6711546182632446, "losses/sft": 1.231006145477295, "losses/total": 0.6711546182632446, "ref_logps/chosen": -75.14749145507812, "ref_logps/rejected": -81.91493225097656, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0018609343096613884, "rewards/margins": 0.01394536904990673, "rewards/rejected": -0.012084433808922768, "step": 22 }, { "epoch": 0.017205909855994014, "grad_norm": 62.143024738756004, "learning_rate": 2.8048780487804877e-07, "logps/chosen": -106.08462524414062, "logps/rejected": -114.9342269897461, "loss": 0.6883, "losses/dpo": 0.713842511177063, "losses/sft": 0.579557478427887, "losses/total": 0.713842511177063, "ref_logps/chosen": -106.23292541503906, "ref_logps/rejected": -114.97693634033203, "rewards/accuracies": 0.5625, "rewards/chosen": 0.014829244464635849, "rewards/margins": 0.010558316484093666, "rewards/rejected": 0.0042709289118647575, "step": 23 }, { "epoch": 0.017953992893211146, "grad_norm": 45.04089833444425, "learning_rate": 2.9268292682926825e-07, "logps/chosen": -73.27450561523438, "logps/rejected": -75.62657165527344, "loss": 0.6937, "losses/dpo": 0.6925753951072693, "losses/sft": 0.3869549632072449, "losses/total": 0.6925753951072693, "ref_logps/chosen": -73.19186401367188, "ref_logps/rejected": -75.54731750488281, "rewards/accuracies": 0.5625, "rewards/chosen": -0.008264392614364624, "rewards/margins": -0.0003392565995454788, "rewards/rejected": -0.007925136014819145, "step": 24 }, { "epoch": 0.018702075930428278, "grad_norm": 108.66984631463723, "learning_rate": 3.048780487804878e-07, "logps/chosen": -98.27293395996094, "logps/rejected": -98.73696899414062, "loss": 0.6938, "losses/dpo": 0.6884900331497192, "losses/sft": 0.6735142469406128, "losses/total": 0.6884900331497192, "ref_logps/chosen": -98.21116638183594, "ref_logps/rejected": -98.68425750732422, "rewards/accuracies": 0.5625, "rewards/chosen": -0.006176230497658253, "rewards/margins": -0.0009045484475791454, "rewards/rejected": -0.005271682515740395, "step": 25 }, { "epoch": 0.01945015896764541, "grad_norm": 50.8873017268112, "learning_rate": 3.170731707317073e-07, "logps/chosen": -91.29349517822266, "logps/rejected": -89.82010650634766, "loss": 0.6934, "losses/dpo": 0.6929163336753845, "losses/sft": 0.6067396402359009, "losses/total": 0.6929163336753845, "ref_logps/chosen": -91.32647705078125, "ref_logps/rejected": -89.85245513916016, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0032984840217977762, "rewards/margins": 6.379315163940191e-05, "rewards/rejected": 0.003234690520912409, "step": 26 }, { "epoch": 0.02019824200486254, "grad_norm": 48.65607536152276, "learning_rate": 3.292682926829268e-07, "logps/chosen": -82.84942626953125, "logps/rejected": -81.38693237304688, "loss": 0.6978, "losses/dpo": 0.7131488919258118, "losses/sft": 0.7464495897293091, "losses/total": 0.7131488919258118, "ref_logps/chosen": -82.77721405029297, "ref_logps/rejected": -81.40044403076172, "rewards/accuracies": 0.4375, "rewards/chosen": -0.007220965810120106, "rewards/margins": -0.008571673184633255, "rewards/rejected": 0.0013507064431905746, "step": 27 }, { "epoch": 0.02094632504207967, "grad_norm": 57.138903138418506, "learning_rate": 3.4146341463414634e-07, "logps/chosen": -92.43415069580078, "logps/rejected": -97.15518188476562, "loss": 0.6973, "losses/dpo": 0.713499128818512, "losses/sft": 1.2389531135559082, "losses/total": 0.713499128818512, "ref_logps/chosen": -92.41498565673828, "ref_logps/rejected": -97.21293640136719, "rewards/accuracies": 0.5, "rewards/chosen": -0.0019167419523000717, "rewards/margins": -0.007691984996199608, "rewards/rejected": 0.0057752421125769615, "step": 28 }, { "epoch": 0.021694408079296804, "grad_norm": 72.27582318243923, "learning_rate": 3.536585365853658e-07, "logps/chosen": -97.90177917480469, "logps/rejected": -100.27867889404297, "loss": 0.6951, "losses/dpo": 0.7139302492141724, "losses/sft": 0.845596432685852, "losses/total": 0.7139302492141724, "ref_logps/chosen": -97.76292419433594, "ref_logps/rejected": -100.17295837402344, "rewards/accuracies": 0.46875, "rewards/chosen": -0.013885889202356339, "rewards/margins": -0.003312957938760519, "rewards/rejected": -0.010572931729257107, "step": 29 }, { "epoch": 0.022442491116513932, "grad_norm": 51.96774863576113, "learning_rate": 3.6585365853658536e-07, "logps/chosen": -75.03881072998047, "logps/rejected": -85.0166015625, "loss": 0.683, "losses/dpo": 0.6717315912246704, "losses/sft": 0.5275499224662781, "losses/total": 0.6717315912246704, "ref_logps/chosen": -75.07958984375, "ref_logps/rejected": -84.84619140625, "rewards/accuracies": 0.59375, "rewards/chosen": 0.004077172838151455, "rewards/margins": 0.02111792005598545, "rewards/rejected": -0.01704074814915657, "step": 30 }, { "epoch": 0.023190574153731065, "grad_norm": 62.371976637718035, "learning_rate": 3.7804878048780484e-07, "logps/chosen": -98.24893188476562, "logps/rejected": -104.1169662475586, "loss": 0.6945, "losses/dpo": 0.7032510638237, "losses/sft": 0.22414442896842957, "losses/total": 0.7032510638237, "ref_logps/chosen": -98.17474365234375, "ref_logps/rejected": -104.06255340576172, "rewards/accuracies": 0.5625, "rewards/chosen": -0.007419183850288391, "rewards/margins": -0.0019785314798355103, "rewards/rejected": -0.005440652370452881, "step": 31 }, { "epoch": 0.023938657190948197, "grad_norm": 49.73091575104649, "learning_rate": 3.902439024390244e-07, "logps/chosen": -81.25952911376953, "logps/rejected": -81.69898986816406, "loss": 0.6894, "losses/dpo": 0.6745765805244446, "losses/sft": 0.8292353749275208, "losses/total": 0.6745765805244446, "ref_logps/chosen": -81.31452178955078, "ref_logps/rejected": -81.67354583740234, "rewards/accuracies": 0.5625, "rewards/chosen": 0.005499035120010376, "rewards/margins": 0.008043794892728329, "rewards/rejected": -0.0025447607040405273, "step": 32 }, { "epoch": 0.024686740228165326, "grad_norm": 53.865023828860785, "learning_rate": 4.024390243902439e-07, "logps/chosen": -90.13691711425781, "logps/rejected": -92.54141235351562, "loss": 0.6944, "losses/dpo": 0.6830124258995056, "losses/sft": 0.6868780255317688, "losses/total": 0.6830124258995056, "ref_logps/chosen": -90.0866470336914, "ref_logps/rejected": -92.50923156738281, "rewards/accuracies": 0.5, "rewards/chosen": -0.005026575643569231, "rewards/margins": -0.0018092188984155655, "rewards/rejected": -0.003217357210814953, "step": 33 }, { "epoch": 0.025434823265382458, "grad_norm": 49.069453044398365, "learning_rate": 4.146341463414634e-07, "logps/chosen": -70.74198150634766, "logps/rejected": -81.5191650390625, "loss": 0.6962, "losses/dpo": 0.6928682327270508, "losses/sft": 0.5342616438865662, "losses/total": 0.6928682327270508, "ref_logps/chosen": -70.68021392822266, "ref_logps/rejected": -81.51214599609375, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0061765192076563835, "rewards/margins": -0.00547447707504034, "rewards/rejected": -0.0007020427146926522, "step": 34 }, { "epoch": 0.02618290630259959, "grad_norm": 50.594457233803816, "learning_rate": 4.268292682926829e-07, "logps/chosen": -52.091697692871094, "logps/rejected": -58.93048095703125, "loss": 0.6879, "losses/dpo": 0.6909371614456177, "losses/sft": 0.15952838957309723, "losses/total": 0.6909371614456177, "ref_logps/chosen": -52.152000427246094, "ref_logps/rejected": -58.881011962890625, "rewards/accuracies": 0.625, "rewards/chosen": 0.006030449643731117, "rewards/margins": 0.01097719743847847, "rewards/rejected": -0.004946747329086065, "step": 35 }, { "epoch": 0.02693098933981672, "grad_norm": 62.35636981403165, "learning_rate": 4.390243902439024e-07, "logps/chosen": -97.5057144165039, "logps/rejected": -99.71139526367188, "loss": 0.6942, "losses/dpo": 0.6876667737960815, "losses/sft": 1.0390082597732544, "losses/total": 0.6876667737960815, "ref_logps/chosen": -97.43600463867188, "ref_logps/rejected": -99.65267181396484, "rewards/accuracies": 0.5, "rewards/chosen": -0.0069717480801045895, "rewards/margins": -0.0010996116325259209, "rewards/rejected": -0.005872136913239956, "step": 36 }, { "epoch": 0.02767907237703385, "grad_norm": 69.89842236911912, "learning_rate": 4.5121951219512194e-07, "logps/chosen": -85.88082122802734, "logps/rejected": -97.86152648925781, "loss": 0.6899, "losses/dpo": 0.7072774171829224, "losses/sft": 0.6150345802307129, "losses/total": 0.7072774171829224, "ref_logps/chosen": -85.84672546386719, "ref_logps/rejected": -97.75698852539062, "rewards/accuracies": 0.53125, "rewards/chosen": -0.003409826662391424, "rewards/margins": 0.007044881582260132, "rewards/rejected": -0.010454708710312843, "step": 37 }, { "epoch": 0.028427155414250983, "grad_norm": 68.09297575385716, "learning_rate": 4.634146341463415e-07, "logps/chosen": -102.5638427734375, "logps/rejected": -97.7885513305664, "loss": 0.6935, "losses/dpo": 0.6917954683303833, "losses/sft": 1.6796200275421143, "losses/total": 0.6917954683303833, "ref_logps/chosen": -102.47378540039062, "ref_logps/rejected": -97.69910430908203, "rewards/accuracies": 0.53125, "rewards/chosen": -0.009004959836602211, "rewards/margins": -6.030488293617964e-05, "rewards/rejected": -0.00894465483725071, "step": 38 }, { "epoch": 0.029175238451468112, "grad_norm": 74.09542519144998, "learning_rate": 4.756097560975609e-07, "logps/chosen": -88.935546875, "logps/rejected": -96.40318298339844, "loss": 0.6957, "losses/dpo": 0.7104014754295349, "losses/sft": 0.7669236660003662, "losses/total": 0.7104014754295349, "ref_logps/chosen": -88.85249328613281, "ref_logps/rejected": -96.3647689819336, "rewards/accuracies": 0.46875, "rewards/chosen": -0.008305096998810768, "rewards/margins": -0.004462673794478178, "rewards/rejected": -0.003842422040179372, "step": 39 }, { "epoch": 0.029923321488685244, "grad_norm": 90.67944839668186, "learning_rate": 4.878048780487804e-07, "logps/chosen": -84.25971984863281, "logps/rejected": -95.01719665527344, "loss": 0.6824, "losses/dpo": 0.6790875196456909, "losses/sft": 0.8468064069747925, "losses/total": 0.6790875196456909, "ref_logps/chosen": -84.3022232055664, "ref_logps/rejected": -94.84003448486328, "rewards/accuracies": 0.8125, "rewards/chosen": 0.004250587895512581, "rewards/margins": 0.021966345608234406, "rewards/rejected": -0.017715759575366974, "step": 40 }, { "epoch": 0.030671404525902377, "grad_norm": 57.11476896725347, "learning_rate": 5e-07, "logps/chosen": -97.73909759521484, "logps/rejected": -96.327392578125, "loss": 0.6964, "losses/dpo": 0.7078738808631897, "losses/sft": 0.5666255950927734, "losses/total": 0.7078738808631897, "ref_logps/chosen": -97.61463165283203, "ref_logps/rejected": -96.26091003417969, "rewards/accuracies": 0.53125, "rewards/chosen": -0.012445923872292042, "rewards/margins": -0.0057981060817837715, "rewards/rejected": -0.006647817324846983, "step": 41 }, { "epoch": 0.031419487563119505, "grad_norm": 58.986868932686484, "learning_rate": 4.999992643520848e-07, "logps/chosen": -92.3856201171875, "logps/rejected": -103.89016723632812, "loss": 0.6949, "losses/dpo": 0.6837512850761414, "losses/sft": 1.1926568746566772, "losses/total": 0.6837512850761414, "ref_logps/chosen": -92.24179077148438, "ref_logps/rejected": -103.77666473388672, "rewards/accuracies": 0.4375, "rewards/chosen": -0.01438295841217041, "rewards/margins": -0.0030325944535434246, "rewards/rejected": -0.011350364424288273, "step": 42 }, { "epoch": 0.032167570600336634, "grad_norm": 68.19228307799769, "learning_rate": 4.999970574126684e-07, "logps/chosen": -83.64216613769531, "logps/rejected": -82.7593994140625, "loss": 0.6899, "losses/dpo": 0.6962037682533264, "losses/sft": 0.6758909225463867, "losses/total": 0.6962037682533264, "ref_logps/chosen": -83.5783462524414, "ref_logps/rejected": -82.62516784667969, "rewards/accuracies": 0.625, "rewards/chosen": -0.006382448133081198, "rewards/margins": 0.0070412661880254745, "rewards/rejected": -0.013423713855445385, "step": 43 }, { "epoch": 0.03291565363755377, "grad_norm": 64.28075754795778, "learning_rate": 4.999933791947391e-07, "logps/chosen": -92.13870239257812, "logps/rejected": -101.4595947265625, "loss": 0.6907, "losses/dpo": 0.698017954826355, "losses/sft": 0.34049952030181885, "losses/total": 0.698017954826355, "ref_logps/chosen": -92.05088806152344, "ref_logps/rejected": -101.31784057617188, "rewards/accuracies": 0.6875, "rewards/chosen": -0.00878117699176073, "rewards/margins": 0.0053946636617183685, "rewards/rejected": -0.014175841584801674, "step": 44 }, { "epoch": 0.0336637366747709, "grad_norm": 84.27569164579617, "learning_rate": 4.999882297199441e-07, "logps/chosen": -102.0521240234375, "logps/rejected": -103.25550842285156, "loss": 0.7008, "losses/dpo": 0.683803915977478, "losses/sft": 1.0301213264465332, "losses/total": 0.683803915977478, "ref_logps/chosen": -101.824462890625, "ref_logps/rejected": -103.17308807373047, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0227656289935112, "rewards/margins": -0.014524010010063648, "rewards/rejected": -0.008241619914770126, "step": 45 }, { "epoch": 0.03441181971198803, "grad_norm": 103.16928677372175, "learning_rate": 4.999816090185887e-07, "logps/chosen": -92.38207244873047, "logps/rejected": -106.74534606933594, "loss": 0.6919, "losses/dpo": 0.7002678513526917, "losses/sft": 1.0810365676879883, "losses/total": 0.7002678513526917, "ref_logps/chosen": -92.17179870605469, "ref_logps/rejected": -106.50029754638672, "rewards/accuracies": 0.5, "rewards/chosen": -0.02102803625166416, "rewards/margins": 0.0034769480116665363, "rewards/rejected": -0.02450498379766941, "step": 46 }, { "epoch": 0.03515990274920516, "grad_norm": 73.30517094003568, "learning_rate": 4.999735171296372e-07, "logps/chosen": -89.39765930175781, "logps/rejected": -87.70747375488281, "loss": 0.6896, "losses/dpo": 0.6841588616371155, "losses/sft": 1.128448724746704, "losses/total": 0.6841588616371155, "ref_logps/chosen": -89.29096984863281, "ref_logps/rejected": -87.5239028930664, "rewards/accuracies": 0.5625, "rewards/chosen": -0.010668843984603882, "rewards/margins": 0.00768826249986887, "rewards/rejected": -0.018357107415795326, "step": 47 }, { "epoch": 0.03590798578642229, "grad_norm": 60.049199643228874, "learning_rate": 4.999639541007116e-07, "logps/chosen": -70.51936340332031, "logps/rejected": -74.22663879394531, "loss": 0.6921, "losses/dpo": 0.703823983669281, "losses/sft": 0.7254171967506409, "losses/total": 0.703823983669281, "ref_logps/chosen": -70.37786102294922, "ref_logps/rejected": -74.0577621459961, "rewards/accuracies": 0.53125, "rewards/chosen": -0.014149850234389305, "rewards/margins": 0.002737160073593259, "rewards/rejected": -0.016887009143829346, "step": 48 }, { "epoch": 0.03665606882363942, "grad_norm": 78.46158806898525, "learning_rate": 4.999529199880923e-07, "logps/chosen": -97.47354125976562, "logps/rejected": -104.1199722290039, "loss": 0.6899, "losses/dpo": 0.6808444857597351, "losses/sft": 0.7124015688896179, "losses/total": 0.6808444857597351, "ref_logps/chosen": -97.28414916992188, "ref_logps/rejected": -103.85783386230469, "rewards/accuracies": 0.65625, "rewards/chosen": -0.018938392400741577, "rewards/margins": 0.0072755212895572186, "rewards/rejected": -0.026213916018605232, "step": 49 }, { "epoch": 0.037404151860856556, "grad_norm": 54.676997038306276, "learning_rate": 4.999404148567169e-07, "logps/chosen": -74.20542907714844, "logps/rejected": -80.48365783691406, "loss": 0.6943, "losses/dpo": 0.696114182472229, "losses/sft": 0.410934716463089, "losses/total": 0.696114182472229, "ref_logps/chosen": -74.03597259521484, "ref_logps/rejected": -80.32927703857422, "rewards/accuracies": 0.53125, "rewards/chosen": -0.016946006566286087, "rewards/margins": -0.0015072498936206102, "rewards/rejected": -0.01543875690549612, "step": 50 }, { "epoch": 0.038152234898073685, "grad_norm": 77.51000847948441, "learning_rate": 4.999264387801805e-07, "logps/chosen": -92.22940063476562, "logps/rejected": -100.2743911743164, "loss": 0.6886, "losses/dpo": 0.693362832069397, "losses/sft": 0.5452644228935242, "losses/total": 0.693362832069397, "ref_logps/chosen": -91.9690933227539, "ref_logps/rejected": -99.90816497802734, "rewards/accuracies": 0.5625, "rewards/chosen": -0.026029760017991066, "rewards/margins": 0.010592987760901451, "rewards/rejected": -0.03662274777889252, "step": 51 }, { "epoch": 0.03890031793529082, "grad_norm": 50.71475709815057, "learning_rate": 4.999109918407349e-07, "logps/chosen": -92.55049133300781, "logps/rejected": -107.47657775878906, "loss": 0.6962, "losses/dpo": 0.6893306970596313, "losses/sft": 0.8368744850158691, "losses/total": 0.6893306970596313, "ref_logps/chosen": -92.26712799072266, "ref_logps/rejected": -107.24708557128906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.028336942195892334, "rewards/margins": -0.005388069897890091, "rewards/rejected": -0.022948872298002243, "step": 52 }, { "epoch": 0.03964840097250795, "grad_norm": 66.7611389295681, "learning_rate": 4.99894074129288e-07, "logps/chosen": -109.7945556640625, "logps/rejected": -117.54583740234375, "loss": 0.694, "losses/dpo": 0.7079731822013855, "losses/sft": 1.4070452451705933, "losses/total": 0.7079731822013855, "ref_logps/chosen": -109.56817626953125, "ref_logps/rejected": -117.32384490966797, "rewards/accuracies": 0.5625, "rewards/chosen": -0.022638272494077682, "rewards/margins": -0.0004387493245303631, "rewards/rejected": -0.022199522703886032, "step": 53 }, { "epoch": 0.04039648400972508, "grad_norm": 75.39706881008834, "learning_rate": 4.998756857454039e-07, "logps/chosen": -88.1456298828125, "logps/rejected": -114.71649169921875, "loss": 0.6815, "losses/dpo": 0.6876960396766663, "losses/sft": 0.8659732937812805, "losses/total": 0.6876960396766663, "ref_logps/chosen": -87.99959564208984, "ref_logps/rejected": -114.32479095458984, "rewards/accuracies": 0.59375, "rewards/chosen": -0.014603697694838047, "rewards/margins": 0.024566419422626495, "rewards/rejected": -0.03917011618614197, "step": 54 }, { "epoch": 0.041144567046942214, "grad_norm": 56.550691853683354, "learning_rate": 4.998558267973013e-07, "logps/chosen": -97.3410415649414, "logps/rejected": -101.69986724853516, "loss": 0.6906, "losses/dpo": 0.7019035220146179, "losses/sft": 0.5660150051116943, "losses/total": 0.7019035220146179, "ref_logps/chosen": -97.09984588623047, "ref_logps/rejected": -101.39913940429688, "rewards/accuracies": 0.53125, "rewards/chosen": -0.024120014160871506, "rewards/margins": 0.005952533334493637, "rewards/rejected": -0.030072549358010292, "step": 55 }, { "epoch": 0.04189265008415934, "grad_norm": 59.33852573598695, "learning_rate": 4.99834497401854e-07, "logps/chosen": -96.47964477539062, "logps/rejected": -97.14228057861328, "loss": 0.691, "losses/dpo": 0.6689102649688721, "losses/sft": 1.0339840650558472, "losses/total": 0.6689102649688721, "ref_logps/chosen": -96.18141174316406, "ref_logps/rejected": -96.79532623291016, "rewards/accuracies": 0.5, "rewards/chosen": -0.02982240915298462, "rewards/margins": 0.004873292520642281, "rewards/rejected": -0.03469569608569145, "step": 56 }, { "epoch": 0.04264073312137647, "grad_norm": 65.99850009890046, "learning_rate": 4.998116976845892e-07, "logps/chosen": -86.54103088378906, "logps/rejected": -93.3443832397461, "loss": 0.6954, "losses/dpo": 0.7138208746910095, "losses/sft": 0.4773041307926178, "losses/total": 0.7138208746910095, "ref_logps/chosen": -86.21240234375, "ref_logps/rejected": -93.05413055419922, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0328633077442646, "rewards/margins": -0.003838244127109647, "rewards/rejected": -0.029025062918663025, "step": 57 }, { "epoch": 0.04338881615859361, "grad_norm": 56.58801557282007, "learning_rate": 4.997874277796877e-07, "logps/chosen": -78.94558715820312, "logps/rejected": -84.19448852539062, "loss": 0.6972, "losses/dpo": 0.6957101821899414, "losses/sft": 0.4586362838745117, "losses/total": 0.6957101821899414, "ref_logps/chosen": -78.654541015625, "ref_logps/rejected": -83.97624969482422, "rewards/accuracies": 0.46875, "rewards/chosen": -0.029105044901371002, "rewards/margins": -0.007280820980668068, "rewards/rejected": -0.021824222058057785, "step": 58 }, { "epoch": 0.044136899195810736, "grad_norm": 54.552973786300896, "learning_rate": 4.997616878299821e-07, "logps/chosen": -101.95695495605469, "logps/rejected": -99.25685119628906, "loss": 0.6923, "losses/dpo": 0.7016672492027283, "losses/sft": 0.8859966397285461, "losses/total": 0.7016672492027283, "ref_logps/chosen": -101.65580749511719, "ref_logps/rejected": -98.9271240234375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.030114391818642616, "rewards/margins": 0.002858144696801901, "rewards/rejected": -0.032972536981105804, "step": 59 }, { "epoch": 0.044884982233027865, "grad_norm": 68.53947600445728, "learning_rate": 4.997344779869566e-07, "logps/chosen": -111.48204040527344, "logps/rejected": -117.95887756347656, "loss": 0.6908, "losses/dpo": 0.6947684288024902, "losses/sft": 1.272596001625061, "losses/total": 0.6947684288024902, "ref_logps/chosen": -111.0802001953125, "ref_logps/rejected": -117.50035858154297, "rewards/accuracies": 0.53125, "rewards/chosen": -0.040184661746025085, "rewards/margins": 0.0056667206808924675, "rewards/rejected": -0.04585137963294983, "step": 60 }, { "epoch": 0.045633065270245, "grad_norm": 64.85035134413697, "learning_rate": 4.997057984107465e-07, "logps/chosen": -94.10345458984375, "logps/rejected": -94.69508361816406, "loss": 0.6889, "losses/dpo": 0.7077723145484924, "losses/sft": 2.0163440704345703, "losses/total": 0.7077723145484924, "ref_logps/chosen": -93.87616729736328, "ref_logps/rejected": -94.37178039550781, "rewards/accuracies": 0.5, "rewards/chosen": -0.022728189826011658, "rewards/margins": 0.009601864032447338, "rewards/rejected": -0.03233005851507187, "step": 61 }, { "epoch": 0.04638114830746213, "grad_norm": 57.033962103749666, "learning_rate": 4.996756492701362e-07, "logps/chosen": -75.93788146972656, "logps/rejected": -83.37911987304688, "loss": 0.683, "losses/dpo": 0.6577162146568298, "losses/sft": 1.2258440256118774, "losses/total": 0.6577162146568298, "ref_logps/chosen": -75.79696655273438, "ref_logps/rejected": -83.02735137939453, "rewards/accuracies": 0.625, "rewards/chosen": -0.014091501012444496, "rewards/margins": 0.021085266023874283, "rewards/rejected": -0.03517676889896393, "step": 62 }, { "epoch": 0.04712923134467926, "grad_norm": 64.88655149192442, "learning_rate": 4.996440307425587e-07, "logps/chosen": -97.60345458984375, "logps/rejected": -94.80174255371094, "loss": 0.6878, "losses/dpo": 0.6976509690284729, "losses/sft": 1.2811416387557983, "losses/total": 0.6976509690284729, "ref_logps/chosen": -97.32852172851562, "ref_logps/rejected": -94.41084289550781, "rewards/accuracies": 0.625, "rewards/chosen": -0.02749297395348549, "rewards/margins": 0.011596133932471275, "rewards/rejected": -0.039089106023311615, "step": 63 }, { "epoch": 0.047877314381896394, "grad_norm": 48.343044875069346, "learning_rate": 4.996109430140952e-07, "logps/chosen": -85.5622329711914, "logps/rejected": -100.84979248046875, "loss": 0.6991, "losses/dpo": 0.7026593685150146, "losses/sft": 0.92555171251297, "losses/total": 0.7026593685150146, "ref_logps/chosen": -85.15542602539062, "ref_logps/rejected": -100.55113220214844, "rewards/accuracies": 0.375, "rewards/chosen": -0.040680475533008575, "rewards/margins": -0.010813570581376553, "rewards/rejected": -0.029866904020309448, "step": 64 }, { "epoch": 0.04862539741911352, "grad_norm": 87.15478593178592, "learning_rate": 4.995763862794729e-07, "logps/chosen": -78.98066711425781, "logps/rejected": -82.49763488769531, "loss": 0.6959, "losses/dpo": 0.6794412136077881, "losses/sft": 0.8385435938835144, "losses/total": 0.6794412136077881, "ref_logps/chosen": -78.65986633300781, "ref_logps/rejected": -82.22047424316406, "rewards/accuracies": 0.4375, "rewards/chosen": -0.032080601900815964, "rewards/margins": -0.00436450494453311, "rewards/rejected": -0.027716096490621567, "step": 65 }, { "epoch": 0.04937348045633065, "grad_norm": 57.92067147366268, "learning_rate": 4.995403607420643e-07, "logps/chosen": -87.43882751464844, "logps/rejected": -91.94143676757812, "loss": 0.6985, "losses/dpo": 0.6896036267280579, "losses/sft": 1.0612897872924805, "losses/total": 0.6896036267280579, "ref_logps/chosen": -87.10023498535156, "ref_logps/rejected": -91.69383239746094, "rewards/accuracies": 0.375, "rewards/chosen": -0.033859796822071075, "rewards/margins": -0.009100079536437988, "rewards/rejected": -0.02475971169769764, "step": 66 }, { "epoch": 0.05012156349354779, "grad_norm": 49.188425251693666, "learning_rate": 4.995028666138866e-07, "logps/chosen": -93.0034408569336, "logps/rejected": -91.25230407714844, "loss": 0.6804, "losses/dpo": 0.6929285526275635, "losses/sft": 0.7129292488098145, "losses/total": 0.6929285526275635, "ref_logps/chosen": -92.73042297363281, "ref_logps/rejected": -90.71182250976562, "rewards/accuracies": 0.71875, "rewards/chosen": -0.027302134782075882, "rewards/margins": 0.0267474465072155, "rewards/rejected": -0.05404958128929138, "step": 67 }, { "epoch": 0.050869646530764916, "grad_norm": 47.48055633323434, "learning_rate": 4.994639041155993e-07, "logps/chosen": -88.83892822265625, "logps/rejected": -100.600341796875, "loss": 0.6855, "losses/dpo": 0.6678017377853394, "losses/sft": 0.9797214269638062, "losses/total": 0.6678017377853394, "ref_logps/chosen": -88.58214569091797, "ref_logps/rejected": -100.17912292480469, "rewards/accuracies": 0.71875, "rewards/chosen": -0.025678418576717377, "rewards/margins": 0.016443094238638878, "rewards/rejected": -0.042121514678001404, "step": 68 }, { "epoch": 0.051617729567982044, "grad_norm": 56.23986352081886, "learning_rate": 4.994234734765043e-07, "logps/chosen": -88.33454895019531, "logps/rejected": -94.43159484863281, "loss": 0.6832, "losses/dpo": 0.6898762583732605, "losses/sft": 1.0205817222595215, "losses/total": 0.6898762583732605, "ref_logps/chosen": -88.02110290527344, "ref_logps/rejected": -93.9108657836914, "rewards/accuracies": 0.65625, "rewards/chosen": -0.031344104558229446, "rewards/margins": 0.02072853595018387, "rewards/rejected": -0.05207264423370361, "step": 69 }, { "epoch": 0.05236581260519918, "grad_norm": 65.99786463086318, "learning_rate": 4.993815749345429e-07, "logps/chosen": -80.23670959472656, "logps/rejected": -87.66299438476562, "loss": 0.6909, "losses/dpo": 0.698851466178894, "losses/sft": 0.19171610474586487, "losses/total": 0.698851466178894, "ref_logps/chosen": -79.87513732910156, "ref_logps/rejected": -87.2525405883789, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03615737706422806, "rewards/margins": 0.004887028597295284, "rewards/rejected": -0.04104440659284592, "step": 70 }, { "epoch": 0.05311389564241631, "grad_norm": 75.7823031448648, "learning_rate": 4.993382087362959e-07, "logps/chosen": -100.08606719970703, "logps/rejected": -106.31314849853516, "loss": 0.6869, "losses/dpo": 0.6713548302650452, "losses/sft": 0.665778636932373, "losses/total": 0.6713548302650452, "ref_logps/chosen": -99.5086441040039, "ref_logps/rejected": -105.598388671875, "rewards/accuracies": 0.5, "rewards/chosen": -0.05774197727441788, "rewards/margins": 0.013735083863139153, "rewards/rejected": -0.07147706300020218, "step": 71 }, { "epoch": 0.05386197867963344, "grad_norm": 67.40288316350006, "learning_rate": 4.992933751369812e-07, "logps/chosen": -77.14959716796875, "logps/rejected": -78.16893005371094, "loss": 0.6826, "losses/dpo": 0.7096676826477051, "losses/sft": 0.6441861987113953, "losses/total": 0.7096676826477051, "ref_logps/chosen": -76.869873046875, "ref_logps/rejected": -77.66357421875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.027971874922513962, "rewards/margins": 0.02256390079855919, "rewards/rejected": -0.05053577572107315, "step": 72 }, { "epoch": 0.05461006171685057, "grad_norm": 75.67512765148375, "learning_rate": 4.99247074400453e-07, "logps/chosen": -66.48274993896484, "logps/rejected": -73.71775817871094, "loss": 0.6906, "losses/dpo": 0.6768147349357605, "losses/sft": 0.8260210752487183, "losses/total": 0.6768147349357605, "ref_logps/chosen": -66.20339965820312, "ref_logps/rejected": -73.38069915771484, "rewards/accuracies": 0.53125, "rewards/chosen": -0.027934866026043892, "rewards/margins": 0.00577153405174613, "rewards/rejected": -0.03370640054345131, "step": 73 }, { "epoch": 0.0553581447540677, "grad_norm": 62.50572535421763, "learning_rate": 4.991993067991995e-07, "logps/chosen": -85.81665802001953, "logps/rejected": -94.92948913574219, "loss": 0.6873, "losses/dpo": 0.7113600969314575, "losses/sft": 0.6777662038803101, "losses/total": 0.7113600969314575, "ref_logps/chosen": -85.32127380371094, "ref_logps/rejected": -94.29959106445312, "rewards/accuracies": 0.625, "rewards/chosen": -0.04953800141811371, "rewards/margins": 0.013450901955366135, "rewards/rejected": -0.06298890709877014, "step": 74 }, { "epoch": 0.05610622779128483, "grad_norm": 70.81936652619041, "learning_rate": 4.991500726143415e-07, "logps/chosen": -94.44608306884766, "logps/rejected": -91.65453338623047, "loss": 0.6928, "losses/dpo": 0.7131880521774292, "losses/sft": 0.8783124685287476, "losses/total": 0.7131880521774292, "ref_logps/chosen": -93.83908081054688, "ref_logps/rejected": -91.02897644042969, "rewards/accuracies": 0.53125, "rewards/chosen": -0.06070011854171753, "rewards/margins": 0.00185573217459023, "rewards/rejected": -0.06255584955215454, "step": 75 }, { "epoch": 0.05685431082850197, "grad_norm": 51.28045352407927, "learning_rate": 4.990993721356315e-07, "logps/chosen": -74.62962341308594, "logps/rejected": -87.84396362304688, "loss": 0.6876, "losses/dpo": 0.6730806231498718, "losses/sft": 0.5774669647216797, "losses/total": 0.6730806231498718, "ref_logps/chosen": -74.23600006103516, "ref_logps/rejected": -87.32610321044922, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03936201333999634, "rewards/margins": 0.012424267828464508, "rewards/rejected": -0.051786281168460846, "step": 76 }, { "epoch": 0.057602393865719095, "grad_norm": 73.38404894371651, "learning_rate": 4.990472056614512e-07, "logps/chosen": -107.91146850585938, "logps/rejected": -99.65370178222656, "loss": 0.6956, "losses/dpo": 0.6743743419647217, "losses/sft": 0.6503850817680359, "losses/total": 0.6743743419647217, "ref_logps/chosen": -107.31517028808594, "ref_logps/rejected": -99.08279418945312, "rewards/accuracies": 0.5, "rewards/chosen": -0.059629812836647034, "rewards/margins": -0.0025398200377821922, "rewards/rejected": -0.05708999186754227, "step": 77 }, { "epoch": 0.058350476902936224, "grad_norm": 64.76811726264836, "learning_rate": 4.989935734988097e-07, "logps/chosen": -111.61673736572266, "logps/rejected": -116.91382598876953, "loss": 0.6964, "losses/dpo": 0.7121404409408569, "losses/sft": 1.1390495300292969, "losses/total": 0.7121404409408569, "ref_logps/chosen": -110.87055206298828, "ref_logps/rejected": -116.21685791015625, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07461907714605331, "rewards/margins": -0.004921893123537302, "rewards/rejected": -0.06969718635082245, "step": 78 }, { "epoch": 0.05909855994015336, "grad_norm": 56.33700245226397, "learning_rate": 4.989384759633421e-07, "logps/chosen": -75.70103454589844, "logps/rejected": -78.37271118164062, "loss": 0.6858, "losses/dpo": 0.6977310180664062, "losses/sft": 0.12895165383815765, "losses/total": 0.6977310180664062, "ref_logps/chosen": -75.32069396972656, "ref_logps/rejected": -77.83572387695312, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03803490102291107, "rewards/margins": 0.015663912519812584, "rewards/rejected": -0.053698815405368805, "step": 79 }, { "epoch": 0.05984664297737049, "grad_norm": 63.322037591326826, "learning_rate": 4.988819133793076e-07, "logps/chosen": -88.87350463867188, "logps/rejected": -96.76344299316406, "loss": 0.6844, "losses/dpo": 0.666938066482544, "losses/sft": 0.6361566781997681, "losses/total": 0.666938066482544, "ref_logps/chosen": -88.37337493896484, "ref_logps/rejected": -96.07521057128906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.05001307278871536, "rewards/margins": 0.01881011575460434, "rewards/rejected": -0.0688231885433197, "step": 80 }, { "epoch": 0.06059472601458762, "grad_norm": 66.7902240212652, "learning_rate": 4.988238860795872e-07, "logps/chosen": -91.89390563964844, "logps/rejected": -108.00592041015625, "loss": 0.6818, "losses/dpo": 0.6747066974639893, "losses/sft": 1.0494171380996704, "losses/total": 0.6747066974639893, "ref_logps/chosen": -91.37062072753906, "ref_logps/rejected": -107.23461151123047, "rewards/accuracies": 0.5625, "rewards/chosen": -0.05232900381088257, "rewards/margins": 0.024800993502140045, "rewards/rejected": -0.07713000476360321, "step": 81 }, { "epoch": 0.06134280905180475, "grad_norm": 62.57921513478238, "learning_rate": 4.987643944056824e-07, "logps/chosen": -96.8008041381836, "logps/rejected": -105.08617401123047, "loss": 0.6841, "losses/dpo": 0.670839250087738, "losses/sft": 0.6522030830383301, "losses/total": 0.670839250087738, "ref_logps/chosen": -96.32463073730469, "ref_logps/rejected": -104.41156005859375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04761672019958496, "rewards/margins": 0.019844714552164078, "rewards/rejected": -0.06746143102645874, "step": 82 }, { "epoch": 0.06209089208902188, "grad_norm": 59.90539992738723, "learning_rate": 4.987034387077125e-07, "logps/chosen": -108.28489685058594, "logps/rejected": -126.1627426147461, "loss": 0.6731, "losses/dpo": 0.6596886515617371, "losses/sft": 0.920014500617981, "losses/total": 0.6596886515617371, "ref_logps/chosen": -107.74777221679688, "ref_logps/rejected": -125.20220184326172, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05371224135160446, "rewards/margins": 0.04234126955270767, "rewards/rejected": -0.09605351090431213, "step": 83 }, { "epoch": 0.06283897512623901, "grad_norm": 81.97272428592007, "learning_rate": 4.98641019344413e-07, "logps/chosen": -98.74504089355469, "logps/rejected": -118.4371566772461, "loss": 0.677, "losses/dpo": 0.6500980854034424, "losses/sft": 1.059922695159912, "losses/total": 0.6500980854034424, "ref_logps/chosen": -98.16727447509766, "ref_logps/rejected": -117.5202407836914, "rewards/accuracies": 0.6875, "rewards/chosen": -0.057776156812906265, "rewards/margins": 0.03391571715474129, "rewards/rejected": -0.09169187396764755, "step": 84 }, { "epoch": 0.06358705816345614, "grad_norm": 78.52467796553555, "learning_rate": 4.985771366831332e-07, "logps/chosen": -74.18081665039062, "logps/rejected": -88.36016082763672, "loss": 0.6682, "losses/dpo": 0.6582998037338257, "losses/sft": 1.0659692287445068, "losses/total": 0.6582998037338257, "ref_logps/chosen": -73.88983154296875, "ref_logps/rejected": -87.5443115234375, "rewards/accuracies": 0.75, "rewards/chosen": -0.029097914695739746, "rewards/margins": 0.052486494183540344, "rewards/rejected": -0.08158441632986069, "step": 85 }, { "epoch": 0.06433514120067327, "grad_norm": 72.80380390939733, "learning_rate": 4.985117910998344e-07, "logps/chosen": -66.82148742675781, "logps/rejected": -64.92509460449219, "loss": 0.6981, "losses/dpo": 0.6858033537864685, "losses/sft": 0.7866793870925903, "losses/total": 0.6858033537864685, "ref_logps/chosen": -66.33202362060547, "ref_logps/rejected": -64.52741241455078, "rewards/accuracies": 0.4375, "rewards/chosen": -0.048946887254714966, "rewards/margins": -0.009178864769637585, "rewards/rejected": -0.039768025279045105, "step": 86 }, { "epoch": 0.06508322423789041, "grad_norm": 65.705312170395, "learning_rate": 4.984449829790873e-07, "logps/chosen": -95.60189056396484, "logps/rejected": -101.1171646118164, "loss": 0.6876, "losses/dpo": 0.6801273822784424, "losses/sft": 1.283775806427002, "losses/total": 0.6801273822784424, "ref_logps/chosen": -94.94813537597656, "ref_logps/rejected": -100.33403778076172, "rewards/accuracies": 0.4375, "rewards/chosen": -0.06537526845932007, "rewards/margins": 0.012937350198626518, "rewards/rejected": -0.07831262052059174, "step": 87 }, { "epoch": 0.06583130727510754, "grad_norm": 66.84256392715615, "learning_rate": 4.983767127140698e-07, "logps/chosen": -98.46894836425781, "logps/rejected": -111.51509857177734, "loss": 0.6779, "losses/dpo": 0.6488788723945618, "losses/sft": 0.9031060934066772, "losses/total": 0.6488788723945618, "ref_logps/chosen": -97.87342071533203, "ref_logps/rejected": -110.58041381835938, "rewards/accuracies": 0.625, "rewards/chosen": -0.0595526359975338, "rewards/margins": 0.03391527757048607, "rewards/rejected": -0.09346791356801987, "step": 88 }, { "epoch": 0.06657939031232467, "grad_norm": 71.87589471674775, "learning_rate": 4.983069807065651e-07, "logps/chosen": -102.52367401123047, "logps/rejected": -101.11985778808594, "loss": 0.6985, "losses/dpo": 0.6811867356300354, "losses/sft": 0.5975565314292908, "losses/total": 0.6811867356300354, "ref_logps/chosen": -101.83409118652344, "ref_logps/rejected": -100.51335144042969, "rewards/accuracies": 0.5, "rewards/chosen": -0.06895739585161209, "rewards/margins": -0.00830664299428463, "rewards/rejected": -0.06065075099468231, "step": 89 }, { "epoch": 0.0673274733495418, "grad_norm": 60.313333614300596, "learning_rate": 4.982357873669588e-07, "logps/chosen": -95.91080474853516, "logps/rejected": -100.54303741455078, "loss": 0.6859, "losses/dpo": 0.6559918522834778, "losses/sft": 0.692446231842041, "losses/total": 0.6559918522834778, "ref_logps/chosen": -95.4013671875, "ref_logps/rejected": -99.86360931396484, "rewards/accuracies": 0.53125, "rewards/chosen": -0.05094433203339577, "rewards/margins": 0.016997840255498886, "rewards/rejected": -0.06794217228889465, "step": 90 }, { "epoch": 0.06807555638675893, "grad_norm": 69.83363089425991, "learning_rate": 4.981631331142367e-07, "logps/chosen": -86.53044891357422, "logps/rejected": -88.71197509765625, "loss": 0.6943, "losses/dpo": 0.6911967396736145, "losses/sft": 0.7096028923988342, "losses/total": 0.6911967396736145, "ref_logps/chosen": -85.82479095458984, "ref_logps/rejected": -88.00960540771484, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0705660730600357, "rewards/margins": -0.00032919086515903473, "rewards/rejected": -0.07023688405752182, "step": 91 }, { "epoch": 0.06882363942397605, "grad_norm": 50.47320602161268, "learning_rate": 4.980890183759825e-07, "logps/chosen": -73.81181335449219, "logps/rejected": -91.15997314453125, "loss": 0.6762, "losses/dpo": 0.7062295079231262, "losses/sft": 0.28434550762176514, "losses/total": 0.7062295079231262, "ref_logps/chosen": -73.39132690429688, "ref_logps/rejected": -90.37786865234375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.04204845428466797, "rewards/margins": 0.03616296499967575, "rewards/rejected": -0.07821141928434372, "step": 92 }, { "epoch": 0.0695717224611932, "grad_norm": 54.73014552539818, "learning_rate": 4.980134435883749e-07, "logps/chosen": -95.4791488647461, "logps/rejected": -100.53861999511719, "loss": 0.6774, "losses/dpo": 0.6859480142593384, "losses/sft": 0.791650116443634, "losses/total": 0.6859480142593384, "ref_logps/chosen": -94.8549575805664, "ref_logps/rejected": -99.5773696899414, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06241985782980919, "rewards/margins": 0.0337047204375267, "rewards/rejected": -0.0961245745420456, "step": 93 }, { "epoch": 0.07031980549841033, "grad_norm": 56.3163679955879, "learning_rate": 4.979364091961855e-07, "logps/chosen": -78.08065795898438, "logps/rejected": -93.63600158691406, "loss": 0.6773, "losses/dpo": 0.6644335389137268, "losses/sft": 1.045114517211914, "losses/total": 0.6644335389137268, "ref_logps/chosen": -77.53814697265625, "ref_logps/rejected": -92.75614929199219, "rewards/accuracies": 0.59375, "rewards/chosen": -0.05425192043185234, "rewards/margins": 0.033732835203409195, "rewards/rejected": -0.08798475563526154, "step": 94 }, { "epoch": 0.07106788853562745, "grad_norm": 185.0654204298231, "learning_rate": 4.978579156527758e-07, "logps/chosen": -82.78665924072266, "logps/rejected": -92.92390441894531, "loss": 0.6786, "losses/dpo": 0.7137663960456848, "losses/sft": 1.143051266670227, "losses/total": 0.7137663960456848, "ref_logps/chosen": -82.29136657714844, "ref_logps/rejected": -92.11564636230469, "rewards/accuracies": 0.65625, "rewards/chosen": -0.049528833478689194, "rewards/margins": 0.03129799664020538, "rewards/rejected": -0.08082681894302368, "step": 95 }, { "epoch": 0.07181597157284458, "grad_norm": 61.95656223839921, "learning_rate": 4.977779634200946e-07, "logps/chosen": -106.30671691894531, "logps/rejected": -115.40925598144531, "loss": 0.6828, "losses/dpo": 0.6933457255363464, "losses/sft": 0.322223961353302, "losses/total": 0.6933457255363464, "ref_logps/chosen": -105.44669342041016, "ref_logps/rejected": -114.3156509399414, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08600245416164398, "rewards/margins": 0.023357883095741272, "rewards/rejected": -0.10936033725738525, "step": 96 }, { "epoch": 0.07256405461006171, "grad_norm": 55.377156820552614, "learning_rate": 4.976965529686756e-07, "logps/chosen": -87.31617736816406, "logps/rejected": -99.21058654785156, "loss": 0.6701, "losses/dpo": 0.6279661655426025, "losses/sft": 1.0962822437286377, "losses/total": 0.6279661655426025, "ref_logps/chosen": -86.67822265625, "ref_logps/rejected": -98.06147766113281, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06379528343677521, "rewards/margins": 0.051116280257701874, "rewards/rejected": -0.11491156369447708, "step": 97 }, { "epoch": 0.07331213764727884, "grad_norm": 67.79994549538101, "learning_rate": 4.976136847776338e-07, "logps/chosen": -103.6918716430664, "logps/rejected": -110.78470611572266, "loss": 0.6817, "losses/dpo": 0.6786997318267822, "losses/sft": 0.7206512093544006, "losses/total": 0.6786997318267822, "ref_logps/chosen": -102.85965728759766, "ref_logps/rejected": -109.69694519042969, "rewards/accuracies": 0.625, "rewards/chosen": -0.08322081714868546, "rewards/margins": 0.025553930550813675, "rewards/rejected": -0.10877473652362823, "step": 98 }, { "epoch": 0.07406022068449598, "grad_norm": 86.12876250487602, "learning_rate": 4.975293593346643e-07, "logps/chosen": -130.41087341308594, "logps/rejected": -127.66023254394531, "loss": 0.6739, "losses/dpo": 0.691582202911377, "losses/sft": 1.7708208560943604, "losses/total": 0.691582202911377, "ref_logps/chosen": -129.33041381835938, "ref_logps/rejected": -126.15959167480469, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1080462783575058, "rewards/margins": 0.04201740771532059, "rewards/rejected": -0.1500636786222458, "step": 99 }, { "epoch": 0.07480830372171311, "grad_norm": 74.91690569518771, "learning_rate": 4.974435771360376e-07, "logps/chosen": -74.93022155761719, "logps/rejected": -74.41163635253906, "loss": 0.6889, "losses/dpo": 0.7068099975585938, "losses/sft": 0.5260930061340332, "losses/total": 0.7068099975585938, "ref_logps/chosen": -74.15255737304688, "ref_logps/rejected": -73.52828979492188, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07776672393083572, "rewards/margins": 0.010567913763225079, "rewards/rejected": -0.08833464235067368, "step": 100 }, { "epoch": 0.07555638675893024, "grad_norm": 64.05353463311741, "learning_rate": 4.973563386865974e-07, "logps/chosen": -80.50483703613281, "logps/rejected": -97.04532623291016, "loss": 0.6599, "losses/dpo": 0.6590644717216492, "losses/sft": 0.6268046498298645, "losses/total": 0.6590644717216492, "ref_logps/chosen": -79.99922180175781, "ref_logps/rejected": -95.83522033691406, "rewards/accuracies": 0.75, "rewards/chosen": -0.05056170001626015, "rewards/margins": 0.07044855505228043, "rewards/rejected": -0.12101025134325027, "step": 101 }, { "epoch": 0.07630446979614737, "grad_norm": 71.56214143754842, "learning_rate": 4.972676444997583e-07, "logps/chosen": -72.7188949584961, "logps/rejected": -80.65179443359375, "loss": 0.6863, "losses/dpo": 0.7365929484367371, "losses/sft": 0.485510915517807, "losses/total": 0.7365929484367371, "ref_logps/chosen": -71.81455993652344, "ref_logps/rejected": -79.56739807128906, "rewards/accuracies": 0.5, "rewards/chosen": -0.09043414890766144, "rewards/margins": 0.01800503209233284, "rewards/rejected": -0.10843917727470398, "step": 102 }, { "epoch": 0.0770525528333645, "grad_norm": 61.33439822240008, "learning_rate": 4.971774950975015e-07, "logps/chosen": -103.31317901611328, "logps/rejected": -110.69197845458984, "loss": 0.6683, "losses/dpo": 0.6355347633361816, "losses/sft": 0.7201902270317078, "losses/total": 0.6355347633361816, "ref_logps/chosen": -102.60247802734375, "ref_logps/rejected": -109.44808197021484, "rewards/accuracies": 0.71875, "rewards/chosen": -0.07107002288103104, "rewards/margins": 0.053320012986660004, "rewards/rejected": -0.12439003586769104, "step": 103 }, { "epoch": 0.07780063587058164, "grad_norm": 79.43470565959389, "learning_rate": 4.97085891010373e-07, "logps/chosen": -109.30094909667969, "logps/rejected": -123.46762084960938, "loss": 0.6713, "losses/dpo": 0.6458479762077332, "losses/sft": 0.4618457853794098, "losses/total": 0.6458479762077332, "ref_logps/chosen": -108.29540252685547, "ref_logps/rejected": -121.97810363769531, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10055407881736755, "rewards/margins": 0.04839760810136795, "rewards/rejected": -0.1489516794681549, "step": 104 }, { "epoch": 0.07854871890779877, "grad_norm": 100.11676456333804, "learning_rate": 4.969928327774797e-07, "logps/chosen": -88.98631286621094, "logps/rejected": -94.40913391113281, "loss": 0.6715, "losses/dpo": 0.7077957391738892, "losses/sft": 0.29774361848831177, "losses/total": 0.7077957391738892, "ref_logps/chosen": -88.1948471069336, "ref_logps/rejected": -93.13722229003906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07914724946022034, "rewards/margins": 0.048043906688690186, "rewards/rejected": -0.12719115614891052, "step": 105 }, { "epoch": 0.0792968019450159, "grad_norm": 86.83304132667494, "learning_rate": 4.968983209464862e-07, "logps/chosen": -80.76592254638672, "logps/rejected": -89.24547576904297, "loss": 0.6673, "losses/dpo": 0.6833786368370056, "losses/sft": 0.8973764181137085, "losses/total": 0.6833786368370056, "ref_logps/chosen": -80.03070068359375, "ref_logps/rejected": -87.94866943359375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07352153211832047, "rewards/margins": 0.05615898221731186, "rewards/rejected": -0.12968051433563232, "step": 106 }, { "epoch": 0.08004488498223303, "grad_norm": 53.67490453262972, "learning_rate": 4.968023560736121e-07, "logps/chosen": -82.72154235839844, "logps/rejected": -93.75114440917969, "loss": 0.6927, "losses/dpo": 0.6981375813484192, "losses/sft": 0.22609184682369232, "losses/total": 0.6981375813484192, "ref_logps/chosen": -81.68257904052734, "ref_logps/rejected": -92.66921997070312, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10389627516269684, "rewards/margins": 0.004295450169593096, "rewards/rejected": -0.10819172859191895, "step": 107 }, { "epoch": 0.08079296801945016, "grad_norm": 76.67300628766606, "learning_rate": 4.96704938723628e-07, "logps/chosen": -102.84796905517578, "logps/rejected": -108.7225341796875, "loss": 0.6914, "losses/dpo": 0.7207042574882507, "losses/sft": 0.8529693484306335, "losses/total": 0.7207042574882507, "ref_logps/chosen": -101.55142211914062, "ref_logps/rejected": -107.36907196044922, "rewards/accuracies": 0.59375, "rewards/chosen": -0.12965452671051025, "rewards/margins": 0.005692359525710344, "rewards/rejected": -0.1353469043970108, "step": 108 }, { "epoch": 0.08154105105666729, "grad_norm": 100.12104696216377, "learning_rate": 4.966060694698532e-07, "logps/chosen": -91.55227661132812, "logps/rejected": -95.9395980834961, "loss": 0.6777, "losses/dpo": 0.6511812210083008, "losses/sft": 0.7598628997802734, "losses/total": 0.6511812210083008, "ref_logps/chosen": -90.46266174316406, "ref_logps/rejected": -94.51873779296875, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10896223783493042, "rewards/margins": 0.03312448039650917, "rewards/rejected": -0.1420867145061493, "step": 109 }, { "epoch": 0.08228913409388443, "grad_norm": 48.2512698624084, "learning_rate": 4.965057488941513e-07, "logps/chosen": -82.87835693359375, "logps/rejected": -82.44159698486328, "loss": 0.6949, "losses/dpo": 0.6899063587188721, "losses/sft": 0.9825999140739441, "losses/total": 0.6899063587188721, "ref_logps/chosen": -82.02870178222656, "ref_logps/rejected": -81.59894561767578, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08496502786874771, "rewards/margins": -0.0007002444472163916, "rewards/rejected": -0.08426479250192642, "step": 110 }, { "epoch": 0.08303721713110156, "grad_norm": 53.40237280542307, "learning_rate": 4.964039775869271e-07, "logps/chosen": -92.53004455566406, "logps/rejected": -107.21756744384766, "loss": 0.6697, "losses/dpo": 0.7204440832138062, "losses/sft": 1.3529318571090698, "losses/total": 0.7204440832138062, "ref_logps/chosen": -91.29595947265625, "ref_logps/rejected": -105.45274353027344, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12340886890888214, "rewards/margins": 0.053074028342962265, "rewards/rejected": -0.1764829009771347, "step": 111 }, { "epoch": 0.08378530016831869, "grad_norm": 87.60371199106952, "learning_rate": 4.963007561471235e-07, "logps/chosen": -104.42776489257812, "logps/rejected": -107.78184509277344, "loss": 0.6724, "losses/dpo": 0.6549930572509766, "losses/sft": 0.501497209072113, "losses/total": 0.6549930572509766, "ref_logps/chosen": -103.24819946289062, "ref_logps/rejected": -106.12086486816406, "rewards/accuracies": 0.625, "rewards/chosen": -0.11795705556869507, "rewards/margins": 0.04814044386148453, "rewards/rejected": -0.1660975217819214, "step": 112 }, { "epoch": 0.08453338320553581, "grad_norm": 60.45216240687134, "learning_rate": 4.961960851822176e-07, "logps/chosen": -87.74386596679688, "logps/rejected": -104.56695556640625, "loss": 0.6659, "losses/dpo": 0.624608039855957, "losses/sft": 0.6458165049552917, "losses/total": 0.624608039855957, "ref_logps/chosen": -86.51728057861328, "ref_logps/rejected": -102.73149108886719, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12265896797180176, "rewards/margins": 0.06088685244321823, "rewards/rejected": -0.1835458129644394, "step": 113 }, { "epoch": 0.08528146624275294, "grad_norm": 166.19651344764523, "learning_rate": 4.960899653082173e-07, "logps/chosen": -100.76226806640625, "logps/rejected": -101.04169464111328, "loss": 0.6876, "losses/dpo": 0.6737421751022339, "losses/sft": 0.8412036895751953, "losses/total": 0.6737421751022339, "ref_logps/chosen": -99.42174530029297, "ref_logps/rejected": -99.52098083496094, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1340520828962326, "rewards/margins": 0.01801922731101513, "rewards/rejected": -0.15207132697105408, "step": 114 }, { "epoch": 0.08602954927997007, "grad_norm": 55.87954932345798, "learning_rate": 4.959823971496574e-07, "logps/chosen": -71.24462127685547, "logps/rejected": -86.14532470703125, "loss": 0.661, "losses/dpo": 0.640383780002594, "losses/sft": 0.52694171667099, "losses/total": 0.640383780002594, "ref_logps/chosen": -70.31541442871094, "ref_logps/rejected": -84.52534484863281, "rewards/accuracies": 0.75, "rewards/chosen": -0.09292107820510864, "rewards/margins": 0.06907765567302704, "rewards/rejected": -0.16199873387813568, "step": 115 }, { "epoch": 0.08677763231718721, "grad_norm": 100.1165967979672, "learning_rate": 4.958733813395962e-07, "logps/chosen": -87.6973876953125, "logps/rejected": -97.9770278930664, "loss": 0.6786, "losses/dpo": 0.6500837206840515, "losses/sft": 0.6048222780227661, "losses/total": 0.6500837206840515, "ref_logps/chosen": -86.62403106689453, "ref_logps/rejected": -96.54557800292969, "rewards/accuracies": 0.59375, "rewards/chosen": -0.10733576864004135, "rewards/margins": 0.03580884635448456, "rewards/rejected": -0.1431446224451065, "step": 116 }, { "epoch": 0.08752571535440434, "grad_norm": 72.15921930322168, "learning_rate": 4.95762918519612e-07, "logps/chosen": -84.52169799804688, "logps/rejected": -94.06840515136719, "loss": 0.6667, "losses/dpo": 0.6510855555534363, "losses/sft": 0.6643170118331909, "losses/total": 0.6510855555534363, "ref_logps/chosen": -83.56491088867188, "ref_logps/rejected": -92.51815032958984, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09567861258983612, "rewards/margins": 0.05934703350067139, "rewards/rejected": -0.1550256311893463, "step": 117 }, { "epoch": 0.08827379839162147, "grad_norm": 56.611923661586374, "learning_rate": 4.956510093397983e-07, "logps/chosen": -91.15338134765625, "logps/rejected": -100.63397216796875, "loss": 0.6827, "losses/dpo": 0.6635411381721497, "losses/sft": 1.070958137512207, "losses/total": 0.6635411381721497, "ref_logps/chosen": -89.86223602294922, "ref_logps/rejected": -99.06715393066406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12911465764045715, "rewards/margins": 0.02756778709590435, "rewards/rejected": -0.15668244659900665, "step": 118 }, { "epoch": 0.0890218814288386, "grad_norm": 55.09337817935897, "learning_rate": 4.955376544587615e-07, "logps/chosen": -104.65599822998047, "logps/rejected": -116.99227142333984, "loss": 0.6583, "losses/dpo": 0.6148874759674072, "losses/sft": 1.4517148733139038, "losses/total": 0.6148874759674072, "ref_logps/chosen": -103.10497283935547, "ref_logps/rejected": -114.62989807128906, "rewards/accuracies": 0.75, "rewards/chosen": -0.15510162711143494, "rewards/margins": 0.08113616704940796, "rewards/rejected": -0.2362377792596817, "step": 119 }, { "epoch": 0.08976996446605573, "grad_norm": 47.61258025940019, "learning_rate": 4.954228545436156e-07, "logps/chosen": -69.296630859375, "logps/rejected": -74.24674987792969, "loss": 0.6708, "losses/dpo": 0.6315573453903198, "losses/sft": 0.8897170424461365, "losses/total": 0.6315573453903198, "ref_logps/chosen": -68.50445556640625, "ref_logps/rejected": -72.93648529052734, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07921842485666275, "rewards/margins": 0.051808152347803116, "rewards/rejected": -0.13102658092975616, "step": 120 }, { "epoch": 0.09051804750327286, "grad_norm": 61.24328103094695, "learning_rate": 4.953066102699795e-07, "logps/chosen": -89.46236419677734, "logps/rejected": -89.00853729248047, "loss": 0.702, "losses/dpo": 0.7374367713928223, "losses/sft": 1.2203149795532227, "losses/total": 0.7374367713928223, "ref_logps/chosen": -87.9637222290039, "ref_logps/rejected": -87.63744354248047, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1498635709285736, "rewards/margins": -0.012754621915519238, "rewards/rejected": -0.1371089518070221, "step": 121 }, { "epoch": 0.09126613054049, "grad_norm": 62.78467413383069, "learning_rate": 4.951889223219717e-07, "logps/chosen": -105.12085723876953, "logps/rejected": -114.30026245117188, "loss": 0.6633, "losses/dpo": 0.6107309460639954, "losses/sft": 0.5436482429504395, "losses/total": 0.6107309460639954, "ref_logps/chosen": -103.68383026123047, "ref_logps/rejected": -112.17886352539062, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14370261132717133, "rewards/margins": 0.06843708455562592, "rewards/rejected": -0.21213969588279724, "step": 122 }, { "epoch": 0.09201421357770713, "grad_norm": 68.62057227909783, "learning_rate": 4.950697913922075e-07, "logps/chosen": -97.43536376953125, "logps/rejected": -99.91390991210938, "loss": 0.6734, "losses/dpo": 0.585090160369873, "losses/sft": 1.1336805820465088, "losses/total": 0.585090160369873, "ref_logps/chosen": -96.06147766113281, "ref_logps/rejected": -98.05244445800781, "rewards/accuracies": 0.71875, "rewards/chosen": -0.13738934695720673, "rewards/margins": 0.048757750540971756, "rewards/rejected": -0.18614710867404938, "step": 123 }, { "epoch": 0.09276229661492426, "grad_norm": 59.35928281258265, "learning_rate": 4.949492181817943e-07, "logps/chosen": -106.76986694335938, "logps/rejected": -124.33409118652344, "loss": 0.6664, "losses/dpo": 0.7265645265579224, "losses/sft": 0.7230411767959595, "losses/total": 0.7265645265579224, "ref_logps/chosen": -104.97098541259766, "ref_logps/rejected": -121.89129638671875, "rewards/accuracies": 0.59375, "rewards/chosen": -0.17988818883895874, "rewards/margins": 0.06439057737588882, "rewards/rejected": -0.24427875876426697, "step": 124 }, { "epoch": 0.09351037965214139, "grad_norm": 49.788287003656635, "learning_rate": 4.948272034003275e-07, "logps/chosen": -69.7929916381836, "logps/rejected": -81.24292755126953, "loss": 0.6523, "losses/dpo": 0.6400548219680786, "losses/sft": 0.3020136058330536, "losses/total": 0.6400548219680786, "ref_logps/chosen": -68.77619934082031, "ref_logps/rejected": -79.3360595703125, "rewards/accuracies": 0.75, "rewards/chosen": -0.10167922079563141, "rewards/margins": 0.08900698274374008, "rewards/rejected": -0.1906861960887909, "step": 125 }, { "epoch": 0.09425846268935852, "grad_norm": 78.46080334773565, "learning_rate": 4.947037477658864e-07, "logps/chosen": -110.53145599365234, "logps/rejected": -110.05540466308594, "loss": 0.6652, "losses/dpo": 0.7279879450798035, "losses/sft": 1.1443910598754883, "losses/total": 0.7279879450798035, "ref_logps/chosen": -108.76507568359375, "ref_logps/rejected": -107.66458129882812, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17663787305355072, "rewards/margins": 0.06244327872991562, "rewards/rejected": -0.23908114433288574, "step": 126 }, { "epoch": 0.09500654572657564, "grad_norm": 65.21038304074122, "learning_rate": 4.945788520050301e-07, "logps/chosen": -87.62205505371094, "logps/rejected": -90.99200439453125, "loss": 0.6813, "losses/dpo": 0.7071346640586853, "losses/sft": 0.5275065302848816, "losses/total": 0.7071346640586853, "ref_logps/chosen": -86.02410888671875, "ref_logps/rejected": -89.10859680175781, "rewards/accuracies": 0.59375, "rewards/chosen": -0.15979382395744324, "rewards/margins": 0.028547586873173714, "rewards/rejected": -0.1883414089679718, "step": 127 }, { "epoch": 0.09575462876379279, "grad_norm": 60.346847541600944, "learning_rate": 4.944525168527931e-07, "logps/chosen": -71.22399139404297, "logps/rejected": -79.74027252197266, "loss": 0.6732, "losses/dpo": 0.6760457158088684, "losses/sft": 0.34599775075912476, "losses/total": 0.6760457158088684, "ref_logps/chosen": -70.46864318847656, "ref_logps/rejected": -78.45245361328125, "rewards/accuracies": 0.5, "rewards/chosen": -0.07553420960903168, "rewards/margins": 0.05324753373861313, "rewards/rejected": -0.1287817507982254, "step": 128 }, { "epoch": 0.09650271180100992, "grad_norm": 45.62834988145372, "learning_rate": 4.943247430526809e-07, "logps/chosen": -83.1222915649414, "logps/rejected": -84.09970092773438, "loss": 0.6817, "losses/dpo": 0.7453728318214417, "losses/sft": 0.46439453959465027, "losses/total": 0.7453728318214417, "ref_logps/chosen": -81.83456420898438, "ref_logps/rejected": -82.438720703125, "rewards/accuracies": 0.40625, "rewards/chosen": -0.12877212464809418, "rewards/margins": 0.03732652962207794, "rewards/rejected": -0.16609865427017212, "step": 129 }, { "epoch": 0.09725079483822704, "grad_norm": 74.64337110987802, "learning_rate": 4.941955313566656e-07, "logps/chosen": -86.57962036132812, "logps/rejected": -93.73190307617188, "loss": 0.6753, "losses/dpo": 0.5975279808044434, "losses/sft": 0.8834252953529358, "losses/total": 0.5975279808044434, "ref_logps/chosen": -84.85720825195312, "ref_logps/rejected": -91.56071472167969, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17224165797233582, "rewards/margins": 0.044876500964164734, "rewards/rejected": -0.21711814403533936, "step": 130 }, { "epoch": 0.09799887787544417, "grad_norm": 86.90917688383999, "learning_rate": 4.94064882525182e-07, "logps/chosen": -93.2861328125, "logps/rejected": -92.44575500488281, "loss": 0.7033, "losses/dpo": 0.7257283926010132, "losses/sft": 1.0290143489837646, "losses/total": 0.7257283926010132, "ref_logps/chosen": -91.15612030029297, "ref_logps/rejected": -90.46290588378906, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2130013108253479, "rewards/margins": -0.01471691019833088, "rewards/rejected": -0.19828438758850098, "step": 131 }, { "epoch": 0.0987469609126613, "grad_norm": 69.2287202496006, "learning_rate": 4.939327973271221e-07, "logps/chosen": -90.3099365234375, "logps/rejected": -94.13520812988281, "loss": 0.6961, "losses/dpo": 0.7045786380767822, "losses/sft": 0.7416899800300598, "losses/total": 0.7045786380767822, "ref_logps/chosen": -88.31343078613281, "ref_logps/rejected": -92.11752319335938, "rewards/accuracies": 0.46875, "rewards/chosen": -0.19965049624443054, "rewards/margins": 0.0021174438297748566, "rewards/rejected": -0.2017679512500763, "step": 132 }, { "epoch": 0.09949504394987843, "grad_norm": 51.27685875836551, "learning_rate": 4.937992765398316e-07, "logps/chosen": -85.92092895507812, "logps/rejected": -83.22978210449219, "loss": 0.703, "losses/dpo": 0.7032231688499451, "losses/sft": 0.965686023235321, "losses/total": 0.7032231688499451, "ref_logps/chosen": -84.01990509033203, "ref_logps/rejected": -81.44741821289062, "rewards/accuracies": 0.4375, "rewards/chosen": -0.19010215997695923, "rewards/margins": -0.011865412816405296, "rewards/rejected": -0.17823673784732819, "step": 133 }, { "epoch": 0.10024312698709557, "grad_norm": 70.22436244318027, "learning_rate": 4.936643209491051e-07, "logps/chosen": -104.39210510253906, "logps/rejected": -101.95588684082031, "loss": 0.6868, "losses/dpo": 0.7534513473510742, "losses/sft": 1.1029878854751587, "losses/total": 0.7534513473510742, "ref_logps/chosen": -102.305908203125, "ref_logps/rejected": -99.67426300048828, "rewards/accuracies": 0.625, "rewards/chosen": -0.20861941576004028, "rewards/margins": 0.019542664289474487, "rewards/rejected": -0.22816208004951477, "step": 134 }, { "epoch": 0.10024312698709557, "eval_logps/chosen": -36.50243377685547, "eval_logps/rejected": -40.21092987060547, "eval_loss": 0.6820266246795654, "eval_losses/dpo": 0.6949748992919922, "eval_losses/sft": 0.30724385380744934, "eval_losses/total": 0.6949748992919922, "eval_ref_logps/chosen": -35.788204193115234, "eval_ref_logps/rejected": -39.235660552978516, "eval_rewards/accuracies": 0.6012930870056152, "eval_rewards/chosen": -0.07142283022403717, "eval_rewards/margins": 0.02610405534505844, "eval_rewards/rejected": -0.09752687811851501, "eval_runtime": 38.5561, "eval_samples_per_second": 12.008, "eval_steps_per_second": 1.504, "step": 134 }, { "epoch": 0.1009912100243127, "grad_norm": 60.56204100632714, "learning_rate": 4.935279313491807e-07, "logps/chosen": -83.3544692993164, "logps/rejected": -92.13490295410156, "loss": 0.6983, "losses/dpo": 0.6923500895500183, "losses/sft": 0.9553567171096802, "losses/total": 0.6923500895500183, "ref_logps/chosen": -81.55061340332031, "ref_logps/rejected": -90.30645751953125, "rewards/accuracies": 0.40625, "rewards/chosen": -0.18038541078567505, "rewards/margins": 0.002458639442920685, "rewards/rejected": -0.18284407258033752, "step": 135 }, { "epoch": 0.10173929306152983, "grad_norm": 69.0436030845153, "learning_rate": 4.933901085427362e-07, "logps/chosen": -96.72776794433594, "logps/rejected": -95.67681884765625, "loss": 0.6801, "losses/dpo": 0.6504555344581604, "losses/sft": 0.48715758323669434, "losses/total": 0.6504555344581604, "ref_logps/chosen": -95.04241943359375, "ref_logps/rejected": -93.64482116699219, "rewards/accuracies": 0.59375, "rewards/chosen": -0.16853636503219604, "rewards/margins": 0.03466228395700455, "rewards/rejected": -0.20319864153862, "step": 136 }, { "epoch": 0.10248737609874696, "grad_norm": 58.835496012604445, "learning_rate": 4.932508533408841e-07, "logps/chosen": -87.32805633544922, "logps/rejected": -90.14346313476562, "loss": 0.673, "losses/dpo": 0.7033333778381348, "losses/sft": 0.6027899384498596, "losses/total": 0.7033333778381348, "ref_logps/chosen": -85.83261108398438, "ref_logps/rejected": -88.17204284667969, "rewards/accuracies": 0.625, "rewards/chosen": -0.14954377710819244, "rewards/margins": 0.04759778827428818, "rewards/rejected": -0.19714155793190002, "step": 137 }, { "epoch": 0.10323545913596409, "grad_norm": 85.97638757489446, "learning_rate": 4.931101665631669e-07, "logps/chosen": -105.77809143066406, "logps/rejected": -106.52483367919922, "loss": 0.6853, "losses/dpo": 0.6396278142929077, "losses/sft": 0.7670556306838989, "losses/total": 0.6396278142929077, "ref_logps/chosen": -103.80207824707031, "ref_logps/rejected": -104.24701690673828, "rewards/accuracies": 0.4375, "rewards/chosen": -0.19760118424892426, "rewards/margins": 0.030180038884282112, "rewards/rejected": -0.22778120636940002, "step": 138 }, { "epoch": 0.10398354217318122, "grad_norm": 81.95328731029865, "learning_rate": 4.92968049037552e-07, "logps/chosen": -98.73228454589844, "logps/rejected": -91.72872924804688, "loss": 0.7016, "losses/dpo": 0.6921336054801941, "losses/sft": 1.3560744524002075, "losses/total": 0.6921336054801941, "ref_logps/chosen": -96.8816909790039, "ref_logps/rejected": -89.94955444335938, "rewards/accuracies": 0.5, "rewards/chosen": -0.18505819141864777, "rewards/margins": -0.007140599191188812, "rewards/rejected": -0.17791759967803955, "step": 139 }, { "epoch": 0.10473162521039836, "grad_norm": 61.86020788762767, "learning_rate": 4.928245016004272e-07, "logps/chosen": -89.96234893798828, "logps/rejected": -98.8394546508789, "loss": 0.6887, "losses/dpo": 0.6135082244873047, "losses/sft": 1.2429158687591553, "losses/total": 0.6135082244873047, "ref_logps/chosen": -87.87760162353516, "ref_logps/rejected": -96.56254577636719, "rewards/accuracies": 0.40625, "rewards/chosen": -0.2084745466709137, "rewards/margins": 0.019216448068618774, "rewards/rejected": -0.22769099473953247, "step": 140 }, { "epoch": 0.10547970824761549, "grad_norm": 54.914728820292375, "learning_rate": 4.926795250965951e-07, "logps/chosen": -76.89518737792969, "logps/rejected": -80.98959350585938, "loss": 0.681, "losses/dpo": 0.6235638856887817, "losses/sft": 1.0926423072814941, "losses/total": 0.6235638856887817, "ref_logps/chosen": -75.38357543945312, "ref_logps/rejected": -79.12550354003906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1511615514755249, "rewards/margins": 0.035248495638370514, "rewards/rejected": -0.18641003966331482, "step": 141 }, { "epoch": 0.10622779128483262, "grad_norm": 79.31904285278556, "learning_rate": 4.925331203792696e-07, "logps/chosen": -81.73404693603516, "logps/rejected": -99.7159194946289, "loss": 0.656, "losses/dpo": 0.6594467163085938, "losses/sft": 0.7253226637840271, "losses/total": 0.6594467163085938, "ref_logps/chosen": -80.31404113769531, "ref_logps/rejected": -97.45824432373047, "rewards/accuracies": 0.625, "rewards/chosen": -0.142001211643219, "rewards/margins": 0.08376571536064148, "rewards/rejected": -0.22576695680618286, "step": 142 }, { "epoch": 0.10697587432204975, "grad_norm": 71.35805194013268, "learning_rate": 4.923852883100688e-07, "logps/chosen": -94.40438079833984, "logps/rejected": -98.6838607788086, "loss": 0.6752, "losses/dpo": 0.6845940947532654, "losses/sft": 0.5453582406044006, "losses/total": 0.6845940947532654, "ref_logps/chosen": -92.57935333251953, "ref_logps/rejected": -96.42033386230469, "rewards/accuracies": 0.59375, "rewards/chosen": -0.18250292539596558, "rewards/margins": 0.0438496433198452, "rewards/rejected": -0.22635257244110107, "step": 143 }, { "epoch": 0.10772395735926688, "grad_norm": 56.92358516497944, "learning_rate": 4.922360297590119e-07, "logps/chosen": -90.72589111328125, "logps/rejected": -98.44126892089844, "loss": 0.6632, "losses/dpo": 0.6082133054733276, "losses/sft": 1.2858704328536987, "losses/total": 0.6082133054733276, "ref_logps/chosen": -89.1635971069336, "ref_logps/rejected": -96.16415405273438, "rewards/accuracies": 0.6875, "rewards/chosen": -0.15622875094413757, "rewards/margins": 0.07148327678442001, "rewards/rejected": -0.2277120053768158, "step": 144 }, { "epoch": 0.108472040396484, "grad_norm": 60.03307270259648, "learning_rate": 4.920853456045125e-07, "logps/chosen": -92.2232666015625, "logps/rejected": -99.55683898925781, "loss": 0.6799, "losses/dpo": 0.6530517935752869, "losses/sft": 0.6887096166610718, "losses/total": 0.6530517935752869, "ref_logps/chosen": -90.43402099609375, "ref_logps/rejected": -97.41462707519531, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17892473936080933, "rewards/margins": 0.035296730697155, "rewards/rejected": -0.21422147750854492, "step": 145 }, { "epoch": 0.10922012343370115, "grad_norm": 76.51478474930543, "learning_rate": 4.919332367333748e-07, "logps/chosen": -86.89801788330078, "logps/rejected": -91.87631225585938, "loss": 0.6641, "losses/dpo": 0.6642838716506958, "losses/sft": 0.6169005632400513, "losses/total": 0.6642838716506958, "ref_logps/chosen": -85.3740005493164, "ref_logps/rejected": -89.56233978271484, "rewards/accuracies": 0.6875, "rewards/chosen": -0.15240201354026794, "rewards/margins": 0.07899493724107742, "rewards/rejected": -0.23139695823192596, "step": 146 }, { "epoch": 0.10996820647091828, "grad_norm": 56.16839541294348, "learning_rate": 4.917797040407869e-07, "logps/chosen": -101.84283447265625, "logps/rejected": -100.79350280761719, "loss": 0.6887, "losses/dpo": 0.6703423857688904, "losses/sft": 1.1580501794815063, "losses/total": 0.6703423857688904, "ref_logps/chosen": -99.70930480957031, "ref_logps/rejected": -98.4612045288086, "rewards/accuracies": 0.5, "rewards/chosen": -0.21335257589817047, "rewards/margins": 0.019876856356859207, "rewards/rejected": -0.23322942852973938, "step": 147 }, { "epoch": 0.1107162895081354, "grad_norm": 57.947243308457395, "learning_rate": 4.916247484303174e-07, "logps/chosen": -91.05974578857422, "logps/rejected": -107.5179672241211, "loss": 0.6705, "losses/dpo": 0.6705751419067383, "losses/sft": 0.4810585677623749, "losses/total": 0.6705751419067383, "ref_logps/chosen": -89.19290161132812, "ref_logps/rejected": -105.0781478881836, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1866830289363861, "rewards/margins": 0.05729846656322479, "rewards/rejected": -0.2439814954996109, "step": 148 }, { "epoch": 0.11146437254535253, "grad_norm": 60.65790570931107, "learning_rate": 4.914683708139083e-07, "logps/chosen": -103.15831756591797, "logps/rejected": -115.44853210449219, "loss": 0.6513, "losses/dpo": 0.6344656944274902, "losses/sft": 1.6918480396270752, "losses/total": 0.6344656944274902, "ref_logps/chosen": -101.20863342285156, "ref_logps/rejected": -112.42498779296875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.19496840238571167, "rewards/margins": 0.10738657414913177, "rewards/rejected": -0.30235499143600464, "step": 149 }, { "epoch": 0.11221245558256966, "grad_norm": 50.21350309525241, "learning_rate": 4.913105721118704e-07, "logps/chosen": -69.80628967285156, "logps/rejected": -82.70751953125, "loss": 0.6667, "losses/dpo": 0.6575927734375, "losses/sft": 0.3539741039276123, "losses/total": 0.6575927734375, "ref_logps/chosen": -68.26954650878906, "ref_logps/rejected": -80.55747985839844, "rewards/accuracies": 0.625, "rewards/chosen": -0.15367525815963745, "rewards/margins": 0.061328694224357605, "rewards/rejected": -0.21500396728515625, "step": 150 }, { "epoch": 0.11296053861978679, "grad_norm": 67.93614693762062, "learning_rate": 4.91151353252878e-07, "logps/chosen": -85.14242553710938, "logps/rejected": -94.68510437011719, "loss": 0.6875, "losses/dpo": 0.7473528385162354, "losses/sft": 0.5720193386077881, "losses/total": 0.7473528385162354, "ref_logps/chosen": -83.10675048828125, "ref_logps/rejected": -92.45664978027344, "rewards/accuracies": 0.53125, "rewards/chosen": -0.20356670022010803, "rewards/margins": 0.019279077649116516, "rewards/rejected": -0.22284576296806335, "step": 151 }, { "epoch": 0.11370862165700393, "grad_norm": 55.35205549991062, "learning_rate": 4.909907151739633e-07, "logps/chosen": -87.07301330566406, "logps/rejected": -85.95860290527344, "loss": 0.6631, "losses/dpo": 0.6323245167732239, "losses/sft": 1.2640571594238281, "losses/total": 0.6323245167732239, "ref_logps/chosen": -85.97431182861328, "ref_logps/rejected": -84.07609558105469, "rewards/accuracies": 0.53125, "rewards/chosen": -0.10987003147602081, "rewards/margins": 0.07838086038827896, "rewards/rejected": -0.18825089931488037, "step": 152 }, { "epoch": 0.11445670469422106, "grad_norm": 76.32923461982115, "learning_rate": 4.90828658820511e-07, "logps/chosen": -87.02693939208984, "logps/rejected": -99.82713317871094, "loss": 0.6596, "losses/dpo": 0.5587536096572876, "losses/sft": 0.758726179599762, "losses/total": 0.5587536096572876, "ref_logps/chosen": -85.1407470703125, "ref_logps/rejected": -97.07820129394531, "rewards/accuracies": 0.625, "rewards/chosen": -0.18861950933933258, "rewards/margins": 0.08627529442310333, "rewards/rejected": -0.2748948037624359, "step": 153 }, { "epoch": 0.11520478773143819, "grad_norm": 73.9777229388073, "learning_rate": 4.906651851462522e-07, "logps/chosen": -86.3282699584961, "logps/rejected": -79.0264892578125, "loss": 0.7081, "losses/dpo": 0.7527448534965515, "losses/sft": 0.7532473802566528, "losses/total": 0.7527448534965515, "ref_logps/chosen": -84.03205108642578, "ref_logps/rejected": -76.87551879882812, "rewards/accuracies": 0.46875, "rewards/chosen": -0.22962193191051483, "rewards/margins": -0.014525026082992554, "rewards/rejected": -0.21509689092636108, "step": 154 }, { "epoch": 0.11595287076865532, "grad_norm": 66.11589799624393, "learning_rate": 4.905002951132597e-07, "logps/chosen": -95.90501403808594, "logps/rejected": -97.0944595336914, "loss": 0.6917, "losses/dpo": 0.7016141414642334, "losses/sft": 0.2860245108604431, "losses/total": 0.7016141414642334, "ref_logps/chosen": -93.73440551757812, "ref_logps/rejected": -94.78968811035156, "rewards/accuracies": 0.34375, "rewards/chosen": -0.2170599102973938, "rewards/margins": 0.013417438603937626, "rewards/rejected": -0.23047733306884766, "step": 155 }, { "epoch": 0.11670095380587245, "grad_norm": 86.04216140961977, "learning_rate": 4.903339896919414e-07, "logps/chosen": -92.77861022949219, "logps/rejected": -105.74507904052734, "loss": 0.6849, "losses/dpo": 0.730445146560669, "losses/sft": 0.6800632476806641, "losses/total": 0.730445146560669, "ref_logps/chosen": -90.86634063720703, "ref_logps/rejected": -103.5136947631836, "rewards/accuracies": 0.59375, "rewards/chosen": -0.19122716784477234, "rewards/margins": 0.03191075474023819, "rewards/rejected": -0.22313791513442993, "step": 156 }, { "epoch": 0.11744903684308958, "grad_norm": 68.24035830740199, "learning_rate": 4.901662698610352e-07, "logps/chosen": -112.68738555908203, "logps/rejected": -107.04827117919922, "loss": 0.7062, "losses/dpo": 0.6697822213172913, "losses/sft": 0.5917311906814575, "losses/total": 0.6697822213172913, "ref_logps/chosen": -110.37052917480469, "ref_logps/rejected": -104.80449676513672, "rewards/accuracies": 0.4375, "rewards/chosen": -0.23168516159057617, "rewards/margins": -0.007307384163141251, "rewards/rejected": -0.22437778115272522, "step": 157 }, { "epoch": 0.11819711988030672, "grad_norm": 79.87877847871249, "learning_rate": 4.899971366076032e-07, "logps/chosen": -116.77674865722656, "logps/rejected": -129.82992553710938, "loss": 0.6615, "losses/dpo": 0.6777714490890503, "losses/sft": 0.9293709993362427, "losses/total": 0.6777714490890503, "ref_logps/chosen": -114.51209259033203, "ref_logps/rejected": -126.74620056152344, "rewards/accuracies": 0.59375, "rewards/chosen": -0.22646574676036835, "rewards/margins": 0.08190611749887466, "rewards/rejected": -0.308371901512146, "step": 158 }, { "epoch": 0.11894520291752385, "grad_norm": 62.95456378624765, "learning_rate": 4.898265909270253e-07, "logps/chosen": -88.93087768554688, "logps/rejected": -85.73485565185547, "loss": 0.7035, "losses/dpo": 0.6681728363037109, "losses/sft": 0.3799082636833191, "losses/total": 0.6681728363037109, "ref_logps/chosen": -86.82762908935547, "ref_logps/rejected": -83.6757583618164, "rewards/accuracies": 0.625, "rewards/chosen": -0.21032488346099854, "rewards/margins": -0.004414811730384827, "rewards/rejected": -0.2059100717306137, "step": 159 }, { "epoch": 0.11969328595474098, "grad_norm": 66.14985100346608, "learning_rate": 4.896546338229944e-07, "logps/chosen": -89.65078735351562, "logps/rejected": -99.16985321044922, "loss": 0.6597, "losses/dpo": 0.5975914597511292, "losses/sft": 0.818283200263977, "losses/total": 0.5975914597511292, "ref_logps/chosen": -87.72842407226562, "ref_logps/rejected": -96.47317504882812, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19223599135875702, "rewards/margins": 0.07743187248706818, "rewards/rejected": -0.2696678936481476, "step": 160 }, { "epoch": 0.1204413689919581, "grad_norm": 53.93627245800194, "learning_rate": 4.894812663075094e-07, "logps/chosen": -78.68441772460938, "logps/rejected": -86.2281494140625, "loss": 0.6574, "losses/dpo": 0.6291898488998413, "losses/sft": 0.6345190405845642, "losses/total": 0.6291898488998413, "ref_logps/chosen": -76.97015380859375, "ref_logps/rejected": -83.71790313720703, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17142683267593384, "rewards/margins": 0.07959761470556259, "rewards/rejected": -0.25102442502975464, "step": 161 }, { "epoch": 0.12118945202917523, "grad_norm": 55.67482854870698, "learning_rate": 4.893064894008701e-07, "logps/chosen": -55.780860900878906, "logps/rejected": -57.99273681640625, "loss": 0.6753, "losses/dpo": 0.6837819218635559, "losses/sft": 0.4737522006034851, "losses/total": 0.6837819218635559, "ref_logps/chosen": -54.612125396728516, "ref_logps/rejected": -56.41606140136719, "rewards/accuracies": 0.5, "rewards/chosen": -0.11687351018190384, "rewards/margins": 0.04079398885369301, "rewards/rejected": -0.15766748785972595, "step": 162 }, { "epoch": 0.12193753506639236, "grad_norm": 113.92318786358513, "learning_rate": 4.891303041316705e-07, "logps/chosen": -100.1951904296875, "logps/rejected": -103.78213500976562, "loss": 0.6821, "losses/dpo": 0.64927738904953, "losses/sft": 0.35727936029434204, "losses/total": 0.64927738904953, "ref_logps/chosen": -97.95149993896484, "ref_logps/rejected": -101.14674377441406, "rewards/accuracies": 0.65625, "rewards/chosen": -0.22436977922916412, "rewards/margins": 0.0391702800989151, "rewards/rejected": -0.2635400593280792, "step": 163 }, { "epoch": 0.1226856181036095, "grad_norm": 58.20626711972638, "learning_rate": 4.88952711536793e-07, "logps/chosen": -104.09432983398438, "logps/rejected": -107.78334045410156, "loss": 0.6786, "losses/dpo": 0.6484010219573975, "losses/sft": 1.0344533920288086, "losses/total": 0.6484010219573975, "ref_logps/chosen": -101.46148681640625, "ref_logps/rejected": -104.62982177734375, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2632841169834137, "rewards/margins": 0.05206887051463127, "rewards/rejected": -0.3153529763221741, "step": 164 }, { "epoch": 0.12343370114082663, "grad_norm": 50.246883295678806, "learning_rate": 4.88773712661403e-07, "logps/chosen": -80.24613952636719, "logps/rejected": -95.1003189086914, "loss": 0.6527, "losses/dpo": 0.6033352613449097, "losses/sft": 1.1214065551757812, "losses/total": 0.6033352613449097, "ref_logps/chosen": -78.42283630371094, "ref_logps/rejected": -92.245361328125, "rewards/accuracies": 0.625, "rewards/chosen": -0.182329922914505, "rewards/margins": 0.1031661331653595, "rewards/rejected": -0.2854960560798645, "step": 165 }, { "epoch": 0.12418178417804376, "grad_norm": 59.993713796648194, "learning_rate": 4.885933085589415e-07, "logps/chosen": -83.73558807373047, "logps/rejected": -97.19931030273438, "loss": 0.6892, "losses/dpo": 0.6878287196159363, "losses/sft": 1.063982367515564, "losses/total": 0.6878287196159363, "ref_logps/chosen": -81.63722229003906, "ref_logps/rejected": -94.86050415039062, "rewards/accuracies": 0.625, "rewards/chosen": -0.2098366618156433, "rewards/margins": 0.02404460683465004, "rewards/rejected": -0.23388127982616425, "step": 166 }, { "epoch": 0.12492986721526089, "grad_norm": 68.42517968329469, "learning_rate": 4.884115002911197e-07, "logps/chosen": -105.46253967285156, "logps/rejected": -117.83299255371094, "loss": 0.6727, "losses/dpo": 0.6893208026885986, "losses/sft": 0.7444704174995422, "losses/total": 0.6893208026885986, "ref_logps/chosen": -102.99386596679688, "ref_logps/rejected": -114.78639221191406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.24686755239963531, "rewards/margins": 0.057792168110609055, "rewards/rejected": -0.30465975403785706, "step": 167 }, { "epoch": 0.12567795025247802, "grad_norm": 50.453410356033025, "learning_rate": 4.882282889279125e-07, "logps/chosen": -77.76266479492188, "logps/rejected": -83.55487823486328, "loss": 0.6613, "losses/dpo": 0.7065508365631104, "losses/sft": 0.4897269010543823, "losses/total": 0.7065508365631104, "ref_logps/chosen": -76.19890594482422, "ref_logps/rejected": -81.210205078125, "rewards/accuracies": 0.59375, "rewards/chosen": -0.156375914812088, "rewards/margins": 0.0780915766954422, "rewards/rejected": -0.2344675064086914, "step": 168 }, { "epoch": 0.12642603328969515, "grad_norm": 78.58564234146374, "learning_rate": 4.880436755475525e-07, "logps/chosen": -81.9049072265625, "logps/rejected": -85.50666046142578, "loss": 0.6996, "losses/dpo": 0.7465134859085083, "losses/sft": 1.4777384996414185, "losses/total": 0.7465134859085083, "ref_logps/chosen": -79.623046875, "ref_logps/rejected": -83.22468566894531, "rewards/accuracies": 0.46875, "rewards/chosen": -0.22818532586097717, "rewards/margins": 1.3111159205436707e-05, "rewards/rejected": -0.22819846868515015, "step": 169 }, { "epoch": 0.12717411632691228, "grad_norm": 57.90099723036197, "learning_rate": 4.878576612365233e-07, "logps/chosen": -104.97842407226562, "logps/rejected": -107.13552856445312, "loss": 0.6839, "losses/dpo": 0.652925968170166, "losses/sft": 0.9745640754699707, "losses/total": 0.652925968170166, "ref_logps/chosen": -102.44204711914062, "ref_logps/rejected": -104.32659912109375, "rewards/accuracies": 0.4375, "rewards/chosen": -0.25363701581954956, "rewards/margins": 0.027255695313215256, "rewards/rejected": -0.2808926999568939, "step": 170 }, { "epoch": 0.1279221993641294, "grad_norm": 69.73315404002163, "learning_rate": 4.876702470895531e-07, "logps/chosen": -98.08821105957031, "logps/rejected": -116.82386779785156, "loss": 0.6759, "losses/dpo": 0.6213781833648682, "losses/sft": 0.9196016788482666, "losses/total": 0.6213781833648682, "ref_logps/chosen": -95.82238006591797, "ref_logps/rejected": -113.99420166015625, "rewards/accuracies": 0.625, "rewards/chosen": -0.22658394277095795, "rewards/margins": 0.05638251453638077, "rewards/rejected": -0.2829664349555969, "step": 171 }, { "epoch": 0.12867028240134654, "grad_norm": 70.56671451656477, "learning_rate": 4.874814342096086e-07, "logps/chosen": -92.7196044921875, "logps/rejected": -112.41795349121094, "loss": 0.6671, "losses/dpo": 0.6604632139205933, "losses/sft": 0.6977664232254028, "losses/total": 0.6604632139205933, "ref_logps/chosen": -90.76507568359375, "ref_logps/rejected": -109.73764038085938, "rewards/accuracies": 0.625, "rewards/chosen": -0.19545236229896545, "rewards/margins": 0.07257819175720215, "rewards/rejected": -0.26803058385849, "step": 172 }, { "epoch": 0.1294183654385637, "grad_norm": 55.24785190393732, "learning_rate": 4.872912237078881e-07, "logps/chosen": -111.37396240234375, "logps/rejected": -119.72882080078125, "loss": 0.6592, "losses/dpo": 0.6299439668655396, "losses/sft": 0.9062247276306152, "losses/total": 0.6299439668655396, "ref_logps/chosen": -109.15583038330078, "ref_logps/rejected": -116.69577026367188, "rewards/accuracies": 0.71875, "rewards/chosen": -0.22181351482868195, "rewards/margins": 0.08149164915084839, "rewards/rejected": -0.30330514907836914, "step": 173 }, { "epoch": 0.13016644847578082, "grad_norm": 53.421842474089345, "learning_rate": 4.870996167038154e-07, "logps/chosen": -98.69496154785156, "logps/rejected": -104.72586059570312, "loss": 0.6941, "losses/dpo": 0.6408145427703857, "losses/sft": 0.7745596170425415, "losses/total": 0.6408145427703857, "ref_logps/chosen": -96.17872619628906, "ref_logps/rejected": -102.05000305175781, "rewards/accuracies": 0.46875, "rewards/chosen": -0.2516232132911682, "rewards/margins": 0.015962546691298485, "rewards/rejected": -0.26758575439453125, "step": 174 }, { "epoch": 0.13091453151299795, "grad_norm": 65.72646570555636, "learning_rate": 4.869066143250328e-07, "logps/chosen": -105.8415298461914, "logps/rejected": -109.89979553222656, "loss": 0.6387, "losses/dpo": 0.6861504316329956, "losses/sft": 0.5257918834686279, "losses/total": 0.6861504316329956, "ref_logps/chosen": -103.8973159790039, "ref_logps/rejected": -106.51354217529297, "rewards/accuracies": 0.6875, "rewards/chosen": -0.19442158937454224, "rewards/margins": 0.1442045420408249, "rewards/rejected": -0.33862611651420593, "step": 175 }, { "epoch": 0.13166261455021508, "grad_norm": 59.263510616289146, "learning_rate": 4.867122177073947e-07, "logps/chosen": -76.27286529541016, "logps/rejected": -81.81326293945312, "loss": 0.6808, "losses/dpo": 0.5401661992073059, "losses/sft": 0.9777116775512695, "losses/total": 0.5401661992073059, "ref_logps/chosen": -74.26249694824219, "ref_logps/rejected": -79.37676239013672, "rewards/accuracies": 0.5625, "rewards/chosen": -0.20103594660758972, "rewards/margins": 0.04261402785778046, "rewards/rejected": -0.24365000426769257, "step": 176 }, { "epoch": 0.1324106975874322, "grad_norm": 54.192022924627295, "learning_rate": 4.865164279949608e-07, "logps/chosen": -90.84090423583984, "logps/rejected": -94.15303039550781, "loss": 0.6701, "losses/dpo": 0.6948055028915405, "losses/sft": 0.6030378341674805, "losses/total": 0.6948055028915405, "ref_logps/chosen": -88.58799743652344, "ref_logps/rejected": -91.28793334960938, "rewards/accuracies": 0.5625, "rewards/chosen": -0.22529026865959167, "rewards/margins": 0.06121928244829178, "rewards/rejected": -0.28650957345962524, "step": 177 }, { "epoch": 0.13315878062464934, "grad_norm": 83.15770343139673, "learning_rate": 4.863192463399895e-07, "logps/chosen": -106.98240661621094, "logps/rejected": -108.50897216796875, "loss": 0.6722, "losses/dpo": 0.6510883569717407, "losses/sft": 0.8943474292755127, "losses/total": 0.6510883569717407, "ref_logps/chosen": -104.36042785644531, "ref_logps/rejected": -105.35926055908203, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2621973156929016, "rewards/margins": 0.05277387797832489, "rewards/rejected": -0.3149711489677429, "step": 178 }, { "epoch": 0.13390686366186647, "grad_norm": 50.24497933239189, "learning_rate": 4.86120673902931e-07, "logps/chosen": -68.96347045898438, "logps/rejected": -79.66374206542969, "loss": 0.677, "losses/dpo": 0.6955000758171082, "losses/sft": 0.5742906928062439, "losses/total": 0.6955000758171082, "ref_logps/chosen": -66.93439483642578, "ref_logps/rejected": -77.18976593017578, "rewards/accuracies": 0.5625, "rewards/chosen": -0.20290765166282654, "rewards/margins": 0.044489722698926926, "rewards/rejected": -0.24739736318588257, "step": 179 }, { "epoch": 0.1346549466990836, "grad_norm": 65.01078032082344, "learning_rate": 4.859207118524203e-07, "logps/chosen": -96.330322265625, "logps/rejected": -97.83934020996094, "loss": 0.6943, "losses/dpo": 0.7036426067352295, "losses/sft": 0.4989013671875, "losses/total": 0.7036426067352295, "ref_logps/chosen": -93.8449935913086, "ref_logps/rejected": -95.25479125976562, "rewards/accuracies": 0.5, "rewards/chosen": -0.24853304028511047, "rewards/margins": 0.009921375662088394, "rewards/rejected": -0.25845441222190857, "step": 180 }, { "epoch": 0.13540302973630072, "grad_norm": 59.17104253392356, "learning_rate": 4.85719361365271e-07, "logps/chosen": -90.72506713867188, "logps/rejected": -105.25631713867188, "loss": 0.681, "losses/dpo": 0.6270008683204651, "losses/sft": 0.6456217169761658, "losses/total": 0.6270008683204651, "ref_logps/chosen": -88.42997741699219, "ref_logps/rejected": -102.6021728515625, "rewards/accuracies": 0.59375, "rewards/chosen": -0.22951003909111023, "rewards/margins": 0.03590420261025429, "rewards/rejected": -0.2654142379760742, "step": 181 }, { "epoch": 0.13615111277351785, "grad_norm": 54.01772776762728, "learning_rate": 4.855166236264675e-07, "logps/chosen": -81.40655517578125, "logps/rejected": -84.75814819335938, "loss": 0.6656, "losses/dpo": 0.6867361068725586, "losses/sft": 0.4522467851638794, "losses/total": 0.6867361068725586, "ref_logps/chosen": -79.35398864746094, "ref_logps/rejected": -81.98771667480469, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20525649189949036, "rewards/margins": 0.07178622484207153, "rewards/rejected": -0.2770427167415619, "step": 182 }, { "epoch": 0.13689919581073498, "grad_norm": 59.51195087143462, "learning_rate": 4.853124998291585e-07, "logps/chosen": -117.21118927001953, "logps/rejected": -131.54893493652344, "loss": 0.6819, "losses/dpo": 0.5886228680610657, "losses/sft": 0.7455391883850098, "losses/total": 0.5886228680610657, "ref_logps/chosen": -114.06651306152344, "ref_logps/rejected": -127.98771667480469, "rewards/accuracies": 0.5, "rewards/chosen": -0.3144669234752655, "rewards/margins": 0.04165399819612503, "rewards/rejected": -0.3561209440231323, "step": 183 }, { "epoch": 0.1376472788479521, "grad_norm": 65.54565090996083, "learning_rate": 4.8510699117465e-07, "logps/chosen": -80.49826049804688, "logps/rejected": -90.28517150878906, "loss": 0.6772, "losses/dpo": 0.6992715001106262, "losses/sft": 1.0478602647781372, "losses/total": 0.6992715001106262, "ref_logps/chosen": -78.21897888183594, "ref_logps/rejected": -87.52845001220703, "rewards/accuracies": 0.625, "rewards/chosen": -0.22792813181877136, "rewards/margins": 0.047743506729602814, "rewards/rejected": -0.2756716310977936, "step": 184 }, { "epoch": 0.13839536188516927, "grad_norm": 52.52546742670221, "learning_rate": 4.849000988723982e-07, "logps/chosen": -76.17027282714844, "logps/rejected": -77.9755859375, "loss": 0.6681, "losses/dpo": 0.6271989345550537, "losses/sft": 1.3566128015518188, "losses/total": 0.6271989345550537, "ref_logps/chosen": -74.2039794921875, "ref_logps/rejected": -75.38041687011719, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19662894308567047, "rewards/margins": 0.06288721412420273, "rewards/rejected": -0.2595161497592926, "step": 185 }, { "epoch": 0.1391434449223864, "grad_norm": 64.41439236289857, "learning_rate": 4.846918241400021e-07, "logps/chosen": -94.28997802734375, "logps/rejected": -110.7164077758789, "loss": 0.63, "losses/dpo": 0.6759794354438782, "losses/sft": 1.2774208784103394, "losses/total": 0.6759794354438782, "ref_logps/chosen": -92.14253997802734, "ref_logps/rejected": -107.11070251464844, "rewards/accuracies": 0.78125, "rewards/chosen": -0.21474316716194153, "rewards/margins": 0.14582814276218414, "rewards/rejected": -0.36057132482528687, "step": 186 }, { "epoch": 0.13989152795960352, "grad_norm": 70.19830013618918, "learning_rate": 4.844821682031968e-07, "logps/chosen": -87.86259460449219, "logps/rejected": -91.18501281738281, "loss": 0.6818, "losses/dpo": 0.7714724540710449, "losses/sft": 0.5422049164772034, "losses/total": 0.7714724540710449, "ref_logps/chosen": -85.28173828125, "ref_logps/rejected": -88.22736358642578, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2580851912498474, "rewards/margins": 0.037679024040699005, "rewards/rejected": -0.2957642376422882, "step": 187 }, { "epoch": 0.14063961099682065, "grad_norm": 48.756836827471226, "learning_rate": 4.842711322958459e-07, "logps/chosen": -92.35582733154297, "logps/rejected": -102.51150512695312, "loss": 0.6588, "losses/dpo": 0.663488507270813, "losses/sft": 0.6719114780426025, "losses/total": 0.663488507270813, "ref_logps/chosen": -90.08038330078125, "ref_logps/rejected": -99.38691711425781, "rewards/accuracies": 0.65625, "rewards/chosen": -0.22754423320293427, "rewards/margins": 0.08491441607475281, "rewards/rejected": -0.31245866417884827, "step": 188 }, { "epoch": 0.14138769403403778, "grad_norm": 61.18191738203066, "learning_rate": 4.840587176599343e-07, "logps/chosen": -90.00007629394531, "logps/rejected": -98.04437255859375, "loss": 0.6514, "losses/dpo": 0.7620391845703125, "losses/sft": 1.3837801218032837, "losses/total": 0.7620391845703125, "ref_logps/chosen": -87.588134765625, "ref_logps/rejected": -94.62313079833984, "rewards/accuracies": 0.65625, "rewards/chosen": -0.24119414389133453, "rewards/margins": 0.10093019902706146, "rewards/rejected": -0.342124342918396, "step": 189 }, { "epoch": 0.1421357770712549, "grad_norm": 58.96958284745105, "learning_rate": 4.838449255455612e-07, "logps/chosen": -91.3669662475586, "logps/rejected": -105.64440155029297, "loss": 0.6738, "losses/dpo": 0.6098815202713013, "losses/sft": 0.9027516841888428, "losses/total": 0.6098815202713013, "ref_logps/chosen": -88.98480987548828, "ref_logps/rejected": -102.65655517578125, "rewards/accuracies": 0.625, "rewards/chosen": -0.23821613192558289, "rewards/margins": 0.060567762702703476, "rewards/rejected": -0.29878389835357666, "step": 190 }, { "epoch": 0.14288386010847204, "grad_norm": 59.42566835110246, "learning_rate": 4.836297572109322e-07, "logps/chosen": -90.8278579711914, "logps/rejected": -92.52567291259766, "loss": 0.688, "losses/dpo": 0.6840787529945374, "losses/sft": 0.7023148536682129, "losses/total": 0.6840787529945374, "ref_logps/chosen": -88.23002624511719, "ref_logps/rejected": -89.64488220214844, "rewards/accuracies": 0.40625, "rewards/chosen": -0.2597828507423401, "rewards/margins": 0.028297439217567444, "rewards/rejected": -0.2880803048610687, "step": 191 }, { "epoch": 0.14363194314568917, "grad_norm": 74.28728704831191, "learning_rate": 4.834132139223526e-07, "logps/chosen": -103.98814392089844, "logps/rejected": -101.87397003173828, "loss": 0.66, "losses/dpo": 0.6446481943130493, "losses/sft": 0.47155994176864624, "losses/total": 0.6446481943130493, "ref_logps/chosen": -101.42027282714844, "ref_logps/rejected": -98.43428802490234, "rewards/accuracies": 0.59375, "rewards/chosen": -0.25678589940071106, "rewards/margins": 0.08718223869800568, "rewards/rejected": -0.34396815299987793, "step": 192 }, { "epoch": 0.1443800261829063, "grad_norm": 47.619286752668025, "learning_rate": 4.831952969542192e-07, "logps/chosen": -93.81659698486328, "logps/rejected": -100.84715270996094, "loss": 0.682, "losses/dpo": 0.7545093894004822, "losses/sft": 0.9159215688705444, "losses/total": 0.7545093894004822, "ref_logps/chosen": -91.10783386230469, "ref_logps/rejected": -97.72148132324219, "rewards/accuracies": 0.46875, "rewards/chosen": -0.2708767056465149, "rewards/margins": 0.04168963432312012, "rewards/rejected": -0.312566339969635, "step": 193 }, { "epoch": 0.14512810922012342, "grad_norm": 71.5957498125419, "learning_rate": 4.829760075890134e-07, "logps/chosen": -88.70191192626953, "logps/rejected": -97.20173645019531, "loss": 0.6216, "losses/dpo": 0.6859922409057617, "losses/sft": 0.9594184756278992, "losses/total": 0.6859922409057617, "ref_logps/chosen": -86.72489929199219, "ref_logps/rejected": -93.61410522460938, "rewards/accuracies": 0.8125, "rewards/chosen": -0.19770172238349915, "rewards/margins": 0.16106204688549042, "rewards/rejected": -0.35876375436782837, "step": 194 }, { "epoch": 0.14587619225734055, "grad_norm": 94.52920627875366, "learning_rate": 4.827553471172934e-07, "logps/chosen": -108.11821746826172, "logps/rejected": -130.4344482421875, "loss": 0.6513, "losses/dpo": 0.5973787903785706, "losses/sft": 1.5188753604888916, "losses/total": 0.5973787903785706, "ref_logps/chosen": -105.25506591796875, "ref_logps/rejected": -126.3784408569336, "rewards/accuracies": 0.625, "rewards/chosen": -0.2863156199455261, "rewards/margins": 0.11928405612707138, "rewards/rejected": -0.4055996537208557, "step": 195 }, { "epoch": 0.14662427529455768, "grad_norm": 65.15008151268123, "learning_rate": 4.825333168376863e-07, "logps/chosen": -95.05158233642578, "logps/rejected": -102.11106872558594, "loss": 0.6665, "losses/dpo": 0.6139391660690308, "losses/sft": 0.4683370888233185, "losses/total": 0.6139391660690308, "ref_logps/chosen": -91.99246215820312, "ref_logps/rejected": -98.3023681640625, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3059118092060089, "rewards/margins": 0.07495856285095215, "rewards/rejected": -0.38087040185928345, "step": 196 }, { "epoch": 0.14737235833177484, "grad_norm": 59.04401721282123, "learning_rate": 4.823099180568812e-07, "logps/chosen": -99.57398986816406, "logps/rejected": -92.76455688476562, "loss": 0.7073, "losses/dpo": 0.7094258666038513, "losses/sft": 1.1847834587097168, "losses/total": 0.7094258666038513, "ref_logps/chosen": -96.42762756347656, "ref_logps/rejected": -89.69480895996094, "rewards/accuracies": 0.46875, "rewards/chosen": -0.3146367073059082, "rewards/margins": -0.007662534713745117, "rewards/rejected": -0.3069741427898407, "step": 197 }, { "epoch": 0.14812044136899197, "grad_norm": 49.29996596135944, "learning_rate": 4.820851520896208e-07, "logps/chosen": -81.24517059326172, "logps/rejected": -92.2049560546875, "loss": 0.6364, "losses/dpo": 0.6491163969039917, "losses/sft": 0.5453506112098694, "losses/total": 0.6491163969039917, "ref_logps/chosen": -79.25790405273438, "ref_logps/rejected": -88.94078826904297, "rewards/accuracies": 0.75, "rewards/chosen": -0.1987273097038269, "rewards/margins": 0.12768976390361786, "rewards/rejected": -0.32641708850860596, "step": 198 }, { "epoch": 0.1488685244062091, "grad_norm": 65.12656826832794, "learning_rate": 4.81859020258694e-07, "logps/chosen": -81.16419219970703, "logps/rejected": -79.90463256835938, "loss": 0.6808, "losses/dpo": 0.7518958449363708, "losses/sft": 0.24330705404281616, "losses/total": 0.7518958449363708, "ref_logps/chosen": -78.94775390625, "ref_logps/rejected": -77.22090911865234, "rewards/accuracies": 0.53125, "rewards/chosen": -0.22164365649223328, "rewards/margins": 0.04672900587320328, "rewards/rejected": -0.26837265491485596, "step": 199 }, { "epoch": 0.14961660744342622, "grad_norm": 66.48025096111463, "learning_rate": 4.816315238949281e-07, "logps/chosen": -90.42564392089844, "logps/rejected": -97.27644348144531, "loss": 0.6993, "losses/dpo": 0.7049291729927063, "losses/sft": 0.5356864333152771, "losses/total": 0.7049291729927063, "ref_logps/chosen": -87.48149871826172, "ref_logps/rejected": -94.1490707397461, "rewards/accuracies": 0.65625, "rewards/chosen": -0.29441383481025696, "rewards/margins": 0.018322303891181946, "rewards/rejected": -0.3127361238002777, "step": 200 }, { "epoch": 0.15036469048064335, "grad_norm": 52.688266264387764, "learning_rate": 4.814026643371809e-07, "logps/chosen": -100.23017883300781, "logps/rejected": -108.89404296875, "loss": 0.6356, "losses/dpo": 0.6619703769683838, "losses/sft": 0.8737683296203613, "losses/total": 0.6619703769683838, "ref_logps/chosen": -97.83048248291016, "ref_logps/rejected": -105.12745666503906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2399698793888092, "rewards/margins": 0.13668933510780334, "rewards/rejected": -0.37665921449661255, "step": 201 }, { "epoch": 0.15111277351786048, "grad_norm": 60.393559495857765, "learning_rate": 4.811724429323328e-07, "logps/chosen": -104.18501281738281, "logps/rejected": -109.08726501464844, "loss": 0.6839, "losses/dpo": 0.6503579616546631, "losses/sft": 1.43683922290802, "losses/total": 0.6503579616546631, "ref_logps/chosen": -101.21627044677734, "ref_logps/rejected": -105.75186157226562, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2968742251396179, "rewards/margins": 0.03666532784700394, "rewards/rejected": -0.33353954553604126, "step": 202 }, { "epoch": 0.1518608565550776, "grad_norm": 73.69255083551539, "learning_rate": 4.80940861035279e-07, "logps/chosen": -79.20345306396484, "logps/rejected": -79.53129577636719, "loss": 0.6906, "losses/dpo": 0.7741960287094116, "losses/sft": 1.1017115116119385, "losses/total": 0.7741960287094116, "ref_logps/chosen": -76.34427642822266, "ref_logps/rejected": -76.46568298339844, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2859179973602295, "rewards/margins": 0.02064242586493492, "rewards/rejected": -0.3065603971481323, "step": 203 }, { "epoch": 0.15260893959229474, "grad_norm": 49.19765961748352, "learning_rate": 4.807079200089216e-07, "logps/chosen": -64.98310089111328, "logps/rejected": -76.41627502441406, "loss": 0.6375, "losses/dpo": 0.6497167348861694, "losses/sft": 0.3627575933933258, "losses/total": 0.6497167348861694, "ref_logps/chosen": -63.31337356567383, "ref_logps/rejected": -73.45730590820312, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1669730544090271, "rewards/margins": 0.12892372906208038, "rewards/rejected": -0.2958967685699463, "step": 204 }, { "epoch": 0.15335702262951187, "grad_norm": 69.38925829930653, "learning_rate": 4.80473621224161e-07, "logps/chosen": -74.62007141113281, "logps/rejected": -85.60157012939453, "loss": 0.6753, "losses/dpo": 0.6621618866920471, "losses/sft": 1.017303466796875, "losses/total": 0.6621618866920471, "ref_logps/chosen": -72.14251708984375, "ref_logps/rejected": -82.56431579589844, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2477554976940155, "rewards/margins": 0.05596931278705597, "rewards/rejected": -0.30372482538223267, "step": 205 }, { "epoch": 0.154105105666729, "grad_norm": 70.2074930414311, "learning_rate": 4.802379660598887e-07, "logps/chosen": -64.42359924316406, "logps/rejected": -67.1998519897461, "loss": 0.6856, "losses/dpo": 0.5983568429946899, "losses/sft": 0.5124667882919312, "losses/total": 0.5983568429946899, "ref_logps/chosen": -62.42451858520508, "ref_logps/rejected": -64.88651275634766, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19990749657154083, "rewards/margins": 0.031427204608917236, "rewards/rejected": -0.23133471608161926, "step": 206 }, { "epoch": 0.15485318870394613, "grad_norm": 61.42332375660897, "learning_rate": 4.800009559029782e-07, "logps/chosen": -87.3368911743164, "logps/rejected": -96.15825653076172, "loss": 0.6461, "losses/dpo": 0.6595812439918518, "losses/sft": 0.5893433094024658, "losses/total": 0.6595812439918518, "ref_logps/chosen": -85.12808227539062, "ref_logps/rejected": -92.73533630371094, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2208813726902008, "rewards/margins": 0.12141063809394836, "rewards/rejected": -0.34229201078414917, "step": 207 }, { "epoch": 0.15560127174116328, "grad_norm": 65.57418751069652, "learning_rate": 4.797625921482782e-07, "logps/chosen": -107.67082977294922, "logps/rejected": -110.65438079833984, "loss": 0.6374, "losses/dpo": 0.6399276256561279, "losses/sft": 0.6354954242706299, "losses/total": 0.6399276256561279, "ref_logps/chosen": -104.74606323242188, "ref_logps/rejected": -106.28572082519531, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2924765646457672, "rewards/margins": 0.1443903148174286, "rewards/rejected": -0.4368668794631958, "step": 208 }, { "epoch": 0.1563493547783804, "grad_norm": 67.90652655357789, "learning_rate": 4.795228761986028e-07, "logps/chosen": -95.61532592773438, "logps/rejected": -96.87928009033203, "loss": 0.6719, "losses/dpo": 0.7024803161621094, "losses/sft": 1.8974405527114868, "losses/total": 0.7024803161621094, "ref_logps/chosen": -92.75838470458984, "ref_logps/rejected": -93.33810424804688, "rewards/accuracies": 0.625, "rewards/chosen": -0.28569352626800537, "rewards/margins": 0.06842470169067383, "rewards/rejected": -0.3541182279586792, "step": 209 }, { "epoch": 0.15709743781559754, "grad_norm": 65.62172980017024, "learning_rate": 4.792818094647242e-07, "logps/chosen": -72.1073989868164, "logps/rejected": -76.28890228271484, "loss": 0.695, "losses/dpo": 0.7684206366539001, "losses/sft": 0.2527555227279663, "losses/total": 0.7684206366539001, "ref_logps/chosen": -69.9959716796875, "ref_logps/rejected": -74.01309967041016, "rewards/accuracies": 0.5625, "rewards/chosen": -0.21114253997802734, "rewards/margins": 0.016437780112028122, "rewards/rejected": -0.22758030891418457, "step": 210 }, { "epoch": 0.15784552085281467, "grad_norm": 64.98636437223804, "learning_rate": 4.790393933653645e-07, "logps/chosen": -88.76152038574219, "logps/rejected": -96.153564453125, "loss": 0.6612, "losses/dpo": 0.689596951007843, "losses/sft": 0.5103479623794556, "losses/total": 0.689596951007843, "ref_logps/chosen": -86.16497039794922, "ref_logps/rejected": -92.78025817871094, "rewards/accuracies": 0.625, "rewards/chosen": -0.25965583324432373, "rewards/margins": 0.07767565548419952, "rewards/rejected": -0.33733147382736206, "step": 211 }, { "epoch": 0.1585936038900318, "grad_norm": 63.52638165181049, "learning_rate": 4.78795629327187e-07, "logps/chosen": -95.9000244140625, "logps/rejected": -99.3536148071289, "loss": 0.7085, "losses/dpo": 0.5517626404762268, "losses/sft": 0.7672191858291626, "losses/total": 0.5517626404762268, "ref_logps/chosen": -92.79862976074219, "ref_logps/rejected": -96.18455505371094, "rewards/accuracies": 0.5, "rewards/chosen": -0.3101387023925781, "rewards/margins": 0.006766693666577339, "rewards/rejected": -0.3169053792953491, "step": 212 }, { "epoch": 0.15934168692724893, "grad_norm": 67.34923297130956, "learning_rate": 4.785505187847876e-07, "logps/chosen": -109.40829467773438, "logps/rejected": -121.07888793945312, "loss": 0.6546, "losses/dpo": 0.6418917775154114, "losses/sft": 1.3359969854354858, "losses/total": 0.6418917775154114, "ref_logps/chosen": -106.17053985595703, "ref_logps/rejected": -116.66868591308594, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3237748146057129, "rewards/margins": 0.11724531650543213, "rewards/rejected": -0.441020131111145, "step": 213 }, { "epoch": 0.16008976996446606, "grad_norm": 69.35506145816413, "learning_rate": 4.783040631806867e-07, "logps/chosen": -74.02384185791016, "logps/rejected": -74.62344360351562, "loss": 0.662, "losses/dpo": 0.7410420775413513, "losses/sft": 0.23567728698253632, "losses/total": 0.7410420775413513, "ref_logps/chosen": -71.84709167480469, "ref_logps/rejected": -71.62324523925781, "rewards/accuracies": 0.625, "rewards/chosen": -0.2176753282546997, "rewards/margins": 0.08234342187643051, "rewards/rejected": -0.3000187873840332, "step": 214 }, { "epoch": 0.16083785300168318, "grad_norm": 69.67690327224206, "learning_rate": 4.78056263965321e-07, "logps/chosen": -88.0389404296875, "logps/rejected": -100.20819091796875, "loss": 0.66, "losses/dpo": 0.7266356348991394, "losses/sft": 0.9407125115394592, "losses/total": 0.7266356348991394, "ref_logps/chosen": -85.74920654296875, "ref_logps/rejected": -96.88803100585938, "rewards/accuracies": 0.625, "rewards/chosen": -0.22897294163703918, "rewards/margins": 0.10304364562034607, "rewards/rejected": -0.33201658725738525, "step": 215 }, { "epoch": 0.1615859360389003, "grad_norm": 58.34517487725544, "learning_rate": 4.778071225970339e-07, "logps/chosen": -78.19352722167969, "logps/rejected": -100.37606048583984, "loss": 0.6207, "losses/dpo": 0.6452184319496155, "losses/sft": 0.5895382761955261, "losses/total": 0.6452184319496155, "ref_logps/chosen": -75.79325103759766, "ref_logps/rejected": -96.1751708984375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24002709984779358, "rewards/margins": 0.18006141483783722, "rewards/rejected": -0.420088529586792, "step": 216 }, { "epoch": 0.16233401907611744, "grad_norm": 57.6268942929129, "learning_rate": 4.775566405420683e-07, "logps/chosen": -74.75460815429688, "logps/rejected": -85.69587707519531, "loss": 0.6465, "losses/dpo": 0.6597747206687927, "losses/sft": 0.7359213829040527, "losses/total": 0.6597747206687927, "ref_logps/chosen": -72.70348358154297, "ref_logps/rejected": -82.42532348632812, "rewards/accuracies": 0.625, "rewards/chosen": -0.20511272549629211, "rewards/margins": 0.12194281816482544, "rewards/rejected": -0.32705554366111755, "step": 217 }, { "epoch": 0.16308210211333457, "grad_norm": 88.17080593187724, "learning_rate": 4.77304819274557e-07, "logps/chosen": -75.875732421875, "logps/rejected": -92.7216796875, "loss": 0.648, "losses/dpo": 0.6706427335739136, "losses/sft": 0.848543643951416, "losses/total": 0.6706427335739136, "ref_logps/chosen": -72.89729309082031, "ref_logps/rejected": -88.56626892089844, "rewards/accuracies": 0.5625, "rewards/chosen": -0.29784324765205383, "rewards/margins": 0.11769694089889526, "rewards/rejected": -0.4155401587486267, "step": 218 }, { "epoch": 0.1638301851505517, "grad_norm": 57.789660669490004, "learning_rate": 4.770516602765143e-07, "logps/chosen": -95.96363830566406, "logps/rejected": -103.35125732421875, "loss": 0.7031, "losses/dpo": 0.653843343257904, "losses/sft": 1.1218945980072021, "losses/total": 0.653843343257904, "ref_logps/chosen": -92.65876770019531, "ref_logps/rejected": -99.8332748413086, "rewards/accuracies": 0.59375, "rewards/chosen": -0.33048632740974426, "rewards/margins": 0.021313201636075974, "rewards/rejected": -0.35179954767227173, "step": 219 }, { "epoch": 0.16457826818776886, "grad_norm": 83.40912546322957, "learning_rate": 4.767971650378272e-07, "logps/chosen": -95.22262573242188, "logps/rejected": -104.5376968383789, "loss": 0.6345, "losses/dpo": 0.5925936698913574, "losses/sft": 0.8302385807037354, "losses/total": 0.5925936698913574, "ref_logps/chosen": -92.50894927978516, "ref_logps/rejected": -100.480224609375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2713678181171417, "rewards/margins": 0.13437838852405548, "rewards/rejected": -0.4057462215423584, "step": 220 }, { "epoch": 0.16532635122498598, "grad_norm": 50.65604723937434, "learning_rate": 4.7654133505624693e-07, "logps/chosen": -85.13906860351562, "logps/rejected": -83.90724182128906, "loss": 0.7001, "losses/dpo": 0.7771542072296143, "losses/sft": 0.7277609705924988, "losses/total": 0.7771542072296143, "ref_logps/chosen": -81.86061096191406, "ref_logps/rejected": -80.58465576171875, "rewards/accuracies": 0.46875, "rewards/chosen": -0.3278459310531616, "rewards/margins": 0.004412924870848656, "rewards/rejected": -0.33225885033607483, "step": 221 }, { "epoch": 0.1660744342622031, "grad_norm": 130.03414316036833, "learning_rate": 4.762841718373799e-07, "logps/chosen": -72.63302612304688, "logps/rejected": -82.07251739501953, "loss": 0.6797, "losses/dpo": 0.6998245716094971, "losses/sft": 0.46153250336647034, "losses/total": 0.6998245716094971, "ref_logps/chosen": -70.30513763427734, "ref_logps/rejected": -79.24848175048828, "rewards/accuracies": 0.5, "rewards/chosen": -0.2327888458967209, "rewards/margins": 0.049615297466516495, "rewards/rejected": -0.2824041545391083, "step": 222 }, { "epoch": 0.16682251729942024, "grad_norm": 57.12479724472424, "learning_rate": 4.7602567689467865e-07, "logps/chosen": -90.8504867553711, "logps/rejected": -108.03956604003906, "loss": 0.6346, "losses/dpo": 0.5499506592750549, "losses/sft": 1.564050555229187, "losses/total": 0.5499506592750549, "ref_logps/chosen": -88.17778778076172, "ref_logps/rejected": -103.74031829833984, "rewards/accuracies": 0.6875, "rewards/chosen": -0.26726964116096497, "rewards/margins": 0.16265526413917542, "rewards/rejected": -0.4299249053001404, "step": 223 }, { "epoch": 0.16757060033663737, "grad_norm": 78.96649217415893, "learning_rate": 4.757658517494334e-07, "logps/chosen": -107.5459213256836, "logps/rejected": -113.39949035644531, "loss": 0.6962, "losses/dpo": 0.6774884462356567, "losses/sft": 1.0425585508346558, "losses/total": 0.6774884462356567, "ref_logps/chosen": -103.65875244140625, "ref_logps/rejected": -109.34613800048828, "rewards/accuracies": 0.5, "rewards/chosen": -0.38871684670448303, "rewards/margins": 0.01661720871925354, "rewards/rejected": -0.4053340256214142, "step": 224 }, { "epoch": 0.1683186833738545, "grad_norm": 72.33695994864885, "learning_rate": 4.7550469793076277e-07, "logps/chosen": -89.77531433105469, "logps/rejected": -99.06884765625, "loss": 0.6534, "losses/dpo": 0.7009602189064026, "losses/sft": 0.6265588998794556, "losses/total": 0.7009602189064026, "ref_logps/chosen": -87.06463623046875, "ref_logps/rejected": -95.24343872070312, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2710670232772827, "rewards/margins": 0.11147421598434448, "rewards/rejected": -0.3825412690639496, "step": 225 }, { "epoch": 0.16906676641107163, "grad_norm": 56.59974679923963, "learning_rate": 4.752422169756047e-07, "logps/chosen": -84.93038940429688, "logps/rejected": -97.15705871582031, "loss": 0.6269, "losses/dpo": 0.723613977432251, "losses/sft": 0.8945620656013489, "losses/total": 0.723613977432251, "ref_logps/chosen": -82.42671203613281, "ref_logps/rejected": -92.802734375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2503688931465149, "rewards/margins": 0.18506431579589844, "rewards/rejected": -0.43543320894241333, "step": 226 }, { "epoch": 0.16981484944828876, "grad_norm": 61.16953620970551, "learning_rate": 4.7497841042870803e-07, "logps/chosen": -91.43177032470703, "logps/rejected": -99.3667984008789, "loss": 0.6678, "losses/dpo": 0.67955482006073, "losses/sft": 0.6140272617340088, "losses/total": 0.67955482006073, "ref_logps/chosen": -88.38743591308594, "ref_logps/rejected": -95.33942413330078, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3044336140155792, "rewards/margins": 0.09830430150032043, "rewards/rejected": -0.40273791551589966, "step": 227 }, { "epoch": 0.17056293248550589, "grad_norm": 71.63131562194934, "learning_rate": 4.7471327984262237e-07, "logps/chosen": -80.15815734863281, "logps/rejected": -85.17037200927734, "loss": 0.7073, "losses/dpo": 0.7598516941070557, "losses/sft": 0.5849785804748535, "losses/total": 0.7598516941070557, "ref_logps/chosen": -77.1418228149414, "ref_logps/rejected": -82.10748291015625, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3016333281993866, "rewards/margins": 0.004654787480831146, "rewards/rejected": -0.30628809332847595, "step": 228 }, { "epoch": 0.17131101552272301, "grad_norm": 73.56114566304943, "learning_rate": 4.7444682677769e-07, "logps/chosen": -112.607177734375, "logps/rejected": -105.06341552734375, "loss": 0.7263, "losses/dpo": 0.7197579741477966, "losses/sft": 0.5426099300384521, "losses/total": 0.7197579741477966, "ref_logps/chosen": -108.126220703125, "ref_logps/rejected": -100.90140533447266, "rewards/accuracies": 0.46875, "rewards/chosen": -0.4480959177017212, "rewards/margins": -0.03189530968666077, "rewards/rejected": -0.4162006378173828, "step": 229 }, { "epoch": 0.17205909855994014, "grad_norm": 61.402470456353065, "learning_rate": 4.741790528020359e-07, "logps/chosen": -120.32759094238281, "logps/rejected": -123.74312591552734, "loss": 0.6921, "losses/dpo": 0.6256581544876099, "losses/sft": 0.8690157532691956, "losses/total": 0.6256581544876099, "ref_logps/chosen": -116.40013885498047, "ref_logps/rejected": -119.19017791748047, "rewards/accuracies": 0.59375, "rewards/chosen": -0.39274489879608154, "rewards/margins": 0.06255073100328445, "rewards/rejected": -0.4552956521511078, "step": 230 }, { "epoch": 0.17280718159715727, "grad_norm": 65.76615657940492, "learning_rate": 4.739099594915591e-07, "logps/chosen": -108.2529525756836, "logps/rejected": -103.0648422241211, "loss": 0.7342, "losses/dpo": 0.831771969795227, "losses/sft": 0.9272911548614502, "losses/total": 0.831771969795227, "ref_logps/chosen": -103.96061706542969, "ref_logps/rejected": -99.42987060546875, "rewards/accuracies": 0.21875, "rewards/chosen": -0.42923277616500854, "rewards/margins": -0.06573548913002014, "rewards/rejected": -0.3634972870349884, "step": 231 }, { "epoch": 0.17355526463437443, "grad_norm": 132.5769807711101, "learning_rate": 4.7363954842992317e-07, "logps/chosen": -94.0348892211914, "logps/rejected": -96.58341217041016, "loss": 0.6719, "losses/dpo": 0.6388920545578003, "losses/sft": 1.543299913406372, "losses/total": 0.6388920545578003, "ref_logps/chosen": -90.12271118164062, "ref_logps/rejected": -91.96390533447266, "rewards/accuracies": 0.5, "rewards/chosen": -0.3912181854248047, "rewards/margins": 0.07073293626308441, "rewards/rejected": -0.4619510769844055, "step": 232 }, { "epoch": 0.17430334767159156, "grad_norm": 50.40660026981996, "learning_rate": 4.733678212085465e-07, "logps/chosen": -86.1070556640625, "logps/rejected": -96.65918731689453, "loss": 0.6414, "losses/dpo": 0.6229254007339478, "losses/sft": 0.6304973363876343, "losses/total": 0.6229254007339478, "ref_logps/chosen": -83.26679229736328, "ref_logps/rejected": -92.43722534179688, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2840268611907959, "rewards/margins": 0.13816921412944794, "rewards/rejected": -0.42219603061676025, "step": 233 }, { "epoch": 0.1750514307088087, "grad_norm": 64.65840805383058, "learning_rate": 4.730947794265939e-07, "logps/chosen": -106.18317413330078, "logps/rejected": -104.25999450683594, "loss": 0.6712, "losses/dpo": 0.6699508428573608, "losses/sft": 0.3082871437072754, "losses/total": 0.6699508428573608, "ref_logps/chosen": -102.49380493164062, "ref_logps/rejected": -99.91177368164062, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36893749237060547, "rewards/margins": 0.06588459014892578, "rewards/rejected": -0.4348220229148865, "step": 234 }, { "epoch": 0.17579951374602582, "grad_norm": 55.97504490607344, "learning_rate": 4.7282042469096604e-07, "logps/chosen": -91.36622619628906, "logps/rejected": -101.64759826660156, "loss": 0.6197, "losses/dpo": 0.5326117277145386, "losses/sft": 0.9364522695541382, "losses/total": 0.5326117277145386, "ref_logps/chosen": -88.21973419189453, "ref_logps/rejected": -96.84684753417969, "rewards/accuracies": 0.78125, "rewards/chosen": -0.31464883685112, "rewards/margins": 0.1654263138771057, "rewards/rejected": -0.4800751209259033, "step": 235 }, { "epoch": 0.17654759678324294, "grad_norm": 64.93637163285183, "learning_rate": 4.725447586162911e-07, "logps/chosen": -85.95491027832031, "logps/rejected": -98.93338012695312, "loss": 0.6558, "losses/dpo": 0.7036439180374146, "losses/sft": 1.044245719909668, "losses/total": 0.7036439180374146, "ref_logps/chosen": -83.29088592529297, "ref_logps/rejected": -95.32159423828125, "rewards/accuracies": 0.625, "rewards/chosen": -0.26640161871910095, "rewards/margins": 0.09477624297142029, "rewards/rejected": -0.36117789149284363, "step": 236 }, { "epoch": 0.17729567982046007, "grad_norm": 57.5924609614907, "learning_rate": 4.722677828249142e-07, "logps/chosen": -100.56997680664062, "logps/rejected": -111.54186248779297, "loss": 0.616, "losses/dpo": 0.6542568206787109, "losses/sft": 0.886088490486145, "losses/total": 0.6542568206787109, "ref_logps/chosen": -98.06915283203125, "ref_logps/rejected": -106.94844055175781, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2500826120376587, "rewards/margins": 0.20925939083099365, "rewards/rejected": -0.45934200286865234, "step": 237 }, { "epoch": 0.1780437628576772, "grad_norm": 80.24346336291565, "learning_rate": 4.719894989468889e-07, "logps/chosen": -84.34761047363281, "logps/rejected": -87.77833557128906, "loss": 0.6464, "losses/dpo": 0.6471801996231079, "losses/sft": 0.25940489768981934, "losses/total": 0.6471801996231079, "ref_logps/chosen": -82.03948974609375, "ref_logps/rejected": -84.20594024658203, "rewards/accuracies": 0.625, "rewards/chosen": -0.23081135749816895, "rewards/margins": 0.1264292150735855, "rewards/rejected": -0.35724058747291565, "step": 238 }, { "epoch": 0.17879184589489433, "grad_norm": 162.3656912845162, "learning_rate": 4.7170990861996667e-07, "logps/chosen": -92.42719268798828, "logps/rejected": -98.07213592529297, "loss": 0.6512, "losses/dpo": 0.6098382472991943, "losses/sft": 0.8222925662994385, "losses/total": 0.6098382472991943, "ref_logps/chosen": -88.89089965820312, "ref_logps/rejected": -93.48404693603516, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3536303639411926, "rewards/margins": 0.10517877340316772, "rewards/rejected": -0.4588090777397156, "step": 239 }, { "epoch": 0.17953992893211146, "grad_norm": 67.66375191836049, "learning_rate": 4.714290134895879e-07, "logps/chosen": -107.26530456542969, "logps/rejected": -110.9176025390625, "loss": 0.6655, "losses/dpo": 0.6038296818733215, "losses/sft": 1.5718603134155273, "losses/total": 0.6038296818733215, "ref_logps/chosen": -103.86514282226562, "ref_logps/rejected": -106.67314147949219, "rewards/accuracies": 0.59375, "rewards/chosen": -0.34001588821411133, "rewards/margins": 0.08443014323711395, "rewards/rejected": -0.4244459867477417, "step": 240 }, { "epoch": 0.1802880119693286, "grad_norm": 228.18708567606524, "learning_rate": 4.711468152088719e-07, "logps/chosen": -117.52373504638672, "logps/rejected": -114.14984130859375, "loss": 0.6899, "losses/dpo": 0.7335980534553528, "losses/sft": 1.0017744302749634, "losses/total": 0.7335980534553528, "ref_logps/chosen": -113.81645202636719, "ref_logps/rejected": -110.091796875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.37072888016700745, "rewards/margins": 0.03507557138800621, "rewards/rejected": -0.40580445528030396, "step": 241 }, { "epoch": 0.18103609500654572, "grad_norm": 74.52984972264176, "learning_rate": 4.7086331543860735e-07, "logps/chosen": -93.05411529541016, "logps/rejected": -110.33512878417969, "loss": 0.6403, "losses/dpo": 0.7402252554893494, "losses/sft": 1.0248353481292725, "losses/total": 0.7402252554893494, "ref_logps/chosen": -89.87931060791016, "ref_logps/rejected": -105.73238372802734, "rewards/accuracies": 0.625, "rewards/chosen": -0.31748029589653015, "rewards/margins": 0.14279381930828094, "rewards/rejected": -0.4602741301059723, "step": 242 }, { "epoch": 0.18178417804376285, "grad_norm": 50.63689230180159, "learning_rate": 4.705785158472423e-07, "logps/chosen": -85.59888458251953, "logps/rejected": -86.82147216796875, "loss": 0.6556, "losses/dpo": 0.5202546715736389, "losses/sft": 0.6494794487953186, "losses/total": 0.5202546715736389, "ref_logps/chosen": -82.95504760742188, "ref_logps/rejected": -83.22807312011719, "rewards/accuracies": 0.75, "rewards/chosen": -0.26438435912132263, "rewards/margins": 0.09495627135038376, "rewards/rejected": -0.3593406081199646, "step": 243 }, { "epoch": 0.18253226108098, "grad_norm": 81.35985024527513, "learning_rate": 4.702924181108745e-07, "logps/chosen": -90.64015197753906, "logps/rejected": -107.41213989257812, "loss": 0.6497, "losses/dpo": 0.5157041549682617, "losses/sft": 0.7819052934646606, "losses/total": 0.5157041549682617, "ref_logps/chosen": -86.97454071044922, "ref_logps/rejected": -102.51929473876953, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3665601313114166, "rewards/margins": 0.1227240264415741, "rewards/rejected": -0.4892841577529907, "step": 244 }, { "epoch": 0.18328034411819713, "grad_norm": 66.8554654833798, "learning_rate": 4.700050239132417e-07, "logps/chosen": -94.07839965820312, "logps/rejected": -89.87190246582031, "loss": 0.7124, "losses/dpo": 0.613213062286377, "losses/sft": 0.8383092284202576, "losses/total": 0.613213062286377, "ref_logps/chosen": -90.79380798339844, "ref_logps/rejected": -86.69174194335938, "rewards/accuracies": 0.46875, "rewards/chosen": -0.3284587860107422, "rewards/margins": -0.01044262945652008, "rewards/rejected": -0.3180161416530609, "step": 245 }, { "epoch": 0.18402842715541426, "grad_norm": 68.70019763768182, "learning_rate": 4.697163349457114e-07, "logps/chosen": -82.46429443359375, "logps/rejected": -95.87737274169922, "loss": 0.672, "losses/dpo": 0.6279751062393188, "losses/sft": 0.6785013675689697, "losses/total": 0.6279751062393188, "ref_logps/chosen": -79.66818237304688, "ref_logps/rejected": -92.4164810180664, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2796117663383484, "rewards/margins": 0.06647655367851257, "rewards/rejected": -0.34608832001686096, "step": 246 }, { "epoch": 0.1847765101926314, "grad_norm": 80.38386505739918, "learning_rate": 4.694263529072711e-07, "logps/chosen": -77.54638671875, "logps/rejected": -101.57247161865234, "loss": 0.6118, "losses/dpo": 0.6513428688049316, "losses/sft": 0.21454240381717682, "losses/total": 0.6513428688049316, "ref_logps/chosen": -74.94630432128906, "ref_logps/rejected": -96.91883850097656, "rewards/accuracies": 0.71875, "rewards/chosen": -0.26000767946243286, "rewards/margins": 0.20535647869110107, "rewards/rejected": -0.4653642177581787, "step": 247 }, { "epoch": 0.18552459322984852, "grad_norm": 68.58401478567514, "learning_rate": 4.6913507950451825e-07, "logps/chosen": -102.9339599609375, "logps/rejected": -109.94276428222656, "loss": 0.6491, "losses/dpo": 0.644589900970459, "losses/sft": 1.2593624591827393, "losses/total": 0.644589900970459, "ref_logps/chosen": -99.09828186035156, "ref_logps/rejected": -104.84109497070312, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3835674524307251, "rewards/margins": 0.12659871578216553, "rewards/rejected": -0.5101661682128906, "step": 248 }, { "epoch": 0.18627267626706565, "grad_norm": 61.896575803443326, "learning_rate": 4.6884251645165017e-07, "logps/chosen": -80.48027038574219, "logps/rejected": -82.9797134399414, "loss": 0.6833, "losses/dpo": 0.742551326751709, "losses/sft": 0.9538648128509521, "losses/total": 0.742551326751709, "ref_logps/chosen": -77.43159484863281, "ref_logps/rejected": -79.47023010253906, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3048674464225769, "rewards/margins": 0.04608059674501419, "rewards/rejected": -0.3509480655193329, "step": 249 }, { "epoch": 0.18702075930428277, "grad_norm": 65.27289054088986, "learning_rate": 4.6854866547045414e-07, "logps/chosen": -65.68275451660156, "logps/rejected": -78.28736877441406, "loss": 0.639, "losses/dpo": 0.5620585680007935, "losses/sft": 0.6518762707710266, "losses/total": 0.5620585680007935, "ref_logps/chosen": -63.607147216796875, "ref_logps/rejected": -74.86917114257812, "rewards/accuracies": 0.5625, "rewards/chosen": -0.20756109058856964, "rewards/margins": 0.13425865769386292, "rewards/rejected": -0.34181976318359375, "step": 250 }, { "epoch": 0.1877688423414999, "grad_norm": 62.591958746416736, "learning_rate": 4.68253528290297e-07, "logps/chosen": -98.32147216796875, "logps/rejected": -111.00888061523438, "loss": 0.6, "losses/dpo": 0.7482824921607971, "losses/sft": 0.9382582306861877, "losses/total": 0.7482824921607971, "ref_logps/chosen": -95.20552825927734, "ref_logps/rejected": -105.52839660644531, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3115936815738678, "rewards/margins": 0.23645338416099548, "rewards/rejected": -0.5480470657348633, "step": 251 }, { "epoch": 0.18851692537871703, "grad_norm": 74.87759524832592, "learning_rate": 4.679571066481152e-07, "logps/chosen": -98.27581787109375, "logps/rejected": -103.1233139038086, "loss": 0.6554, "losses/dpo": 0.706989586353302, "losses/sft": 0.632639467716217, "losses/total": 0.706989586353302, "ref_logps/chosen": -95.04934692382812, "ref_logps/rejected": -98.99075317382812, "rewards/accuracies": 0.625, "rewards/chosen": -0.32264846563339233, "rewards/margins": 0.09060685336589813, "rewards/rejected": -0.41325533390045166, "step": 252 }, { "epoch": 0.18926500841593416, "grad_norm": 70.27402912121983, "learning_rate": 4.6765940228840444e-07, "logps/chosen": -107.07659912109375, "logps/rejected": -117.1727294921875, "loss": 0.6362, "losses/dpo": 0.6620203256607056, "losses/sft": 0.9039374589920044, "losses/total": 0.6620203256607056, "ref_logps/chosen": -103.27685546875, "ref_logps/rejected": -111.8776626586914, "rewards/accuracies": 0.6875, "rewards/chosen": -0.37997448444366455, "rewards/margins": 0.14953212440013885, "rewards/rejected": -0.5295066237449646, "step": 253 }, { "epoch": 0.1900130914531513, "grad_norm": 65.79017003849319, "learning_rate": 4.673604169632094e-07, "logps/chosen": -96.71389770507812, "logps/rejected": -99.88565826416016, "loss": 0.6677, "losses/dpo": 0.5988122224807739, "losses/sft": 0.47526541352272034, "losses/total": 0.5988122224807739, "ref_logps/chosen": -92.98429870605469, "ref_logps/rejected": -95.46641540527344, "rewards/accuracies": 0.625, "rewards/chosen": -0.37295854091644287, "rewards/margins": 0.06896503269672394, "rewards/rejected": -0.4419235587120056, "step": 254 }, { "epoch": 0.19076117449036842, "grad_norm": 69.23308034283411, "learning_rate": 4.670601524321136e-07, "logps/chosen": -120.55353546142578, "logps/rejected": -136.38595581054688, "loss": 0.6315, "losses/dpo": 0.7070225477218628, "losses/sft": 0.3683015704154968, "losses/total": 0.7070225477218628, "ref_logps/chosen": -116.44075012207031, "ref_logps/rejected": -130.65780639648438, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4112778902053833, "rewards/margins": 0.1615380197763443, "rewards/rejected": -0.5728158950805664, "step": 255 }, { "epoch": 0.19150925752758557, "grad_norm": 60.143574849206374, "learning_rate": 4.6675861046222876e-07, "logps/chosen": -92.890380859375, "logps/rejected": -93.44573974609375, "loss": 0.6747, "losses/dpo": 0.6829701662063599, "losses/sft": 1.037916660308838, "losses/total": 0.6829701662063599, "ref_logps/chosen": -89.27152252197266, "ref_logps/rejected": -89.25787353515625, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3618856370449066, "rewards/margins": 0.05690015107393265, "rewards/rejected": -0.41878578066825867, "step": 256 }, { "epoch": 0.1922573405648027, "grad_norm": 62.20460034374114, "learning_rate": 4.664557928281848e-07, "logps/chosen": -100.36891174316406, "logps/rejected": -89.64251708984375, "loss": 0.6764, "losses/dpo": 0.7482361793518066, "losses/sft": 0.8949476480484009, "losses/total": 0.7482361793518066, "ref_logps/chosen": -97.0692367553711, "ref_logps/rejected": -85.85588073730469, "rewards/accuracies": 0.4375, "rewards/chosen": -0.32996755838394165, "rewards/margins": 0.04869517683982849, "rewards/rejected": -0.37866270542144775, "step": 257 }, { "epoch": 0.19300542360201983, "grad_norm": 60.37074326382915, "learning_rate": 4.6615170131211883e-07, "logps/chosen": -105.66846466064453, "logps/rejected": -122.65033721923828, "loss": 0.6783, "losses/dpo": 0.7969935536384583, "losses/sft": 0.7166157960891724, "losses/total": 0.7969935536384583, "ref_logps/chosen": -101.50288391113281, "ref_logps/rejected": -117.85450744628906, "rewards/accuracies": 0.5, "rewards/chosen": -0.41655784845352173, "rewards/margins": 0.06302426755428314, "rewards/rejected": -0.47958213090896606, "step": 258 }, { "epoch": 0.19375350663923696, "grad_norm": 69.27602869575789, "learning_rate": 4.658463377036653e-07, "logps/chosen": -114.51446533203125, "logps/rejected": -111.44181823730469, "loss": 0.6856, "losses/dpo": 0.7087757587432861, "losses/sft": 1.4299794435501099, "losses/total": 0.7087757587432861, "ref_logps/chosen": -110.12324523925781, "ref_logps/rejected": -106.53305053710938, "rewards/accuracies": 0.5625, "rewards/chosen": -0.43912339210510254, "rewards/margins": 0.051752474159002304, "rewards/rejected": -0.49087584018707275, "step": 259 }, { "epoch": 0.1945015896764541, "grad_norm": 78.66290897618897, "learning_rate": 4.65539703799945e-07, "logps/chosen": -114.02323913574219, "logps/rejected": -117.08036041259766, "loss": 0.6643, "losses/dpo": 0.8926162719726562, "losses/sft": 0.913337230682373, "losses/total": 0.8926162719726562, "ref_logps/chosen": -109.80601501464844, "ref_logps/rejected": -111.79730224609375, "rewards/accuracies": 0.5, "rewards/chosen": -0.421722948551178, "rewards/margins": 0.10658328980207443, "rewards/rejected": -0.528306245803833, "step": 260 }, { "epoch": 0.19524967271367122, "grad_norm": 60.798630123642575, "learning_rate": 4.652318014055546e-07, "logps/chosen": -101.66409301757812, "logps/rejected": -117.11936950683594, "loss": 0.6179, "losses/dpo": 0.7113232612609863, "losses/sft": 0.9840794205665588, "losses/total": 0.7113232612609863, "ref_logps/chosen": -98.35137176513672, "ref_logps/rejected": -111.89437866210938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.331271767616272, "rewards/margins": 0.1912284940481186, "rewards/rejected": -0.522500216960907, "step": 261 }, { "epoch": 0.19599775575088835, "grad_norm": 60.29023364863485, "learning_rate": 4.6492263233255623e-07, "logps/chosen": -70.79317474365234, "logps/rejected": -73.7172622680664, "loss": 0.662, "losses/dpo": 0.6983675360679626, "losses/sft": 0.9190975427627563, "losses/total": 0.6983675360679626, "ref_logps/chosen": -67.98592376708984, "ref_logps/rejected": -69.97438049316406, "rewards/accuracies": 0.65625, "rewards/chosen": -0.28072503209114075, "rewards/margins": 0.09356343746185303, "rewards/rejected": -0.3742884397506714, "step": 262 }, { "epoch": 0.19674583878810548, "grad_norm": 65.50004129190089, "learning_rate": 4.646121984004665e-07, "logps/chosen": -102.90118408203125, "logps/rejected": -109.82378387451172, "loss": 0.6524, "losses/dpo": 0.8180959820747375, "losses/sft": 0.9161641597747803, "losses/total": 0.8180959820747375, "ref_logps/chosen": -99.5177230834961, "ref_logps/rejected": -105.0859375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.338345468044281, "rewards/margins": 0.13543924689292908, "rewards/rejected": -0.4737846851348877, "step": 263 }, { "epoch": 0.1974939218253226, "grad_norm": 66.44439716011385, "learning_rate": 4.6430050143624607e-07, "logps/chosen": -92.55293273925781, "logps/rejected": -99.24625396728516, "loss": 0.6793, "losses/dpo": 0.6664392948150635, "losses/sft": 0.6990221738815308, "losses/total": 0.6664392948150635, "ref_logps/chosen": -89.01671600341797, "ref_logps/rejected": -95.15377044677734, "rewards/accuracies": 0.5625, "rewards/chosen": -0.353621244430542, "rewards/margins": 0.05562663450837135, "rewards/rejected": -0.40924787521362305, "step": 264 }, { "epoch": 0.19824200486253973, "grad_norm": 100.18125379180282, "learning_rate": 4.639875432742886e-07, "logps/chosen": -69.94048309326172, "logps/rejected": -74.97105407714844, "loss": 0.6996, "losses/dpo": 0.6656035780906677, "losses/sft": 0.24509817361831665, "losses/total": 0.6656035780906677, "ref_logps/chosen": -65.98180389404297, "ref_logps/rejected": -70.8719482421875, "rewards/accuracies": 0.5, "rewards/chosen": -0.39586764574050903, "rewards/margins": 0.014042830094695091, "rewards/rejected": -0.4099104404449463, "step": 265 }, { "epoch": 0.19899008789975686, "grad_norm": 71.93541310827106, "learning_rate": 4.636733257564104e-07, "logps/chosen": -81.2688980102539, "logps/rejected": -90.12744140625, "loss": 0.6605, "losses/dpo": 0.6667319536209106, "losses/sft": 0.6594846844673157, "losses/total": 0.6667319536209106, "ref_logps/chosen": -77.82425689697266, "ref_logps/rejected": -85.76838684082031, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3444635272026062, "rewards/margins": 0.09144194424152374, "rewards/rejected": -0.4359055161476135, "step": 266 }, { "epoch": 0.199738170936974, "grad_norm": 63.05122415320751, "learning_rate": 4.63357850731839e-07, "logps/chosen": -105.55098724365234, "logps/rejected": -101.25495910644531, "loss": 0.7066, "losses/dpo": 0.5862468481063843, "losses/sft": 0.9539228081703186, "losses/total": 0.5862468481063843, "ref_logps/chosen": -101.00293731689453, "ref_logps/rejected": -96.6338882446289, "rewards/accuracies": 0.46875, "rewards/chosen": -0.4548049569129944, "rewards/margins": 0.007302125915884972, "rewards/rejected": -0.4621070921421051, "step": 267 }, { "epoch": 0.20048625397419115, "grad_norm": 63.329118511426685, "learning_rate": 4.6304112005720287e-07, "logps/chosen": -80.62962341308594, "logps/rejected": -79.70521545410156, "loss": 0.6788, "losses/dpo": 0.7173545360565186, "losses/sft": 0.6766791343688965, "losses/total": 0.7173545360565186, "ref_logps/chosen": -76.78077697753906, "ref_logps/rejected": -75.28585815429688, "rewards/accuracies": 0.625, "rewards/chosen": -0.38488414883613586, "rewards/margins": 0.057050809264183044, "rewards/rejected": -0.4419349431991577, "step": 268 }, { "epoch": 0.20048625397419115, "eval_logps/chosen": -37.69032287597656, "eval_logps/rejected": -41.96424102783203, "eval_loss": 0.6628445386886597, "eval_losses/dpo": 0.6699182391166687, "eval_losses/sft": 0.31613361835479736, "eval_losses/total": 0.6699182391166687, "eval_ref_logps/chosen": -35.788204193115234, "eval_ref_logps/rejected": -39.235660552978516, "eval_rewards/accuracies": 0.6142241358757019, "eval_rewards/chosen": -0.1902119517326355, "eval_rewards/margins": 0.08264617621898651, "eval_rewards/rejected": -0.2728581130504608, "eval_runtime": 38.051, "eval_samples_per_second": 12.168, "eval_steps_per_second": 1.524, "step": 268 }, { "epoch": 0.20123433701140828, "grad_norm": 74.65253820888512, "learning_rate": 4.627231355965201e-07, "logps/chosen": -96.11650085449219, "logps/rejected": -94.89401245117188, "loss": 0.7328, "losses/dpo": 0.7660701274871826, "losses/sft": 1.2651734352111816, "losses/total": 0.7660701274871826, "ref_logps/chosen": -92.3476791381836, "ref_logps/rejected": -91.51785278320312, "rewards/accuracies": 0.46875, "rewards/chosen": -0.3768818974494934, "rewards/margins": -0.039266668260097504, "rewards/rejected": -0.3376152515411377, "step": 269 }, { "epoch": 0.2019824200486254, "grad_norm": 61.84568502165967, "learning_rate": 4.624038992211874e-07, "logps/chosen": -97.30059814453125, "logps/rejected": -100.09675598144531, "loss": 0.6658, "losses/dpo": 0.560997724533081, "losses/sft": 1.075927495956421, "losses/total": 0.560997724533081, "ref_logps/chosen": -93.96491241455078, "ref_logps/rejected": -95.70162963867188, "rewards/accuracies": 0.625, "rewards/chosen": -0.3335689902305603, "rewards/margins": 0.10594379156827927, "rewards/rejected": -0.43951278924942017, "step": 270 }, { "epoch": 0.20273050308584253, "grad_norm": 54.76846242566015, "learning_rate": 4.6208341280996955e-07, "logps/chosen": -97.57977294921875, "logps/rejected": -103.1713638305664, "loss": 0.6211, "losses/dpo": 0.46571481227874756, "losses/sft": 0.6469762325286865, "losses/total": 0.46571481227874756, "ref_logps/chosen": -93.92818450927734, "ref_logps/rejected": -97.56671142578125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.36515921354293823, "rewards/margins": 0.1953059732913971, "rewards/rejected": -0.5604652166366577, "step": 271 }, { "epoch": 0.20347858612305966, "grad_norm": 83.2639183117204, "learning_rate": 4.617616782489877e-07, "logps/chosen": -105.46704864501953, "logps/rejected": -107.84443664550781, "loss": 0.6541, "losses/dpo": 0.6842960119247437, "losses/sft": 0.6073062419891357, "losses/total": 0.6842960119247437, "ref_logps/chosen": -101.16836547851562, "ref_logps/rejected": -102.29286193847656, "rewards/accuracies": 0.5625, "rewards/chosen": -0.42986780405044556, "rewards/margins": 0.12528976798057556, "rewards/rejected": -0.5551576018333435, "step": 272 }, { "epoch": 0.2042266691602768, "grad_norm": 57.0999191951681, "learning_rate": 4.614386974317088e-07, "logps/chosen": -73.75599670410156, "logps/rejected": -87.5391616821289, "loss": 0.6446, "losses/dpo": 0.6055724620819092, "losses/sft": 0.7901456356048584, "losses/total": 0.6055724620819092, "ref_logps/chosen": -71.3878173828125, "ref_logps/rejected": -83.95172119140625, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2368178367614746, "rewards/margins": 0.12192633748054504, "rewards/rejected": -0.35874420404434204, "step": 273 }, { "epoch": 0.20497475219749392, "grad_norm": 62.15707527330064, "learning_rate": 4.6111447225893405e-07, "logps/chosen": -102.66200256347656, "logps/rejected": -112.4920883178711, "loss": 0.6474, "losses/dpo": 0.5891335010528564, "losses/sft": 0.9484972953796387, "losses/total": 0.5891335010528564, "ref_logps/chosen": -98.54915618896484, "ref_logps/rejected": -106.82636260986328, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4112843871116638, "rewards/margins": 0.15528768301010132, "rewards/rejected": -0.5665720701217651, "step": 274 }, { "epoch": 0.20572283523471105, "grad_norm": 66.65601706162325, "learning_rate": 4.6078900463878823e-07, "logps/chosen": -67.12260437011719, "logps/rejected": -82.78294372558594, "loss": 0.6263, "losses/dpo": 0.658736526966095, "losses/sft": 0.26026275753974915, "losses/total": 0.658736526966095, "ref_logps/chosen": -64.51988220214844, "ref_logps/rejected": -78.55741882324219, "rewards/accuracies": 0.6875, "rewards/chosen": -0.260272353887558, "rewards/margins": 0.16228020191192627, "rewards/rejected": -0.42255258560180664, "step": 275 }, { "epoch": 0.20647091827192818, "grad_norm": 51.99222011321526, "learning_rate": 4.604622964867078e-07, "logps/chosen": -94.54098510742188, "logps/rejected": -113.33656311035156, "loss": 0.6385, "losses/dpo": 0.617497444152832, "losses/sft": 0.9880934953689575, "losses/total": 0.617497444152832, "ref_logps/chosen": -91.16566467285156, "ref_logps/rejected": -108.4525146484375, "rewards/accuracies": 0.625, "rewards/chosen": -0.3375318646430969, "rewards/margins": 0.1508719027042389, "rewards/rejected": -0.4884037673473358, "step": 276 }, { "epoch": 0.2072190013091453, "grad_norm": 54.62479652000256, "learning_rate": 4.6013434972543007e-07, "logps/chosen": -78.4320297241211, "logps/rejected": -89.89421844482422, "loss": 0.6723, "losses/dpo": 0.5096994638442993, "losses/sft": 0.7861394882202148, "losses/total": 0.5096994638442993, "ref_logps/chosen": -75.0577392578125, "ref_logps/rejected": -85.6985855102539, "rewards/accuracies": 0.59375, "rewards/chosen": -0.33742886781692505, "rewards/margins": 0.08213449269533157, "rewards/rejected": -0.41956332325935364, "step": 277 }, { "epoch": 0.20796708434636244, "grad_norm": 115.56814904690303, "learning_rate": 4.5980516628498193e-07, "logps/chosen": -98.96047973632812, "logps/rejected": -111.81144714355469, "loss": 0.6587, "losses/dpo": 0.7479885816574097, "losses/sft": 1.01896333694458, "losses/total": 0.7479885816574097, "ref_logps/chosen": -95.40351867675781, "ref_logps/rejected": -107.20159912109375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3556954562664032, "rewards/margins": 0.10529030859470367, "rewards/rejected": -0.4609857499599457, "step": 278 }, { "epoch": 0.20871516738357956, "grad_norm": 72.47201863657455, "learning_rate": 4.594747481026684e-07, "logps/chosen": -68.52212524414062, "logps/rejected": -76.67103576660156, "loss": 0.6496, "losses/dpo": 0.6674209833145142, "losses/sft": 0.7476940751075745, "losses/total": 0.6674209833145142, "ref_logps/chosen": -65.82398223876953, "ref_logps/rejected": -72.77177429199219, "rewards/accuracies": 0.6875, "rewards/chosen": -0.26981455087661743, "rewards/margins": 0.12011167407035828, "rewards/rejected": -0.3899262249469757, "step": 279 }, { "epoch": 0.20946325042079672, "grad_norm": 56.98550476678065, "learning_rate": 4.591430971230609e-07, "logps/chosen": -102.06346130371094, "logps/rejected": -114.8024673461914, "loss": 0.6038, "losses/dpo": 0.6156561970710754, "losses/sft": 0.5474119782447815, "losses/total": 0.6156561970710754, "ref_logps/chosen": -99.18653869628906, "ref_logps/rejected": -109.6315689086914, "rewards/accuracies": 0.6875, "rewards/chosen": -0.28769195079803467, "rewards/margins": 0.22939801216125488, "rewards/rejected": -0.5170899629592896, "step": 280 }, { "epoch": 0.21021133345801385, "grad_norm": 59.94290742955156, "learning_rate": 4.5881021529798625e-07, "logps/chosen": -99.44277954101562, "logps/rejected": -109.2183837890625, "loss": 0.6231, "losses/dpo": 0.5537616014480591, "losses/sft": 0.8191556334495544, "losses/total": 0.5537616014480591, "ref_logps/chosen": -96.04500579833984, "ref_logps/rejected": -103.99049377441406, "rewards/accuracies": 0.625, "rewards/chosen": -0.33977776765823364, "rewards/margins": 0.18301048874855042, "rewards/rejected": -0.5227882862091064, "step": 281 }, { "epoch": 0.21095941649523098, "grad_norm": 75.34308210383048, "learning_rate": 4.58476104586515e-07, "logps/chosen": -119.7197265625, "logps/rejected": -117.79696655273438, "loss": 0.6538, "losses/dpo": 0.7036129236221313, "losses/sft": 1.4565376043319702, "losses/total": 0.7036129236221313, "ref_logps/chosen": -115.1100845336914, "ref_logps/rejected": -111.98704528808594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4609641432762146, "rewards/margins": 0.12002761662006378, "rewards/rejected": -0.5809917449951172, "step": 282 }, { "epoch": 0.2117074995324481, "grad_norm": 65.7392427470593, "learning_rate": 4.5814076695495005e-07, "logps/chosen": -102.37005615234375, "logps/rejected": -107.21012115478516, "loss": 0.6283, "losses/dpo": 0.5516294240951538, "losses/sft": 0.337167352437973, "losses/total": 0.5516294240951538, "ref_logps/chosen": -98.79994201660156, "ref_logps/rejected": -102.01918029785156, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3570111393928528, "rewards/margins": 0.16208258271217346, "rewards/rejected": -0.5190937519073486, "step": 283 }, { "epoch": 0.21245558256966524, "grad_norm": 57.279721403563265, "learning_rate": 4.5780420437681477e-07, "logps/chosen": -61.23653030395508, "logps/rejected": -66.80399322509766, "loss": 0.664, "losses/dpo": 0.6609764099121094, "losses/sft": 0.4224961996078491, "losses/total": 0.6609764099121094, "ref_logps/chosen": -58.724693298339844, "ref_logps/rejected": -63.48948287963867, "rewards/accuracies": 0.5, "rewards/chosen": -0.251183420419693, "rewards/margins": 0.08026783913373947, "rewards/rejected": -0.33145126700401306, "step": 284 }, { "epoch": 0.21320366560688236, "grad_norm": 55.51756637532857, "learning_rate": 4.5746641883284166e-07, "logps/chosen": -111.48076629638672, "logps/rejected": -119.73654174804688, "loss": 0.6427, "losses/dpo": 0.7341890931129456, "losses/sft": 1.5750701427459717, "losses/total": 0.7341890931129456, "ref_logps/chosen": -107.68017578125, "ref_logps/rejected": -114.63284301757812, "rewards/accuracies": 0.625, "rewards/chosen": -0.3800576627254486, "rewards/margins": 0.13031215965747833, "rewards/rejected": -0.5103697776794434, "step": 285 }, { "epoch": 0.2139517486440995, "grad_norm": 70.66976924899052, "learning_rate": 4.571274123109605e-07, "logps/chosen": -102.43988037109375, "logps/rejected": -119.24671936035156, "loss": 0.6513, "losses/dpo": 0.7890401482582092, "losses/sft": 0.8652801513671875, "losses/total": 0.7890401482582092, "ref_logps/chosen": -97.83662414550781, "ref_logps/rejected": -113.40361022949219, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4603254199028015, "rewards/margins": 0.12398591637611389, "rewards/rejected": -0.584311306476593, "step": 286 }, { "epoch": 0.21469983168131662, "grad_norm": 65.16792588671848, "learning_rate": 4.567871868062869e-07, "logps/chosen": -92.13160705566406, "logps/rejected": -93.74192810058594, "loss": 0.7253, "losses/dpo": 0.6850562691688538, "losses/sft": 0.6131348013877869, "losses/total": 0.6850562691688538, "ref_logps/chosen": -87.79933166503906, "ref_logps/rejected": -89.5985336303711, "rewards/accuracies": 0.625, "rewards/chosen": -0.43322789669036865, "rewards/margins": -0.018888473510742188, "rewards/rejected": -0.41433945298194885, "step": 287 }, { "epoch": 0.21544791471853375, "grad_norm": 73.72289716917398, "learning_rate": 4.5644574432111025e-07, "logps/chosen": -109.132568359375, "logps/rejected": -109.30514526367188, "loss": 0.7033, "losses/dpo": 0.6788849234580994, "losses/sft": 0.8769365549087524, "losses/total": 0.6788849234580994, "ref_logps/chosen": -104.67325592041016, "ref_logps/rejected": -104.527587890625, "rewards/accuracies": 0.5, "rewards/chosen": -0.4459308981895447, "rewards/margins": 0.03182465583086014, "rewards/rejected": -0.4777555465698242, "step": 288 }, { "epoch": 0.21619599775575088, "grad_norm": 113.5007120521829, "learning_rate": 4.561030868648822e-07, "logps/chosen": -110.25473022460938, "logps/rejected": -123.94915771484375, "loss": 0.7061, "losses/dpo": 0.7723401784896851, "losses/sft": 0.9212793111801147, "losses/total": 0.7723401784896851, "ref_logps/chosen": -105.72904968261719, "ref_logps/rejected": -119.193603515625, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4525676369667053, "rewards/margins": 0.02298782579600811, "rewards/rejected": -0.475555419921875, "step": 289 }, { "epoch": 0.216944080792968, "grad_norm": 69.26665527899279, "learning_rate": 4.5575921645420476e-07, "logps/chosen": -88.7147216796875, "logps/rejected": -94.62202453613281, "loss": 0.6349, "losses/dpo": 0.5935918092727661, "losses/sft": 0.9002220034599304, "losses/total": 0.5935918092727661, "ref_logps/chosen": -86.11112976074219, "ref_logps/rejected": -90.38993072509766, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2603588104248047, "rewards/margins": 0.16285017132759094, "rewards/rejected": -0.42320898175239563, "step": 290 }, { "epoch": 0.21769216383018514, "grad_norm": 64.84596940620118, "learning_rate": 4.554141351128182e-07, "logps/chosen": -112.98928833007812, "logps/rejected": -114.4159164428711, "loss": 0.6488, "losses/dpo": 0.5659849643707275, "losses/sft": 0.8108077049255371, "losses/total": 0.5659849643707275, "ref_logps/chosen": -108.40953826904297, "ref_logps/rejected": -108.4754638671875, "rewards/accuracies": 0.53125, "rewards/chosen": -0.457974910736084, "rewards/margins": 0.13607092201709747, "rewards/rejected": -0.594045877456665, "step": 291 }, { "epoch": 0.2184402468674023, "grad_norm": 58.753431552744416, "learning_rate": 4.550678448715896e-07, "logps/chosen": -57.58449172973633, "logps/rejected": -63.87934494018555, "loss": 0.611, "losses/dpo": 0.5799652338027954, "losses/sft": 0.6151180267333984, "losses/total": 0.5799652338027954, "ref_logps/chosen": -54.82413101196289, "ref_logps/rejected": -59.10428237915039, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2760356068611145, "rewards/margins": 0.20147044956684113, "rewards/rejected": -0.4775060713291168, "step": 292 }, { "epoch": 0.21918832990461942, "grad_norm": 71.25740170765575, "learning_rate": 4.547203477685004e-07, "logps/chosen": -93.28994750976562, "logps/rejected": -91.74594116210938, "loss": 0.6866, "losses/dpo": 0.7994166612625122, "losses/sft": 0.6279712915420532, "losses/total": 0.7994166612625122, "ref_logps/chosen": -89.24925231933594, "ref_logps/rejected": -87.25523376464844, "rewards/accuracies": 0.53125, "rewards/chosen": -0.40406978130340576, "rewards/margins": 0.045001257210969925, "rewards/rejected": -0.4490710198879242, "step": 293 }, { "epoch": 0.21993641294183655, "grad_norm": 63.866248499882396, "learning_rate": 4.5437164584863495e-07, "logps/chosen": -69.24853515625, "logps/rejected": -78.67710876464844, "loss": 0.6502, "losses/dpo": 0.5691109895706177, "losses/sft": 0.9058778285980225, "losses/total": 0.5691109895706177, "ref_logps/chosen": -66.99088287353516, "ref_logps/rejected": -75.15858459472656, "rewards/accuracies": 0.5625, "rewards/chosen": -0.225765198469162, "rewards/margins": 0.12608817219734192, "rewards/rejected": -0.3518533408641815, "step": 294 }, { "epoch": 0.22068449597905368, "grad_norm": 102.89615123509412, "learning_rate": 4.540217411641678e-07, "logps/chosen": -95.79168701171875, "logps/rejected": -111.97608947753906, "loss": 0.6243, "losses/dpo": 0.6829866170883179, "losses/sft": 0.8064054250717163, "losses/total": 0.6829866170883179, "ref_logps/chosen": -91.28604888916016, "ref_logps/rejected": -105.50436401367188, "rewards/accuracies": 0.59375, "rewards/chosen": -0.45056259632110596, "rewards/margins": 0.1966102421283722, "rewards/rejected": -0.6471728086471558, "step": 295 }, { "epoch": 0.2214325790162708, "grad_norm": 84.09414741935367, "learning_rate": 4.5367063577435216e-07, "logps/chosen": -102.76103210449219, "logps/rejected": -104.30159759521484, "loss": 0.7134, "losses/dpo": 0.6440633535385132, "losses/sft": 0.9594119787216187, "losses/total": 0.6440633535385132, "ref_logps/chosen": -97.92121887207031, "ref_logps/rejected": -99.3582534790039, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4839816093444824, "rewards/margins": 0.010353732854127884, "rewards/rejected": -0.4943353533744812, "step": 296 }, { "epoch": 0.22218066205348794, "grad_norm": 69.79138809679907, "learning_rate": 4.533183317455077e-07, "logps/chosen": -85.64404296875, "logps/rejected": -85.5567855834961, "loss": 0.7097, "losses/dpo": 0.7359490394592285, "losses/sft": 1.0437558889389038, "losses/total": 0.7359490394592285, "ref_logps/chosen": -81.53857421875, "ref_logps/rejected": -81.45659637451172, "rewards/accuracies": 0.53125, "rewards/chosen": -0.41054800152778625, "rewards/margins": -0.0005293600261211395, "rewards/rejected": -0.4100186228752136, "step": 297 }, { "epoch": 0.22292874509070507, "grad_norm": 77.15146102659105, "learning_rate": 4.5296483115100814e-07, "logps/chosen": -91.51983642578125, "logps/rejected": -95.52337646484375, "loss": 0.6606, "losses/dpo": 0.6581255793571472, "losses/sft": 0.7308759093284607, "losses/total": 0.6581255793571472, "ref_logps/chosen": -87.45832824707031, "ref_logps/rejected": -90.32709503173828, "rewards/accuracies": 0.53125, "rewards/chosen": -0.40615007281303406, "rewards/margins": 0.11347870528697968, "rewards/rejected": -0.5196288228034973, "step": 298 }, { "epoch": 0.2236768281279222, "grad_norm": 57.76824971623868, "learning_rate": 4.526101360712693e-07, "logps/chosen": -105.67477416992188, "logps/rejected": -116.85748291015625, "loss": 0.598, "losses/dpo": 0.7066806554794312, "losses/sft": 0.6479476690292358, "losses/total": 0.7066806554794312, "ref_logps/chosen": -102.04267883300781, "ref_logps/rejected": -110.74639129638672, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36320871114730835, "rewards/margins": 0.2479003369808197, "rewards/rejected": -0.6111090183258057, "step": 299 }, { "epoch": 0.22442491116513932, "grad_norm": 62.30187879396033, "learning_rate": 4.5225424859373684e-07, "logps/chosen": -97.39226531982422, "logps/rejected": -114.33299255371094, "loss": 0.5679, "losses/dpo": 0.7391623854637146, "losses/sft": 0.9010686874389648, "losses/total": 0.7391623854637146, "ref_logps/chosen": -94.75485229492188, "ref_logps/rejected": -108.48426055908203, "rewards/accuracies": 0.78125, "rewards/chosen": -0.26374244689941406, "rewards/margins": 0.3211306035518646, "rewards/rejected": -0.5848730802536011, "step": 300 }, { "epoch": 0.22517299420235645, "grad_norm": 63.97565029778259, "learning_rate": 4.518971708128737e-07, "logps/chosen": -103.11335754394531, "logps/rejected": -109.44488525390625, "loss": 0.6616, "losses/dpo": 0.6886447072029114, "losses/sft": 1.0084078311920166, "losses/total": 0.6886447072029114, "ref_logps/chosen": -99.09407043457031, "ref_logps/rejected": -104.45931243896484, "rewards/accuracies": 0.5625, "rewards/chosen": -0.40192949771881104, "rewards/margins": 0.09662729501724243, "rewards/rejected": -0.49855679273605347, "step": 301 }, { "epoch": 0.22592107723957358, "grad_norm": 65.72025078598531, "learning_rate": 4.515389048301481e-07, "logps/chosen": -96.44305419921875, "logps/rejected": -92.877197265625, "loss": 0.6843, "losses/dpo": 0.7054979801177979, "losses/sft": 0.5385233163833618, "losses/total": 0.7054979801177979, "ref_logps/chosen": -92.88849639892578, "ref_logps/rejected": -88.86605834960938, "rewards/accuracies": 0.46875, "rewards/chosen": -0.35545557737350464, "rewards/margins": 0.04565828666090965, "rewards/rejected": -0.4011138677597046, "step": 302 }, { "epoch": 0.22666916027679074, "grad_norm": 56.78734065946023, "learning_rate": 4.511794527540211e-07, "logps/chosen": -85.43777465820312, "logps/rejected": -95.66278076171875, "loss": 0.6547, "losses/dpo": 0.6811451315879822, "losses/sft": 0.7239047288894653, "losses/total": 0.6811451315879822, "ref_logps/chosen": -82.072509765625, "ref_logps/rejected": -91.15760040283203, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3365270793437958, "rewards/margins": 0.11399129778146744, "rewards/rejected": -0.450518399477005, "step": 303 }, { "epoch": 0.22741724331400787, "grad_norm": 63.768230060662496, "learning_rate": 4.508188166999339e-07, "logps/chosen": -105.96969604492188, "logps/rejected": -94.26342010498047, "loss": 0.6893, "losses/dpo": 0.7197575569152832, "losses/sft": 1.9526853561401367, "losses/total": 0.7197575569152832, "ref_logps/chosen": -101.29161071777344, "ref_logps/rejected": -88.95541381835938, "rewards/accuracies": 0.59375, "rewards/chosen": -0.46781015396118164, "rewards/margins": 0.06298968940973282, "rewards/rejected": -0.5307998657226562, "step": 304 }, { "epoch": 0.228165326351225, "grad_norm": 61.57955346317575, "learning_rate": 4.50456998790296e-07, "logps/chosen": -98.1336898803711, "logps/rejected": -99.79108428955078, "loss": 0.642, "losses/dpo": 0.7596883177757263, "losses/sft": 1.4791220426559448, "losses/total": 0.7596883177757263, "ref_logps/chosen": -94.00310516357422, "ref_logps/rejected": -94.27605438232422, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41305798292160034, "rewards/margins": 0.13844527304172516, "rewards/rejected": -0.5515032410621643, "step": 305 }, { "epoch": 0.22891340938844212, "grad_norm": 97.75347418111306, "learning_rate": 4.5009400115447206e-07, "logps/chosen": -104.91222381591797, "logps/rejected": -119.73594665527344, "loss": 0.6104, "losses/dpo": 0.5621293783187866, "losses/sft": 1.1335766315460205, "losses/total": 0.5621293783187866, "ref_logps/chosen": -100.89756774902344, "ref_logps/rejected": -113.6295394897461, "rewards/accuracies": 0.75, "rewards/chosen": -0.40146538615226746, "rewards/margins": 0.20917651057243347, "rewards/rejected": -0.6106418967247009, "step": 306 }, { "epoch": 0.22966149242565925, "grad_norm": 60.99319606106913, "learning_rate": 4.4972982592876955e-07, "logps/chosen": -89.1437759399414, "logps/rejected": -95.24713134765625, "loss": 0.6445, "losses/dpo": 0.6288329362869263, "losses/sft": 1.5273544788360596, "losses/total": 0.6288329362869263, "ref_logps/chosen": -85.63395690917969, "ref_logps/rejected": -90.25359344482422, "rewards/accuracies": 0.625, "rewards/chosen": -0.35098153352737427, "rewards/margins": 0.14837296307086945, "rewards/rejected": -0.4993545114994049, "step": 307 }, { "epoch": 0.23040957546287638, "grad_norm": 61.30041688151902, "learning_rate": 4.493644752564266e-07, "logps/chosen": -117.79389953613281, "logps/rejected": -120.77992248535156, "loss": 0.6661, "losses/dpo": 0.5830092430114746, "losses/sft": 1.166965126991272, "losses/total": 0.5830092430114746, "ref_logps/chosen": -113.29170989990234, "ref_logps/rejected": -115.28483581542969, "rewards/accuracies": 0.5625, "rewards/chosen": -0.45021969079971313, "rewards/margins": 0.0992894321680069, "rewards/rejected": -0.5495091676712036, "step": 308 }, { "epoch": 0.2311576585000935, "grad_norm": 64.11805632476585, "learning_rate": 4.4899795128759886e-07, "logps/chosen": -100.50289154052734, "logps/rejected": -100.28108215332031, "loss": 0.6297, "losses/dpo": 0.8114888668060303, "losses/sft": 1.5795881748199463, "losses/total": 0.8114888668060303, "ref_logps/chosen": -96.62437438964844, "ref_logps/rejected": -94.67839050292969, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3878519535064697, "rewards/margins": 0.17241667211055756, "rewards/rejected": -0.5602686405181885, "step": 309 }, { "epoch": 0.23190574153731064, "grad_norm": 82.84091182812915, "learning_rate": 4.4863025617934715e-07, "logps/chosen": -92.64906311035156, "logps/rejected": -93.3626480102539, "loss": 0.6668, "losses/dpo": 0.6645923852920532, "losses/sft": 0.6265849471092224, "losses/total": 0.6645923852920532, "ref_logps/chosen": -88.97686004638672, "ref_logps/rejected": -88.56852722167969, "rewards/accuracies": 0.625, "rewards/chosen": -0.367220401763916, "rewards/margins": 0.11219199746847153, "rewards/rejected": -0.47941234707832336, "step": 310 }, { "epoch": 0.23265382457452777, "grad_norm": 55.92286626252208, "learning_rate": 4.482613920956244e-07, "logps/chosen": -84.6201171875, "logps/rejected": -82.34272003173828, "loss": 0.6792, "losses/dpo": 0.6657872200012207, "losses/sft": 0.15702635049819946, "losses/total": 0.6657872200012207, "ref_logps/chosen": -80.97850799560547, "ref_logps/rejected": -78.26864624023438, "rewards/accuracies": 0.5625, "rewards/chosen": -0.364160418510437, "rewards/margins": 0.04324778914451599, "rewards/rejected": -0.4074082374572754, "step": 311 }, { "epoch": 0.2334019076117449, "grad_norm": 53.39077761446581, "learning_rate": 4.4789136120726355e-07, "logps/chosen": -69.89353942871094, "logps/rejected": -81.66445922851562, "loss": 0.6383, "losses/dpo": 0.672562837600708, "losses/sft": 0.9207233190536499, "losses/total": 0.672562837600708, "ref_logps/chosen": -67.09561157226562, "ref_logps/rejected": -77.44005584716797, "rewards/accuracies": 0.71875, "rewards/chosen": -0.27979373931884766, "rewards/margins": 0.14264701306819916, "rewards/rejected": -0.4224407374858856, "step": 312 }, { "epoch": 0.23414999064896203, "grad_norm": 68.05080805907662, "learning_rate": 4.475201656919642e-07, "logps/chosen": -100.5257797241211, "logps/rejected": -103.95291137695312, "loss": 0.6651, "losses/dpo": 0.7288721799850464, "losses/sft": 0.5363259315490723, "losses/total": 0.7288721799850464, "ref_logps/chosen": -96.30624389648438, "ref_logps/rejected": -98.75265502929688, "rewards/accuracies": 0.625, "rewards/chosen": -0.42195284366607666, "rewards/margins": 0.09807237237691879, "rewards/rejected": -0.5200251936912537, "step": 313 }, { "epoch": 0.23489807368617915, "grad_norm": 72.6624562096509, "learning_rate": 4.4714780773427975e-07, "logps/chosen": -94.72134399414062, "logps/rejected": -106.85720825195312, "loss": 0.6492, "losses/dpo": 0.6413378715515137, "losses/sft": 0.6710217595100403, "losses/total": 0.6413378715515137, "ref_logps/chosen": -91.14076232910156, "ref_logps/rejected": -101.90744018554688, "rewards/accuracies": 0.5625, "rewards/chosen": -0.35805732011795044, "rewards/margins": 0.13691887259483337, "rewards/rejected": -0.4949762225151062, "step": 314 }, { "epoch": 0.2356461567233963, "grad_norm": 73.94004785141495, "learning_rate": 4.4677428952560535e-07, "logps/chosen": -115.55271911621094, "logps/rejected": -122.54920196533203, "loss": 0.6466, "losses/dpo": 0.6968921422958374, "losses/sft": 1.578653335571289, "losses/total": 0.6968921422958374, "ref_logps/chosen": -111.7533187866211, "ref_logps/rejected": -117.61209869384766, "rewards/accuracies": 0.65625, "rewards/chosen": -0.37993958592414856, "rewards/margins": 0.11376997828483582, "rewards/rejected": -0.49370959401130676, "step": 315 }, { "epoch": 0.23639423976061344, "grad_norm": 86.07454435885487, "learning_rate": 4.46399613264164e-07, "logps/chosen": -88.19760131835938, "logps/rejected": -98.60675048828125, "loss": 0.6301, "losses/dpo": 0.8372524976730347, "losses/sft": 1.0015900135040283, "losses/total": 0.8372524976730347, "ref_logps/chosen": -84.29364013671875, "ref_logps/rejected": -92.98052978515625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.39039647579193115, "rewards/margins": 0.1722247153520584, "rewards/rejected": -0.5626212358474731, "step": 316 }, { "epoch": 0.23714232279783057, "grad_norm": 92.92272583459861, "learning_rate": 4.4602378115499424e-07, "logps/chosen": -95.57081604003906, "logps/rejected": -101.58955383300781, "loss": 0.7073, "losses/dpo": 0.742162823677063, "losses/sft": 0.9472687244415283, "losses/total": 0.742162823677063, "ref_logps/chosen": -91.36087036132812, "ref_logps/rejected": -96.93291473388672, "rewards/accuracies": 0.625, "rewards/chosen": -0.42099452018737793, "rewards/margins": 0.044669583439826965, "rewards/rejected": -0.4656640887260437, "step": 317 }, { "epoch": 0.2378904058350477, "grad_norm": 65.23252041127772, "learning_rate": 4.456467954099369e-07, "logps/chosen": -102.75108337402344, "logps/rejected": -118.76388549804688, "loss": 0.633, "losses/dpo": 0.712697446346283, "losses/sft": 1.109299659729004, "losses/total": 0.712697446346283, "ref_logps/chosen": -98.98365020751953, "ref_logps/rejected": -113.31980895996094, "rewards/accuracies": 0.59375, "rewards/chosen": -0.37674403190612793, "rewards/margins": 0.1676635593175888, "rewards/rejected": -0.5444075465202332, "step": 318 }, { "epoch": 0.23863848887226483, "grad_norm": 54.16043381067874, "learning_rate": 4.4526865824762216e-07, "logps/chosen": -82.98881530761719, "logps/rejected": -88.15798950195312, "loss": 0.68, "losses/dpo": 0.6419991254806519, "losses/sft": 0.906076192855835, "losses/total": 0.6419991254806519, "ref_logps/chosen": -79.07994842529297, "ref_logps/rejected": -83.70919036865234, "rewards/accuracies": 0.625, "rewards/chosen": -0.39088666439056396, "rewards/margins": 0.05399385467171669, "rewards/rejected": -0.44488051533699036, "step": 319 }, { "epoch": 0.23938657190948195, "grad_norm": 60.77011315467522, "learning_rate": 4.4488937189345655e-07, "logps/chosen": -106.88552856445312, "logps/rejected": -108.478271484375, "loss": 0.6787, "losses/dpo": 0.8704560995101929, "losses/sft": 1.9296140670776367, "losses/total": 0.8704560995101929, "ref_logps/chosen": -102.94357299804688, "ref_logps/rejected": -103.86480712890625, "rewards/accuracies": 0.5, "rewards/chosen": -0.3941953778266907, "rewards/margins": 0.06715169548988342, "rewards/rejected": -0.4613470435142517, "step": 320 }, { "epoch": 0.24013465494669908, "grad_norm": 70.51264441167186, "learning_rate": 4.4450893857960984e-07, "logps/chosen": -92.30961608886719, "logps/rejected": -105.35087585449219, "loss": 0.6506, "losses/dpo": 0.560577392578125, "losses/sft": 1.0445090532302856, "losses/total": 0.560577392578125, "ref_logps/chosen": -88.71942138671875, "ref_logps/rejected": -100.36598205566406, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3590187430381775, "rewards/margins": 0.1394696682691574, "rewards/rejected": -0.4984884560108185, "step": 321 }, { "epoch": 0.2408827379839162, "grad_norm": 76.05812821248955, "learning_rate": 4.441273605450018e-07, "logps/chosen": -90.47355651855469, "logps/rejected": -100.51823425292969, "loss": 0.6441, "losses/dpo": 0.6161359548568726, "losses/sft": 0.7949166297912598, "losses/total": 0.6161359548568726, "ref_logps/chosen": -87.0403060913086, "ref_logps/rejected": -95.62769317626953, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34332481026649475, "rewards/margins": 0.1457281857728958, "rewards/rejected": -0.489052951335907, "step": 322 }, { "epoch": 0.24163082102113334, "grad_norm": 69.65071866444475, "learning_rate": 4.4374464003528916e-07, "logps/chosen": -105.79873657226562, "logps/rejected": -109.9566650390625, "loss": 0.6772, "losses/dpo": 0.6459444761276245, "losses/sft": 0.9663287401199341, "losses/total": 0.6459444761276245, "ref_logps/chosen": -102.025634765625, "ref_logps/rejected": -105.4859848022461, "rewards/accuracies": 0.5, "rewards/chosen": -0.37731075286865234, "rewards/margins": 0.06975805014371872, "rewards/rejected": -0.4470687508583069, "step": 323 }, { "epoch": 0.24237890405835047, "grad_norm": 66.420405976233, "learning_rate": 4.433607793028522e-07, "logps/chosen": -92.2669448852539, "logps/rejected": -98.89781188964844, "loss": 0.6459, "losses/dpo": 0.5917967557907104, "losses/sft": 0.9478490352630615, "losses/total": 0.5917967557907104, "ref_logps/chosen": -88.58975219726562, "ref_logps/rejected": -93.8061752319336, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36771899461746216, "rewards/margins": 0.14144468307495117, "rewards/rejected": -0.5091636776924133, "step": 324 }, { "epoch": 0.2431269870955676, "grad_norm": 56.76040086928598, "learning_rate": 4.4297578060678174e-07, "logps/chosen": -99.54680633544922, "logps/rejected": -103.07437896728516, "loss": 0.6663, "losses/dpo": 0.5745702385902405, "losses/sft": 1.2190130949020386, "losses/total": 0.5745702385902405, "ref_logps/chosen": -95.34854888916016, "ref_logps/rejected": -98.0794448852539, "rewards/accuracies": 0.625, "rewards/chosen": -0.4198259711265564, "rewards/margins": 0.07966753840446472, "rewards/rejected": -0.4994935095310211, "step": 325 }, { "epoch": 0.24387507013278473, "grad_norm": 114.63918643468406, "learning_rate": 4.4258964621286577e-07, "logps/chosen": -81.22007751464844, "logps/rejected": -96.22056579589844, "loss": 0.6248, "losses/dpo": 0.7075506448745728, "losses/sft": 0.9456363916397095, "losses/total": 0.7075506448745728, "ref_logps/chosen": -77.56159973144531, "ref_logps/rejected": -90.60028076171875, "rewards/accuracies": 0.59375, "rewards/chosen": -0.36584722995758057, "rewards/margins": 0.19618083536624908, "rewards/rejected": -0.5620280504226685, "step": 326 }, { "epoch": 0.24462315317000188, "grad_norm": 59.75814851558422, "learning_rate": 4.4220237839357584e-07, "logps/chosen": -88.87202453613281, "logps/rejected": -99.5818862915039, "loss": 0.6542, "losses/dpo": 0.7841302156448364, "losses/sft": 0.5573152899742126, "losses/total": 0.7841302156448364, "ref_logps/chosen": -85.32697296142578, "ref_logps/rejected": -94.7450942993164, "rewards/accuracies": 0.5625, "rewards/chosen": -0.35450518131256104, "rewards/margins": 0.1291736215353012, "rewards/rejected": -0.48367881774902344, "step": 327 }, { "epoch": 0.245371236207219, "grad_norm": 69.13110272197247, "learning_rate": 4.418139794280541e-07, "logps/chosen": -93.00694274902344, "logps/rejected": -91.38935852050781, "loss": 0.7062, "losses/dpo": 0.5855580568313599, "losses/sft": 0.23086127638816833, "losses/total": 0.5855580568313599, "ref_logps/chosen": -88.88777160644531, "ref_logps/rejected": -87.14383697509766, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4119179844856262, "rewards/margins": 0.012635063380002975, "rewards/rejected": -0.4245530664920807, "step": 328 }, { "epoch": 0.24611931924443614, "grad_norm": 73.77580292179802, "learning_rate": 4.4142445160209976e-07, "logps/chosen": -113.85076141357422, "logps/rejected": -115.35945129394531, "loss": 0.6672, "losses/dpo": 0.7111160755157471, "losses/sft": 1.1357218027114868, "losses/total": 0.7111160755157471, "ref_logps/chosen": -109.80499267578125, "ref_logps/rejected": -110.2974853515625, "rewards/accuracies": 0.53125, "rewards/chosen": -0.40457719564437866, "rewards/margins": 0.10161975026130676, "rewards/rejected": -0.506196916103363, "step": 329 }, { "epoch": 0.24686740228165327, "grad_norm": 61.667776477552884, "learning_rate": 4.410337972081553e-07, "logps/chosen": -113.1512451171875, "logps/rejected": -115.84979248046875, "loss": 0.663, "losses/dpo": 0.601220965385437, "losses/sft": 0.8613933324813843, "losses/total": 0.601220965385437, "ref_logps/chosen": -109.41373443603516, "ref_logps/rejected": -111.20651245117188, "rewards/accuracies": 0.53125, "rewards/chosen": -0.37375035881996155, "rewards/margins": 0.09057748317718506, "rewards/rejected": -0.464327871799469, "step": 330 }, { "epoch": 0.2476154853188704, "grad_norm": 87.9465548605562, "learning_rate": 4.406420185452936e-07, "logps/chosen": -94.77821350097656, "logps/rejected": -92.24978637695312, "loss": 0.7398, "losses/dpo": 0.55787593126297, "losses/sft": 0.22884608805179596, "losses/total": 0.55787593126297, "ref_logps/chosen": -90.32415008544922, "ref_logps/rejected": -88.23858642578125, "rewards/accuracies": 0.375, "rewards/chosen": -0.44540703296661377, "rewards/margins": -0.0442872978746891, "rewards/rejected": -0.4011197090148926, "step": 331 }, { "epoch": 0.24836356835608753, "grad_norm": 56.48724609507025, "learning_rate": 4.4024911791920393e-07, "logps/chosen": -76.1095199584961, "logps/rejected": -84.38812255859375, "loss": 0.6235, "losses/dpo": 0.630761981010437, "losses/sft": 0.5464266538619995, "losses/total": 0.630761981010437, "ref_logps/chosen": -73.39447021484375, "ref_logps/rejected": -79.94483947753906, "rewards/accuracies": 0.625, "rewards/chosen": -0.27150431275367737, "rewards/margins": 0.1728230118751526, "rewards/rejected": -0.44432729482650757, "step": 332 }, { "epoch": 0.24911165139330466, "grad_norm": 64.20034296057057, "learning_rate": 4.3985509764217837e-07, "logps/chosen": -90.61302947998047, "logps/rejected": -104.02531433105469, "loss": 0.6262, "losses/dpo": 0.6523864269256592, "losses/sft": 0.8790729641914368, "losses/total": 0.6523864269256592, "ref_logps/chosen": -87.00902557373047, "ref_logps/rejected": -97.99055480957031, "rewards/accuracies": 0.5625, "rewards/chosen": -0.360400915145874, "rewards/margins": 0.24307531118392944, "rewards/rejected": -0.6034762859344482, "step": 333 }, { "epoch": 0.24985973443052178, "grad_norm": 70.85675354874803, "learning_rate": 4.394599600330986e-07, "logps/chosen": -93.4686508178711, "logps/rejected": -98.6458740234375, "loss": 0.6871, "losses/dpo": 0.636366069316864, "losses/sft": 0.5765417218208313, "losses/total": 0.636366069316864, "ref_logps/chosen": -89.03810119628906, "ref_logps/rejected": -93.50631713867188, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4430542588233948, "rewards/margins": 0.07090127468109131, "rewards/rejected": -0.5139555931091309, "step": 334 }, { "epoch": 0.2506078174677389, "grad_norm": 91.72408433833, "learning_rate": 4.390637074174218e-07, "logps/chosen": -94.0125732421875, "logps/rejected": -104.56366729736328, "loss": 0.6322, "losses/dpo": 0.6224732398986816, "losses/sft": 1.2148048877716064, "losses/total": 0.6224732398986816, "ref_logps/chosen": -90.70438385009766, "ref_logps/rejected": -99.28610229492188, "rewards/accuracies": 0.625, "rewards/chosen": -0.3308182954788208, "rewards/margins": 0.19693754613399506, "rewards/rejected": -0.527755856513977, "step": 335 }, { "epoch": 0.25135590050495604, "grad_norm": 76.67655402848452, "learning_rate": 4.3866634212716735e-07, "logps/chosen": -110.56129455566406, "logps/rejected": -114.31731414794922, "loss": 0.66, "losses/dpo": 0.6016351580619812, "losses/sft": 1.186758279800415, "losses/total": 0.6016351580619812, "ref_logps/chosen": -106.71891784667969, "ref_logps/rejected": -109.24052429199219, "rewards/accuracies": 0.65625, "rewards/chosen": -0.38423842191696167, "rewards/margins": 0.12344083189964294, "rewards/rejected": -0.5076792240142822, "step": 336 }, { "epoch": 0.25210398354217317, "grad_norm": 92.9845665461946, "learning_rate": 4.3826786650090273e-07, "logps/chosen": -102.96378326416016, "logps/rejected": -103.87907409667969, "loss": 0.7366, "losses/dpo": 0.8297593593597412, "losses/sft": 0.8575360774993896, "losses/total": 0.8297593593597412, "ref_logps/chosen": -98.24644470214844, "ref_logps/rejected": -99.63350677490234, "rewards/accuracies": 0.5, "rewards/chosen": -0.47173380851745605, "rewards/margins": -0.047176748514175415, "rewards/rejected": -0.42455700039863586, "step": 337 }, { "epoch": 0.2528520665793903, "grad_norm": 58.772934629172006, "learning_rate": 4.378682828837301e-07, "logps/chosen": -91.87226104736328, "logps/rejected": -92.04378509521484, "loss": 0.672, "losses/dpo": 0.766425371170044, "losses/sft": 0.9694927930831909, "losses/total": 0.766425371170044, "ref_logps/chosen": -88.53239440917969, "ref_logps/rejected": -87.80770111083984, "rewards/accuracies": 0.59375, "rewards/chosen": -0.33398669958114624, "rewards/margins": 0.08962157368659973, "rewards/rejected": -0.42360830307006836, "step": 338 }, { "epoch": 0.25360014961660743, "grad_norm": 64.45045694643481, "learning_rate": 4.374675936272723e-07, "logps/chosen": -84.83343505859375, "logps/rejected": -81.6814956665039, "loss": 0.7063, "losses/dpo": 0.7025765180587769, "losses/sft": 1.6373333930969238, "losses/total": 0.7025765180587769, "ref_logps/chosen": -81.16494750976562, "ref_logps/rejected": -77.97303771972656, "rewards/accuracies": 0.53125, "rewards/chosen": -0.36684906482696533, "rewards/margins": 0.0039953142404556274, "rewards/rejected": -0.37084439396858215, "step": 339 }, { "epoch": 0.25434823265382456, "grad_norm": 58.680052620903815, "learning_rate": 4.3706580108965907e-07, "logps/chosen": -107.62540435791016, "logps/rejected": -110.91204833984375, "loss": 0.5925, "losses/dpo": 0.5699632167816162, "losses/sft": 0.5872005224227905, "losses/total": 0.5699632167816162, "ref_logps/chosen": -104.51457214355469, "ref_logps/rejected": -105.01579284667969, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3110834062099457, "rewards/margins": 0.2785421311855316, "rewards/rejected": -0.5896255373954773, "step": 340 }, { "epoch": 0.2550963156910417, "grad_norm": 114.11851188041071, "learning_rate": 4.3666290763551305e-07, "logps/chosen": -75.84796142578125, "logps/rejected": -91.01274108886719, "loss": 0.6086, "losses/dpo": 0.6125524044036865, "losses/sft": 0.7868267893791199, "losses/total": 0.6125524044036865, "ref_logps/chosen": -72.81411743164062, "ref_logps/rejected": -85.60926818847656, "rewards/accuracies": 0.625, "rewards/chosen": -0.3033832609653473, "rewards/margins": 0.23696300387382507, "rewards/rejected": -0.5403462648391724, "step": 341 }, { "epoch": 0.2558443987282588, "grad_norm": 51.36879406871547, "learning_rate": 4.3625891563593624e-07, "logps/chosen": -90.52652740478516, "logps/rejected": -96.35853576660156, "loss": 0.6292, "losses/dpo": 0.5979079008102417, "losses/sft": 0.6872302293777466, "losses/total": 0.5979079008102417, "ref_logps/chosen": -87.62197875976562, "ref_logps/rejected": -91.77046966552734, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2904537618160248, "rewards/margins": 0.16835284233093262, "rewards/rejected": -0.4588066041469574, "step": 342 }, { "epoch": 0.25659248176547594, "grad_norm": 82.95747740869365, "learning_rate": 4.3585382746849555e-07, "logps/chosen": -84.8115463256836, "logps/rejected": -96.26366424560547, "loss": 0.6908, "losses/dpo": 0.6148281097412109, "losses/sft": 1.002333641052246, "losses/total": 0.6148281097412109, "ref_logps/chosen": -81.5679931640625, "ref_logps/rejected": -92.50774383544922, "rewards/accuracies": 0.5, "rewards/chosen": -0.32435426115989685, "rewards/margins": 0.05123797431588173, "rewards/rejected": -0.3755922317504883, "step": 343 }, { "epoch": 0.2573405648026931, "grad_norm": 66.35562632665474, "learning_rate": 4.354476455172091e-07, "logps/chosen": -106.34852600097656, "logps/rejected": -106.12844848632812, "loss": 0.6482, "losses/dpo": 0.8114491701126099, "losses/sft": 1.3805378675460815, "losses/total": 0.8114491701126099, "ref_logps/chosen": -102.43018341064453, "ref_logps/rejected": -100.83842468261719, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3918343186378479, "rewards/margins": 0.13716742396354675, "rewards/rejected": -0.5290017127990723, "step": 344 }, { "epoch": 0.25808864783991026, "grad_norm": 62.22832874729676, "learning_rate": 4.3504037217253207e-07, "logps/chosen": -90.94232177734375, "logps/rejected": -97.07719421386719, "loss": 0.6533, "losses/dpo": 0.7640217542648315, "losses/sft": 0.8389308452606201, "losses/total": 0.7640217542648315, "ref_logps/chosen": -87.3399658203125, "ref_logps/rejected": -92.22074890136719, "rewards/accuracies": 0.5, "rewards/chosen": -0.36023515462875366, "rewards/margins": 0.1254090517759323, "rewards/rejected": -0.4856441915035248, "step": 345 }, { "epoch": 0.2588367308771274, "grad_norm": 52.374270231493384, "learning_rate": 4.3463200983134283e-07, "logps/chosen": -69.49311065673828, "logps/rejected": -76.28266143798828, "loss": 0.6396, "losses/dpo": 0.6892877817153931, "losses/sft": 0.44505131244659424, "losses/total": 0.6892877817153931, "ref_logps/chosen": -66.22232818603516, "ref_logps/rejected": -71.50410461425781, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3270784318447113, "rewards/margins": 0.15077702701091766, "rewards/rejected": -0.4778554439544678, "step": 346 }, { "epoch": 0.2595848139143445, "grad_norm": 113.4958926226492, "learning_rate": 4.3422256089692864e-07, "logps/chosen": -93.28970336914062, "logps/rejected": -89.62554931640625, "loss": 0.7485, "losses/dpo": 0.6938676834106445, "losses/sft": 0.9398950338363647, "losses/total": 0.6938676834106445, "ref_logps/chosen": -89.2889175415039, "ref_logps/rejected": -86.13784790039062, "rewards/accuracies": 0.4375, "rewards/chosen": -0.40007826685905457, "rewards/margins": -0.051309116184711456, "rewards/rejected": -0.3487691581249237, "step": 347 }, { "epoch": 0.26033289695156164, "grad_norm": 62.72260002328045, "learning_rate": 4.338120277789715e-07, "logps/chosen": -102.0580062866211, "logps/rejected": -105.69828796386719, "loss": 0.6233, "losses/dpo": 0.5578228831291199, "losses/sft": 1.0151971578598022, "losses/total": 0.5578228831291199, "ref_logps/chosen": -99.30459594726562, "ref_logps/rejected": -101.12935638427734, "rewards/accuracies": 0.59375, "rewards/chosen": -0.27534130215644836, "rewards/margins": 0.18155153095722198, "rewards/rejected": -0.45689281821250916, "step": 348 }, { "epoch": 0.26108097998877877, "grad_norm": 64.41984045279547, "learning_rate": 4.334004128935341e-07, "logps/chosen": -89.49449157714844, "logps/rejected": -97.50041198730469, "loss": 0.6444, "losses/dpo": 0.774516224861145, "losses/sft": 0.36122989654541016, "losses/total": 0.774516224861145, "ref_logps/chosen": -86.60538482666016, "ref_logps/rejected": -93.2137680053711, "rewards/accuracies": 0.71875, "rewards/chosen": -0.28891170024871826, "rewards/margins": 0.13975180685520172, "rewards/rejected": -0.4286634922027588, "step": 349 }, { "epoch": 0.2618290630259959, "grad_norm": 59.685870147341916, "learning_rate": 4.3298771866304553e-07, "logps/chosen": -99.8868408203125, "logps/rejected": -101.68441772460938, "loss": 0.6453, "losses/dpo": 0.6648879051208496, "losses/sft": 0.6590445041656494, "losses/total": 0.6648879051208496, "ref_logps/chosen": -97.2677001953125, "ref_logps/rejected": -97.51998901367188, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2619129419326782, "rewards/margins": 0.15452858805656433, "rewards/rejected": -0.41644155979156494, "step": 350 }, { "epoch": 0.26257714606321303, "grad_norm": 71.64177822902906, "learning_rate": 4.3257394751628684e-07, "logps/chosen": -109.29497528076172, "logps/rejected": -118.75592803955078, "loss": 0.6143, "losses/dpo": 0.6475285291671753, "losses/sft": 1.6340135335922241, "losses/total": 0.6475285291671753, "ref_logps/chosen": -105.31428527832031, "ref_logps/rejected": -112.77046203613281, "rewards/accuracies": 0.71875, "rewards/chosen": -0.39806878566741943, "rewards/margins": 0.20047815144062042, "rewards/rejected": -0.5985468626022339, "step": 351 }, { "epoch": 0.26332522910043016, "grad_norm": 88.5311421165792, "learning_rate": 4.321591018883771e-07, "logps/chosen": -100.16071319580078, "logps/rejected": -109.5865478515625, "loss": 0.6356, "losses/dpo": 0.550627589225769, "losses/sft": 0.9531588554382324, "losses/total": 0.550627589225769, "ref_logps/chosen": -96.87012481689453, "ref_logps/rejected": -104.75088500976562, "rewards/accuracies": 0.71875, "rewards/chosen": -0.32905977964401245, "rewards/margins": 0.1545066237449646, "rewards/rejected": -0.48356640338897705, "step": 352 }, { "epoch": 0.2640733121376473, "grad_norm": 704.5662703198763, "learning_rate": 4.317431842207591e-07, "logps/chosen": -103.59830474853516, "logps/rejected": -106.19921112060547, "loss": 0.6871, "losses/dpo": 0.641968846321106, "losses/sft": 1.1789569854736328, "losses/total": 0.641968846321106, "ref_logps/chosen": -99.26826477050781, "ref_logps/rejected": -101.35611724853516, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4330042004585266, "rewards/margins": 0.05130578950047493, "rewards/rejected": -0.48430997133255005, "step": 353 }, { "epoch": 0.2648213951748644, "grad_norm": 58.90920375228225, "learning_rate": 4.3132619696118446e-07, "logps/chosen": -87.79286193847656, "logps/rejected": -101.03178405761719, "loss": 0.6378, "losses/dpo": 0.6195574998855591, "losses/sft": 0.7465136051177979, "losses/total": 0.6195574998855591, "ref_logps/chosen": -84.89408111572266, "ref_logps/rejected": -96.540771484375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.28987783193588257, "rewards/margins": 0.15922433137893677, "rewards/rejected": -0.44910216331481934, "step": 354 }, { "epoch": 0.26556947821208154, "grad_norm": 96.66306041918637, "learning_rate": 4.3090814256369944e-07, "logps/chosen": -73.95314025878906, "logps/rejected": -74.46007537841797, "loss": 0.6461, "losses/dpo": 0.5594350695610046, "losses/sft": 0.37894201278686523, "losses/total": 0.5594350695610046, "ref_logps/chosen": -71.39718627929688, "ref_logps/rejected": -70.59175872802734, "rewards/accuracies": 0.625, "rewards/chosen": -0.2555948793888092, "rewards/margins": 0.13123740255832672, "rewards/rejected": -0.3868322968482971, "step": 355 }, { "epoch": 0.2663175612492987, "grad_norm": 90.02597035714258, "learning_rate": 4.3048902348863106e-07, "logps/chosen": -93.15377807617188, "logps/rejected": -98.01200866699219, "loss": 0.6643, "losses/dpo": 0.6630292534828186, "losses/sft": 0.6847069263458252, "losses/total": 0.6630292534828186, "ref_logps/chosen": -89.27113342285156, "ref_logps/rejected": -93.05946350097656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3882654309272766, "rewards/margins": 0.10698854923248291, "rewards/rejected": -0.4952539801597595, "step": 356 }, { "epoch": 0.2670656442865158, "grad_norm": 56.39820333554553, "learning_rate": 4.3006884220257187e-07, "logps/chosen": -88.54147338867188, "logps/rejected": -92.16751098632812, "loss": 0.668, "losses/dpo": 0.5299070477485657, "losses/sft": 0.6704638004302979, "losses/total": 0.5299070477485657, "ref_logps/chosen": -85.4273681640625, "ref_logps/rejected": -87.85038757324219, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3114109933376312, "rewards/margins": 0.1203019767999649, "rewards/rejected": -0.4317129850387573, "step": 357 }, { "epoch": 0.26781372732373293, "grad_norm": 77.71691081252565, "learning_rate": 4.296476011783657e-07, "logps/chosen": -88.66586303710938, "logps/rejected": -91.66463470458984, "loss": 0.6942, "losses/dpo": 0.6867028474807739, "losses/sft": 0.4477330148220062, "losses/total": 0.6867028474807739, "ref_logps/chosen": -84.44121551513672, "ref_logps/rejected": -87.20732116699219, "rewards/accuracies": 0.5625, "rewards/chosen": -0.42246460914611816, "rewards/margins": 0.023266321048140526, "rewards/rejected": -0.44573095440864563, "step": 358 }, { "epoch": 0.26856181036095006, "grad_norm": 64.26420703365181, "learning_rate": 4.2922530289509316e-07, "logps/chosen": -101.50946044921875, "logps/rejected": -103.14901733398438, "loss": 0.6659, "losses/dpo": 0.6497390270233154, "losses/sft": 0.8484537601470947, "losses/total": 0.6497390270233154, "ref_logps/chosen": -99.14418029785156, "ref_logps/rejected": -99.83372497558594, "rewards/accuracies": 0.5625, "rewards/chosen": -0.23652781546115875, "rewards/margins": 0.09500101208686829, "rewards/rejected": -0.33152884244918823, "step": 359 }, { "epoch": 0.2693098933981672, "grad_norm": 60.60442209129845, "learning_rate": 4.2880194983805715e-07, "logps/chosen": -83.51825714111328, "logps/rejected": -88.94476318359375, "loss": 0.6729, "losses/dpo": 0.7160643935203552, "losses/sft": 0.7032243013381958, "losses/total": 0.7160643935203552, "ref_logps/chosen": -80.53553771972656, "ref_logps/rejected": -85.09683227539062, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2982712388038635, "rewards/margins": 0.08652173727750778, "rewards/rejected": -0.3847929835319519, "step": 360 }, { "epoch": 0.2700579764353843, "grad_norm": 69.44317018721593, "learning_rate": 4.2837754449876806e-07, "logps/chosen": -82.43925476074219, "logps/rejected": -97.81515502929688, "loss": 0.6504, "losses/dpo": 0.774928092956543, "losses/sft": 0.48593103885650635, "losses/total": 0.774928092956543, "ref_logps/chosen": -79.11768341064453, "ref_logps/rejected": -93.05793762207031, "rewards/accuracies": 0.625, "rewards/chosen": -0.3321569561958313, "rewards/margins": 0.14356482028961182, "rewards/rejected": -0.4757217764854431, "step": 361 }, { "epoch": 0.27080605947260145, "grad_norm": 68.77690237638893, "learning_rate": 4.27952089374929e-07, "logps/chosen": -90.57831573486328, "logps/rejected": -79.0505142211914, "loss": 0.736, "losses/dpo": 0.670172929763794, "losses/sft": 0.8914649486541748, "losses/total": 0.670172929763794, "ref_logps/chosen": -86.48335266113281, "ref_logps/rejected": -75.40269470214844, "rewards/accuracies": 0.46875, "rewards/chosen": -0.4094967842102051, "rewards/margins": -0.044714588671922684, "rewards/rejected": -0.3647821843624115, "step": 362 }, { "epoch": 0.2715541425098186, "grad_norm": 58.47785489801682, "learning_rate": 4.275255869704214e-07, "logps/chosen": -98.15933227539062, "logps/rejected": -113.11212158203125, "loss": 0.6389, "losses/dpo": 0.6357824206352234, "losses/sft": 1.4779934883117676, "losses/total": 0.6357824206352234, "ref_logps/chosen": -94.88140869140625, "ref_logps/rejected": -108.41517639160156, "rewards/accuracies": 0.5625, "rewards/chosen": -0.32779181003570557, "rewards/margins": 0.14190295338630676, "rewards/rejected": -0.46969473361968994, "step": 363 }, { "epoch": 0.2723022255470357, "grad_norm": 53.66322641225347, "learning_rate": 4.2709803979529023e-07, "logps/chosen": -99.82719421386719, "logps/rejected": -120.27503967285156, "loss": 0.5927, "losses/dpo": 0.412897527217865, "losses/sft": 1.5839099884033203, "losses/total": 0.412897527217865, "ref_logps/chosen": -97.06554412841797, "ref_logps/rejected": -114.49451446533203, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2761649489402771, "rewards/margins": 0.3018876910209656, "rewards/rejected": -0.5780525803565979, "step": 364 }, { "epoch": 0.27305030858425283, "grad_norm": 100.59053455817896, "learning_rate": 4.266694503657288e-07, "logps/chosen": -109.48866271972656, "logps/rejected": -103.51502227783203, "loss": 0.747, "losses/dpo": 0.4191114902496338, "losses/sft": 1.2431716918945312, "losses/total": 0.4191114902496338, "ref_logps/chosen": -105.55632781982422, "ref_logps/rejected": -99.81114196777344, "rewards/accuracies": 0.5, "rewards/chosen": -0.39323288202285767, "rewards/margins": -0.022844010964035988, "rewards/rejected": -0.370388925075531, "step": 365 }, { "epoch": 0.27379839162146996, "grad_norm": 73.84745191027748, "learning_rate": 4.2623982120406455e-07, "logps/chosen": -87.04074096679688, "logps/rejected": -108.49356079101562, "loss": 0.6456, "losses/dpo": 0.6161978244781494, "losses/sft": 0.8003149032592773, "losses/total": 0.6161978244781494, "ref_logps/chosen": -84.52778625488281, "ref_logps/rejected": -104.6218490600586, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2512965798377991, "rewards/margins": 0.13587547838687897, "rewards/rejected": -0.38717207312583923, "step": 366 }, { "epoch": 0.2745464746586871, "grad_norm": 93.36429225086475, "learning_rate": 4.2580915483874386e-07, "logps/chosen": -94.72943115234375, "logps/rejected": -106.71089172363281, "loss": 0.6169, "losses/dpo": 0.49447232484817505, "losses/sft": 0.3017962574958801, "losses/total": 0.49447232484817505, "ref_logps/chosen": -91.7420425415039, "ref_logps/rejected": -101.540771484375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.298738569021225, "rewards/margins": 0.21827368438243866, "rewards/rejected": -0.5170122385025024, "step": 367 }, { "epoch": 0.2752945576959042, "grad_norm": 123.07036404647043, "learning_rate": 4.253774538043172e-07, "logps/chosen": -89.19345092773438, "logps/rejected": -99.34358215332031, "loss": 0.6382, "losses/dpo": 0.5755652189254761, "losses/sft": 0.8644188046455383, "losses/total": 0.5755652189254761, "ref_logps/chosen": -85.35987091064453, "ref_logps/rejected": -93.86044311523438, "rewards/accuracies": 0.625, "rewards/chosen": -0.3833569884300232, "rewards/margins": 0.1649562120437622, "rewards/rejected": -0.5483132004737854, "step": 368 }, { "epoch": 0.2760426407331214, "grad_norm": 56.394649997223375, "learning_rate": 4.2494472064142445e-07, "logps/chosen": -86.11345672607422, "logps/rejected": -97.45702362060547, "loss": 0.6536, "losses/dpo": 0.6365755796432495, "losses/sft": 0.20990777015686035, "losses/total": 0.6365755796432495, "ref_logps/chosen": -83.63475799560547, "ref_logps/rejected": -93.97148132324219, "rewards/accuracies": 0.59375, "rewards/chosen": -0.24787023663520813, "rewards/margins": 0.10068462789058685, "rewards/rejected": -0.3485548794269562, "step": 369 }, { "epoch": 0.27679072377033853, "grad_norm": 61.77134131940505, "learning_rate": 4.245109578967794e-07, "logps/chosen": -100.02681732177734, "logps/rejected": -108.10770416259766, "loss": 0.6361, "losses/dpo": 0.7091788053512573, "losses/sft": 0.6594188213348389, "losses/total": 0.7091788053512573, "ref_logps/chosen": -96.90988159179688, "ref_logps/rejected": -103.26136779785156, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3116932809352875, "rewards/margins": 0.17294064164161682, "rewards/rejected": -0.4846339225769043, "step": 370 }, { "epoch": 0.27753880680755566, "grad_norm": 57.91007944240269, "learning_rate": 4.240761681231555e-07, "logps/chosen": -87.16018676757812, "logps/rejected": -98.56135559082031, "loss": 0.6389, "losses/dpo": 0.5826079845428467, "losses/sft": 0.6474307179450989, "losses/total": 0.5826079845428467, "ref_logps/chosen": -84.08444213867188, "ref_logps/rejected": -93.83903503417969, "rewards/accuracies": 0.65625, "rewards/chosen": -0.30757468938827515, "rewards/margins": 0.1646571308374405, "rewards/rejected": -0.47223180532455444, "step": 371 }, { "epoch": 0.2782868898447728, "grad_norm": 64.41819596870741, "learning_rate": 4.2364035387937013e-07, "logps/chosen": -93.35939025878906, "logps/rejected": -90.65776824951172, "loss": 0.5991, "losses/dpo": 0.7410329580307007, "losses/sft": 0.31276944279670715, "losses/total": 0.7410329580307007, "ref_logps/chosen": -91.14404296875, "ref_logps/rejected": -85.81830596923828, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22153609991073608, "rewards/margins": 0.2624104917049408, "rewards/rejected": -0.4839465618133545, "step": 372 }, { "epoch": 0.2790349728819899, "grad_norm": 91.14096159785372, "learning_rate": 4.2320351773027007e-07, "logps/chosen": -108.08280944824219, "logps/rejected": -108.26910400390625, "loss": 0.6755, "losses/dpo": 0.7130191326141357, "losses/sft": 1.484626293182373, "losses/total": 0.7130191326141357, "ref_logps/chosen": -103.84332275390625, "ref_logps/rejected": -103.36087799072266, "rewards/accuracies": 0.5, "rewards/chosen": -0.42394864559173584, "rewards/margins": 0.06687364727258682, "rewards/rejected": -0.49082231521606445, "step": 373 }, { "epoch": 0.27978305591920705, "grad_norm": 126.90766363007087, "learning_rate": 4.227656622467162e-07, "logps/chosen": -87.5870361328125, "logps/rejected": -88.5322036743164, "loss": 0.605, "losses/dpo": 0.572255551815033, "losses/sft": 0.4049743413925171, "losses/total": 0.572255551815033, "ref_logps/chosen": -84.92054748535156, "ref_logps/rejected": -83.50144958496094, "rewards/accuracies": 0.6875, "rewards/chosen": -0.26664918661117554, "rewards/margins": 0.23642632365226746, "rewards/rejected": -0.5030755400657654, "step": 374 }, { "epoch": 0.2805311389564242, "grad_norm": 53.653798051196745, "learning_rate": 4.2232679000556813e-07, "logps/chosen": -91.92958068847656, "logps/rejected": -90.0485610961914, "loss": 0.7187, "losses/dpo": 0.710106611251831, "losses/sft": 1.3844250440597534, "losses/total": 0.710106611251831, "ref_logps/chosen": -88.3966064453125, "ref_logps/rejected": -86.68307495117188, "rewards/accuracies": 0.5, "rewards/chosen": -0.3532969057559967, "rewards/margins": -0.016747869551181793, "rewards/rejected": -0.3365490138530731, "step": 375 }, { "epoch": 0.2812792219936413, "grad_norm": 67.69722998402347, "learning_rate": 4.218869035896697e-07, "logps/chosen": -109.99205017089844, "logps/rejected": -108.04450988769531, "loss": 0.7034, "losses/dpo": 0.6756234765052795, "losses/sft": 1.4084258079528809, "losses/total": 0.6756234765052795, "ref_logps/chosen": -106.27424621582031, "ref_logps/rejected": -104.15574645996094, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3717804551124573, "rewards/margins": 0.017095983028411865, "rewards/rejected": -0.38887640833854675, "step": 376 }, { "epoch": 0.28202730503085843, "grad_norm": 66.6517655678922, "learning_rate": 4.2144600558783284e-07, "logps/chosen": -82.02978515625, "logps/rejected": -97.30498504638672, "loss": 0.5734, "losses/dpo": 0.6335276365280151, "losses/sft": 0.708867609500885, "losses/total": 0.6335276365280151, "ref_logps/chosen": -79.55414581298828, "ref_logps/rejected": -91.77738952636719, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2475641816854477, "rewards/margins": 0.305195152759552, "rewards/rejected": -0.5527592897415161, "step": 377 }, { "epoch": 0.28277538806807556, "grad_norm": 74.86105046062416, "learning_rate": 4.210040985948234e-07, "logps/chosen": -95.55134582519531, "logps/rejected": -100.42412567138672, "loss": 0.6345, "losses/dpo": 0.6745758652687073, "losses/sft": 1.2199480533599854, "losses/total": 0.6745758652687073, "ref_logps/chosen": -92.65342712402344, "ref_logps/rejected": -95.82487487792969, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2897922992706299, "rewards/margins": 0.17013272643089294, "rewards/rejected": -0.4599250257015228, "step": 378 }, { "epoch": 0.2835234711052927, "grad_norm": 45.95057454792911, "learning_rate": 4.2056118521134474e-07, "logps/chosen": -56.288055419921875, "logps/rejected": -63.91986083984375, "loss": 0.6252, "losses/dpo": 0.6295973658561707, "losses/sft": 0.5429747700691223, "losses/total": 0.6295973658561707, "ref_logps/chosen": -53.94113540649414, "ref_logps/rejected": -59.8658561706543, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2346915602684021, "rewards/margins": 0.17070892453193665, "rewards/rejected": -0.40540048480033875, "step": 379 }, { "epoch": 0.2842715541425098, "grad_norm": 61.09872069759873, "learning_rate": 4.2011726804402366e-07, "logps/chosen": -94.00880432128906, "logps/rejected": -110.77256774902344, "loss": 0.6022, "losses/dpo": 0.6889244914054871, "losses/sft": 0.8656389117240906, "losses/total": 0.6889244914054871, "ref_logps/chosen": -91.28462982177734, "ref_logps/rejected": -105.58648681640625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.27241766452789307, "rewards/margins": 0.24619126319885254, "rewards/rejected": -0.5186089277267456, "step": 380 }, { "epoch": 0.28501963717972695, "grad_norm": 66.87421800387928, "learning_rate": 4.196723497053938e-07, "logps/chosen": -123.52484130859375, "logps/rejected": -126.07083129882812, "loss": 0.6395, "losses/dpo": 0.7798928022384644, "losses/sft": 1.173722267150879, "losses/total": 0.7798928022384644, "ref_logps/chosen": -120.11604309082031, "ref_logps/rejected": -120.73371887207031, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3408803641796112, "rewards/margins": 0.1928310990333557, "rewards/rejected": -0.5337114930152893, "step": 381 }, { "epoch": 0.2857677202169441, "grad_norm": 78.86587233408062, "learning_rate": 4.192264328138812e-07, "logps/chosen": -91.67349243164062, "logps/rejected": -92.37512969970703, "loss": 0.6303, "losses/dpo": 0.6725355982780457, "losses/sft": 0.7597410678863525, "losses/total": 0.6725355982780457, "ref_logps/chosen": -88.24591064453125, "ref_logps/rejected": -87.18920135498047, "rewards/accuracies": 0.625, "rewards/chosen": -0.3427579998970032, "rewards/margins": 0.1758340299129486, "rewards/rejected": -0.5185920596122742, "step": 382 }, { "epoch": 0.2865158032541612, "grad_norm": 51.42948241108186, "learning_rate": 4.187795199937886e-07, "logps/chosen": -88.22174835205078, "logps/rejected": -94.05795288085938, "loss": 0.625, "losses/dpo": 0.5679365396499634, "losses/sft": 0.4570709466934204, "losses/total": 0.5679365396499634, "ref_logps/chosen": -85.09529113769531, "ref_logps/rejected": -89.05624389648438, "rewards/accuracies": 0.625, "rewards/chosen": -0.31264516711235046, "rewards/margins": 0.18752610683441162, "rewards/rejected": -0.5001713037490845, "step": 383 }, { "epoch": 0.28726388629137833, "grad_norm": 64.25532923761236, "learning_rate": 4.1833161387527984e-07, "logps/chosen": -80.52953338623047, "logps/rejected": -88.19551086425781, "loss": 0.6905, "losses/dpo": 0.8063064813613892, "losses/sft": 0.7960500121116638, "losses/total": 0.8063064813613892, "ref_logps/chosen": -77.16768646240234, "ref_logps/rejected": -84.47203063964844, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3361847996711731, "rewards/margins": 0.036162685602903366, "rewards/rejected": -0.37234747409820557, "step": 384 }, { "epoch": 0.28801196932859546, "grad_norm": 54.74629648302411, "learning_rate": 4.1788271709436444e-07, "logps/chosen": -100.41122436523438, "logps/rejected": -104.57575988769531, "loss": 0.6554, "losses/dpo": 0.5923233032226562, "losses/sft": 1.0751452445983887, "losses/total": 0.5923233032226562, "ref_logps/chosen": -97.29524230957031, "ref_logps/rejected": -100.06254577636719, "rewards/accuracies": 0.625, "rewards/chosen": -0.31159815192222595, "rewards/margins": 0.13972365856170654, "rewards/rejected": -0.4513218402862549, "step": 385 }, { "epoch": 0.2887600523658126, "grad_norm": 60.21882113901399, "learning_rate": 4.174328322928824e-07, "logps/chosen": -76.7869644165039, "logps/rejected": -79.00904846191406, "loss": 0.6893, "losses/dpo": 0.6839309930801392, "losses/sft": 0.7250419855117798, "losses/total": 0.6839309930801392, "ref_logps/chosen": -73.55694580078125, "ref_logps/rejected": -75.03411865234375, "rewards/accuracies": 0.53125, "rewards/chosen": -0.32300153374671936, "rewards/margins": 0.07449106872081757, "rewards/rejected": -0.39749258756637573, "step": 386 }, { "epoch": 0.2895081354030297, "grad_norm": 91.18661594812076, "learning_rate": 4.1698196211848815e-07, "logps/chosen": -97.15957641601562, "logps/rejected": -101.00285339355469, "loss": 0.7212, "losses/dpo": 0.8845137357711792, "losses/sft": 1.352766513824463, "losses/total": 0.8845137357711792, "ref_logps/chosen": -93.53697204589844, "ref_logps/rejected": -97.43905639648438, "rewards/accuracies": 0.46875, "rewards/chosen": -0.36226069927215576, "rewards/margins": -0.00587979331612587, "rewards/rejected": -0.3563809096813202, "step": 387 }, { "epoch": 0.29025621844024685, "grad_norm": 180.07973307985904, "learning_rate": 4.1653010922463537e-07, "logps/chosen": -87.074951171875, "logps/rejected": -89.51736450195312, "loss": 0.6313, "losses/dpo": 0.6230663061141968, "losses/sft": 0.482770174741745, "losses/total": 0.6230663061141968, "ref_logps/chosen": -84.31674194335938, "ref_logps/rejected": -84.77002716064453, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2758212685585022, "rewards/margins": 0.19891220331192017, "rewards/rejected": -0.47473347187042236, "step": 388 }, { "epoch": 0.291004301477464, "grad_norm": 72.80567243924784, "learning_rate": 4.160772762705611e-07, "logps/chosen": -103.52108001708984, "logps/rejected": -129.72653198242188, "loss": 0.5769, "losses/dpo": 0.6337815523147583, "losses/sft": 0.2983129024505615, "losses/total": 0.6337815523147583, "ref_logps/chosen": -100.64726257324219, "ref_logps/rejected": -123.97457885742188, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2873810827732086, "rewards/margins": 0.2878149151802063, "rewards/rejected": -0.5751960277557373, "step": 389 }, { "epoch": 0.2917523845146811, "grad_norm": 53.11174005515542, "learning_rate": 4.1562346592127044e-07, "logps/chosen": -66.69302368164062, "logps/rejected": -74.94159698486328, "loss": 0.627, "losses/dpo": 0.6542713046073914, "losses/sft": 0.4709785580635071, "losses/total": 0.6542713046073914, "ref_logps/chosen": -64.6053695678711, "ref_logps/rejected": -71.20217895507812, "rewards/accuracies": 0.71875, "rewards/chosen": -0.20876523852348328, "rewards/margins": 0.1651758849620819, "rewards/rejected": -0.3739411234855652, "step": 390 }, { "epoch": 0.29250046755189824, "grad_norm": 65.58858261363594, "learning_rate": 4.151686808475203e-07, "logps/chosen": -93.3748779296875, "logps/rejected": -91.1708984375, "loss": 0.6734, "losses/dpo": 0.7169638276100159, "losses/sft": 0.1976374387741089, "losses/total": 0.7169638276100159, "ref_logps/chosen": -89.40373229980469, "ref_logps/rejected": -86.19606018066406, "rewards/accuracies": 0.65625, "rewards/chosen": -0.397114634513855, "rewards/margins": 0.10037027299404144, "rewards/rejected": -0.49748489260673523, "step": 391 }, { "epoch": 0.29324855058911536, "grad_norm": 86.80081089273857, "learning_rate": 4.1471292372580443e-07, "logps/chosen": -85.85636901855469, "logps/rejected": -90.66569519042969, "loss": 0.6608, "losses/dpo": 0.6535675525665283, "losses/sft": 0.8053929209709167, "losses/total": 0.6535675525665283, "ref_logps/chosen": -82.15685272216797, "ref_logps/rejected": -86.03886413574219, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36995214223861694, "rewards/margins": 0.09272924065589905, "rewards/rejected": -0.46268144249916077, "step": 392 }, { "epoch": 0.29399663362633255, "grad_norm": 101.42791928440496, "learning_rate": 4.1425619723833683e-07, "logps/chosen": -119.17550659179688, "logps/rejected": -121.9960708618164, "loss": 0.6195, "losses/dpo": 0.6728461980819702, "losses/sft": 1.3620843887329102, "losses/total": 0.6728461980819702, "ref_logps/chosen": -116.452880859375, "ref_logps/rejected": -117.20929718017578, "rewards/accuracies": 0.78125, "rewards/chosen": -0.27226197719573975, "rewards/margins": 0.2064143866300583, "rewards/rejected": -0.47867631912231445, "step": 393 }, { "epoch": 0.2947447166635497, "grad_norm": 75.47512849820177, "learning_rate": 4.1379850407303667e-07, "logps/chosen": -104.0035629272461, "logps/rejected": -105.18464660644531, "loss": 0.6667, "losses/dpo": 0.6452575922012329, "losses/sft": 1.3853174448013306, "losses/total": 0.6452575922012329, "ref_logps/chosen": -100.5472640991211, "ref_logps/rejected": -100.67808532714844, "rewards/accuracies": 0.59375, "rewards/chosen": -0.34563010931015015, "rewards/margins": 0.10502530634403229, "rewards/rejected": -0.45065540075302124, "step": 394 }, { "epoch": 0.2954927997007668, "grad_norm": 61.151185525301976, "learning_rate": 4.133398469235122e-07, "logps/chosen": -89.93748474121094, "logps/rejected": -89.63323974609375, "loss": 0.6674, "losses/dpo": 0.7023779153823853, "losses/sft": 0.26025834679603577, "losses/total": 0.7023779153823853, "ref_logps/chosen": -86.72557067871094, "ref_logps/rejected": -85.14253234863281, "rewards/accuracies": 0.625, "rewards/chosen": -0.3211905062198639, "rewards/margins": 0.12787987291812897, "rewards/rejected": -0.44907036423683167, "step": 395 }, { "epoch": 0.29624088273798393, "grad_norm": 62.71644524947364, "learning_rate": 4.128802284890448e-07, "logps/chosen": -101.12677001953125, "logps/rejected": -110.56572723388672, "loss": 0.5928, "losses/dpo": 0.5784692764282227, "losses/sft": 0.9545797109603882, "losses/total": 0.5784692764282227, "ref_logps/chosen": -98.42140197753906, "ref_logps/rejected": -105.34101104736328, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2705361545085907, "rewards/margins": 0.2519353926181793, "rewards/rejected": -0.5224716067314148, "step": 396 }, { "epoch": 0.29698896577520106, "grad_norm": 112.99006231040057, "learning_rate": 4.124196514745731e-07, "logps/chosen": -103.50193786621094, "logps/rejected": -100.85554504394531, "loss": 0.6899, "losses/dpo": 0.9087870717048645, "losses/sft": 1.038790225982666, "losses/total": 0.9087870717048645, "ref_logps/chosen": -99.77083587646484, "ref_logps/rejected": -96.61971282958984, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3731100559234619, "rewards/margins": 0.05047391355037689, "rewards/rejected": -0.423583984375, "step": 397 }, { "epoch": 0.2977370488124182, "grad_norm": 71.67526933654698, "learning_rate": 4.119581185906775e-07, "logps/chosen": -62.57926940917969, "logps/rejected": -68.94880676269531, "loss": 0.6841, "losses/dpo": 0.7435628175735474, "losses/sft": 0.5898489952087402, "losses/total": 0.7435628175735474, "ref_logps/chosen": -60.03893280029297, "ref_logps/rejected": -65.92884826660156, "rewards/accuracies": 0.5, "rewards/chosen": -0.2540333867073059, "rewards/margins": 0.04796294867992401, "rewards/rejected": -0.3019963502883911, "step": 398 }, { "epoch": 0.2984851318496353, "grad_norm": 57.782508403140554, "learning_rate": 4.1149563255356344e-07, "logps/chosen": -89.75532531738281, "logps/rejected": -98.67219543457031, "loss": 0.5889, "losses/dpo": 0.5487593412399292, "losses/sft": 0.6175507307052612, "losses/total": 0.5487593412399292, "ref_logps/chosen": -87.03262329101562, "ref_logps/rejected": -93.24620056152344, "rewards/accuracies": 0.71875, "rewards/chosen": -0.27226966619491577, "rewards/margins": 0.2703302204608917, "rewards/rejected": -0.5425998568534851, "step": 399 }, { "epoch": 0.29923321488685245, "grad_norm": 61.11271356774331, "learning_rate": 4.1103219608504605e-07, "logps/chosen": -68.95610046386719, "logps/rejected": -76.6453857421875, "loss": 0.7046, "losses/dpo": 0.7525333166122437, "losses/sft": 0.3280377686023712, "losses/total": 0.7525333166122437, "ref_logps/chosen": -66.00277709960938, "ref_logps/rejected": -73.59567260742188, "rewards/accuracies": 0.5, "rewards/chosen": -0.29533249139785767, "rewards/margins": 0.009638383984565735, "rewards/rejected": -0.3049708604812622, "step": 400 }, { "epoch": 0.2999812979240696, "grad_norm": 64.351919284585, "learning_rate": 4.1056781191253396e-07, "logps/chosen": -104.71839904785156, "logps/rejected": -103.46099853515625, "loss": 0.6594, "losses/dpo": 0.6111490726470947, "losses/sft": 1.2278536558151245, "losses/total": 0.6111490726470947, "ref_logps/chosen": -101.38566589355469, "ref_logps/rejected": -98.89288330078125, "rewards/accuracies": 0.59375, "rewards/chosen": -0.33327388763427734, "rewards/margins": 0.12353702634572983, "rewards/rejected": -0.45681092143058777, "step": 401 }, { "epoch": 0.3007293809612867, "grad_norm": 79.57813957269649, "learning_rate": 4.1010248276901317e-07, "logps/chosen": -106.5688247680664, "logps/rejected": -115.73623657226562, "loss": 0.6427, "losses/dpo": 0.7220480442047119, "losses/sft": 0.3787539005279541, "losses/total": 0.7220480442047119, "ref_logps/chosen": -103.11180114746094, "ref_logps/rejected": -110.77111053466797, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34570246934890747, "rewards/margins": 0.15080928802490234, "rewards/rejected": -0.4965117573738098, "step": 402 }, { "epoch": 0.3007293809612867, "eval_logps/chosen": -38.12191390991211, "eval_logps/rejected": -42.93342590332031, "eval_loss": 0.6449341177940369, "eval_losses/dpo": 0.6665902137756348, "eval_losses/sft": 0.3194921016693115, "eval_losses/total": 0.6665902137756348, "eval_ref_logps/chosen": -35.788204193115234, "eval_ref_logps/rejected": -39.235660552978516, "eval_rewards/accuracies": 0.631465494632721, "eval_rewards/chosen": -0.23337091505527496, "eval_rewards/margins": 0.13640545308589935, "eval_rewards/rejected": -0.3697763681411743, "eval_runtime": 38.0998, "eval_samples_per_second": 12.152, "eval_steps_per_second": 1.522, "step": 402 }, { "epoch": 0.30147746399850384, "grad_norm": 79.80624228898748, "learning_rate": 4.0963621139303094e-07, "logps/chosen": -109.87112426757812, "logps/rejected": -119.45699310302734, "loss": 0.6003, "losses/dpo": 0.49853605031967163, "losses/sft": 1.3727030754089355, "losses/total": 0.49853605031967163, "ref_logps/chosen": -107.29054260253906, "ref_logps/rejected": -114.48904418945312, "rewards/accuracies": 0.71875, "rewards/chosen": -0.25805726647377014, "rewards/margins": 0.23873820900917053, "rewards/rejected": -0.4967954754829407, "step": 403 }, { "epoch": 0.30222554703572097, "grad_norm": 68.1067629000617, "learning_rate": 4.091690005286799e-07, "logps/chosen": -85.49311828613281, "logps/rejected": -89.87053680419922, "loss": 0.6964, "losses/dpo": 0.6462661623954773, "losses/sft": 0.5406739115715027, "losses/total": 0.6462661623954773, "ref_logps/chosen": -81.82528686523438, "ref_logps/rejected": -85.94021606445312, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36678290367126465, "rewards/margins": 0.026249976828694344, "rewards/rejected": -0.39303287863731384, "step": 404 }, { "epoch": 0.3029736300729381, "grad_norm": 72.81388807815516, "learning_rate": 4.087008529255814e-07, "logps/chosen": -91.80622863769531, "logps/rejected": -104.83479309082031, "loss": 0.6177, "losses/dpo": 0.5293331146240234, "losses/sft": 1.0656768083572388, "losses/total": 0.5293331146240234, "ref_logps/chosen": -88.36183166503906, "ref_logps/rejected": -99.43157958984375, "rewards/accuracies": 0.75, "rewards/chosen": -0.3444405198097229, "rewards/margins": 0.19588156044483185, "rewards/rejected": -0.5403220653533936, "step": 405 }, { "epoch": 0.3037217131101552, "grad_norm": 53.00269922544611, "learning_rate": 4.0823177133887016e-07, "logps/chosen": -91.5296859741211, "logps/rejected": -92.5514907836914, "loss": 0.69, "losses/dpo": 0.5607931017875671, "losses/sft": 0.5130151510238647, "losses/total": 0.5607931017875671, "ref_logps/chosen": -88.2492446899414, "ref_logps/rejected": -88.65289306640625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.32804417610168457, "rewards/margins": 0.06181597709655762, "rewards/rejected": -0.3898601531982422, "step": 406 }, { "epoch": 0.30446979614737235, "grad_norm": 57.36131291244925, "learning_rate": 4.077617585291772e-07, "logps/chosen": -74.71708679199219, "logps/rejected": -81.5923843383789, "loss": 0.6395, "losses/dpo": 0.5574181079864502, "losses/sft": 0.6046332716941833, "losses/total": 0.5574181079864502, "ref_logps/chosen": -72.27204132080078, "ref_logps/rejected": -77.46245574951172, "rewards/accuracies": 0.5625, "rewards/chosen": -0.24450382590293884, "rewards/margins": 0.1684892326593399, "rewards/rejected": -0.41299304366111755, "step": 407 }, { "epoch": 0.3052178791845895, "grad_norm": 60.01478124381459, "learning_rate": 4.0729081726261403e-07, "logps/chosen": -79.79627227783203, "logps/rejected": -89.93247985839844, "loss": 0.6144, "losses/dpo": 0.5408573150634766, "losses/sft": 1.2792030572891235, "losses/total": 0.5408573150634766, "ref_logps/chosen": -78.05073547363281, "ref_logps/rejected": -86.1183853149414, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1745530515909195, "rewards/margins": 0.20685599744319916, "rewards/rejected": -0.38140904903411865, "step": 408 }, { "epoch": 0.3059659622218066, "grad_norm": 55.812001361734, "learning_rate": 4.068189503107564e-07, "logps/chosen": -86.1483383178711, "logps/rejected": -95.67485046386719, "loss": 0.6178, "losses/dpo": 0.4712520241737366, "losses/sft": 0.7892611622810364, "losses/total": 0.4712520241737366, "ref_logps/chosen": -83.71187591552734, "ref_logps/rejected": -91.16522979736328, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2436455488204956, "rewards/margins": 0.20731624960899353, "rewards/rejected": -0.45096179842948914, "step": 409 }, { "epoch": 0.30671404525902374, "grad_norm": 71.0461636086716, "learning_rate": 4.0634616045062786e-07, "logps/chosen": -69.23989868164062, "logps/rejected": -86.7921142578125, "loss": 0.6303, "losses/dpo": 0.5790273547172546, "losses/sft": 0.19181804358959198, "losses/total": 0.5790273547172546, "ref_logps/chosen": -67.4405288696289, "ref_logps/rejected": -83.12232208251953, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17993669211864471, "rewards/margins": 0.18704192340373993, "rewards/rejected": -0.36697864532470703, "step": 410 }, { "epoch": 0.30746212829624087, "grad_norm": 66.74769704039176, "learning_rate": 4.058724504646834e-07, "logps/chosen": -95.54651641845703, "logps/rejected": -91.33610534667969, "loss": 0.6537, "losses/dpo": 0.6427196264266968, "losses/sft": 0.5492734909057617, "losses/total": 0.6427196264266968, "ref_logps/chosen": -92.57904052734375, "ref_logps/rejected": -87.233154296875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2967478632926941, "rewards/margins": 0.11354569345712662, "rewards/rejected": -0.4102935791015625, "step": 411 }, { "epoch": 0.308210211333458, "grad_norm": 68.54545854401054, "learning_rate": 4.0539782314079303e-07, "logps/chosen": -87.63526916503906, "logps/rejected": -90.53598022460938, "loss": 0.6523, "losses/dpo": 0.5802704095840454, "losses/sft": 0.3004165291786194, "losses/total": 0.5802704095840454, "ref_logps/chosen": -84.59001159667969, "ref_logps/rejected": -86.20924377441406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3045269250869751, "rewards/margins": 0.12814565002918243, "rewards/rejected": -0.43267256021499634, "step": 412 }, { "epoch": 0.3089582943706751, "grad_norm": 82.06577605356378, "learning_rate": 4.049222812722256e-07, "logps/chosen": -109.60850524902344, "logps/rejected": -121.68104553222656, "loss": 0.5744, "losses/dpo": 0.647861123085022, "losses/sft": 0.634648859500885, "losses/total": 0.647861123085022, "ref_logps/chosen": -106.61862182617188, "ref_logps/rejected": -115.44195556640625, "rewards/accuracies": 0.75, "rewards/chosen": -0.29898756742477417, "rewards/margins": 0.3249210715293884, "rewards/rejected": -0.6239086389541626, "step": 413 }, { "epoch": 0.30970637740789225, "grad_norm": 80.32936958041873, "learning_rate": 4.0444582765763225e-07, "logps/chosen": -103.69288635253906, "logps/rejected": -117.85601806640625, "loss": 0.6076, "losses/dpo": 0.6093718409538269, "losses/sft": 1.119607925415039, "losses/total": 0.6093718409538269, "ref_logps/chosen": -100.667724609375, "ref_logps/rejected": -112.56735229492188, "rewards/accuracies": 0.75, "rewards/chosen": -0.30251544713974, "rewards/margins": 0.22635163366794586, "rewards/rejected": -0.5288670659065247, "step": 414 }, { "epoch": 0.3104544604451094, "grad_norm": 64.6525682274081, "learning_rate": 4.0396846510102983e-07, "logps/chosen": -97.1929931640625, "logps/rejected": -109.17264556884766, "loss": 0.6264, "losses/dpo": 0.5272024869918823, "losses/sft": 0.41647017002105713, "losses/total": 0.5272024869918823, "ref_logps/chosen": -94.81771850585938, "ref_logps/rejected": -104.61764526367188, "rewards/accuracies": 0.71875, "rewards/chosen": -0.23752787709236145, "rewards/margins": 0.21797136962413788, "rewards/rejected": -0.4554992616176605, "step": 415 }, { "epoch": 0.31120254348232657, "grad_norm": 126.45872825464531, "learning_rate": 4.0349019641178437e-07, "logps/chosen": -115.88329315185547, "logps/rejected": -114.70958709716797, "loss": 0.644, "losses/dpo": 0.6127883195877075, "losses/sft": 1.0987428426742554, "losses/total": 0.6127883195877075, "ref_logps/chosen": -112.39251708984375, "ref_logps/rejected": -109.62857055664062, "rewards/accuracies": 0.625, "rewards/chosen": -0.34907814860343933, "rewards/margins": 0.15902383625507355, "rewards/rejected": -0.5081019997596741, "step": 416 }, { "epoch": 0.3119506265195437, "grad_norm": 75.30562960995778, "learning_rate": 4.03011024404595e-07, "logps/chosen": -110.96200561523438, "logps/rejected": -111.28706359863281, "loss": 0.6571, "losses/dpo": 0.7238119840621948, "losses/sft": 0.629623293876648, "losses/total": 0.7238119840621948, "ref_logps/chosen": -107.95684814453125, "ref_logps/rejected": -106.89905548095703, "rewards/accuracies": 0.53125, "rewards/chosen": -0.30051541328430176, "rewards/margins": 0.13828566670417786, "rewards/rejected": -0.4388010799884796, "step": 417 }, { "epoch": 0.3126987095567608, "grad_norm": 68.90252973067639, "learning_rate": 4.025309518994767e-07, "logps/chosen": -108.11424255371094, "logps/rejected": -117.80757904052734, "loss": 0.6806, "losses/dpo": 0.6468265056610107, "losses/sft": 1.2807934284210205, "losses/total": 0.6468265056610107, "ref_logps/chosen": -105.21532440185547, "ref_logps/rejected": -114.34671020507812, "rewards/accuracies": 0.53125, "rewards/chosen": -0.28989264369010925, "rewards/margins": 0.05619426444172859, "rewards/rejected": -0.34608688950538635, "step": 418 }, { "epoch": 0.31344679259397795, "grad_norm": 59.983359492957725, "learning_rate": 4.020499817217441e-07, "logps/chosen": -75.89962768554688, "logps/rejected": -92.03089904785156, "loss": 0.6107, "losses/dpo": 0.502295732498169, "losses/sft": 0.24557626247406006, "losses/total": 0.502295732498169, "ref_logps/chosen": -73.68333435058594, "ref_logps/rejected": -87.51895141601562, "rewards/accuracies": 0.65625, "rewards/chosen": -0.22162964940071106, "rewards/margins": 0.22956444323062897, "rewards/rejected": -0.4511941075325012, "step": 419 }, { "epoch": 0.3141948756311951, "grad_norm": 103.62029109328599, "learning_rate": 4.01568116701995e-07, "logps/chosen": -102.99961853027344, "logps/rejected": -104.78469848632812, "loss": 0.6763, "losses/dpo": 0.7222643494606018, "losses/sft": 1.1946238279342651, "losses/total": 0.7222643494606018, "ref_logps/chosen": -98.93156433105469, "ref_logps/rejected": -99.92823028564453, "rewards/accuracies": 0.625, "rewards/chosen": -0.4068056344985962, "rewards/margins": 0.07884092628955841, "rewards/rejected": -0.4856465458869934, "step": 420 }, { "epoch": 0.3149429586684122, "grad_norm": 81.7134793334181, "learning_rate": 4.0108535967609323e-07, "logps/chosen": -91.32058715820312, "logps/rejected": -104.98336791992188, "loss": 0.644, "losses/dpo": 0.5873275399208069, "losses/sft": 0.5764497518539429, "losses/total": 0.5873275399208069, "ref_logps/chosen": -88.02153778076172, "ref_logps/rejected": -100.3485107421875, "rewards/accuracies": 0.59375, "rewards/chosen": -0.32990553975105286, "rewards/margins": 0.133580282330513, "rewards/rejected": -0.46348583698272705, "step": 421 }, { "epoch": 0.31569104170562934, "grad_norm": 77.65641041054074, "learning_rate": 4.0060171348515257e-07, "logps/chosen": -86.51165771484375, "logps/rejected": -102.88876342773438, "loss": 0.6366, "losses/dpo": 0.8151021003723145, "losses/sft": 0.618018627166748, "losses/total": 0.8151021003723145, "ref_logps/chosen": -83.52452087402344, "ref_logps/rejected": -98.32380676269531, "rewards/accuracies": 0.65625, "rewards/chosen": -0.29871419072151184, "rewards/margins": 0.1577829122543335, "rewards/rejected": -0.45649710297584534, "step": 422 }, { "epoch": 0.31643912474284647, "grad_norm": 84.20082293853554, "learning_rate": 4.001171809755195e-07, "logps/chosen": -105.68197631835938, "logps/rejected": -111.67355346679688, "loss": 0.6807, "losses/dpo": 0.6474246978759766, "losses/sft": 1.086479663848877, "losses/total": 0.6474246978759766, "ref_logps/chosen": -101.92076873779297, "ref_logps/rejected": -107.04911041259766, "rewards/accuracies": 0.5, "rewards/chosen": -0.37612003087997437, "rewards/margins": 0.0863235741853714, "rewards/rejected": -0.46244359016418457, "step": 423 }, { "epoch": 0.3171872077800636, "grad_norm": 65.37865554479792, "learning_rate": 3.996317649987565e-07, "logps/chosen": -89.56013488769531, "logps/rejected": -105.94629669189453, "loss": 0.6207, "losses/dpo": 0.6066243648529053, "losses/sft": 0.7007786631584167, "losses/total": 0.6066243648529053, "ref_logps/chosen": -86.41014099121094, "ref_logps/rejected": -100.9263916015625, "rewards/accuracies": 0.625, "rewards/chosen": -0.3150005042552948, "rewards/margins": 0.18698954582214355, "rewards/rejected": -0.5019900798797607, "step": 424 }, { "epoch": 0.3179352908172807, "grad_norm": 69.94140575912805, "learning_rate": 3.991454684116257e-07, "logps/chosen": -94.8255844116211, "logps/rejected": -109.76188659667969, "loss": 0.6283, "losses/dpo": 0.5830463171005249, "losses/sft": 1.3820254802703857, "losses/total": 0.5830463171005249, "ref_logps/chosen": -91.85401153564453, "ref_logps/rejected": -104.49398040771484, "rewards/accuracies": 0.625, "rewards/chosen": -0.2971574664115906, "rewards/margins": 0.22963380813598633, "rewards/rejected": -0.5267912745475769, "step": 425 }, { "epoch": 0.31868337385449785, "grad_norm": 72.60349705157908, "learning_rate": 3.9865829407607166e-07, "logps/chosen": -87.78775787353516, "logps/rejected": -103.01887512207031, "loss": 0.6292, "losses/dpo": 0.7087821960449219, "losses/sft": 1.0878392457962036, "losses/total": 0.7087821960449219, "ref_logps/chosen": -84.41118621826172, "ref_logps/rejected": -97.68626403808594, "rewards/accuracies": 0.71875, "rewards/chosen": -0.33765673637390137, "rewards/margins": 0.1956038475036621, "rewards/rejected": -0.5332606434822083, "step": 426 }, { "epoch": 0.319431456891715, "grad_norm": 71.25085429967727, "learning_rate": 3.981702448592046e-07, "logps/chosen": -74.51376342773438, "logps/rejected": -82.4559555053711, "loss": 0.7019, "losses/dpo": 0.5549836754798889, "losses/sft": 1.0648905038833618, "losses/total": 0.5549836754798889, "ref_logps/chosen": -71.43677520751953, "ref_logps/rejected": -79.18876647949219, "rewards/accuracies": 0.59375, "rewards/chosen": -0.30769893527030945, "rewards/margins": 0.019019566476345062, "rewards/rejected": -0.3267184793949127, "step": 427 }, { "epoch": 0.3201795399289321, "grad_norm": 76.70767777173339, "learning_rate": 3.976813236332837e-07, "logps/chosen": -110.92650604248047, "logps/rejected": -115.1159439086914, "loss": 0.6372, "losses/dpo": 0.5650621652603149, "losses/sft": 1.0259181261062622, "losses/total": 0.5650621652603149, "ref_logps/chosen": -107.53527069091797, "ref_logps/rejected": -110.10155487060547, "rewards/accuracies": 0.625, "rewards/chosen": -0.339123010635376, "rewards/margins": 0.16231590509414673, "rewards/rejected": -0.5014389157295227, "step": 428 }, { "epoch": 0.32092762296614924, "grad_norm": 53.660761079797275, "learning_rate": 3.971915332756999e-07, "logps/chosen": -97.02236938476562, "logps/rejected": -96.38919830322266, "loss": 0.6518, "losses/dpo": 0.5432475209236145, "losses/sft": 0.9187198281288147, "losses/total": 0.5432475209236145, "ref_logps/chosen": -94.91315460205078, "ref_logps/rejected": -93.0185317993164, "rewards/accuracies": 0.71875, "rewards/chosen": -0.21092185378074646, "rewards/margins": 0.1261441707611084, "rewards/rejected": -0.33706602454185486, "step": 429 }, { "epoch": 0.32167570600336637, "grad_norm": 92.11065164575945, "learning_rate": 3.9670087666895936e-07, "logps/chosen": -95.7829360961914, "logps/rejected": -88.99809265136719, "loss": 0.7249, "losses/dpo": 0.639995276927948, "losses/sft": 0.22489672899246216, "losses/total": 0.639995276927948, "ref_logps/chosen": -92.1137924194336, "ref_logps/rejected": -85.10140991210938, "rewards/accuracies": 0.40625, "rewards/chosen": -0.36691486835479736, "rewards/margins": 0.022753456607460976, "rewards/rejected": -0.3896683156490326, "step": 430 }, { "epoch": 0.3224237890405835, "grad_norm": 52.88174099709453, "learning_rate": 3.9620935670066615e-07, "logps/chosen": -84.58985137939453, "logps/rejected": -79.594970703125, "loss": 0.7251, "losses/dpo": 0.7397390604019165, "losses/sft": 0.2404203712940216, "losses/total": 0.7397390604019165, "ref_logps/chosen": -81.58450317382812, "ref_logps/rejected": -76.62251281738281, "rewards/accuracies": 0.34375, "rewards/chosen": -0.30053502321243286, "rewards/margins": -0.0032885540276765823, "rewards/rejected": -0.2972464859485626, "step": 431 }, { "epoch": 0.3231718720778006, "grad_norm": 73.47612483671486, "learning_rate": 3.957169762635053e-07, "logps/chosen": -86.64166259765625, "logps/rejected": -87.47342681884766, "loss": 0.6723, "losses/dpo": 0.5861333608627319, "losses/sft": 0.5796988606452942, "losses/total": 0.5861333608627319, "ref_logps/chosen": -83.99126434326172, "ref_logps/rejected": -83.97229766845703, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2650403082370758, "rewards/margins": 0.08507286757230759, "rewards/rejected": -0.3501132130622864, "step": 432 }, { "epoch": 0.32391995511501775, "grad_norm": 71.21478954114552, "learning_rate": 3.95223738255226e-07, "logps/chosen": -92.5272216796875, "logps/rejected": -101.27699279785156, "loss": 0.653, "losses/dpo": 0.796826958656311, "losses/sft": 1.137900471687317, "losses/total": 0.796826958656311, "ref_logps/chosen": -89.09515380859375, "ref_logps/rejected": -96.33619689941406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.34320729970932007, "rewards/margins": 0.15087108314037323, "rewards/rejected": -0.4940783679485321, "step": 433 }, { "epoch": 0.3246680381522349, "grad_norm": 92.48796242408724, "learning_rate": 3.947296455786244e-07, "logps/chosen": -100.68257141113281, "logps/rejected": -105.42575073242188, "loss": 0.6541, "losses/dpo": 0.9157425761222839, "losses/sft": 0.3389385938644409, "losses/total": 0.9157425761222839, "ref_logps/chosen": -97.0872802734375, "ref_logps/rejected": -100.33599853515625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3595285415649414, "rewards/margins": 0.14944753050804138, "rewards/rejected": -0.5089760422706604, "step": 434 }, { "epoch": 0.325416121189452, "grad_norm": 73.10862504248749, "learning_rate": 3.9423470114152635e-07, "logps/chosen": -83.29928588867188, "logps/rejected": -92.48457336425781, "loss": 0.609, "losses/dpo": 0.6235782504081726, "losses/sft": 1.1920592784881592, "losses/total": 0.6235782504081726, "ref_logps/chosen": -80.30609130859375, "ref_logps/rejected": -87.0528564453125, "rewards/accuracies": 0.59375, "rewards/chosen": -0.29931890964508057, "rewards/margins": 0.24385233223438263, "rewards/rejected": -0.5431711673736572, "step": 435 }, { "epoch": 0.32616420422666914, "grad_norm": 62.890380919557415, "learning_rate": 3.9373890785677074e-07, "logps/chosen": -100.17208862304688, "logps/rejected": -110.0091781616211, "loss": 0.5872, "losses/dpo": 0.43491512537002563, "losses/sft": 0.604377269744873, "losses/total": 0.43491512537002563, "ref_logps/chosen": -97.15734100341797, "ref_logps/rejected": -104.12722778320312, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3014741837978363, "rewards/margins": 0.2867213785648346, "rewards/rejected": -0.5881955623626709, "step": 436 }, { "epoch": 0.32691228726388627, "grad_norm": 72.5896326244794, "learning_rate": 3.9324226864219186e-07, "logps/chosen": -93.6436767578125, "logps/rejected": -97.98384094238281, "loss": 0.6422, "losses/dpo": 0.5582523345947266, "losses/sft": 1.0219215154647827, "losses/total": 0.5582523345947266, "ref_logps/chosen": -91.07538604736328, "ref_logps/rejected": -93.84363555908203, "rewards/accuracies": 0.625, "rewards/chosen": -0.2568296194076538, "rewards/margins": 0.1571904867887497, "rewards/rejected": -0.4140201210975647, "step": 437 }, { "epoch": 0.3276603703011034, "grad_norm": 76.07549348558922, "learning_rate": 3.927447864206025e-07, "logps/chosen": -92.69998168945312, "logps/rejected": -93.96725463867188, "loss": 0.6749, "losses/dpo": 0.8948017358779907, "losses/sft": 1.0610474348068237, "losses/total": 0.8948017358779907, "ref_logps/chosen": -88.8744888305664, "ref_logps/rejected": -89.21636199951172, "rewards/accuracies": 0.375, "rewards/chosen": -0.38254913687705994, "rewards/margins": 0.09254151582717896, "rewards/rejected": -0.4750906229019165, "step": 438 }, { "epoch": 0.3284084533383205, "grad_norm": 58.93158583028018, "learning_rate": 3.9224646411977683e-07, "logps/chosen": -88.2759780883789, "logps/rejected": -98.56522369384766, "loss": 0.6315, "losses/dpo": 0.6478318572044373, "losses/sft": 0.8289915323257446, "losses/total": 0.6478318572044373, "ref_logps/chosen": -84.90039825439453, "ref_logps/rejected": -93.23046875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3375582695007324, "rewards/margins": 0.1959175169467926, "rewards/rejected": -0.5334757566452026, "step": 439 }, { "epoch": 0.3291565363755377, "grad_norm": 68.44040542078916, "learning_rate": 3.917473046724329e-07, "logps/chosen": -105.46665954589844, "logps/rejected": -124.25345611572266, "loss": 0.6002, "losses/dpo": 0.7857096195220947, "losses/sft": 0.9716615676879883, "losses/total": 0.7857096195220947, "ref_logps/chosen": -102.0205078125, "ref_logps/rejected": -118.31937408447266, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3446151614189148, "rewards/margins": 0.248792365193367, "rewards/rejected": -0.5934075117111206, "step": 440 }, { "epoch": 0.32990461941275484, "grad_norm": 57.44461111851674, "learning_rate": 3.9124731101621555e-07, "logps/chosen": -111.64830017089844, "logps/rejected": -114.1006088256836, "loss": 0.6477, "losses/dpo": 0.4028710126876831, "losses/sft": 0.5666754245758057, "losses/total": 0.4028710126876831, "ref_logps/chosen": -107.81105041503906, "ref_logps/rejected": -108.57315063476562, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3837239742279053, "rewards/margins": 0.1690213680267334, "rewards/rejected": -0.5527453422546387, "step": 441 }, { "epoch": 0.33065270244997197, "grad_norm": 65.51098080674878, "learning_rate": 3.9074648609367913e-07, "logps/chosen": -101.65794372558594, "logps/rejected": -108.99177551269531, "loss": 0.6373, "losses/dpo": 0.652005672454834, "losses/sft": 0.6236210465431213, "losses/total": 0.652005672454834, "ref_logps/chosen": -99.07010650634766, "ref_logps/rejected": -104.65498352050781, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25878340005874634, "rewards/margins": 0.17489539086818695, "rewards/rejected": -0.4336787462234497, "step": 442 }, { "epoch": 0.3314007854871891, "grad_norm": 74.55011039434045, "learning_rate": 3.9024483285227014e-07, "logps/chosen": -88.26228332519531, "logps/rejected": -93.7977523803711, "loss": 0.6457, "losses/dpo": 0.7524773478507996, "losses/sft": 0.9942690134048462, "losses/total": 0.7524773478507996, "ref_logps/chosen": -85.48783874511719, "ref_logps/rejected": -89.47219848632812, "rewards/accuracies": 0.625, "rewards/chosen": -0.27744418382644653, "rewards/margins": 0.15511168539524078, "rewards/rejected": -0.4325559139251709, "step": 443 }, { "epoch": 0.3321488685244062, "grad_norm": 50.775069880097014, "learning_rate": 3.897423542443098e-07, "logps/chosen": -90.06489562988281, "logps/rejected": -93.7593994140625, "loss": 0.6109, "losses/dpo": 0.6973180174827576, "losses/sft": 0.9053441286087036, "losses/total": 0.6973180174827576, "ref_logps/chosen": -87.59675598144531, "ref_logps/rejected": -89.04324340820312, "rewards/accuracies": 0.65625, "rewards/chosen": -0.24681319296360016, "rewards/margins": 0.224802166223526, "rewards/rejected": -0.47161537408828735, "step": 444 }, { "epoch": 0.33289695156162336, "grad_norm": 70.11678996448937, "learning_rate": 3.892390532269768e-07, "logps/chosen": -86.44689178466797, "logps/rejected": -91.95427703857422, "loss": 0.6259, "losses/dpo": 0.7251797914505005, "losses/sft": 1.0454893112182617, "losses/total": 0.7251797914505005, "ref_logps/chosen": -84.00619506835938, "ref_logps/rejected": -87.61637878417969, "rewards/accuracies": 0.625, "rewards/chosen": -0.24406886100769043, "rewards/margins": 0.18972155451774597, "rewards/rejected": -0.4337904751300812, "step": 445 }, { "epoch": 0.3336450345988405, "grad_norm": 70.47975563331701, "learning_rate": 3.8873493276229006e-07, "logps/chosen": -72.170654296875, "logps/rejected": -80.54757690429688, "loss": 0.6909, "losses/dpo": 0.5073265433311462, "losses/sft": 0.1933509111404419, "losses/total": 0.5073265433311462, "ref_logps/chosen": -68.72400665283203, "ref_logps/rejected": -76.66437530517578, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3446650505065918, "rewards/margins": 0.04365500807762146, "rewards/rejected": -0.38832008838653564, "step": 446 }, { "epoch": 0.3343931176360576, "grad_norm": 66.36656326440445, "learning_rate": 3.882299958170908e-07, "logps/chosen": -74.01231384277344, "logps/rejected": -77.60809326171875, "loss": 0.6615, "losses/dpo": 0.649003267288208, "losses/sft": 0.6935522556304932, "losses/total": 0.649003267288208, "ref_logps/chosen": -71.01026916503906, "ref_logps/rejected": -73.65438079833984, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3002045750617981, "rewards/margins": 0.09516683220863342, "rewards/rejected": -0.3953714072704315, "step": 447 }, { "epoch": 0.33514120067327474, "grad_norm": 67.58472834259388, "learning_rate": 3.877242453630256e-07, "logps/chosen": -93.02215576171875, "logps/rejected": -98.6417465209961, "loss": 0.6642, "losses/dpo": 0.6402926445007324, "losses/sft": 0.940190851688385, "losses/total": 0.6402926445007324, "ref_logps/chosen": -89.68971252441406, "ref_logps/rejected": -93.89846801757812, "rewards/accuracies": 0.53125, "rewards/chosen": -0.33324360847473145, "rewards/margins": 0.14108437299728394, "rewards/rejected": -0.4743279814720154, "step": 448 }, { "epoch": 0.33588928371049187, "grad_norm": 69.81211381632954, "learning_rate": 3.8721768437652844e-07, "logps/chosen": -80.84601593017578, "logps/rejected": -93.20015716552734, "loss": 0.6452, "losses/dpo": 0.6386904716491699, "losses/sft": 0.7573591470718384, "losses/total": 0.6386904716491699, "ref_logps/chosen": -78.45005798339844, "ref_logps/rejected": -89.23762512207031, "rewards/accuracies": 0.59375, "rewards/chosen": -0.23959574103355408, "rewards/margins": 0.15665757656097412, "rewards/rejected": -0.3962532877922058, "step": 449 }, { "epoch": 0.336637366747709, "grad_norm": 66.11251831516034, "learning_rate": 3.867103158388039e-07, "logps/chosen": -108.20269775390625, "logps/rejected": -120.94900512695312, "loss": 0.635, "losses/dpo": 0.5267238616943359, "losses/sft": 1.0224450826644897, "losses/total": 0.5267238616943359, "ref_logps/chosen": -104.97881317138672, "ref_logps/rejected": -115.40927124023438, "rewards/accuracies": 0.65625, "rewards/chosen": -0.322388231754303, "rewards/margins": 0.23158498108386993, "rewards/rejected": -0.5539731979370117, "step": 450 }, { "epoch": 0.33738544978492613, "grad_norm": 57.06259838818595, "learning_rate": 3.862021427358084e-07, "logps/chosen": -89.01814270019531, "logps/rejected": -93.53445434570312, "loss": 0.6719, "losses/dpo": 0.6098772287368774, "losses/sft": 1.0963928699493408, "losses/total": 0.6098772287368774, "ref_logps/chosen": -86.06135559082031, "ref_logps/rejected": -89.73529052734375, "rewards/accuracies": 0.625, "rewards/chosen": -0.29567933082580566, "rewards/margins": 0.08423683792352676, "rewards/rejected": -0.37991613149642944, "step": 451 }, { "epoch": 0.33813353282214326, "grad_norm": 52.13640575810865, "learning_rate": 3.8569316805823416e-07, "logps/chosen": -75.651611328125, "logps/rejected": -94.55577087402344, "loss": 0.5914, "losses/dpo": 0.6240452527999878, "losses/sft": 0.6982599496841431, "losses/total": 0.6240452527999878, "ref_logps/chosen": -73.29026794433594, "ref_logps/rejected": -89.50032806396484, "rewards/accuracies": 0.65625, "rewards/chosen": -0.23613393306732178, "rewards/margins": 0.2694106996059418, "rewards/rejected": -0.5055446028709412, "step": 452 }, { "epoch": 0.3388816158593604, "grad_norm": 74.29949802778472, "learning_rate": 3.851833948014903e-07, "logps/chosen": -107.18257141113281, "logps/rejected": -109.71834564208984, "loss": 0.6483, "losses/dpo": 0.6943519115447998, "losses/sft": 0.6750003695487976, "losses/total": 0.6943519115447998, "ref_logps/chosen": -103.74610137939453, "ref_logps/rejected": -104.83622741699219, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3436462879180908, "rewards/margins": 0.14456552267074585, "rewards/rejected": -0.48821181058883667, "step": 453 }, { "epoch": 0.3396296988965775, "grad_norm": 61.243913900882276, "learning_rate": 3.8467282596568595e-07, "logps/chosen": -112.67169952392578, "logps/rejected": -121.51107788085938, "loss": 0.5847, "losses/dpo": 0.4702145457267761, "losses/sft": 1.2025010585784912, "losses/total": 0.4702145457267761, "ref_logps/chosen": -109.80242156982422, "ref_logps/rejected": -115.68096923828125, "rewards/accuracies": 0.75, "rewards/chosen": -0.2869284152984619, "rewards/margins": 0.29608213901519775, "rewards/rejected": -0.5830105543136597, "step": 454 }, { "epoch": 0.34037778193379464, "grad_norm": 66.08455623955962, "learning_rate": 3.8416146455561225e-07, "logps/chosen": -108.4802017211914, "logps/rejected": -100.60990142822266, "loss": 0.661, "losses/dpo": 0.48239973187446594, "losses/sft": 0.9943763017654419, "losses/total": 0.48239973187446594, "ref_logps/chosen": -105.38045501708984, "ref_logps/rejected": -96.4166488647461, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3099748492240906, "rewards/margins": 0.10934992879629135, "rewards/rejected": -0.4193247854709625, "step": 455 }, { "epoch": 0.34112586497101177, "grad_norm": 91.15563221834256, "learning_rate": 3.8364931358072496e-07, "logps/chosen": -113.60888671875, "logps/rejected": -119.38871765136719, "loss": 0.7155, "losses/dpo": 0.5682973861694336, "losses/sft": 0.6286466717720032, "losses/total": 0.5682973861694336, "ref_logps/chosen": -110.18307495117188, "ref_logps/rejected": -115.61735534667969, "rewards/accuracies": 0.53125, "rewards/chosen": -0.34257984161376953, "rewards/margins": 0.03455526381731033, "rewards/rejected": -0.37713512778282166, "step": 456 }, { "epoch": 0.3418739480082289, "grad_norm": 56.80367352332619, "learning_rate": 3.831363760551263e-07, "logps/chosen": -95.91716003417969, "logps/rejected": -109.7703857421875, "loss": 0.59, "losses/dpo": 0.6157217621803284, "losses/sft": 1.0158534049987793, "losses/total": 0.6157217621803284, "ref_logps/chosen": -93.69451904296875, "ref_logps/rejected": -104.6258773803711, "rewards/accuracies": 0.625, "rewards/chosen": -0.22226452827453613, "rewards/margins": 0.29218626022338867, "rewards/rejected": -0.5144507884979248, "step": 457 }, { "epoch": 0.34262203104544603, "grad_norm": 124.5856709306697, "learning_rate": 3.8262265499754775e-07, "logps/chosen": -79.60511016845703, "logps/rejected": -82.61204528808594, "loss": 0.6783, "losses/dpo": 0.7586066722869873, "losses/sft": 0.7274318337440491, "losses/total": 0.7586066722869873, "ref_logps/chosen": -76.11509704589844, "ref_logps/rejected": -78.4538803100586, "rewards/accuracies": 0.5, "rewards/chosen": -0.3490024507045746, "rewards/margins": 0.06681393086910248, "rewards/rejected": -0.41581639647483826, "step": 458 }, { "epoch": 0.34337011408266316, "grad_norm": 50.530105625044456, "learning_rate": 3.8210815343133183e-07, "logps/chosen": -77.39887237548828, "logps/rejected": -96.75808715820312, "loss": 0.5402, "losses/dpo": 0.6288225650787354, "losses/sft": 0.9749691486358643, "losses/total": 0.6288225650787354, "ref_logps/chosen": -74.44740295410156, "ref_logps/rejected": -89.68289947509766, "rewards/accuracies": 0.75, "rewards/chosen": -0.2951467037200928, "rewards/margins": 0.41237252950668335, "rewards/rejected": -0.7075192332267761, "step": 459 }, { "epoch": 0.3441181971198803, "grad_norm": 62.506476928799444, "learning_rate": 3.8159287438441475e-07, "logps/chosen": -75.51727294921875, "logps/rejected": -77.00557708740234, "loss": 0.6384, "losses/dpo": 0.6503839492797852, "losses/sft": 0.3092862367630005, "losses/total": 0.6503839492797852, "ref_logps/chosen": -72.55814361572266, "ref_logps/rejected": -72.30506134033203, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2959137260913849, "rewards/margins": 0.1741379052400589, "rewards/rejected": -0.4700516164302826, "step": 460 }, { "epoch": 0.3448662801570974, "grad_norm": 90.02749283805099, "learning_rate": 3.810768208893079e-07, "logps/chosen": -85.18295288085938, "logps/rejected": -100.55577087402344, "loss": 0.6415, "losses/dpo": 0.7288193702697754, "losses/sft": 0.7021851539611816, "losses/total": 0.7288193702697754, "ref_logps/chosen": -82.52938842773438, "ref_logps/rejected": -96.18135070800781, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2653561234474182, "rewards/margins": 0.17208635807037354, "rewards/rejected": -0.43744248151779175, "step": 461 }, { "epoch": 0.34561436319431454, "grad_norm": 51.84451236359774, "learning_rate": 3.8055999598308094e-07, "logps/chosen": -82.33088684082031, "logps/rejected": -89.21057891845703, "loss": 0.5598, "losses/dpo": 0.6004656553268433, "losses/sft": 0.6818792819976807, "losses/total": 0.6004656553268433, "ref_logps/chosen": -79.78987121582031, "ref_logps/rejected": -83.14875793457031, "rewards/accuracies": 0.75, "rewards/chosen": -0.25410065054893494, "rewards/margins": 0.352081298828125, "rewards/rejected": -0.6061819791793823, "step": 462 }, { "epoch": 0.3463624462315317, "grad_norm": 59.52428983915998, "learning_rate": 3.80042402707343e-07, "logps/chosen": -100.10381317138672, "logps/rejected": -99.96148681640625, "loss": 0.6598, "losses/dpo": 0.8139052391052246, "losses/sft": 1.0459139347076416, "losses/total": 0.8139052391052246, "ref_logps/chosen": -96.11210632324219, "ref_logps/rejected": -94.53898620605469, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3991697430610657, "rewards/margins": 0.14307931065559387, "rewards/rejected": -0.5422490835189819, "step": 463 }, { "epoch": 0.34711052926874886, "grad_norm": 58.926254623580256, "learning_rate": 3.795240441082255e-07, "logps/chosen": -69.44772338867188, "logps/rejected": -80.36278533935547, "loss": 0.6114, "losses/dpo": 0.6272427439689636, "losses/sft": 0.35434386134147644, "losses/total": 0.6272427439689636, "ref_logps/chosen": -67.19667053222656, "ref_logps/rejected": -75.66275787353516, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2251054048538208, "rewards/margins": 0.24489745497703552, "rewards/rejected": -0.4700028896331787, "step": 464 }, { "epoch": 0.347858612305966, "grad_norm": 48.94351219769304, "learning_rate": 3.790049232363639e-07, "logps/chosen": -89.3467788696289, "logps/rejected": -106.08296203613281, "loss": 0.5219, "losses/dpo": 0.5602531433105469, "losses/sft": 1.173064947128296, "losses/total": 0.5602531433105469, "ref_logps/chosen": -87.4158706665039, "ref_logps/rejected": -99.81707763671875, "rewards/accuracies": 0.875, "rewards/chosen": -0.19309069216251373, "rewards/margins": 0.4334970712661743, "rewards/rejected": -0.6265877485275269, "step": 465 }, { "epoch": 0.3486066953431831, "grad_norm": 66.2037077917035, "learning_rate": 3.7848504314687944e-07, "logps/chosen": -85.39662170410156, "logps/rejected": -91.81763458251953, "loss": 0.6255, "losses/dpo": 0.789120614528656, "losses/sft": 1.2053353786468506, "losses/total": 0.789120614528656, "ref_logps/chosen": -81.74397277832031, "ref_logps/rejected": -86.08926391601562, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36526477336883545, "rewards/margins": 0.2075720727443695, "rewards/rejected": -0.5728368759155273, "step": 466 }, { "epoch": 0.34935477838040024, "grad_norm": 50.58528832685531, "learning_rate": 3.779644068993621e-07, "logps/chosen": -65.6746826171875, "logps/rejected": -79.60144805908203, "loss": 0.6053, "losses/dpo": 0.7104791402816772, "losses/sft": 0.3954317271709442, "losses/total": 0.7104791402816772, "ref_logps/chosen": -63.181190490722656, "ref_logps/rejected": -74.6443099975586, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2493494749069214, "rewards/margins": 0.2463640719652176, "rewards/rejected": -0.495713472366333, "step": 467 }, { "epoch": 0.3501028614176174, "grad_norm": 64.45631256914241, "learning_rate": 3.7744301755785136e-07, "logps/chosen": -103.80078125, "logps/rejected": -108.73640441894531, "loss": 0.6777, "losses/dpo": 0.4021540880203247, "losses/sft": 0.19124622642993927, "losses/total": 0.4021540880203247, "ref_logps/chosen": -99.82046508789062, "ref_logps/rejected": -103.71883392333984, "rewards/accuracies": 0.59375, "rewards/chosen": -0.398030549287796, "rewards/margins": 0.10372571647167206, "rewards/rejected": -0.5017562508583069, "step": 468 }, { "epoch": 0.3508509444548345, "grad_norm": 48.56994830027659, "learning_rate": 3.769208781908192e-07, "logps/chosen": -83.70832824707031, "logps/rejected": -92.22117614746094, "loss": 0.5985, "losses/dpo": 0.8795135617256165, "losses/sft": 1.0818628072738647, "losses/total": 0.8795135617256165, "ref_logps/chosen": -81.02307891845703, "ref_logps/rejected": -86.75225830078125, "rewards/accuracies": 0.59375, "rewards/chosen": -0.26852506399154663, "rewards/margins": 0.2783676087856293, "rewards/rejected": -0.5468926429748535, "step": 469 }, { "epoch": 0.35159902749205163, "grad_norm": 94.27573618129765, "learning_rate": 3.763979918711514e-07, "logps/chosen": -113.58733367919922, "logps/rejected": -122.62984466552734, "loss": 0.6166, "losses/dpo": 0.4915837049484253, "losses/sft": 0.6423507928848267, "losses/total": 0.4915837049484253, "ref_logps/chosen": -110.58096313476562, "ref_logps/rejected": -117.236083984375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3006364703178406, "rewards/margins": 0.23873981833457947, "rewards/rejected": -0.5393762588500977, "step": 470 }, { "epoch": 0.35234711052926876, "grad_norm": 62.996812845779715, "learning_rate": 3.7587436167612995e-07, "logps/chosen": -91.30633544921875, "logps/rejected": -94.43800354003906, "loss": 0.6023, "losses/dpo": 0.5618022084236145, "losses/sft": 1.1489226818084717, "losses/total": 0.5618022084236145, "ref_logps/chosen": -88.74603271484375, "ref_logps/rejected": -89.3563232421875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2560303807258606, "rewards/margins": 0.252137690782547, "rewards/rejected": -0.50816810131073, "step": 471 }, { "epoch": 0.3530951935664859, "grad_norm": 85.59505621814156, "learning_rate": 3.7534999068741456e-07, "logps/chosen": -83.7705307006836, "logps/rejected": -100.367919921875, "loss": 0.5742, "losses/dpo": 0.5463736653327942, "losses/sft": 0.6255823969841003, "losses/total": 0.5463736653327942, "ref_logps/chosen": -81.34942626953125, "ref_logps/rejected": -94.68207550048828, "rewards/accuracies": 0.8125, "rewards/chosen": -0.24211113154888153, "rewards/margins": 0.3264732360839844, "rewards/rejected": -0.5685843229293823, "step": 472 }, { "epoch": 0.353843276603703, "grad_norm": 63.61164727834134, "learning_rate": 3.7482488199102446e-07, "logps/chosen": -87.62928009033203, "logps/rejected": -92.08415985107422, "loss": 0.6065, "losses/dpo": 0.35730642080307007, "losses/sft": 1.1170786619186401, "losses/total": 0.35730642080307007, "ref_logps/chosen": -85.0799789428711, "ref_logps/rejected": -87.00650787353516, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2549295425415039, "rewards/margins": 0.25283563137054443, "rewards/rejected": -0.5077651143074036, "step": 473 }, { "epoch": 0.35459135964092015, "grad_norm": 69.95471493616371, "learning_rate": 3.742990386773207e-07, "logps/chosen": -77.73030090332031, "logps/rejected": -95.21199798583984, "loss": 0.6001, "losses/dpo": 0.4861801266670227, "losses/sft": 1.0502934455871582, "losses/total": 0.4861801266670227, "ref_logps/chosen": -76.02202606201172, "ref_logps/rejected": -90.99285888671875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17082807421684265, "rewards/margins": 0.251086562871933, "rewards/rejected": -0.421914666891098, "step": 474 }, { "epoch": 0.3553394426781373, "grad_norm": 59.31599805061532, "learning_rate": 3.737724638409876e-07, "logps/chosen": -92.96549987792969, "logps/rejected": -93.07750701904297, "loss": 0.6938, "losses/dpo": 0.7219908833503723, "losses/sft": 0.9679485559463501, "losses/total": 0.7219908833503723, "ref_logps/chosen": -89.99276733398438, "ref_logps/rejected": -89.64840698242188, "rewards/accuracies": 0.5625, "rewards/chosen": -0.29727286100387573, "rewards/margins": 0.04563693702220917, "rewards/rejected": -0.3429098427295685, "step": 475 }, { "epoch": 0.3560875257153544, "grad_norm": 68.19258007842704, "learning_rate": 3.732451605810145e-07, "logps/chosen": -102.75858306884766, "logps/rejected": -107.38939666748047, "loss": 0.6454, "losses/dpo": 0.6033260226249695, "losses/sft": 0.8392477035522461, "losses/total": 0.6033260226249695, "ref_logps/chosen": -100.2508773803711, "ref_logps/rejected": -103.41471862792969, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2507703900337219, "rewards/margins": 0.14669808745384216, "rewards/rejected": -0.3974684476852417, "step": 476 }, { "epoch": 0.35683560875257153, "grad_norm": 62.58047339977496, "learning_rate": 3.727171320006779e-07, "logps/chosen": -89.43415832519531, "logps/rejected": -94.81315612792969, "loss": 0.6821, "losses/dpo": 0.6052768230438232, "losses/sft": 0.5130563974380493, "losses/total": 0.6052768230438232, "ref_logps/chosen": -85.94160461425781, "ref_logps/rejected": -90.63307189941406, "rewards/accuracies": 0.46875, "rewards/chosen": -0.34925585985183716, "rewards/margins": 0.06875312328338623, "rewards/rejected": -0.4180089831352234, "step": 477 }, { "epoch": 0.35758369178978866, "grad_norm": 58.02661730588597, "learning_rate": 3.721883812075227e-07, "logps/chosen": -90.50814819335938, "logps/rejected": -106.12510681152344, "loss": 0.6258, "losses/dpo": 0.7365454435348511, "losses/sft": 0.5449880957603455, "losses/total": 0.7365454435348511, "ref_logps/chosen": -87.94220733642578, "ref_logps/rejected": -101.4798583984375, "rewards/accuracies": 0.625, "rewards/chosen": -0.25659462809562683, "rewards/margins": 0.20793044567108154, "rewards/rejected": -0.464525043964386, "step": 478 }, { "epoch": 0.3583317748270058, "grad_norm": 79.10120375111549, "learning_rate": 3.716589113133443e-07, "logps/chosen": -103.71589660644531, "logps/rejected": -99.13236236572266, "loss": 0.6793, "losses/dpo": 0.7117807269096375, "losses/sft": 0.9677572846412659, "losses/total": 0.7117807269096375, "ref_logps/chosen": -99.99458312988281, "ref_logps/rejected": -94.27881622314453, "rewards/accuracies": 0.53125, "rewards/chosen": -0.37213242053985596, "rewards/margins": 0.11322171986103058, "rewards/rejected": -0.4853541851043701, "step": 479 }, { "epoch": 0.3590798578642229, "grad_norm": 66.4079926385548, "learning_rate": 3.7112872543416997e-07, "logps/chosen": -77.66653442382812, "logps/rejected": -79.48382568359375, "loss": 0.6758, "losses/dpo": 0.44012030959129333, "losses/sft": 0.6699324250221252, "losses/total": 0.44012030959129333, "ref_logps/chosen": -74.20112609863281, "ref_logps/rejected": -74.75756072998047, "rewards/accuracies": 0.625, "rewards/chosen": -0.3465409576892853, "rewards/margins": 0.1260850727558136, "rewards/rejected": -0.4726260304450989, "step": 480 }, { "epoch": 0.35982794090144005, "grad_norm": 58.43617012020261, "learning_rate": 3.705978266902409e-07, "logps/chosen": -89.28042602539062, "logps/rejected": -97.10018157958984, "loss": 0.5491, "losses/dpo": 0.6274359822273254, "losses/sft": 1.0351243019104004, "losses/total": 0.6274359822273254, "ref_logps/chosen": -86.90560150146484, "ref_logps/rejected": -90.38480377197266, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23748309910297394, "rewards/margins": 0.4340546131134033, "rewards/rejected": -0.6715376973152161, "step": 481 }, { "epoch": 0.3605760239386572, "grad_norm": 80.59071034929828, "learning_rate": 3.7006621820599357e-07, "logps/chosen": -96.16352844238281, "logps/rejected": -106.44424438476562, "loss": 0.6364, "losses/dpo": 0.5953168869018555, "losses/sft": 0.9837155342102051, "losses/total": 0.5953168869018555, "ref_logps/chosen": -93.17951965332031, "ref_logps/rejected": -101.77664184570312, "rewards/accuracies": 0.65625, "rewards/chosen": -0.29840031266212463, "rewards/margins": 0.16836047172546387, "rewards/rejected": -0.4667607843875885, "step": 482 }, { "epoch": 0.3613241069758743, "grad_norm": 74.27373882957637, "learning_rate": 3.6953390311004125e-07, "logps/chosen": -111.01522064208984, "logps/rejected": -104.23545837402344, "loss": 0.6404, "losses/dpo": 0.7659198045730591, "losses/sft": 0.7670500874519348, "losses/total": 0.7659198045730591, "ref_logps/chosen": -107.23226928710938, "ref_logps/rejected": -98.61796569824219, "rewards/accuracies": 0.59375, "rewards/chosen": -0.37829530239105225, "rewards/margins": 0.1834537535905838, "rewards/rejected": -0.5617491006851196, "step": 483 }, { "epoch": 0.36207219001309143, "grad_norm": 93.79208908313568, "learning_rate": 3.6900088453515596e-07, "logps/chosen": -102.17916870117188, "logps/rejected": -101.65907287597656, "loss": 0.7005, "losses/dpo": 0.6308658123016357, "losses/sft": 0.5679488182067871, "losses/total": 0.6308658123016357, "ref_logps/chosen": -98.22813415527344, "ref_logps/rejected": -96.8544921875, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3951035141944885, "rewards/margins": 0.08535487949848175, "rewards/rejected": -0.48045843839645386, "step": 484 }, { "epoch": 0.36282027305030856, "grad_norm": 69.91972336511422, "learning_rate": 3.6846716561824967e-07, "logps/chosen": -88.28865051269531, "logps/rejected": -97.94123840332031, "loss": 0.6616, "losses/dpo": 0.5589333772659302, "losses/sft": 0.6026472449302673, "losses/total": 0.5589333772659302, "ref_logps/chosen": -83.84707641601562, "ref_logps/rejected": -91.9183349609375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.44415658712387085, "rewards/margins": 0.15813417732715607, "rewards/rejected": -0.6022907495498657, "step": 485 }, { "epoch": 0.3635683560875257, "grad_norm": 56.30946164079247, "learning_rate": 3.679327495003561e-07, "logps/chosen": -104.57087707519531, "logps/rejected": -121.51707458496094, "loss": 0.5977, "losses/dpo": 0.5934650897979736, "losses/sft": 1.5577067136764526, "losses/total": 0.5934650897979736, "ref_logps/chosen": -102.20411682128906, "ref_logps/rejected": -115.95957946777344, "rewards/accuracies": 0.625, "rewards/chosen": -0.2366756945848465, "rewards/margins": 0.31907394528388977, "rewards/rejected": -0.5557496547698975, "step": 486 }, { "epoch": 0.3643164391247428, "grad_norm": 63.696403427635445, "learning_rate": 3.67397639326612e-07, "logps/chosen": -83.48893737792969, "logps/rejected": -88.74971008300781, "loss": 0.6336, "losses/dpo": 0.5997482538223267, "losses/sft": 0.8705397844314575, "losses/total": 0.5997482538223267, "ref_logps/chosen": -80.57660675048828, "ref_logps/rejected": -84.09112548828125, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2912323474884033, "rewards/margins": 0.17462578415870667, "rewards/rejected": -0.4658581018447876, "step": 487 }, { "epoch": 0.36506452216196, "grad_norm": 67.17095282188482, "learning_rate": 3.6686183824623887e-07, "logps/chosen": -100.67367553710938, "logps/rejected": -114.22627258300781, "loss": 0.6232, "losses/dpo": 0.7044239044189453, "losses/sft": 1.1110897064208984, "losses/total": 0.7044239044189453, "ref_logps/chosen": -97.42585754394531, "ref_logps/rejected": -108.87762451171875, "rewards/accuracies": 0.65625, "rewards/chosen": -0.324782133102417, "rewards/margins": 0.21008339524269104, "rewards/rejected": -0.5348654985427856, "step": 488 }, { "epoch": 0.36581260519917713, "grad_norm": 77.2105845908692, "learning_rate": 3.6632534941252436e-07, "logps/chosen": -105.77287292480469, "logps/rejected": -116.0762939453125, "loss": 0.6588, "losses/dpo": 0.6067355275154114, "losses/sft": 1.2944308519363403, "losses/total": 0.6067355275154114, "ref_logps/chosen": -103.0094223022461, "ref_logps/rejected": -111.68614196777344, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2763444781303406, "rewards/margins": 0.1626703441143036, "rewards/rejected": -0.4390147924423218, "step": 489 }, { "epoch": 0.36656068823639426, "grad_norm": 74.3964674413515, "learning_rate": 3.6578817598280353e-07, "logps/chosen": -97.46862030029297, "logps/rejected": -110.53195190429688, "loss": 0.6457, "losses/dpo": 0.6782180666923523, "losses/sft": 0.622215747833252, "losses/total": 0.6782180666923523, "ref_logps/chosen": -94.48513793945312, "ref_logps/rejected": -105.95850372314453, "rewards/accuracies": 0.625, "rewards/chosen": -0.2983492314815521, "rewards/margins": 0.1589956283569336, "rewards/rejected": -0.4573448598384857, "step": 490 }, { "epoch": 0.3673087712736114, "grad_norm": 57.52541148052239, "learning_rate": 3.6525032111844053e-07, "logps/chosen": -75.58665466308594, "logps/rejected": -81.41671752929688, "loss": 0.6695, "losses/dpo": 0.6985927224159241, "losses/sft": 0.6491951942443848, "losses/total": 0.6985927224159241, "ref_logps/chosen": -71.99436950683594, "ref_logps/rejected": -76.842041015625, "rewards/accuracies": 0.625, "rewards/chosen": -0.3592279553413391, "rewards/margins": 0.098239004611969, "rewards/rejected": -0.4574669599533081, "step": 491 }, { "epoch": 0.3680568543108285, "grad_norm": 89.96773675558025, "learning_rate": 3.647117879848098e-07, "logps/chosen": -92.61830139160156, "logps/rejected": -97.61871337890625, "loss": 0.6816, "losses/dpo": 0.5646923780441284, "losses/sft": 0.9582585096359253, "losses/total": 0.5646923780441284, "ref_logps/chosen": -88.44613647460938, "ref_logps/rejected": -92.05915069580078, "rewards/accuracies": 0.5, "rewards/chosen": -0.4172162115573883, "rewards/margins": 0.13874047994613647, "rewards/rejected": -0.5559566617012024, "step": 492 }, { "epoch": 0.36880493734804565, "grad_norm": 66.83638541307302, "learning_rate": 3.6417257975127756e-07, "logps/chosen": -95.16051483154297, "logps/rejected": -107.23004150390625, "loss": 0.6001, "losses/dpo": 0.4767833948135376, "losses/sft": 1.0820449590682983, "losses/total": 0.4767833948135376, "ref_logps/chosen": -92.16758728027344, "ref_logps/rejected": -101.32286071777344, "rewards/accuracies": 0.625, "rewards/chosen": -0.2992931306362152, "rewards/margins": 0.291424959897995, "rewards/rejected": -0.5907180905342102, "step": 493 }, { "epoch": 0.3695530203852628, "grad_norm": 99.71629337642813, "learning_rate": 3.6363269959118313e-07, "logps/chosen": -113.97933197021484, "logps/rejected": -114.32070922851562, "loss": 0.6973, "losses/dpo": 0.6015608906745911, "losses/sft": 1.3181772232055664, "losses/total": 0.6015608906745911, "ref_logps/chosen": -110.61275482177734, "ref_logps/rejected": -110.1690902709961, "rewards/accuracies": 0.625, "rewards/chosen": -0.3366580307483673, "rewards/margins": 0.07850445061922073, "rewards/rejected": -0.41516244411468506, "step": 494 }, { "epoch": 0.3703011034224799, "grad_norm": 71.81558401525669, "learning_rate": 3.630921506818203e-07, "logps/chosen": -99.5779800415039, "logps/rejected": -101.12135314941406, "loss": 0.6067, "losses/dpo": 0.5507004261016846, "losses/sft": 0.7061862349510193, "losses/total": 0.5507004261016846, "ref_logps/chosen": -97.52459716796875, "ref_logps/rejected": -96.85233306884766, "rewards/accuracies": 0.75, "rewards/chosen": -0.2053391933441162, "rewards/margins": 0.22156298160552979, "rewards/rejected": -0.426902174949646, "step": 495 }, { "epoch": 0.37104918645969703, "grad_norm": 91.92340876136015, "learning_rate": 3.625509362044183e-07, "logps/chosen": -81.48184204101562, "logps/rejected": -88.7281265258789, "loss": 0.5826, "losses/dpo": 0.6188513040542603, "losses/sft": 0.4833926558494568, "losses/total": 0.6188513040542603, "ref_logps/chosen": -79.05908203125, "ref_logps/rejected": -83.47209930419922, "rewards/accuracies": 0.78125, "rewards/chosen": -0.24227532744407654, "rewards/margins": 0.28332728147506714, "rewards/rejected": -0.5256025791168213, "step": 496 }, { "epoch": 0.37179726949691416, "grad_norm": 57.65251259204285, "learning_rate": 3.6200905934412373e-07, "logps/chosen": -105.7718505859375, "logps/rejected": -128.13916015625, "loss": 0.5575, "losses/dpo": 0.3958369493484497, "losses/sft": 1.3600988388061523, "losses/total": 0.3958369493484497, "ref_logps/chosen": -102.94503021240234, "ref_logps/rejected": -121.52565002441406, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2826817035675049, "rewards/margins": 0.37866851687431335, "rewards/rejected": -0.6613502502441406, "step": 497 }, { "epoch": 0.3725453525341313, "grad_norm": 60.572820061704746, "learning_rate": 3.6146652328998115e-07, "logps/chosen": -86.0465316772461, "logps/rejected": -89.35520935058594, "loss": 0.6211, "losses/dpo": 0.6312898397445679, "losses/sft": 0.2970695197582245, "losses/total": 0.6312898397445679, "ref_logps/chosen": -83.36029815673828, "ref_logps/rejected": -84.43804931640625, "rewards/accuracies": 0.75, "rewards/chosen": -0.2686236500740051, "rewards/margins": 0.22309203445911407, "rewards/rejected": -0.491715669631958, "step": 498 }, { "epoch": 0.3732934355713484, "grad_norm": 56.98283567825864, "learning_rate": 3.609233312349148e-07, "logps/chosen": -86.68248748779297, "logps/rejected": -111.05752563476562, "loss": 0.5995, "losses/dpo": 0.4464918375015259, "losses/sft": 0.6396567225456238, "losses/total": 0.4464918375015259, "ref_logps/chosen": -84.15753173828125, "ref_logps/rejected": -105.9564437866211, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2524960935115814, "rewards/margins": 0.25761187076568604, "rewards/rejected": -0.5101079344749451, "step": 499 }, { "epoch": 0.37404151860856555, "grad_norm": 72.02704869924722, "learning_rate": 3.603794863757094e-07, "logps/chosen": -88.96958923339844, "logps/rejected": -92.83810424804688, "loss": 0.6738, "losses/dpo": 0.7756743431091309, "losses/sft": 1.642956256866455, "losses/total": 0.7756743431091309, "ref_logps/chosen": -86.12216186523438, "ref_logps/rejected": -89.12744903564453, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2847422957420349, "rewards/margins": 0.08632314205169678, "rewards/rejected": -0.3710654377937317, "step": 500 }, { "epoch": 0.3747896016457827, "grad_norm": 68.80113129073807, "learning_rate": 3.598349919129917e-07, "logps/chosen": -92.23851013183594, "logps/rejected": -107.35994720458984, "loss": 0.6228, "losses/dpo": 0.7407935857772827, "losses/sft": 0.5513738393783569, "losses/total": 0.7407935857772827, "ref_logps/chosen": -89.11861419677734, "ref_logps/rejected": -101.91899871826172, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3119903802871704, "rewards/margins": 0.2321043759584427, "rewards/rejected": -0.5440947413444519, "step": 501 }, { "epoch": 0.3755376846829998, "grad_norm": 62.41616653866761, "learning_rate": 3.592898510512113e-07, "logps/chosen": -92.89141082763672, "logps/rejected": -107.72120666503906, "loss": 0.6103, "losses/dpo": 0.6522347927093506, "losses/sft": 0.8331180810928345, "losses/total": 0.6522347927093506, "ref_logps/chosen": -90.15361785888672, "ref_logps/rejected": -102.77294921875, "rewards/accuracies": 0.625, "rewards/chosen": -0.27377834916114807, "rewards/margins": 0.2210468053817749, "rewards/rejected": -0.49482518434524536, "step": 502 }, { "epoch": 0.37628576772021693, "grad_norm": 58.235892081349796, "learning_rate": 3.587440669986224e-07, "logps/chosen": -107.63162231445312, "logps/rejected": -107.09001922607422, "loss": 0.6428, "losses/dpo": 0.5043058395385742, "losses/sft": 1.2684407234191895, "losses/total": 0.5043058395385742, "ref_logps/chosen": -104.14076232910156, "ref_logps/rejected": -102.08076477050781, "rewards/accuracies": 0.625, "rewards/chosen": -0.34908559918403625, "rewards/margins": 0.15183879435062408, "rewards/rejected": -0.5009243488311768, "step": 503 }, { "epoch": 0.37703385075743406, "grad_norm": 56.83561521668184, "learning_rate": 3.581976429672639e-07, "logps/chosen": -94.71292877197266, "logps/rejected": -89.58069610595703, "loss": 0.7178, "losses/dpo": 0.6541300415992737, "losses/sft": 1.6750656366348267, "losses/total": 0.6541300415992737, "ref_logps/chosen": -91.5320053100586, "ref_logps/rejected": -86.22120666503906, "rewards/accuracies": 0.5, "rewards/chosen": -0.31809258460998535, "rewards/margins": 0.017856501042842865, "rewards/rejected": -0.3359490633010864, "step": 504 }, { "epoch": 0.3777819337946512, "grad_norm": 81.80792806394874, "learning_rate": 3.576505821729416e-07, "logps/chosen": -93.62876892089844, "logps/rejected": -108.25489807128906, "loss": 0.5418, "losses/dpo": 0.5640100240707397, "losses/sft": 0.9062073826789856, "losses/total": 0.5640100240707397, "ref_logps/chosen": -91.65287780761719, "ref_logps/rejected": -102.290771484375, "rewards/accuracies": 0.75, "rewards/chosen": -0.19758978486061096, "rewards/margins": 0.3988226652145386, "rewards/rejected": -0.5964124202728271, "step": 505 }, { "epoch": 0.3785300168318683, "grad_norm": 66.74174889991176, "learning_rate": 3.5710288783520835e-07, "logps/chosen": -88.50070190429688, "logps/rejected": -102.47048950195312, "loss": 0.6406, "losses/dpo": 0.8640881776809692, "losses/sft": 1.1905866861343384, "losses/total": 0.8640881776809692, "ref_logps/chosen": -85.97560119628906, "ref_logps/rejected": -98.04273986816406, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2525096833705902, "rewards/margins": 0.19026589393615723, "rewards/rejected": -0.44277554750442505, "step": 506 }, { "epoch": 0.37927809986908545, "grad_norm": 55.495065452536394, "learning_rate": 3.56554563177346e-07, "logps/chosen": -75.65396118164062, "logps/rejected": -83.85263061523438, "loss": 0.6154, "losses/dpo": 0.7450136542320251, "losses/sft": 0.40612176060676575, "losses/total": 0.7450136542320251, "ref_logps/chosen": -72.53784942626953, "ref_logps/rejected": -78.2455062866211, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3116113543510437, "rewards/margins": 0.24910219013690948, "rewards/rejected": -0.560713529586792, "step": 507 }, { "epoch": 0.3800261829063026, "grad_norm": 87.9766830417765, "learning_rate": 3.560056114263455e-07, "logps/chosen": -103.10520935058594, "logps/rejected": -105.3595962524414, "loss": 0.7405, "losses/dpo": 0.9892740249633789, "losses/sft": 0.7787941098213196, "losses/total": 0.9892740249633789, "ref_logps/chosen": -98.64010620117188, "ref_logps/rejected": -100.91387176513672, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4465096592903137, "rewards/margins": -0.0019371584057807922, "rewards/rejected": -0.4445725083351135, "step": 508 }, { "epoch": 0.3807742659435197, "grad_norm": 63.98116454476197, "learning_rate": 3.554560358128886e-07, "logps/chosen": -104.62046813964844, "logps/rejected": -127.22407531738281, "loss": 0.6035, "losses/dpo": 0.6028349995613098, "losses/sft": 1.5269896984100342, "losses/total": 0.6028349995613098, "ref_logps/chosen": -101.95005798339844, "ref_logps/rejected": -121.93203735351562, "rewards/accuracies": 0.71875, "rewards/chosen": -0.26704102754592896, "rewards/margins": 0.26216351985931396, "rewards/rejected": -0.5292045474052429, "step": 509 }, { "epoch": 0.38152234898073684, "grad_norm": 93.69905879180024, "learning_rate": 3.549058395713285e-07, "logps/chosen": -107.06275939941406, "logps/rejected": -117.33446502685547, "loss": 0.6574, "losses/dpo": 0.5617856979370117, "losses/sft": 1.31675124168396, "losses/total": 0.5617856979370117, "ref_logps/chosen": -102.84420776367188, "ref_logps/rejected": -111.41387939453125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4218553304672241, "rewards/margins": 0.1702028065919876, "rewards/rejected": -0.5920581817626953, "step": 510 }, { "epoch": 0.382270432017954, "grad_norm": 53.49682046230229, "learning_rate": 3.54355025939671e-07, "logps/chosen": -83.67735290527344, "logps/rejected": -98.4323959350586, "loss": 0.637, "losses/dpo": 0.5505675077438354, "losses/sft": 0.6961530447006226, "losses/total": 0.5505675077438354, "ref_logps/chosen": -79.99559783935547, "ref_logps/rejected": -92.9800033569336, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3681759834289551, "rewards/margins": 0.17706209421157837, "rewards/rejected": -0.5452380776405334, "step": 511 }, { "epoch": 0.38301851505517115, "grad_norm": 68.29065968777897, "learning_rate": 3.5380359815955526e-07, "logps/chosen": -108.1059341430664, "logps/rejected": -114.28530883789062, "loss": 0.6613, "losses/dpo": 0.7203066349029541, "losses/sft": 0.8195047378540039, "losses/total": 0.7203066349029541, "ref_logps/chosen": -104.4546890258789, "ref_logps/rejected": -108.85444641113281, "rewards/accuracies": 0.6875, "rewards/chosen": -0.36512434482574463, "rewards/margins": 0.17796197533607483, "rewards/rejected": -0.5430862903594971, "step": 512 }, { "epoch": 0.3837665980923883, "grad_norm": 68.69363821577797, "learning_rate": 3.5325155947623475e-07, "logps/chosen": -91.43203735351562, "logps/rejected": -104.4222412109375, "loss": 0.601, "losses/dpo": 0.6240744590759277, "losses/sft": 0.2309247851371765, "losses/total": 0.6240744590759277, "ref_logps/chosen": -87.8249740600586, "ref_logps/rejected": -97.88746643066406, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36070504784584045, "rewards/margins": 0.2927727699279785, "rewards/rejected": -0.6534779071807861, "step": 513 }, { "epoch": 0.3845146811296054, "grad_norm": 73.17342364480578, "learning_rate": 3.526989131385586e-07, "logps/chosen": -95.7757797241211, "logps/rejected": -95.12107849121094, "loss": 0.7075, "losses/dpo": 0.7475035786628723, "losses/sft": 1.036118984222412, "losses/total": 0.7475035786628723, "ref_logps/chosen": -91.52781677246094, "ref_logps/rejected": -90.68326568603516, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4247950315475464, "rewards/margins": 0.018986139446496964, "rewards/rejected": -0.44378119707107544, "step": 514 }, { "epoch": 0.38526276416682254, "grad_norm": 78.06323173430441, "learning_rate": 3.521456623989515e-07, "logps/chosen": -86.89875030517578, "logps/rejected": -92.2366943359375, "loss": 0.6417, "losses/dpo": 0.63551265001297, "losses/sft": 0.3895583152770996, "losses/total": 0.63551265001297, "ref_logps/chosen": -82.08674621582031, "ref_logps/rejected": -85.7799072265625, "rewards/accuracies": 0.59375, "rewards/chosen": -0.48120057582855225, "rewards/margins": 0.16447722911834717, "rewards/rejected": -0.6456778049468994, "step": 515 }, { "epoch": 0.38601084720403966, "grad_norm": 93.38636400146862, "learning_rate": 3.515918105133957e-07, "logps/chosen": -115.03431701660156, "logps/rejected": -114.19931030273438, "loss": 0.6515, "losses/dpo": 0.7637505531311035, "losses/sft": 1.4620559215545654, "losses/total": 0.7637505531311035, "ref_logps/chosen": -110.83012390136719, "ref_logps/rejected": -108.55613708496094, "rewards/accuracies": 0.5, "rewards/chosen": -0.42041975259780884, "rewards/margins": 0.14389878511428833, "rewards/rejected": -0.5643185377120972, "step": 516 }, { "epoch": 0.3867589302412568, "grad_norm": 72.74952968956639, "learning_rate": 3.51037360741411e-07, "logps/chosen": -72.86038208007812, "logps/rejected": -82.91361236572266, "loss": 0.6661, "losses/dpo": 0.7210302352905273, "losses/sft": 0.2418709099292755, "losses/total": 0.7210302352905273, "ref_logps/chosen": -68.90402221679688, "ref_logps/rejected": -77.32935333251953, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3956355154514313, "rewards/margins": 0.1627901941537857, "rewards/rejected": -0.5584257245063782, "step": 517 }, { "epoch": 0.3875070132784739, "grad_norm": 111.20555114886686, "learning_rate": 3.5048231634603597e-07, "logps/chosen": -84.8773193359375, "logps/rejected": -93.94743347167969, "loss": 0.5758, "losses/dpo": 0.737694263458252, "losses/sft": 1.1148535013198853, "losses/total": 0.737694263458252, "ref_logps/chosen": -81.95550537109375, "ref_logps/rejected": -87.702880859375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2921810746192932, "rewards/margins": 0.3322726786136627, "rewards/rejected": -0.6244537830352783, "step": 518 }, { "epoch": 0.38825509631569105, "grad_norm": 93.04906223650651, "learning_rate": 3.499266805938086e-07, "logps/chosen": -94.6605453491211, "logps/rejected": -107.85733032226562, "loss": 0.5519, "losses/dpo": 0.5351649522781372, "losses/sft": 0.3558017909526825, "losses/total": 0.5351649522781372, "ref_logps/chosen": -93.43859100341797, "ref_logps/rejected": -102.48019409179688, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12219510972499847, "rewards/margins": 0.4155177175998688, "rewards/rejected": -0.537712812423706, "step": 519 }, { "epoch": 0.3890031793529082, "grad_norm": 59.014791614565645, "learning_rate": 3.493704567547472e-07, "logps/chosen": -78.48353576660156, "logps/rejected": -90.63548278808594, "loss": 0.6339, "losses/dpo": 0.7476864457130432, "losses/sft": 0.8500849008560181, "losses/total": 0.7476864457130432, "ref_logps/chosen": -74.6740951538086, "ref_logps/rejected": -84.66331481933594, "rewards/accuracies": 0.59375, "rewards/chosen": -0.38094422221183777, "rewards/margins": 0.21627211570739746, "rewards/rejected": -0.5972163677215576, "step": 520 }, { "epoch": 0.3897512623901253, "grad_norm": 77.57076486821951, "learning_rate": 3.4881364810233096e-07, "logps/chosen": -113.48897552490234, "logps/rejected": -112.85272979736328, "loss": 0.6216, "losses/dpo": 0.4176411032676697, "losses/sft": 0.6320239305496216, "losses/total": 0.4176411032676697, "ref_logps/chosen": -110.06195068359375, "ref_logps/rejected": -107.20074462890625, "rewards/accuracies": 0.625, "rewards/chosen": -0.342702716588974, "rewards/margins": 0.22249627113342285, "rewards/rejected": -0.5651990175247192, "step": 521 }, { "epoch": 0.39049934542734244, "grad_norm": 105.83525205860334, "learning_rate": 3.482562579134809e-07, "logps/chosen": -92.09410095214844, "logps/rejected": -97.76458740234375, "loss": 0.6743, "losses/dpo": 0.5999992489814758, "losses/sft": 0.9807921051979065, "losses/total": 0.5999992489814758, "ref_logps/chosen": -88.23837280273438, "ref_logps/rejected": -92.83773803710938, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3855735659599304, "rewards/margins": 0.10711109638214111, "rewards/rejected": -0.49268466234207153, "step": 522 }, { "epoch": 0.39124742846455957, "grad_norm": 53.1034044959575, "learning_rate": 3.476982894685405e-07, "logps/chosen": -60.92084503173828, "logps/rejected": -68.71604919433594, "loss": 0.6674, "losses/dpo": 0.8303675651550293, "losses/sft": 0.2863978147506714, "losses/total": 0.8303675651550293, "ref_logps/chosen": -57.35423278808594, "ref_logps/rejected": -64.12004089355469, "rewards/accuracies": 0.5625, "rewards/chosen": -0.356661319732666, "rewards/margins": 0.10293994843959808, "rewards/rejected": -0.4596012532711029, "step": 523 }, { "epoch": 0.3919955115017767, "grad_norm": 63.68019014136044, "learning_rate": 3.471397460512563e-07, "logps/chosen": -67.70696258544922, "logps/rejected": -74.84326171875, "loss": 0.6177, "losses/dpo": 0.7199791669845581, "losses/sft": 0.630373477935791, "losses/total": 0.7199791669845581, "ref_logps/chosen": -64.40567779541016, "ref_logps/rejected": -69.50814819335938, "rewards/accuracies": 0.65625, "rewards/chosen": -0.33012843132019043, "rewards/margins": 0.20338377356529236, "rewards/rejected": -0.5335121750831604, "step": 524 }, { "epoch": 0.3927435945389938, "grad_norm": 81.87327452726943, "learning_rate": 3.465806309487588e-07, "logps/chosen": -99.93244934082031, "logps/rejected": -95.87660217285156, "loss": 0.7519, "losses/dpo": 0.7244669198989868, "losses/sft": 0.8510613441467285, "losses/total": 0.7244669198989868, "ref_logps/chosen": -95.64258575439453, "ref_logps/rejected": -91.98545837402344, "rewards/accuracies": 0.5625, "rewards/chosen": -0.42898690700531006, "rewards/margins": -0.03987149894237518, "rewards/rejected": -0.3891153931617737, "step": 525 }, { "epoch": 0.39349167757621095, "grad_norm": 74.06338448941952, "learning_rate": 3.4602094745154275e-07, "logps/chosen": -78.84849548339844, "logps/rejected": -88.3840560913086, "loss": 0.5579, "losses/dpo": 0.2942348122596741, "losses/sft": 0.23167628049850464, "losses/total": 0.2942348122596741, "ref_logps/chosen": -75.59613800048828, "ref_logps/rejected": -81.04273223876953, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3252354860305786, "rewards/margins": 0.40889641642570496, "rewards/rejected": -0.734131932258606, "step": 526 }, { "epoch": 0.3942397606134281, "grad_norm": 69.18290309228493, "learning_rate": 3.454606988534482e-07, "logps/chosen": -91.56443786621094, "logps/rejected": -113.8514404296875, "loss": 0.5872, "losses/dpo": 0.3813515305519104, "losses/sft": 1.3907618522644043, "losses/total": 0.3813515305519104, "ref_logps/chosen": -88.92665100097656, "ref_logps/rejected": -108.0140609741211, "rewards/accuracies": 0.625, "rewards/chosen": -0.26377835869789124, "rewards/margins": 0.3199589252471924, "rewards/rejected": -0.583737313747406, "step": 527 }, { "epoch": 0.3949878436506452, "grad_norm": 81.91448264898807, "learning_rate": 3.4489988845164087e-07, "logps/chosen": -85.55101776123047, "logps/rejected": -89.02488708496094, "loss": 0.6938, "losses/dpo": 0.6395630240440369, "losses/sft": 1.0777130126953125, "losses/total": 0.6395630240440369, "ref_logps/chosen": -82.3196029663086, "ref_logps/rejected": -84.34416961669922, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3231416940689087, "rewards/margins": 0.14493007957935333, "rewards/rejected": -0.46807175874710083, "step": 528 }, { "epoch": 0.39573592668786234, "grad_norm": 71.97560130083797, "learning_rate": 3.4433851954659264e-07, "logps/chosen": -90.76234436035156, "logps/rejected": -93.51187896728516, "loss": 0.6068, "losses/dpo": 0.5862928628921509, "losses/sft": 0.648563027381897, "losses/total": 0.5862928628921509, "ref_logps/chosen": -87.58815002441406, "ref_logps/rejected": -87.51338195800781, "rewards/accuracies": 0.625, "rewards/chosen": -0.3174194097518921, "rewards/margins": 0.2824299931526184, "rewards/rejected": -0.5998494625091553, "step": 529 }, { "epoch": 0.39648400972507947, "grad_norm": 72.98195955676889, "learning_rate": 3.437765954420627e-07, "logps/chosen": -94.31344604492188, "logps/rejected": -100.07025146484375, "loss": 0.6876, "losses/dpo": 0.4359629154205322, "losses/sft": 0.8687266111373901, "losses/total": 0.4359629154205322, "ref_logps/chosen": -89.98812866210938, "ref_logps/rejected": -95.09148406982422, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4325307011604309, "rewards/margins": 0.06534530967473984, "rewards/rejected": -0.49787604808807373, "step": 530 }, { "epoch": 0.3972320927622966, "grad_norm": 65.42925762160554, "learning_rate": 3.4321411944507714e-07, "logps/chosen": -112.68701171875, "logps/rejected": -118.56505584716797, "loss": 0.6762, "losses/dpo": 0.7439157366752625, "losses/sft": 1.1327104568481445, "losses/total": 0.7439157366752625, "ref_logps/chosen": -107.73435974121094, "ref_logps/rejected": -111.941650390625, "rewards/accuracies": 0.5, "rewards/chosen": -0.4952648878097534, "rewards/margins": 0.1670762151479721, "rewards/rejected": -0.6623411178588867, "step": 531 }, { "epoch": 0.3979801757995137, "grad_norm": 83.25635664673673, "learning_rate": 3.4265109486591047e-07, "logps/chosen": -106.44083404541016, "logps/rejected": -109.14124298095703, "loss": 0.75, "losses/dpo": 1.0142651796340942, "losses/sft": 1.1016578674316406, "losses/total": 1.0142651796340942, "ref_logps/chosen": -100.38831329345703, "ref_logps/rejected": -103.36605834960938, "rewards/accuracies": 0.46875, "rewards/chosen": -0.6052523255348206, "rewards/margins": -0.02773328870534897, "rewards/rejected": -0.5775189995765686, "step": 532 }, { "epoch": 0.39872825883673085, "grad_norm": 99.77003534018732, "learning_rate": 3.420875250180656e-07, "logps/chosen": -100.97187042236328, "logps/rejected": -98.57984924316406, "loss": 0.6647, "losses/dpo": 0.8182965517044067, "losses/sft": 1.6899068355560303, "losses/total": 0.8182965517044067, "ref_logps/chosen": -96.15628051757812, "ref_logps/rejected": -92.35212707519531, "rewards/accuracies": 0.5625, "rewards/chosen": -0.481559157371521, "rewards/margins": 0.14121367037296295, "rewards/rejected": -0.6227728128433228, "step": 533 }, { "epoch": 0.399476341873948, "grad_norm": 60.73090048965488, "learning_rate": 3.4152341321825423e-07, "logps/chosen": -87.34199523925781, "logps/rejected": -93.20065307617188, "loss": 0.6186, "losses/dpo": 0.7929102182388306, "losses/sft": 0.7664826512336731, "losses/total": 0.7929102182388306, "ref_logps/chosen": -84.55140686035156, "ref_logps/rejected": -88.03488159179688, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2790587842464447, "rewards/margins": 0.23751848936080933, "rewards/rejected": -0.5165772438049316, "step": 534 }, { "epoch": 0.40022442491116517, "grad_norm": 64.84729467791757, "learning_rate": 3.409587627863779e-07, "logps/chosen": -107.0971908569336, "logps/rejected": -110.19012451171875, "loss": 0.6496, "losses/dpo": 0.7651079893112183, "losses/sft": 1.30637526512146, "losses/total": 0.7651079893112183, "ref_logps/chosen": -103.86644744873047, "ref_logps/rejected": -104.882568359375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.32307517528533936, "rewards/margins": 0.20767998695373535, "rewards/rejected": -0.5307551622390747, "step": 535 }, { "epoch": 0.4009725079483823, "grad_norm": 83.24975260563197, "learning_rate": 3.403935770455077e-07, "logps/chosen": -95.24365234375, "logps/rejected": -100.7398681640625, "loss": 0.6886, "losses/dpo": 0.6735537648200989, "losses/sft": 1.089953064918518, "losses/total": 0.6735537648200989, "ref_logps/chosen": -91.4373779296875, "ref_logps/rejected": -96.3736572265625, "rewards/accuracies": 0.5, "rewards/chosen": -0.380628764629364, "rewards/margins": 0.0559929683804512, "rewards/rejected": -0.4366217851638794, "step": 536 }, { "epoch": 0.4009725079483823, "eval_logps/chosen": -38.94425964355469, "eval_logps/rejected": -44.36123275756836, "eval_loss": 0.6313894391059875, "eval_losses/dpo": 0.6363487839698792, "eval_losses/sft": 0.3253152668476105, "eval_losses/total": 0.6363487839698792, "eval_ref_logps/chosen": -35.788204193115234, "eval_ref_logps/rejected": -39.235660552978516, "eval_rewards/accuracies": 0.6271551847457886, "eval_rewards/chosen": -0.3156052529811859, "eval_rewards/margins": 0.1969514787197113, "eval_rewards/rejected": -0.5125567317008972, "eval_runtime": 38.114, "eval_samples_per_second": 12.148, "eval_steps_per_second": 1.522, "step": 536 }, { "epoch": 0.4017205909855994, "grad_norm": 70.27661618247365, "learning_rate": 3.3982785932186554e-07, "logps/chosen": -88.62275695800781, "logps/rejected": -94.8654556274414, "loss": 0.7058, "losses/dpo": 0.5458791255950928, "losses/sft": 0.5830483436584473, "losses/total": 0.5458791255950928, "ref_logps/chosen": -84.63155364990234, "ref_logps/rejected": -89.81563568115234, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3991207480430603, "rewards/margins": 0.10586109012365341, "rewards/rejected": -0.5049818158149719, "step": 537 }, { "epoch": 0.40246867402281655, "grad_norm": 64.99569420858215, "learning_rate": 3.392616129448038e-07, "logps/chosen": -86.03614807128906, "logps/rejected": -89.63469696044922, "loss": 0.6483, "losses/dpo": 0.5993373394012451, "losses/sft": 0.7860309481620789, "losses/total": 0.5993373394012451, "ref_logps/chosen": -82.88345336914062, "ref_logps/rejected": -85.1463623046875, "rewards/accuracies": 0.625, "rewards/chosen": -0.3152706027030945, "rewards/margins": 0.13356247544288635, "rewards/rejected": -0.44883304834365845, "step": 538 }, { "epoch": 0.4032167570600337, "grad_norm": 67.65700395617716, "learning_rate": 3.386948412467863e-07, "logps/chosen": -95.02510070800781, "logps/rejected": -104.71397399902344, "loss": 0.6168, "losses/dpo": 0.7122653722763062, "losses/sft": 0.7215588688850403, "losses/total": 0.7122653722763062, "ref_logps/chosen": -91.15094757080078, "ref_logps/rejected": -98.71623992919922, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3874160051345825, "rewards/margins": 0.21235668659210205, "rewards/rejected": -0.5997726917266846, "step": 539 }, { "epoch": 0.4039648400972508, "grad_norm": 207.33764782962, "learning_rate": 3.381275475633683e-07, "logps/chosen": -114.63864135742188, "logps/rejected": -120.53153991699219, "loss": 0.6347, "losses/dpo": 0.6241196990013123, "losses/sft": 0.7308965921401978, "losses/total": 0.6241196990013123, "ref_logps/chosen": -111.1640853881836, "ref_logps/rejected": -114.83798217773438, "rewards/accuracies": 0.59375, "rewards/chosen": -0.347455233335495, "rewards/margins": 0.22190049290657043, "rewards/rejected": -0.5693557262420654, "step": 540 }, { "epoch": 0.40471292313446794, "grad_norm": 91.545324825165, "learning_rate": 3.3755973523317716e-07, "logps/chosen": -86.60053253173828, "logps/rejected": -90.36318969726562, "loss": 0.7481, "losses/dpo": 0.8153946399688721, "losses/sft": 0.8318772912025452, "losses/total": 0.8153946399688721, "ref_logps/chosen": -81.64947509765625, "ref_logps/rejected": -85.55439758300781, "rewards/accuracies": 0.46875, "rewards/chosen": -0.4951050579547882, "rewards/margins": -0.014224782586097717, "rewards/rejected": -0.4808802902698517, "step": 541 }, { "epoch": 0.40546100617168507, "grad_norm": 89.02743658144895, "learning_rate": 3.369914075978926e-07, "logps/chosen": -105.09552001953125, "logps/rejected": -119.03538513183594, "loss": 0.6739, "losses/dpo": 0.49392253160476685, "losses/sft": 1.3437527418136597, "losses/total": 0.49392253160476685, "ref_logps/chosen": -100.62794494628906, "ref_logps/rejected": -113.63702392578125, "rewards/accuracies": 0.46875, "rewards/chosen": -0.4467574954032898, "rewards/margins": 0.09307905286550522, "rewards/rejected": -0.539836585521698, "step": 542 }, { "epoch": 0.4062090892089022, "grad_norm": 74.76389676032852, "learning_rate": 3.364225680022268e-07, "logps/chosen": -72.75846099853516, "logps/rejected": -74.41178894042969, "loss": 0.6542, "losses/dpo": 0.5088807344436646, "losses/sft": 0.7471147775650024, "losses/total": 0.5088807344436646, "ref_logps/chosen": -69.61845397949219, "ref_logps/rejected": -69.67154693603516, "rewards/accuracies": 0.625, "rewards/chosen": -0.3140007555484772, "rewards/margins": 0.16002362966537476, "rewards/rejected": -0.47402438521385193, "step": 543 }, { "epoch": 0.4069571722461193, "grad_norm": 72.36950950023052, "learning_rate": 3.3585321979390515e-07, "logps/chosen": -96.66557312011719, "logps/rejected": -108.13249206542969, "loss": 0.6465, "losses/dpo": 0.669330358505249, "losses/sft": 0.935883641242981, "losses/total": 0.669330358505249, "ref_logps/chosen": -93.12315368652344, "ref_logps/rejected": -102.98783874511719, "rewards/accuracies": 0.59375, "rewards/chosen": -0.35424160957336426, "rewards/margins": 0.16022345423698425, "rewards/rejected": -0.5144650936126709, "step": 544 }, { "epoch": 0.40770525528333645, "grad_norm": 75.42732819986884, "learning_rate": 3.352833663236462e-07, "logps/chosen": -82.45219421386719, "logps/rejected": -83.53385925292969, "loss": 0.65, "losses/dpo": 0.6775598526000977, "losses/sft": 0.6163861751556396, "losses/total": 0.6775598526000977, "ref_logps/chosen": -78.66213989257812, "ref_logps/rejected": -77.98927307128906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3790058493614197, "rewards/margins": 0.17545285820960999, "rewards/rejected": -0.554458737373352, "step": 545 }, { "epoch": 0.4084533383205536, "grad_norm": 58.77997326110771, "learning_rate": 3.347130109451422e-07, "logps/chosen": -104.91368865966797, "logps/rejected": -108.94544219970703, "loss": 0.6267, "losses/dpo": 0.6614294648170471, "losses/sft": 0.7864300012588501, "losses/total": 0.6614294648170471, "ref_logps/chosen": -101.45561981201172, "ref_logps/rejected": -103.56749725341797, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3458077609539032, "rewards/margins": 0.1919858157634735, "rewards/rejected": -0.5377935767173767, "step": 546 }, { "epoch": 0.4092014213577707, "grad_norm": 65.42224303206484, "learning_rate": 3.34142157015039e-07, "logps/chosen": -86.69021606445312, "logps/rejected": -88.6554946899414, "loss": 0.7011, "losses/dpo": 0.8945934176445007, "losses/sft": 0.6026625633239746, "losses/total": 0.8945934176445007, "ref_logps/chosen": -83.32637023925781, "ref_logps/rejected": -84.69400787353516, "rewards/accuracies": 0.59375, "rewards/chosen": -0.336384117603302, "rewards/margins": 0.059764765202999115, "rewards/rejected": -0.3961488902568817, "step": 547 }, { "epoch": 0.40994950439498784, "grad_norm": 88.96232435459144, "learning_rate": 3.3357080789291657e-07, "logps/chosen": -94.96780395507812, "logps/rejected": -89.79536437988281, "loss": 0.6699, "losses/dpo": 0.7245442271232605, "losses/sft": 1.4581332206726074, "losses/total": 0.7245442271232605, "ref_logps/chosen": -91.74343872070312, "ref_logps/rejected": -85.22018432617188, "rewards/accuracies": 0.5, "rewards/chosen": -0.32243698835372925, "rewards/margins": 0.13508103787899017, "rewards/rejected": -0.4575180411338806, "step": 548 }, { "epoch": 0.41069758743220497, "grad_norm": 58.6251844228353, "learning_rate": 3.3299896694126937e-07, "logps/chosen": -108.19862365722656, "logps/rejected": -119.58702087402344, "loss": 0.6146, "losses/dpo": 0.5806111097335815, "losses/sft": 1.6926099061965942, "losses/total": 0.5806111097335815, "ref_logps/chosen": -105.59727478027344, "ref_logps/rejected": -114.7877197265625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.26013433933258057, "rewards/margins": 0.21979546546936035, "rewards/rejected": -0.47992977499961853, "step": 549 }, { "epoch": 0.4114456704694221, "grad_norm": 64.62622677138549, "learning_rate": 3.3242663752548616e-07, "logps/chosen": -90.39683532714844, "logps/rejected": -94.21862030029297, "loss": 0.6843, "losses/dpo": 0.6595144271850586, "losses/sft": 0.34095773100852966, "losses/total": 0.6595144271850586, "ref_logps/chosen": -86.8149642944336, "ref_logps/rejected": -89.16761016845703, "rewards/accuracies": 0.53125, "rewards/chosen": -0.35818803310394287, "rewards/margins": 0.1469123512506485, "rewards/rejected": -0.5051003694534302, "step": 550 }, { "epoch": 0.4121937535066392, "grad_norm": 69.54361906638411, "learning_rate": 3.3185382301383056e-07, "logps/chosen": -107.34070587158203, "logps/rejected": -113.35000610351562, "loss": 0.631, "losses/dpo": 0.8718062043190002, "losses/sft": 0.55952388048172, "losses/total": 0.8718062043190002, "ref_logps/chosen": -103.72281646728516, "ref_logps/rejected": -107.73136901855469, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3617892563343048, "rewards/margins": 0.20007365942001343, "rewards/rejected": -0.5618628859519958, "step": 551 }, { "epoch": 0.41294183654385636, "grad_norm": 64.60565008014262, "learning_rate": 3.3128052677742086e-07, "logps/chosen": -107.95960235595703, "logps/rejected": -117.19122314453125, "loss": 0.6367, "losses/dpo": 0.7117491364479065, "losses/sft": 1.0145138502120972, "losses/total": 0.7117491364479065, "ref_logps/chosen": -104.67643737792969, "ref_logps/rejected": -111.88963317871094, "rewards/accuracies": 0.625, "rewards/chosen": -0.3283158242702484, "rewards/margins": 0.20184314250946045, "rewards/rejected": -0.5301588773727417, "step": 552 }, { "epoch": 0.4136899195810735, "grad_norm": 55.41213622349196, "learning_rate": 3.3070675219021056e-07, "logps/chosen": -89.73562622070312, "logps/rejected": -99.87869262695312, "loss": 0.5998, "losses/dpo": 0.6481798887252808, "losses/sft": 1.334252953529358, "losses/total": 0.6481798887252808, "ref_logps/chosen": -87.10983276367188, "ref_logps/rejected": -94.39178466796875, "rewards/accuracies": 0.59375, "rewards/chosen": -0.26257964968681335, "rewards/margins": 0.28611117601394653, "rewards/rejected": -0.548690915107727, "step": 553 }, { "epoch": 0.4144380026182906, "grad_norm": 71.33640583349579, "learning_rate": 3.301325026289683e-07, "logps/chosen": -96.17607879638672, "logps/rejected": -125.7203598022461, "loss": 0.6097, "losses/dpo": 0.6327696442604065, "losses/sft": 1.4134737253189087, "losses/total": 0.6327696442604065, "ref_logps/chosen": -93.20203399658203, "ref_logps/rejected": -120.39506530761719, "rewards/accuracies": 0.6875, "rewards/chosen": -0.29740479588508606, "rewards/margins": 0.23512482643127441, "rewards/rejected": -0.5325295925140381, "step": 554 }, { "epoch": 0.41518608565550774, "grad_norm": 62.93041755352251, "learning_rate": 3.2955778147325795e-07, "logps/chosen": -92.91413879394531, "logps/rejected": -95.86158752441406, "loss": 0.6515, "losses/dpo": 0.6873270273208618, "losses/sft": 0.7733550667762756, "losses/total": 0.6873270273208618, "ref_logps/chosen": -90.2946548461914, "ref_logps/rejected": -91.56927490234375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.26194876432418823, "rewards/margins": 0.16728270053863525, "rewards/rejected": -0.42923152446746826, "step": 555 }, { "epoch": 0.41593416869272487, "grad_norm": 68.32654115213809, "learning_rate": 3.28982592105419e-07, "logps/chosen": -105.28170776367188, "logps/rejected": -107.42449188232422, "loss": 0.591, "losses/dpo": 0.7157949209213257, "losses/sft": 1.0213172435760498, "losses/total": 0.7157949209213257, "ref_logps/chosen": -103.11618041992188, "ref_logps/rejected": -102.47515106201172, "rewards/accuracies": 0.625, "rewards/chosen": -0.21655279397964478, "rewards/margins": 0.27838125824928284, "rewards/rejected": -0.4949340522289276, "step": 556 }, { "epoch": 0.416682251729942, "grad_norm": 54.880690051542246, "learning_rate": 3.2840693791054625e-07, "logps/chosen": -106.82046508789062, "logps/rejected": -114.80335235595703, "loss": 0.6162, "losses/dpo": 0.6282214522361755, "losses/sft": 1.3896490335464478, "losses/total": 0.6282214522361755, "ref_logps/chosen": -103.9823989868164, "ref_logps/rejected": -109.63600158691406, "rewards/accuracies": 0.625, "rewards/chosen": -0.28380659222602844, "rewards/margins": 0.2329292595386505, "rewards/rejected": -0.516735851764679, "step": 557 }, { "epoch": 0.41743033476715913, "grad_norm": 62.25583837315822, "learning_rate": 3.278308222764702e-07, "logps/chosen": -89.1195068359375, "logps/rejected": -88.68072509765625, "loss": 0.6928, "losses/dpo": 0.6509997844696045, "losses/sft": 0.6958317756652832, "losses/total": 0.6509997844696045, "ref_logps/chosen": -85.60920715332031, "ref_logps/rejected": -84.78532409667969, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3510305881500244, "rewards/margins": 0.03850971534848213, "rewards/rejected": -0.38954031467437744, "step": 558 }, { "epoch": 0.4181784178043763, "grad_norm": 60.25138644447483, "learning_rate": 3.272542485937368e-07, "logps/chosen": -98.06887817382812, "logps/rejected": -98.65748596191406, "loss": 0.593, "losses/dpo": 0.4702034890651703, "losses/sft": 1.0627949237823486, "losses/total": 0.4702034890651703, "ref_logps/chosen": -95.6250991821289, "ref_logps/rejected": -93.07026672363281, "rewards/accuracies": 0.59375, "rewards/chosen": -0.24437853693962097, "rewards/margins": 0.31434276700019836, "rewards/rejected": -0.5587213039398193, "step": 559 }, { "epoch": 0.41892650084159344, "grad_norm": 66.59040326462816, "learning_rate": 3.266772202555882e-07, "logps/chosen": -71.45325469970703, "logps/rejected": -72.72718048095703, "loss": 0.7115, "losses/dpo": 0.6638317704200745, "losses/sft": 0.6801550984382629, "losses/total": 0.6638317704200745, "ref_logps/chosen": -67.53736877441406, "ref_logps/rejected": -68.42790985107422, "rewards/accuracies": 0.6875, "rewards/chosen": -0.39158791303634644, "rewards/margins": 0.038339100778102875, "rewards/rejected": -0.4299269914627075, "step": 560 }, { "epoch": 0.41967458387881057, "grad_norm": 66.22693878999338, "learning_rate": 3.260997406579417e-07, "logps/chosen": -105.10626220703125, "logps/rejected": -114.35891723632812, "loss": 0.6504, "losses/dpo": 0.4142158627510071, "losses/sft": 0.35586196184158325, "losses/total": 0.4142158627510071, "ref_logps/chosen": -101.24308776855469, "ref_logps/rejected": -108.59030151367188, "rewards/accuracies": 0.65625, "rewards/chosen": -0.38631826639175415, "rewards/margins": 0.1905430257320404, "rewards/rejected": -0.5768612623214722, "step": 561 }, { "epoch": 0.4204226669160277, "grad_norm": 61.530440480654846, "learning_rate": 3.255218131993707e-07, "logps/chosen": -95.99339294433594, "logps/rejected": -100.72029113769531, "loss": 0.6354, "losses/dpo": 0.5294334292411804, "losses/sft": 1.0159187316894531, "losses/total": 0.5294334292411804, "ref_logps/chosen": -93.13920593261719, "ref_logps/rejected": -95.8665542602539, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2854187488555908, "rewards/margins": 0.19995543360710144, "rewards/rejected": -0.48537421226501465, "step": 562 }, { "epoch": 0.4211707499532448, "grad_norm": 76.55668571971655, "learning_rate": 3.249434412810841e-07, "logps/chosen": -87.65216064453125, "logps/rejected": -103.87351989746094, "loss": 0.5893, "losses/dpo": 0.5677692890167236, "losses/sft": 0.4379825294017792, "losses/total": 0.5677692890167236, "ref_logps/chosen": -84.57865142822266, "ref_logps/rejected": -97.59904479980469, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3073505461215973, "rewards/margins": 0.32009661197662354, "rewards/rejected": -0.6274471879005432, "step": 563 }, { "epoch": 0.42191883299046196, "grad_norm": 65.87712290634902, "learning_rate": 3.243646283069068e-07, "logps/chosen": -109.8886947631836, "logps/rejected": -119.0110092163086, "loss": 0.6058, "losses/dpo": 0.3630932569503784, "losses/sft": 0.5180153250694275, "losses/total": 0.3630932569503784, "ref_logps/chosen": -107.08323669433594, "ref_logps/rejected": -113.43138885498047, "rewards/accuracies": 0.625, "rewards/chosen": -0.2805459499359131, "rewards/margins": 0.2774158716201782, "rewards/rejected": -0.5579618215560913, "step": 564 }, { "epoch": 0.4226669160276791, "grad_norm": 105.41479826667373, "learning_rate": 3.237853776832593e-07, "logps/chosen": -89.50782775878906, "logps/rejected": -96.37557983398438, "loss": 0.6274, "losses/dpo": 0.4444139897823334, "losses/sft": 0.7910679578781128, "losses/total": 0.4444139897823334, "ref_logps/chosen": -86.23632049560547, "ref_logps/rejected": -90.62142944335938, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3271498680114746, "rewards/margins": 0.248266339302063, "rewards/rejected": -0.5754162073135376, "step": 565 }, { "epoch": 0.4234149990648962, "grad_norm": 70.53690864062276, "learning_rate": 3.2320569281913754e-07, "logps/chosen": -116.66152954101562, "logps/rejected": -128.68881225585938, "loss": 0.6372, "losses/dpo": 0.6119585633277893, "losses/sft": 1.305147647857666, "losses/total": 0.6119585633277893, "ref_logps/chosen": -112.30741882324219, "ref_logps/rejected": -122.73194122314453, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4354117214679718, "rewards/margins": 0.1602742075920105, "rewards/rejected": -0.5956859588623047, "step": 566 }, { "epoch": 0.42416308210211334, "grad_norm": 67.94996715373635, "learning_rate": 3.226255771260935e-07, "logps/chosen": -117.61006927490234, "logps/rejected": -123.28350067138672, "loss": 0.6226, "losses/dpo": 0.8232445120811462, "losses/sft": 1.183520793914795, "losses/total": 0.8232445120811462, "ref_logps/chosen": -114.17011260986328, "ref_logps/rejected": -117.84645080566406, "rewards/accuracies": 0.75, "rewards/chosen": -0.34399569034576416, "rewards/margins": 0.19970980286598206, "rewards/rejected": -0.5437054634094238, "step": 567 }, { "epoch": 0.42491116513933047, "grad_norm": 79.89191127601845, "learning_rate": 3.2204503401821405e-07, "logps/chosen": -97.13182830810547, "logps/rejected": -111.55624389648438, "loss": 0.6443, "losses/dpo": 0.9807767868041992, "losses/sft": 1.540515661239624, "losses/total": 0.9807767868041992, "ref_logps/chosen": -94.08938598632812, "ref_logps/rejected": -106.2016830444336, "rewards/accuracies": 0.625, "rewards/chosen": -0.3042442500591278, "rewards/margins": 0.23121193051338196, "rewards/rejected": -0.5354561805725098, "step": 568 }, { "epoch": 0.4256592481765476, "grad_norm": 43.79545873501666, "learning_rate": 3.21464066912102e-07, "logps/chosen": -70.56554412841797, "logps/rejected": -76.53166198730469, "loss": 0.6168, "losses/dpo": 0.599460244178772, "losses/sft": 0.6058323383331299, "losses/total": 0.599460244178772, "ref_logps/chosen": -67.25223541259766, "ref_logps/rejected": -70.6937026977539, "rewards/accuracies": 0.625, "rewards/chosen": -0.3313309848308563, "rewards/margins": 0.2524654269218445, "rewards/rejected": -0.5837963819503784, "step": 569 }, { "epoch": 0.42640733121376473, "grad_norm": 71.0731572435931, "learning_rate": 3.208826792268552e-07, "logps/chosen": -102.1513671875, "logps/rejected": -114.25880432128906, "loss": 0.592, "losses/dpo": 0.48905670642852783, "losses/sft": 0.5965745449066162, "losses/total": 0.48905670642852783, "ref_logps/chosen": -99.26106262207031, "ref_logps/rejected": -108.68408966064453, "rewards/accuracies": 0.75, "rewards/chosen": -0.28903090953826904, "rewards/margins": 0.2684408128261566, "rewards/rejected": -0.557471752166748, "step": 570 }, { "epoch": 0.42715541425098186, "grad_norm": 59.4284662226778, "learning_rate": 3.203008743840468e-07, "logps/chosen": -92.17697143554688, "logps/rejected": -107.18965148925781, "loss": 0.6025, "losses/dpo": 0.48621609807014465, "losses/sft": 0.8535603880882263, "losses/total": 0.48621609807014465, "ref_logps/chosen": -89.46080017089844, "ref_logps/rejected": -101.96305084228516, "rewards/accuracies": 0.6875, "rewards/chosen": -0.27161705493927, "rewards/margins": 0.2510426342487335, "rewards/rejected": -0.5226596593856812, "step": 571 }, { "epoch": 0.427903497288199, "grad_norm": 100.4918629592795, "learning_rate": 3.19718655807705e-07, "logps/chosen": -82.79302215576172, "logps/rejected": -97.14334869384766, "loss": 0.6022, "losses/dpo": 0.5265401601791382, "losses/sft": 0.8754407167434692, "losses/total": 0.5265401601791382, "ref_logps/chosen": -80.36764526367188, "ref_logps/rejected": -92.15611267089844, "rewards/accuracies": 0.71875, "rewards/chosen": -0.24253755807876587, "rewards/margins": 0.2561861574649811, "rewards/rejected": -0.49872371554374695, "step": 572 }, { "epoch": 0.4286515803254161, "grad_norm": 71.30129355157369, "learning_rate": 3.1913602692429275e-07, "logps/chosen": -108.59872436523438, "logps/rejected": -111.6538314819336, "loss": 0.5808, "losses/dpo": 0.6087496876716614, "losses/sft": 0.8173922896385193, "losses/total": 0.6087496876716614, "ref_logps/chosen": -106.47711944580078, "ref_logps/rejected": -106.30694580078125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21216163039207458, "rewards/margins": 0.3225269913673401, "rewards/rejected": -0.5346886515617371, "step": 573 }, { "epoch": 0.42939966336263324, "grad_norm": 87.09631208108067, "learning_rate": 3.18552991162688e-07, "logps/chosen": -99.32667541503906, "logps/rejected": -104.52302551269531, "loss": 0.6316, "losses/dpo": 0.5396655201911926, "losses/sft": 0.8887767195701599, "losses/total": 0.5396655201911926, "ref_logps/chosen": -96.53710174560547, "ref_logps/rejected": -99.96852111816406, "rewards/accuracies": 0.59375, "rewards/chosen": -0.27895742654800415, "rewards/margins": 0.17649319767951965, "rewards/rejected": -0.4554505944252014, "step": 574 }, { "epoch": 0.4301477463998504, "grad_norm": 70.57103925584333, "learning_rate": 3.179695519541628e-07, "logps/chosen": -99.565185546875, "logps/rejected": -116.73143005371094, "loss": 0.599, "losses/dpo": 0.5669975876808167, "losses/sft": 1.0618538856506348, "losses/total": 0.5669975876808167, "ref_logps/chosen": -97.2781753540039, "ref_logps/rejected": -111.54563903808594, "rewards/accuracies": 0.71875, "rewards/chosen": -0.22869986295700073, "rewards/margins": 0.2898791432380676, "rewards/rejected": -0.5185790061950684, "step": 575 }, { "epoch": 0.4308958294370675, "grad_norm": 75.57854450985592, "learning_rate": 3.1738571273236414e-07, "logps/chosen": -94.3277587890625, "logps/rejected": -104.15563201904297, "loss": 0.6212, "losses/dpo": 0.543371319770813, "losses/sft": 1.1963356733322144, "losses/total": 0.543371319770813, "ref_logps/chosen": -91.29320526123047, "ref_logps/rejected": -98.93323516845703, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3034552335739136, "rewards/margins": 0.21878503262996674, "rewards/rejected": -0.5222402811050415, "step": 576 }, { "epoch": 0.43164391247428463, "grad_norm": 73.1098583227472, "learning_rate": 3.1680147693329276e-07, "logps/chosen": -83.40862274169922, "logps/rejected": -89.52815246582031, "loss": 0.6334, "losses/dpo": 0.6343467235565186, "losses/sft": 0.583922266960144, "losses/total": 0.6343467235565186, "ref_logps/chosen": -79.90872192382812, "ref_logps/rejected": -84.09806823730469, "rewards/accuracies": 0.5625, "rewards/chosen": -0.34999069571495056, "rewards/margins": 0.1930173635482788, "rewards/rejected": -0.5430080890655518, "step": 577 }, { "epoch": 0.43239199551150176, "grad_norm": 79.17913906696012, "learning_rate": 3.162168479952834e-07, "logps/chosen": -101.01660919189453, "logps/rejected": -107.38810729980469, "loss": 0.6403, "losses/dpo": 0.643623948097229, "losses/sft": 1.021632194519043, "losses/total": 0.643623948097229, "ref_logps/chosen": -98.23513793945312, "ref_logps/rejected": -103.07054138183594, "rewards/accuracies": 0.75, "rewards/chosen": -0.2781481444835663, "rewards/margins": 0.15360984206199646, "rewards/rejected": -0.43175798654556274, "step": 578 }, { "epoch": 0.4331400785487189, "grad_norm": 68.4160344012926, "learning_rate": 3.1563182935898457e-07, "logps/chosen": -91.25782775878906, "logps/rejected": -95.88185119628906, "loss": 0.7075, "losses/dpo": 0.737692654132843, "losses/sft": 0.746170699596405, "losses/total": 0.737692654132843, "ref_logps/chosen": -88.90159606933594, "ref_logps/rejected": -93.29762268066406, "rewards/accuracies": 0.59375, "rewards/chosen": -0.23562397062778473, "rewards/margins": 0.022799521684646606, "rewards/rejected": -0.25842347741127014, "step": 579 }, { "epoch": 0.433888161585936, "grad_norm": 85.25340470034314, "learning_rate": 3.1504642446733826e-07, "logps/chosen": -97.22107696533203, "logps/rejected": -104.99661254882812, "loss": 0.5991, "losses/dpo": 0.6104007959365845, "losses/sft": 0.9574998021125793, "losses/total": 0.6104007959365845, "ref_logps/chosen": -94.47223663330078, "ref_logps/rejected": -99.46815490722656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2748849391937256, "rewards/margins": 0.27795982360839844, "rewards/rejected": -0.552844762802124, "step": 580 }, { "epoch": 0.43463624462315315, "grad_norm": 65.91670620897396, "learning_rate": 3.1446063676555944e-07, "logps/chosen": -72.59549713134766, "logps/rejected": -83.93070983886719, "loss": 0.6496, "losses/dpo": 0.4790010452270508, "losses/sft": 0.7397443056106567, "losses/total": 0.4790010452270508, "ref_logps/chosen": -70.15288543701172, "ref_logps/rejected": -79.6253662109375, "rewards/accuracies": 0.625, "rewards/chosen": -0.2442607283592224, "rewards/margins": 0.18627384305000305, "rewards/rejected": -0.43053460121154785, "step": 581 }, { "epoch": 0.4353843276603703, "grad_norm": 60.89165599518891, "learning_rate": 3.1387446970111633e-07, "logps/chosen": -96.09858703613281, "logps/rejected": -104.29052734375, "loss": 0.5771, "losses/dpo": 0.5431051850318909, "losses/sft": 1.2324751615524292, "losses/total": 0.5431051850318909, "ref_logps/chosen": -93.1080322265625, "ref_logps/rejected": -97.64286041259766, "rewards/accuracies": 0.65625, "rewards/chosen": -0.29905620217323303, "rewards/margins": 0.36571067571640015, "rewards/rejected": -0.6647669076919556, "step": 582 }, { "epoch": 0.43613241069758746, "grad_norm": 66.62794513884765, "learning_rate": 3.132879267237093e-07, "logps/chosen": -100.37779235839844, "logps/rejected": -113.95277404785156, "loss": 0.6154, "losses/dpo": 0.5648993849754333, "losses/sft": 0.9180312752723694, "losses/total": 0.5648993849754333, "ref_logps/chosen": -98.15699005126953, "ref_logps/rejected": -109.71261596679688, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22208113968372345, "rewards/margins": 0.2019346058368683, "rewards/rejected": -0.42401570081710815, "step": 583 }, { "epoch": 0.4368804937348046, "grad_norm": 82.98709323218546, "learning_rate": 3.127010112852514e-07, "logps/chosen": -92.833740234375, "logps/rejected": -96.1993179321289, "loss": 0.6526, "losses/dpo": 0.5558457970619202, "losses/sft": 1.0278451442718506, "losses/total": 0.5558457970619202, "ref_logps/chosen": -89.47407531738281, "ref_logps/rejected": -91.50212860107422, "rewards/accuracies": 0.59375, "rewards/chosen": -0.33596593141555786, "rewards/margins": 0.1337534487247467, "rewards/rejected": -0.46971940994262695, "step": 584 }, { "epoch": 0.4376285767720217, "grad_norm": 65.54915165181269, "learning_rate": 3.121137268398477e-07, "logps/chosen": -69.38302612304688, "logps/rejected": -71.73271179199219, "loss": 0.7105, "losses/dpo": 0.6269130706787109, "losses/sft": 0.5185239315032959, "losses/total": 0.6269130706787109, "ref_logps/chosen": -66.07160186767578, "ref_logps/rejected": -68.24417877197266, "rewards/accuracies": 0.5, "rewards/chosen": -0.3311431407928467, "rewards/margins": 0.01770991086959839, "rewards/rejected": -0.34885305166244507, "step": 585 }, { "epoch": 0.43837665980923884, "grad_norm": 84.82328463572983, "learning_rate": 3.1152607684377467e-07, "logps/chosen": -92.81410217285156, "logps/rejected": -96.60641479492188, "loss": 0.6285, "losses/dpo": 0.6292883157730103, "losses/sft": 0.5830672979354858, "losses/total": 0.6292883157730103, "ref_logps/chosen": -89.56978607177734, "ref_logps/rejected": -91.41510009765625, "rewards/accuracies": 0.625, "rewards/chosen": -0.3244333267211914, "rewards/margins": 0.1946980059146881, "rewards/rejected": -0.5191313624382019, "step": 586 }, { "epoch": 0.439124742846456, "grad_norm": 77.32140608044344, "learning_rate": 3.109380647554604e-07, "logps/chosen": -87.85423278808594, "logps/rejected": -87.98773956298828, "loss": 0.6518, "losses/dpo": 0.7079410552978516, "losses/sft": 0.19831973314285278, "losses/total": 0.7079410552978516, "ref_logps/chosen": -84.3681411743164, "ref_logps/rejected": -82.62571716308594, "rewards/accuracies": 0.59375, "rewards/chosen": -0.34860938787460327, "rewards/margins": 0.18759284913539886, "rewards/rejected": -0.5362021923065186, "step": 587 }, { "epoch": 0.4398728258836731, "grad_norm": 58.156657730468886, "learning_rate": 3.103496940354637e-07, "logps/chosen": -88.40480041503906, "logps/rejected": -95.12994384765625, "loss": 0.6647, "losses/dpo": 0.7173474431037903, "losses/sft": 0.6402688026428223, "losses/total": 0.7173474431037903, "ref_logps/chosen": -84.89547729492188, "ref_logps/rejected": -90.66358947753906, "rewards/accuracies": 0.625, "rewards/chosen": -0.35093221068382263, "rewards/margins": 0.0957026481628418, "rewards/rejected": -0.44663482904434204, "step": 588 }, { "epoch": 0.44062090892089023, "grad_norm": 63.99737799757085, "learning_rate": 3.097609681464542e-07, "logps/chosen": -101.99127197265625, "logps/rejected": -103.31422424316406, "loss": 0.623, "losses/dpo": 0.7317803502082825, "losses/sft": 0.7122834920883179, "losses/total": 0.7317803502082825, "ref_logps/chosen": -98.70166778564453, "ref_logps/rejected": -97.81468200683594, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3289600610733032, "rewards/margins": 0.220994234085083, "rewards/rejected": -0.5499542951583862, "step": 589 }, { "epoch": 0.44136899195810736, "grad_norm": 67.34773261735945, "learning_rate": 3.091718905531916e-07, "logps/chosen": -104.08946990966797, "logps/rejected": -114.51091766357422, "loss": 0.644, "losses/dpo": 0.7823739051818848, "losses/sft": 1.006361722946167, "losses/total": 0.7823739051818848, "ref_logps/chosen": -100.74207305908203, "ref_logps/rejected": -109.56292724609375, "rewards/accuracies": 0.625, "rewards/chosen": -0.33474037051200867, "rewards/margins": 0.1600579023361206, "rewards/rejected": -0.49479830265045166, "step": 590 }, { "epoch": 0.4421170749953245, "grad_norm": 55.43429207793735, "learning_rate": 3.085824647225056e-07, "logps/chosen": -93.21284484863281, "logps/rejected": -116.74262237548828, "loss": 0.6128, "losses/dpo": 0.5922693014144897, "losses/sft": 0.9347488880157471, "losses/total": 0.5922693014144897, "ref_logps/chosen": -90.23699951171875, "ref_logps/rejected": -111.43778991699219, "rewards/accuracies": 0.65625, "rewards/chosen": -0.29758548736572266, "rewards/margins": 0.23289784789085388, "rewards/rejected": -0.5304833650588989, "step": 591 }, { "epoch": 0.4428651580325416, "grad_norm": 83.76550454627544, "learning_rate": 3.079926941232753e-07, "logps/chosen": -79.63245391845703, "logps/rejected": -92.5086669921875, "loss": 0.6326, "losses/dpo": 0.6509387493133545, "losses/sft": 1.2367701530456543, "losses/total": 0.6509387493133545, "ref_logps/chosen": -76.95489501953125, "ref_logps/rejected": -87.728759765625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2677555978298187, "rewards/margins": 0.21023614704608917, "rewards/rejected": -0.4779917597770691, "step": 592 }, { "epoch": 0.44361324106975875, "grad_norm": 91.0379722933897, "learning_rate": 3.0740258222640863e-07, "logps/chosen": -81.41388702392578, "logps/rejected": -91.30450439453125, "loss": 0.6598, "losses/dpo": 0.7942617535591125, "losses/sft": 0.5874863862991333, "losses/total": 0.7942617535591125, "ref_logps/chosen": -77.5085678100586, "ref_logps/rejected": -85.88697814941406, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3905317783355713, "rewards/margins": 0.1512201726436615, "rewards/rejected": -0.5417519211769104, "step": 593 }, { "epoch": 0.4443613241069759, "grad_norm": 63.01917616838073, "learning_rate": 3.068121325048225e-07, "logps/chosen": -84.73677062988281, "logps/rejected": -87.90411376953125, "loss": 0.7072, "losses/dpo": 0.7133409976959229, "losses/sft": 0.7097502946853638, "losses/total": 0.7133409976959229, "ref_logps/chosen": -80.9483642578125, "ref_logps/rejected": -83.99176025390625, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3788405954837799, "rewards/margins": 0.012394292280077934, "rewards/rejected": -0.3912348747253418, "step": 594 }, { "epoch": 0.445109407144193, "grad_norm": 69.74280601236602, "learning_rate": 3.0622134843342164e-07, "logps/chosen": -105.42146301269531, "logps/rejected": -116.7693099975586, "loss": 0.5984, "losses/dpo": 0.6619859933853149, "losses/sft": 1.5512886047363281, "losses/total": 0.6619859933853149, "ref_logps/chosen": -101.73481750488281, "ref_logps/rejected": -109.71050262451172, "rewards/accuracies": 0.625, "rewards/chosen": -0.3686646819114685, "rewards/margins": 0.33721643686294556, "rewards/rejected": -0.7058811187744141, "step": 595 }, { "epoch": 0.44585749018141013, "grad_norm": 95.39165540383925, "learning_rate": 3.056302334890786e-07, "logps/chosen": -86.69895935058594, "logps/rejected": -89.07525634765625, "loss": 0.683, "losses/dpo": 0.4644124507904053, "losses/sft": 0.839553952217102, "losses/total": 0.4644124507904053, "ref_logps/chosen": -83.46466064453125, "ref_logps/rejected": -84.72047424316406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3234294354915619, "rewards/margins": 0.11204937100410461, "rewards/rejected": -0.4354788064956665, "step": 596 }, { "epoch": 0.44660557321862726, "grad_norm": 82.0909706809841, "learning_rate": 3.050387911506132e-07, "logps/chosen": -97.25286102294922, "logps/rejected": -91.03499603271484, "loss": 0.6751, "losses/dpo": 0.8827547430992126, "losses/sft": 1.4183205366134644, "losses/total": 0.8827547430992126, "ref_logps/chosen": -93.77871704101562, "ref_logps/rejected": -86.68141174316406, "rewards/accuracies": 0.625, "rewards/chosen": -0.34741470217704773, "rewards/margins": 0.08794364333152771, "rewards/rejected": -0.4353583753108978, "step": 597 }, { "epoch": 0.4473536562558444, "grad_norm": 56.42608195878682, "learning_rate": 3.0444702489877216e-07, "logps/chosen": -112.54423522949219, "logps/rejected": -118.77587890625, "loss": 0.6302, "losses/dpo": 0.8036903142929077, "losses/sft": 1.4605646133422852, "losses/total": 0.8036903142929077, "ref_logps/chosen": -109.21426391601562, "ref_logps/rejected": -113.4338150024414, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3329964280128479, "rewards/margins": 0.20121099054813385, "rewards/rejected": -0.5342074632644653, "step": 598 }, { "epoch": 0.4481017392930615, "grad_norm": 101.9499360432782, "learning_rate": 3.038549382162081e-07, "logps/chosen": -111.16635131835938, "logps/rejected": -111.83452606201172, "loss": 0.7488, "losses/dpo": 0.6103127002716064, "losses/sft": 1.1141865253448486, "losses/total": 0.6103127002716064, "ref_logps/chosen": -106.96540832519531, "ref_logps/rejected": -107.99913024902344, "rewards/accuracies": 0.53125, "rewards/chosen": -0.42009487748146057, "rewards/margins": -0.03655475005507469, "rewards/rejected": -0.3835401237010956, "step": 599 }, { "epoch": 0.44884982233027865, "grad_norm": 63.70914580295086, "learning_rate": 3.0326253458746e-07, "logps/chosen": -107.8551025390625, "logps/rejected": -112.22441101074219, "loss": 0.6543, "losses/dpo": 0.6091989874839783, "losses/sft": 1.1566228866577148, "losses/total": 0.6091989874839783, "ref_logps/chosen": -103.47481536865234, "ref_logps/rejected": -106.19183349609375, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4380283057689667, "rewards/margins": 0.16522935032844543, "rewards/rejected": -0.6032576560974121, "step": 600 }, { "epoch": 0.4495979053674958, "grad_norm": 63.42340696542314, "learning_rate": 3.0266981749893154e-07, "logps/chosen": -121.48268127441406, "logps/rejected": -124.24080657958984, "loss": 0.6821, "losses/dpo": 0.47733116149902344, "losses/sft": 1.2274082899093628, "losses/total": 0.47733116149902344, "ref_logps/chosen": -117.20759582519531, "ref_logps/rejected": -118.68032836914062, "rewards/accuracies": 0.65625, "rewards/chosen": -0.42750823497772217, "rewards/margins": 0.12853950262069702, "rewards/rejected": -0.5560477375984192, "step": 601 }, { "epoch": 0.4503459884047129, "grad_norm": 78.83500133575131, "learning_rate": 3.020767904388716e-07, "logps/chosen": -94.61759948730469, "logps/rejected": -106.8546142578125, "loss": 0.5936, "losses/dpo": 0.46889209747314453, "losses/sft": 1.0110926628112793, "losses/total": 0.46889209747314453, "ref_logps/chosen": -92.49598693847656, "ref_logps/rejected": -101.53587341308594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.212161585688591, "rewards/margins": 0.3197129964828491, "rewards/rejected": -0.5318745374679565, "step": 602 }, { "epoch": 0.45109407144193003, "grad_norm": 104.60024233802746, "learning_rate": 3.014834568973532e-07, "logps/chosen": -118.50169372558594, "logps/rejected": -120.34476470947266, "loss": 0.6428, "losses/dpo": 0.5077962279319763, "losses/sft": 0.9012305736541748, "losses/total": 0.5077962279319763, "ref_logps/chosen": -114.6470947265625, "ref_logps/rejected": -114.52182006835938, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3854594826698303, "rewards/margins": 0.1968350112438202, "rewards/rejected": -0.5822944641113281, "step": 603 }, { "epoch": 0.45184215447914716, "grad_norm": 57.85468957500076, "learning_rate": 3.0088982036625273e-07, "logps/chosen": -80.93902587890625, "logps/rejected": -97.2328872680664, "loss": 0.5664, "losses/dpo": 0.6743850708007812, "losses/sft": 0.6833662986755371, "losses/total": 0.6743850708007812, "ref_logps/chosen": -79.35145568847656, "ref_logps/rejected": -92.3011703491211, "rewards/accuracies": 0.75, "rewards/chosen": -0.15875637531280518, "rewards/margins": 0.3344150185585022, "rewards/rejected": -0.4931713938713074, "step": 604 }, { "epoch": 0.4525902375163643, "grad_norm": 81.51641203032585, "learning_rate": 3.0029588433923024e-07, "logps/chosen": -95.56951904296875, "logps/rejected": -114.81880187988281, "loss": 0.6234, "losses/dpo": 0.7082030773162842, "losses/sft": 1.2077592611312866, "losses/total": 0.7082030773162842, "ref_logps/chosen": -92.85069274902344, "ref_logps/rejected": -108.72093963623047, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2718830704689026, "rewards/margins": 0.3379019498825073, "rewards/rejected": -0.6097850203514099, "step": 605 }, { "epoch": 0.4533383205535815, "grad_norm": 86.62240301645818, "learning_rate": 2.997016523117081e-07, "logps/chosen": -78.72555541992188, "logps/rejected": -87.86203002929688, "loss": 0.5934, "losses/dpo": 0.5316370725631714, "losses/sft": 0.6614341139793396, "losses/total": 0.5316370725631714, "ref_logps/chosen": -76.09919738769531, "ref_logps/rejected": -82.60214233398438, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2626360356807709, "rewards/margins": 0.2633529305458069, "rewards/rejected": -0.5259889364242554, "step": 606 }, { "epoch": 0.4540864035907986, "grad_norm": 65.18166862877885, "learning_rate": 2.9910712778085065e-07, "logps/chosen": -93.04177856445312, "logps/rejected": -99.390869140625, "loss": 0.6753, "losses/dpo": 0.6468925476074219, "losses/sft": 0.5286334156990051, "losses/total": 0.6468925476074219, "ref_logps/chosen": -89.05633544921875, "ref_logps/rejected": -94.04995727539062, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3985441029071808, "rewards/margins": 0.13554655015468597, "rewards/rejected": -0.5340906381607056, "step": 607 }, { "epoch": 0.45483448662801573, "grad_norm": 61.35449469780354, "learning_rate": 2.985123142455438e-07, "logps/chosen": -103.71066284179688, "logps/rejected": -104.96378326416016, "loss": 0.6956, "losses/dpo": 0.4971983730792999, "losses/sft": 0.7978631258010864, "losses/total": 0.4971983730792999, "ref_logps/chosen": -101.63436889648438, "ref_logps/rejected": -102.25829315185547, "rewards/accuracies": 0.5, "rewards/chosen": -0.20762915909290314, "rewards/margins": 0.06292077898979187, "rewards/rejected": -0.2705499231815338, "step": 608 }, { "epoch": 0.45558256966523286, "grad_norm": 54.566009775649746, "learning_rate": 2.9791721520637425e-07, "logps/chosen": -96.64830780029297, "logps/rejected": -113.66720581054688, "loss": 0.6208, "losses/dpo": 0.48595505952835083, "losses/sft": 1.2291420698165894, "losses/total": 0.48595505952835083, "ref_logps/chosen": -93.74925231933594, "ref_logps/rejected": -108.36664581298828, "rewards/accuracies": 0.65625, "rewards/chosen": -0.28990525007247925, "rewards/margins": 0.2401517629623413, "rewards/rejected": -0.5300570130348206, "step": 609 }, { "epoch": 0.45633065270245, "grad_norm": 106.86927507024892, "learning_rate": 2.9732183416560897e-07, "logps/chosen": -89.1168212890625, "logps/rejected": -90.38671112060547, "loss": 0.7183, "losses/dpo": 0.9705816507339478, "losses/sft": 0.574019193649292, "losses/total": 0.9705816507339478, "ref_logps/chosen": -84.63175964355469, "ref_logps/rejected": -85.46549987792969, "rewards/accuracies": 0.625, "rewards/chosen": -0.4485051929950714, "rewards/margins": 0.04361598566174507, "rewards/rejected": -0.492121160030365, "step": 610 }, { "epoch": 0.4570787357396671, "grad_norm": 76.50160159510128, "learning_rate": 2.967261746271744e-07, "logps/chosen": -84.42436981201172, "logps/rejected": -98.92903137207031, "loss": 0.6781, "losses/dpo": 0.8697295188903809, "losses/sft": 0.4226045608520508, "losses/total": 0.8697295188903809, "ref_logps/chosen": -79.716796875, "ref_logps/rejected": -93.19950866699219, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4707571864128113, "rewards/margins": 0.102195605635643, "rewards/rejected": -0.5729528069496155, "step": 611 }, { "epoch": 0.45782681877688425, "grad_norm": 67.39820864076891, "learning_rate": 2.961302400966363e-07, "logps/chosen": -83.63417053222656, "logps/rejected": -89.096435546875, "loss": 0.6399, "losses/dpo": 0.7351549863815308, "losses/sft": 0.6155878901481628, "losses/total": 0.7351549863815308, "ref_logps/chosen": -81.05464172363281, "ref_logps/rejected": -84.85009002685547, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2579520642757416, "rewards/margins": 0.16668176651000977, "rewards/rejected": -0.42463386058807373, "step": 612 }, { "epoch": 0.4585749018141014, "grad_norm": 96.39037227118249, "learning_rate": 2.9553403408117844e-07, "logps/chosen": -107.87495422363281, "logps/rejected": -120.14285278320312, "loss": 0.6147, "losses/dpo": 0.4879547953605652, "losses/sft": 0.8243243098258972, "losses/total": 0.4879547953605652, "ref_logps/chosen": -104.35291290283203, "ref_logps/rejected": -113.98173522949219, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3522043228149414, "rewards/margins": 0.26390811800956726, "rewards/rejected": -0.616112470626831, "step": 613 }, { "epoch": 0.4593229848513185, "grad_norm": 63.095973897192664, "learning_rate": 2.949375600895827e-07, "logps/chosen": -91.67424011230469, "logps/rejected": -97.52037811279297, "loss": 0.6445, "losses/dpo": 0.5908305644989014, "losses/sft": 0.9467368125915527, "losses/total": 0.5908305644989014, "ref_logps/chosen": -87.92755126953125, "ref_logps/rejected": -91.97042846679688, "rewards/accuracies": 0.59375, "rewards/chosen": -0.37466809153556824, "rewards/margins": 0.18032705783843994, "rewards/rejected": -0.5549952387809753, "step": 614 }, { "epoch": 0.46007106788853563, "grad_norm": 56.01399633362238, "learning_rate": 2.943408216322077e-07, "logps/chosen": -108.11190795898438, "logps/rejected": -111.06094360351562, "loss": 0.6069, "losses/dpo": 0.5386508107185364, "losses/sft": 0.58930903673172, "losses/total": 0.5386508107185364, "ref_logps/chosen": -105.25895690917969, "ref_logps/rejected": -105.18547821044922, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2852940559387207, "rewards/margins": 0.30225181579589844, "rewards/rejected": -0.5875458717346191, "step": 615 }, { "epoch": 0.46081915092575276, "grad_norm": 80.42153873080547, "learning_rate": 2.9374382222096885e-07, "logps/chosen": -83.96395111083984, "logps/rejected": -97.21951293945312, "loss": 0.637, "losses/dpo": 0.6527711153030396, "losses/sft": 1.2284557819366455, "losses/total": 0.6527711153030396, "ref_logps/chosen": -80.24812316894531, "ref_logps/rejected": -91.61285400390625, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3715823292732239, "rewards/margins": 0.18908309936523438, "rewards/rejected": -0.5606654286384583, "step": 616 }, { "epoch": 0.4615672339629699, "grad_norm": 64.93808742400691, "learning_rate": 2.93146565369317e-07, "logps/chosen": -109.4530029296875, "logps/rejected": -124.95814514160156, "loss": 0.5482, "losses/dpo": 0.57236647605896, "losses/sft": 0.9468920230865479, "losses/total": 0.57236647605896, "ref_logps/chosen": -106.29130554199219, "ref_logps/rejected": -117.29688262939453, "rewards/accuracies": 0.625, "rewards/chosen": -0.3161698877811432, "rewards/margins": 0.44995760917663574, "rewards/rejected": -0.7661274671554565, "step": 617 }, { "epoch": 0.462315317000187, "grad_norm": 66.967899617943, "learning_rate": 2.9254905459221814e-07, "logps/chosen": -103.9727783203125, "logps/rejected": -121.77484893798828, "loss": 0.6245, "losses/dpo": 0.5996555089950562, "losses/sft": 0.8984191417694092, "losses/total": 0.5996555089950562, "ref_logps/chosen": -101.27645111083984, "ref_logps/rejected": -117.02986145019531, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2696329951286316, "rewards/margins": 0.2048652619123459, "rewards/rejected": -0.47449830174446106, "step": 618 }, { "epoch": 0.46306340003740415, "grad_norm": 73.79468913882505, "learning_rate": 2.919512934061329e-07, "logps/chosen": -83.04483795166016, "logps/rejected": -88.12823486328125, "loss": 0.6501, "losses/dpo": 0.43642163276672363, "losses/sft": 0.8354369401931763, "losses/total": 0.43642163276672363, "ref_logps/chosen": -79.55371856689453, "ref_logps/rejected": -82.95499420166016, "rewards/accuracies": 0.5625, "rewards/chosen": -0.34911102056503296, "rewards/margins": 0.16821226477622986, "rewards/rejected": -0.5173232555389404, "step": 619 }, { "epoch": 0.4638114830746213, "grad_norm": 63.840839216935485, "learning_rate": 2.9135328532899536e-07, "logps/chosen": -90.75587463378906, "logps/rejected": -100.31360626220703, "loss": 0.5935, "losses/dpo": 0.5329927206039429, "losses/sft": 1.072456955909729, "losses/total": 0.5329927206039429, "ref_logps/chosen": -88.20568084716797, "ref_logps/rejected": -94.59412384033203, "rewards/accuracies": 0.65625, "rewards/chosen": -0.25501954555511475, "rewards/margins": 0.31692925095558167, "rewards/rejected": -0.571948766708374, "step": 620 }, { "epoch": 0.4645595661118384, "grad_norm": 63.90669034698916, "learning_rate": 2.9075503388019267e-07, "logps/chosen": -73.2017822265625, "logps/rejected": -81.55227661132812, "loss": 0.5888, "losses/dpo": 0.5857675671577454, "losses/sft": 0.7977795600891113, "losses/total": 0.5857675671577454, "ref_logps/chosen": -71.20689392089844, "ref_logps/rejected": -76.33358001708984, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19948861002922058, "rewards/margins": 0.32238221168518066, "rewards/rejected": -0.5218708515167236, "step": 621 }, { "epoch": 0.46530764914905554, "grad_norm": 54.310185741991255, "learning_rate": 2.9015654258054426e-07, "logps/chosen": -85.47588348388672, "logps/rejected": -87.39404296875, "loss": 0.6299, "losses/dpo": 0.7449665069580078, "losses/sft": 0.7726209163665771, "losses/total": 0.7449665069580078, "ref_logps/chosen": -82.31523132324219, "ref_logps/rejected": -82.18983459472656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.31606534123420715, "rewards/margins": 0.20435620844364166, "rewards/rejected": -0.5204215049743652, "step": 622 }, { "epoch": 0.46605573218627266, "grad_norm": 61.3448284752012, "learning_rate": 2.8955781495228126e-07, "logps/chosen": -108.13063049316406, "logps/rejected": -115.95240783691406, "loss": 0.6168, "losses/dpo": 0.8445759415626526, "losses/sft": 0.8298963904380798, "losses/total": 0.8445759415626526, "ref_logps/chosen": -105.2891845703125, "ref_logps/rejected": -110.56217956542969, "rewards/accuracies": 0.65625, "rewards/chosen": -0.28414440155029297, "rewards/margins": 0.2548787593841553, "rewards/rejected": -0.539023220539093, "step": 623 }, { "epoch": 0.4668038152234898, "grad_norm": 58.50298011557671, "learning_rate": 2.8895885451902546e-07, "logps/chosen": -84.53626251220703, "logps/rejected": -87.92662048339844, "loss": 0.7239, "losses/dpo": 0.7927569150924683, "losses/sft": 0.6557831764221191, "losses/total": 0.7927569150924683, "ref_logps/chosen": -80.91655731201172, "ref_logps/rejected": -84.25662994384766, "rewards/accuracies": 0.53125, "rewards/chosen": -0.36197036504745483, "rewards/margins": 0.0050286054611206055, "rewards/rejected": -0.36699897050857544, "step": 624 }, { "epoch": 0.4675518982607069, "grad_norm": 83.58018467437503, "learning_rate": 2.8835966480576875e-07, "logps/chosen": -95.48876190185547, "logps/rejected": -101.90271759033203, "loss": 0.6685, "losses/dpo": 0.704746663570404, "losses/sft": 0.7333493232727051, "losses/total": 0.704746663570404, "ref_logps/chosen": -91.4062271118164, "ref_logps/rejected": -96.44110107421875, "rewards/accuracies": 0.625, "rewards/chosen": -0.4082525968551636, "rewards/margins": 0.13790974020957947, "rewards/rejected": -0.5461623072624207, "step": 625 }, { "epoch": 0.46829998129792405, "grad_norm": 105.66362042854614, "learning_rate": 2.8776024933885244e-07, "logps/chosen": -79.38973236083984, "logps/rejected": -89.39334106445312, "loss": 0.651, "losses/dpo": 0.8554584980010986, "losses/sft": 0.7256470322608948, "losses/total": 0.8554584980010986, "ref_logps/chosen": -76.23359680175781, "ref_logps/rejected": -84.42933654785156, "rewards/accuracies": 0.75, "rewards/chosen": -0.31561318039894104, "rewards/margins": 0.18078818917274475, "rewards/rejected": -0.49640142917633057, "step": 626 }, { "epoch": 0.4690480643351412, "grad_norm": 104.61336127186574, "learning_rate": 2.871606116459465e-07, "logps/chosen": -87.3015365600586, "logps/rejected": -85.04820251464844, "loss": 0.6826, "losses/dpo": 0.622020959854126, "losses/sft": 0.7871265411376953, "losses/total": 0.622020959854126, "ref_logps/chosen": -84.03062438964844, "ref_logps/rejected": -80.83650207519531, "rewards/accuracies": 0.5, "rewards/chosen": -0.32709187269210815, "rewards/margins": 0.09407911449670792, "rewards/rejected": -0.42117100954055786, "step": 627 }, { "epoch": 0.4697961473723583, "grad_norm": 75.9480109330393, "learning_rate": 2.8656075525602876e-07, "logps/chosen": -82.5948486328125, "logps/rejected": -96.23271179199219, "loss": 0.582, "losses/dpo": 0.5040274262428284, "losses/sft": 1.381479263305664, "losses/total": 0.5040274262428284, "ref_logps/chosen": -80.4783935546875, "ref_logps/rejected": -91.20248413085938, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2116459161043167, "rewards/margins": 0.29137641191482544, "rewards/rejected": -0.5030223727226257, "step": 628 }, { "epoch": 0.47054423040957544, "grad_norm": 107.4382109737783, "learning_rate": 2.859606836993638e-07, "logps/chosen": -105.25887298583984, "logps/rejected": -102.4459228515625, "loss": 0.6604, "losses/dpo": 0.55781090259552, "losses/sft": 0.7668987512588501, "losses/total": 0.55781090259552, "ref_logps/chosen": -101.90347290039062, "ref_logps/rejected": -97.83383178710938, "rewards/accuracies": 0.625, "rewards/chosen": -0.3355400264263153, "rewards/margins": 0.12566983699798584, "rewards/rejected": -0.46120989322662354, "step": 629 }, { "epoch": 0.4712923134467926, "grad_norm": 85.35190253078721, "learning_rate": 2.85360400507483e-07, "logps/chosen": -100.90940856933594, "logps/rejected": -109.98139190673828, "loss": 0.6934, "losses/dpo": 0.6291580200195312, "losses/sft": 1.4324426651000977, "losses/total": 0.6291580200195312, "ref_logps/chosen": -97.3980941772461, "ref_logps/rejected": -105.80810546875, "rewards/accuracies": 0.53125, "rewards/chosen": -0.35113152861595154, "rewards/margins": 0.06619678437709808, "rewards/rejected": -0.4173282980918884, "step": 630 }, { "epoch": 0.47204039648400975, "grad_norm": 62.31897023240216, "learning_rate": 2.847599092131629e-07, "logps/chosen": -111.14581298828125, "logps/rejected": -122.2906494140625, "loss": 0.645, "losses/dpo": 0.5636265873908997, "losses/sft": 0.6017358303070068, "losses/total": 0.5636265873908997, "ref_logps/chosen": -107.1126708984375, "ref_logps/rejected": -116.34638214111328, "rewards/accuracies": 0.65625, "rewards/chosen": -0.40331438183784485, "rewards/margins": 0.19111253321170807, "rewards/rejected": -0.5944269299507141, "step": 631 }, { "epoch": 0.4727884795212269, "grad_norm": 72.68344049770214, "learning_rate": 2.841592133504047e-07, "logps/chosen": -111.66691589355469, "logps/rejected": -109.91522216796875, "loss": 0.6535, "losses/dpo": 0.6693626642227173, "losses/sft": 0.9479665756225586, "losses/total": 0.6693626642227173, "ref_logps/chosen": -108.19189453125, "ref_logps/rejected": -104.71543884277344, "rewards/accuracies": 0.625, "rewards/chosen": -0.3475017547607422, "rewards/margins": 0.1724775731563568, "rewards/rejected": -0.5199793577194214, "step": 632 }, { "epoch": 0.473536562558444, "grad_norm": 70.48722279875169, "learning_rate": 2.8355831645441387e-07, "logps/chosen": -89.65467834472656, "logps/rejected": -116.25346374511719, "loss": 0.5348, "losses/dpo": 0.5188152194023132, "losses/sft": 0.8509321212768555, "losses/total": 0.5188152194023132, "ref_logps/chosen": -87.34147644042969, "ref_logps/rejected": -108.99853515625, "rewards/accuracies": 0.75, "rewards/chosen": -0.23132039606571198, "rewards/margins": 0.4941716492176056, "rewards/rejected": -0.7254920601844788, "step": 633 }, { "epoch": 0.47428464559566114, "grad_norm": 69.20922984492414, "learning_rate": 2.829572220615787e-07, "logps/chosen": -79.7682113647461, "logps/rejected": -91.057373046875, "loss": 0.6504, "losses/dpo": 0.8790660500526428, "losses/sft": 0.677997350692749, "losses/total": 0.8790660500526428, "ref_logps/chosen": -77.14723205566406, "ref_logps/rejected": -87.10033416748047, "rewards/accuracies": 0.6875, "rewards/chosen": -0.26209691166877747, "rewards/margins": 0.13360729813575745, "rewards/rejected": -0.3957042396068573, "step": 634 }, { "epoch": 0.47503272863287826, "grad_norm": 70.01423400845641, "learning_rate": 2.823559337094499e-07, "logps/chosen": -100.03290557861328, "logps/rejected": -115.72378540039062, "loss": 0.637, "losses/dpo": 0.6210981011390686, "losses/sft": 0.5253406167030334, "losses/total": 0.6210981011390686, "ref_logps/chosen": -95.4913330078125, "ref_logps/rejected": -109.08293914794922, "rewards/accuracies": 0.625, "rewards/chosen": -0.4541570246219635, "rewards/margins": 0.20992791652679443, "rewards/rejected": -0.6640849113464355, "step": 635 }, { "epoch": 0.4757808116700954, "grad_norm": 76.98909401139194, "learning_rate": 2.8175445493671966e-07, "logps/chosen": -96.89875793457031, "logps/rejected": -102.63162994384766, "loss": 0.6509, "losses/dpo": 0.524368405342102, "losses/sft": 0.5325725078582764, "losses/total": 0.524368405342102, "ref_logps/chosen": -93.36029052734375, "ref_logps/rejected": -97.38685607910156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.35384610295295715, "rewards/margins": 0.1706310212612152, "rewards/rejected": -0.5244771242141724, "step": 636 }, { "epoch": 0.4765288947073125, "grad_norm": 73.69041541935093, "learning_rate": 2.8115278928320087e-07, "logps/chosen": -112.60880279541016, "logps/rejected": -126.40345764160156, "loss": 0.6292, "losses/dpo": 0.5563470125198364, "losses/sft": 0.9829625487327576, "losses/total": 0.5563470125198364, "ref_logps/chosen": -109.28243255615234, "ref_logps/rejected": -119.9560546875, "rewards/accuracies": 0.625, "rewards/chosen": -0.3326367735862732, "rewards/margins": 0.3121032416820526, "rewards/rejected": -0.6447399854660034, "step": 637 }, { "epoch": 0.47727697774452965, "grad_norm": 104.62322039440873, "learning_rate": 2.8055094028980616e-07, "logps/chosen": -112.68001556396484, "logps/rejected": -118.03590393066406, "loss": 0.5991, "losses/dpo": 0.5715678930282593, "losses/sft": 1.4588537216186523, "losses/total": 0.5715678930282593, "ref_logps/chosen": -110.20771026611328, "ref_logps/rejected": -112.77491760253906, "rewards/accuracies": 0.625, "rewards/chosen": -0.24723094701766968, "rewards/margins": 0.2788682281970978, "rewards/rejected": -0.5260992050170898, "step": 638 }, { "epoch": 0.4780250607817468, "grad_norm": 66.86159671979887, "learning_rate": 2.7994891149852705e-07, "logps/chosen": -87.70005798339844, "logps/rejected": -100.07765197753906, "loss": 0.6435, "losses/dpo": 0.4684246778488159, "losses/sft": 0.37634074687957764, "losses/total": 0.4684246778488159, "ref_logps/chosen": -83.86164855957031, "ref_logps/rejected": -94.23680114746094, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3838408589363098, "rewards/margins": 0.20024389028549194, "rewards/rejected": -0.5840847492218018, "step": 639 }, { "epoch": 0.4787731438189639, "grad_norm": 59.5173084023962, "learning_rate": 2.793467064524136e-07, "logps/chosen": -89.32329559326172, "logps/rejected": -99.15227508544922, "loss": 0.5932, "losses/dpo": 0.5397840142250061, "losses/sft": 1.1670854091644287, "losses/total": 0.5397840142250061, "ref_logps/chosen": -86.52674102783203, "ref_logps/rejected": -93.6219711303711, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2796555161476135, "rewards/margins": 0.2733749747276306, "rewards/rejected": -0.5530304908752441, "step": 640 }, { "epoch": 0.47952122685618104, "grad_norm": 67.13503848122991, "learning_rate": 2.7874432869555275e-07, "logps/chosen": -91.361572265625, "logps/rejected": -89.38398742675781, "loss": 0.6604, "losses/dpo": 0.7017619609832764, "losses/sft": 1.1407450437545776, "losses/total": 0.7017619609832764, "ref_logps/chosen": -87.50657653808594, "ref_logps/rejected": -84.03380584716797, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3854999840259552, "rewards/margins": 0.14951710402965546, "rewards/rejected": -0.5350170731544495, "step": 641 }, { "epoch": 0.48026930989339817, "grad_norm": 103.67459436341412, "learning_rate": 2.781417817730481e-07, "logps/chosen": -98.46597290039062, "logps/rejected": -119.8273696899414, "loss": 0.6734, "losses/dpo": 0.7204196453094482, "losses/sft": 1.3199150562286377, "losses/total": 0.7204196453094482, "ref_logps/chosen": -93.54175567626953, "ref_logps/rejected": -113.93548583984375, "rewards/accuracies": 0.53125, "rewards/chosen": -0.49242323637008667, "rewards/margins": 0.09676433354616165, "rewards/rejected": -0.5891875624656677, "step": 642 }, { "epoch": 0.4810173929306153, "grad_norm": 57.06441298220728, "learning_rate": 2.7753906923099864e-07, "logps/chosen": -92.265869140625, "logps/rejected": -105.91110229492188, "loss": 0.6061, "losses/dpo": 0.6111069321632385, "losses/sft": 1.1844030618667603, "losses/total": 0.6111069321632385, "ref_logps/chosen": -89.45256042480469, "ref_logps/rejected": -100.34557342529297, "rewards/accuracies": 0.65625, "rewards/chosen": -0.28132951259613037, "rewards/margins": 0.2752232849597931, "rewards/rejected": -0.5565527677536011, "step": 643 }, { "epoch": 0.4817654759678324, "grad_norm": 69.2383306703808, "learning_rate": 2.769361946164782e-07, "logps/chosen": -110.30276489257812, "logps/rejected": -109.08043670654297, "loss": 0.6247, "losses/dpo": 0.6310604810714722, "losses/sft": 0.27715224027633667, "losses/total": 0.6310604810714722, "ref_logps/chosen": -106.37232971191406, "ref_logps/rejected": -102.97792053222656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3930424451828003, "rewards/margins": 0.21720948815345764, "rewards/rejected": -0.6102519035339355, "step": 644 }, { "epoch": 0.48251355900504955, "grad_norm": 70.84051869767379, "learning_rate": 2.763331614775146e-07, "logps/chosen": -103.56897735595703, "logps/rejected": -112.4870376586914, "loss": 0.6395, "losses/dpo": 0.5490050315856934, "losses/sft": 0.8892979621887207, "losses/total": 0.5490050315856934, "ref_logps/chosen": -100.91931915283203, "ref_logps/rejected": -108.16096496582031, "rewards/accuracies": 0.65625, "rewards/chosen": -0.26496621966362, "rewards/margins": 0.1676405370235443, "rewards/rejected": -0.4326067864894867, "step": 645 }, { "epoch": 0.4832616420422667, "grad_norm": 69.66661477277123, "learning_rate": 2.757299733630681e-07, "logps/chosen": -78.86598205566406, "logps/rejected": -93.98143005371094, "loss": 0.616, "losses/dpo": 0.49852198362350464, "losses/sft": 0.527184009552002, "losses/total": 0.49852198362350464, "ref_logps/chosen": -75.46049499511719, "ref_logps/rejected": -87.98657989501953, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3405489921569824, "rewards/margins": 0.25893649458885193, "rewards/rejected": -0.599485456943512, "step": 646 }, { "epoch": 0.4840097250794838, "grad_norm": 103.39403468472554, "learning_rate": 2.7512663382301153e-07, "logps/chosen": -109.42111206054688, "logps/rejected": -102.67023468017578, "loss": 0.7114, "losses/dpo": 0.5247329473495483, "losses/sft": 0.6042976975440979, "losses/total": 0.5247329473495483, "ref_logps/chosen": -104.51200866699219, "ref_logps/rejected": -97.28778076171875, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4909093379974365, "rewards/margins": 0.04733698442578316, "rewards/rejected": -0.5382462739944458, "step": 647 }, { "epoch": 0.48475780811670094, "grad_norm": 90.5803643424097, "learning_rate": 2.7452314640810866e-07, "logps/chosen": -85.43313598632812, "logps/rejected": -86.32101440429688, "loss": 0.7375, "losses/dpo": 0.7320919036865234, "losses/sft": 1.0480103492736816, "losses/total": 0.7320919036865234, "ref_logps/chosen": -81.82070922851562, "ref_logps/rejected": -83.10172271728516, "rewards/accuracies": 0.46875, "rewards/chosen": -0.3612425923347473, "rewards/margins": -0.03931254893541336, "rewards/rejected": -0.32193008065223694, "step": 648 }, { "epoch": 0.48550589115391807, "grad_norm": 57.50582230985903, "learning_rate": 2.739195146699935e-07, "logps/chosen": -85.98284912109375, "logps/rejected": -87.46293640136719, "loss": 0.6653, "losses/dpo": 0.54607093334198, "losses/sft": 1.4009051322937012, "losses/total": 0.54607093334198, "ref_logps/chosen": -82.33602905273438, "ref_logps/rejected": -82.79840850830078, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36468201875686646, "rewards/margins": 0.10177057981491089, "rewards/rejected": -0.46645259857177734, "step": 649 }, { "epoch": 0.4862539741911352, "grad_norm": 57.006390062830725, "learning_rate": 2.7331574216114963e-07, "logps/chosen": -124.98674011230469, "logps/rejected": -134.55123901367188, "loss": 0.6878, "losses/dpo": 0.43806785345077515, "losses/sft": 1.0159426927566528, "losses/total": 0.43806785345077515, "ref_logps/chosen": -119.58325958251953, "ref_logps/rejected": -127.65470886230469, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5403473377227783, "rewards/margins": 0.14930513501167297, "rewards/rejected": -0.6896525621414185, "step": 650 }, { "epoch": 0.4870020572283523, "grad_norm": 67.96337493338913, "learning_rate": 2.727118324348888e-07, "logps/chosen": -77.57115936279297, "logps/rejected": -78.29637145996094, "loss": 0.5905, "losses/dpo": 0.6331087350845337, "losses/sft": 0.9575177431106567, "losses/total": 0.6331087350845337, "ref_logps/chosen": -74.26507568359375, "ref_logps/rejected": -71.84017181396484, "rewards/accuracies": 0.6875, "rewards/chosen": -0.33060845732688904, "rewards/margins": 0.31501132249832153, "rewards/rejected": -0.645619809627533, "step": 651 }, { "epoch": 0.48775014026556945, "grad_norm": 51.87012467052567, "learning_rate": 2.721077890453305e-07, "logps/chosen": -80.28841400146484, "logps/rejected": -94.88671875, "loss": 0.5744, "losses/dpo": 0.6046202182769775, "losses/sft": 0.8270553350448608, "losses/total": 0.6046202182769775, "ref_logps/chosen": -76.90766906738281, "ref_logps/rejected": -87.86087036132812, "rewards/accuracies": 0.6875, "rewards/chosen": -0.33807435631752014, "rewards/margins": 0.3645097613334656, "rewards/rejected": -0.7025840878486633, "step": 652 }, { "epoch": 0.4884982233027866, "grad_norm": 78.34932247957101, "learning_rate": 2.7150361554738075e-07, "logps/chosen": -82.85256958007812, "logps/rejected": -88.84716033935547, "loss": 0.6019, "losses/dpo": 0.5671592354774475, "losses/sft": 0.7192299365997314, "losses/total": 0.5671592354774475, "ref_logps/chosen": -80.37715911865234, "ref_logps/rejected": -83.43225860595703, "rewards/accuracies": 0.71875, "rewards/chosen": -0.24754050374031067, "rewards/margins": 0.29395002126693726, "rewards/rejected": -0.5414905548095703, "step": 653 }, { "epoch": 0.48924630634000377, "grad_norm": 81.99710729195296, "learning_rate": 2.7089931549671153e-07, "logps/chosen": -96.03602600097656, "logps/rejected": -105.89593505859375, "loss": 0.645, "losses/dpo": 0.6052750945091248, "losses/sft": 0.6013673543930054, "losses/total": 0.6052750945091248, "ref_logps/chosen": -92.5604248046875, "ref_logps/rejected": -100.4410400390625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.34756067395210266, "rewards/margins": 0.19792842864990234, "rewards/rejected": -0.5454891324043274, "step": 654 }, { "epoch": 0.4899943893772209, "grad_norm": 67.82294008956393, "learning_rate": 2.7029489244973927e-07, "logps/chosen": -79.21932220458984, "logps/rejected": -95.53096771240234, "loss": 0.663, "losses/dpo": 0.8186168074607849, "losses/sft": 1.2383259534835815, "losses/total": 0.8186168074607849, "ref_logps/chosen": -77.07193756103516, "ref_logps/rejected": -91.2093276977539, "rewards/accuracies": 0.5, "rewards/chosen": -0.21473780274391174, "rewards/margins": 0.21742649376392365, "rewards/rejected": -0.4321643114089966, "step": 655 }, { "epoch": 0.490742472414438, "grad_norm": 87.00914247203141, "learning_rate": 2.6969034996360445e-07, "logps/chosen": -101.4977035522461, "logps/rejected": -114.26630401611328, "loss": 0.5863, "losses/dpo": 0.8372029066085815, "losses/sft": 1.6462410688400269, "losses/total": 0.8372029066085815, "ref_logps/chosen": -97.58805847167969, "ref_logps/rejected": -107.25834655761719, "rewards/accuracies": 0.71875, "rewards/chosen": -0.39096391201019287, "rewards/margins": 0.309831827878952, "rewards/rejected": -0.7007957100868225, "step": 656 }, { "epoch": 0.49149055545165515, "grad_norm": 102.10219949787303, "learning_rate": 2.6908569159615033e-07, "logps/chosen": -108.42770385742188, "logps/rejected": -118.88230895996094, "loss": 0.5273, "losses/dpo": 0.4941050410270691, "losses/sft": 0.7011599540710449, "losses/total": 0.4941050410270691, "ref_logps/chosen": -106.49918365478516, "ref_logps/rejected": -112.74472045898438, "rewards/accuracies": 0.875, "rewards/chosen": -0.1928524225950241, "rewards/margins": 0.42090705037117004, "rewards/rejected": -0.6137595176696777, "step": 657 }, { "epoch": 0.4922386384888723, "grad_norm": 74.67251292257737, "learning_rate": 2.6848092090590246e-07, "logps/chosen": -78.27899932861328, "logps/rejected": -80.03541564941406, "loss": 0.6682, "losses/dpo": 0.7566322684288025, "losses/sft": 0.5758327841758728, "losses/total": 0.7566322684288025, "ref_logps/chosen": -75.083251953125, "ref_logps/rejected": -75.47998809814453, "rewards/accuracies": 0.59375, "rewards/chosen": -0.31957513093948364, "rewards/margins": 0.13596704602241516, "rewards/rejected": -0.4555421769618988, "step": 658 }, { "epoch": 0.4929867215260894, "grad_norm": 68.69569482765282, "learning_rate": 2.678760414520471e-07, "logps/chosen": -76.64510345458984, "logps/rejected": -88.01742553710938, "loss": 0.6334, "losses/dpo": 0.6530583500862122, "losses/sft": 0.5798169374465942, "losses/total": 0.6530583500862122, "ref_logps/chosen": -72.2594985961914, "ref_logps/rejected": -81.7091293334961, "rewards/accuracies": 0.5, "rewards/chosen": -0.4385610520839691, "rewards/margins": 0.1922685205936432, "rewards/rejected": -0.6308295726776123, "step": 659 }, { "epoch": 0.49373480456330654, "grad_norm": 57.54379810286909, "learning_rate": 2.6727105679441066e-07, "logps/chosen": -88.1878662109375, "logps/rejected": -96.45850372314453, "loss": 0.696, "losses/dpo": 0.7304913401603699, "losses/sft": 0.6091845035552979, "losses/total": 0.7304913401603699, "ref_logps/chosen": -84.26338195800781, "ref_logps/rejected": -91.76119232177734, "rewards/accuracies": 0.5, "rewards/chosen": -0.39244887232780457, "rewards/margins": 0.07728195190429688, "rewards/rejected": -0.46973085403442383, "step": 660 }, { "epoch": 0.49448288760052367, "grad_norm": 60.90999509160986, "learning_rate": 2.666659704934388e-07, "logps/chosen": -99.45357513427734, "logps/rejected": -110.97360229492188, "loss": 0.5898, "losses/dpo": 0.4070979952812195, "losses/sft": 0.6833369135856628, "losses/total": 0.4070979952812195, "ref_logps/chosen": -96.50969696044922, "ref_logps/rejected": -105.02960968017578, "rewards/accuracies": 0.75, "rewards/chosen": -0.29438796639442444, "rewards/margins": 0.3000109791755676, "rewards/rejected": -0.5943989753723145, "step": 661 }, { "epoch": 0.4952309706377408, "grad_norm": 55.945458452332396, "learning_rate": 2.6606078611017536e-07, "logps/chosen": -91.210693359375, "logps/rejected": -92.87973022460938, "loss": 0.6651, "losses/dpo": 0.6303207874298096, "losses/sft": 0.45312851667404175, "losses/total": 0.6303207874298096, "ref_logps/chosen": -88.07672119140625, "ref_logps/rejected": -87.91793060302734, "rewards/accuracies": 0.46875, "rewards/chosen": -0.31339672207832336, "rewards/margins": 0.18278446793556213, "rewards/rejected": -0.4961811900138855, "step": 662 }, { "epoch": 0.4959790536749579, "grad_norm": 70.74766097229319, "learning_rate": 2.6545550720624146e-07, "logps/chosen": -116.49992370605469, "logps/rejected": -122.80101776123047, "loss": 0.6232, "losses/dpo": 0.668146550655365, "losses/sft": 0.9113481044769287, "losses/total": 0.668146550655365, "ref_logps/chosen": -113.33161163330078, "ref_logps/rejected": -117.3842544555664, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3168312907218933, "rewards/margins": 0.2248448133468628, "rewards/rejected": -0.5416761040687561, "step": 663 }, { "epoch": 0.49672713671217505, "grad_norm": 55.107423397904455, "learning_rate": 2.648501373438142e-07, "logps/chosen": -67.83524322509766, "logps/rejected": -83.55148315429688, "loss": 0.6244, "losses/dpo": 0.758920431137085, "losses/sft": 0.5348086357116699, "losses/total": 0.758920431137085, "ref_logps/chosen": -65.28556823730469, "ref_logps/rejected": -78.53582763671875, "rewards/accuracies": 0.625, "rewards/chosen": -0.25496697425842285, "rewards/margins": 0.2465991973876953, "rewards/rejected": -0.5015661716461182, "step": 664 }, { "epoch": 0.4974752197493922, "grad_norm": 58.96655924355811, "learning_rate": 2.6424468008560636e-07, "logps/chosen": -104.88054656982422, "logps/rejected": -116.67018127441406, "loss": 0.6029, "losses/dpo": 0.43534040451049805, "losses/sft": 0.657228410243988, "losses/total": 0.43534040451049805, "ref_logps/chosen": -101.39546966552734, "ref_logps/rejected": -110.2042236328125, "rewards/accuracies": 0.625, "rewards/chosen": -0.34850865602493286, "rewards/margins": 0.2980863153934479, "rewards/rejected": -0.6465950012207031, "step": 665 }, { "epoch": 0.4982233027866093, "grad_norm": 72.53355015530622, "learning_rate": 2.6363913899484485e-07, "logps/chosen": -97.74931335449219, "logps/rejected": -105.62248229980469, "loss": 0.6665, "losses/dpo": 0.6227249503135681, "losses/sft": 0.8071325421333313, "losses/total": 0.6227249503135681, "ref_logps/chosen": -94.58921813964844, "ref_logps/rejected": -100.99276733398438, "rewards/accuracies": 0.625, "rewards/chosen": -0.3160099685192108, "rewards/margins": 0.14696191251277924, "rewards/rejected": -0.46297186613082886, "step": 666 }, { "epoch": 0.49897138582382644, "grad_norm": 62.759934037965884, "learning_rate": 2.6303351763524993e-07, "logps/chosen": -89.4681396484375, "logps/rejected": -103.35067749023438, "loss": 0.6273, "losses/dpo": 0.7424946427345276, "losses/sft": 0.992725133895874, "losses/total": 0.7424946427345276, "ref_logps/chosen": -85.66310119628906, "ref_logps/rejected": -97.38096618652344, "rewards/accuracies": 0.71875, "rewards/chosen": -0.38050276041030884, "rewards/margins": 0.2164691686630249, "rewards/rejected": -0.596971869468689, "step": 667 }, { "epoch": 0.49971946886104357, "grad_norm": 92.87885603979775, "learning_rate": 2.6242781957101453e-07, "logps/chosen": -100.13358306884766, "logps/rejected": -103.92334747314453, "loss": 0.5859, "losses/dpo": 0.504057765007019, "losses/sft": 0.6822654604911804, "losses/total": 0.504057765007019, "ref_logps/chosen": -97.08492279052734, "ref_logps/rejected": -97.73219299316406, "rewards/accuracies": 0.71875, "rewards/chosen": -0.30486512184143066, "rewards/margins": 0.31425052881240845, "rewards/rejected": -0.6191156506538391, "step": 668 }, { "epoch": 0.5004675518982608, "grad_norm": 72.70457191101475, "learning_rate": 2.618220483667825e-07, "logps/chosen": -114.6578598022461, "logps/rejected": -123.06483459472656, "loss": 0.6612, "losses/dpo": 0.6481232643127441, "losses/sft": 0.6576825976371765, "losses/total": 0.6481232643127441, "ref_logps/chosen": -110.91487121582031, "ref_logps/rejected": -117.30381774902344, "rewards/accuracies": 0.65625, "rewards/chosen": -0.37429869174957275, "rewards/margins": 0.20180293917655945, "rewards/rejected": -0.5761016607284546, "step": 669 }, { "epoch": 0.5012156349354778, "grad_norm": 71.23053312222893, "learning_rate": 2.6121620758762875e-07, "logps/chosen": -95.67866516113281, "logps/rejected": -102.78934478759766, "loss": 0.6265, "losses/dpo": 0.5072833299636841, "losses/sft": 0.7827895283699036, "losses/total": 0.5072833299636841, "ref_logps/chosen": -92.1739730834961, "ref_logps/rejected": -97.3194808959961, "rewards/accuracies": 0.75, "rewards/chosen": -0.35046863555908203, "rewards/margins": 0.1965176910161972, "rewards/rejected": -0.5469863414764404, "step": 670 }, { "epoch": 0.5012156349354778, "eval_logps/chosen": -39.127403259277344, "eval_logps/rejected": -44.93039321899414, "eval_loss": 0.6225560903549194, "eval_losses/dpo": 0.6291965842247009, "eval_losses/sft": 0.3258923292160034, "eval_losses/total": 0.6291965842247009, "eval_ref_logps/chosen": -35.788204193115234, "eval_ref_logps/rejected": -39.235660552978516, "eval_rewards/accuracies": 0.6400862336158752, "eval_rewards/chosen": -0.3339199423789978, "eval_rewards/margins": 0.23555341362953186, "eval_rewards/rejected": -0.569473385810852, "eval_runtime": 38.1077, "eval_samples_per_second": 12.15, "eval_steps_per_second": 1.522, "step": 670 }, { "epoch": 0.501963717972695, "grad_norm": 72.88068472411825, "learning_rate": 2.606103007990371e-07, "logps/chosen": -96.36322021484375, "logps/rejected": -100.99651336669922, "loss": 0.6179, "losses/dpo": 0.6000449061393738, "losses/sft": 0.48722758889198303, "losses/total": 0.6000449061393738, "ref_logps/chosen": -93.91279602050781, "ref_logps/rejected": -96.50965881347656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2450428009033203, "rewards/margins": 0.20364195108413696, "rewards/rejected": -0.4486847519874573, "step": 671 }, { "epoch": 0.5027118010099121, "grad_norm": 57.72381108622117, "learning_rate": 2.600043315668801e-07, "logps/chosen": -86.52278900146484, "logps/rejected": -97.85930633544922, "loss": 0.6435, "losses/dpo": 0.44575852155685425, "losses/sft": 0.7721992135047913, "losses/total": 0.44575852155685425, "ref_logps/chosen": -83.11795043945312, "ref_logps/rejected": -92.20844268798828, "rewards/accuracies": 0.625, "rewards/chosen": -0.340483158826828, "rewards/margins": 0.2246038317680359, "rewards/rejected": -0.5650869607925415, "step": 672 }, { "epoch": 0.5034598840471293, "grad_norm": 60.93248203059926, "learning_rate": 2.593983034573979e-07, "logps/chosen": -89.59574127197266, "logps/rejected": -89.29147338867188, "loss": 0.6615, "losses/dpo": 0.715979814529419, "losses/sft": 1.5634551048278809, "losses/total": 0.715979814529419, "ref_logps/chosen": -86.5543212890625, "ref_logps/rejected": -85.08010864257812, "rewards/accuracies": 0.71875, "rewards/chosen": -0.30414143204689026, "rewards/margins": 0.11699514091014862, "rewards/rejected": -0.4211365580558777, "step": 673 }, { "epoch": 0.5042079670843463, "grad_norm": 61.49579362517975, "learning_rate": 2.587922200371768e-07, "logps/chosen": -83.95984649658203, "logps/rejected": -98.97450256347656, "loss": 0.5574, "losses/dpo": 0.631995439529419, "losses/sft": 0.3142673969268799, "losses/total": 0.631995439529419, "ref_logps/chosen": -81.01990509033203, "ref_logps/rejected": -92.45726013183594, "rewards/accuracies": 0.8125, "rewards/chosen": -0.29399412870407104, "rewards/margins": 0.3577296733856201, "rewards/rejected": -0.6517238020896912, "step": 674 }, { "epoch": 0.5049560501215635, "grad_norm": 78.47926916958446, "learning_rate": 2.581860848731291e-07, "logps/chosen": -112.81756591796875, "logps/rejected": -122.94268798828125, "loss": 0.6129, "losses/dpo": 0.6472572088241577, "losses/sft": 0.8743788003921509, "losses/total": 0.6472572088241577, "ref_logps/chosen": -108.91326904296875, "ref_logps/rejected": -116.67937469482422, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3904297649860382, "rewards/margins": 0.2359020709991455, "rewards/rejected": -0.6263318061828613, "step": 675 }, { "epoch": 0.5057041331587806, "grad_norm": 71.29531217254583, "learning_rate": 2.5757990153247124e-07, "logps/chosen": -89.78809356689453, "logps/rejected": -94.52871704101562, "loss": 0.6983, "losses/dpo": 0.6018109321594238, "losses/sft": 1.6305218935012817, "losses/total": 0.6018109321594238, "ref_logps/chosen": -85.9620361328125, "ref_logps/rejected": -90.31598663330078, "rewards/accuracies": 0.4375, "rewards/chosen": -0.38260579109191895, "rewards/margins": 0.038666822016239166, "rewards/rejected": -0.4212726056575775, "step": 676 }, { "epoch": 0.5064522161959978, "grad_norm": 59.857669687537644, "learning_rate": 2.5697367358270323e-07, "logps/chosen": -97.23737335205078, "logps/rejected": -113.94136810302734, "loss": 0.5534, "losses/dpo": 0.5108659863471985, "losses/sft": 0.8576421141624451, "losses/total": 0.5108659863471985, "ref_logps/chosen": -94.78330993652344, "ref_logps/rejected": -108.00068664550781, "rewards/accuracies": 0.84375, "rewards/chosen": -0.24540656805038452, "rewards/margins": 0.3486618101596832, "rewards/rejected": -0.5940684080123901, "step": 677 }, { "epoch": 0.5072002992332149, "grad_norm": 70.01708887498528, "learning_rate": 2.5636740459158773e-07, "logps/chosen": -87.63397216796875, "logps/rejected": -92.24628448486328, "loss": 0.6769, "losses/dpo": 0.664787769317627, "losses/sft": 0.6448372602462769, "losses/total": 0.664787769317627, "ref_logps/chosen": -84.03873443603516, "ref_logps/rejected": -87.47845458984375, "rewards/accuracies": 0.625, "rewards/chosen": -0.35952410101890564, "rewards/margins": 0.1172589659690857, "rewards/rejected": -0.47678306698799133, "step": 678 }, { "epoch": 0.507948382270432, "grad_norm": 78.91144047652301, "learning_rate": 2.55761098127129e-07, "logps/chosen": -113.3352279663086, "logps/rejected": -130.6089630126953, "loss": 0.6385, "losses/dpo": 0.4341987073421478, "losses/sft": 1.2237602472305298, "losses/total": 0.4341987073421478, "ref_logps/chosen": -110.07793426513672, "ref_logps/rejected": -125.04549407958984, "rewards/accuracies": 0.625, "rewards/chosen": -0.32572996616363525, "rewards/margins": 0.23061636090278625, "rewards/rejected": -0.5563463568687439, "step": 679 }, { "epoch": 0.5086964653076491, "grad_norm": 70.43702796432616, "learning_rate": 2.551547577575517e-07, "logps/chosen": -82.06181335449219, "logps/rejected": -97.10562133789062, "loss": 0.6626, "losses/dpo": 0.5204773545265198, "losses/sft": 0.9677653908729553, "losses/total": 0.5204773545265198, "ref_logps/chosen": -78.19664764404297, "ref_logps/rejected": -91.50259399414062, "rewards/accuracies": 0.5625, "rewards/chosen": -0.386516273021698, "rewards/margins": 0.17378658056259155, "rewards/rejected": -0.5603028535842896, "step": 680 }, { "epoch": 0.5094445483448663, "grad_norm": 61.746686622590595, "learning_rate": 2.545483870512799e-07, "logps/chosen": -108.64253234863281, "logps/rejected": -116.38257598876953, "loss": 0.6271, "losses/dpo": 0.7151970863342285, "losses/sft": 1.092226505279541, "losses/total": 0.7151970863342285, "ref_logps/chosen": -105.2730484008789, "ref_logps/rejected": -110.80535125732422, "rewards/accuracies": 0.75, "rewards/chosen": -0.33694833517074585, "rewards/margins": 0.22077462077140808, "rewards/rejected": -0.5577229857444763, "step": 681 }, { "epoch": 0.5101926313820834, "grad_norm": 66.65538119480017, "learning_rate": 2.5394198957691655e-07, "logps/chosen": -94.84297180175781, "logps/rejected": -104.7146224975586, "loss": 0.6229, "losses/dpo": 0.747527003288269, "losses/sft": 0.5363923907279968, "losses/total": 0.747527003288269, "ref_logps/chosen": -91.75859069824219, "ref_logps/rejected": -99.07186889648438, "rewards/accuracies": 0.65625, "rewards/chosen": -0.30843865871429443, "rewards/margins": 0.25583750009536743, "rewards/rejected": -0.5642761588096619, "step": 682 }, { "epoch": 0.5109407144193006, "grad_norm": 54.6875925292186, "learning_rate": 2.533355689032218e-07, "logps/chosen": -84.684326171875, "logps/rejected": -92.05369567871094, "loss": 0.6339, "losses/dpo": 0.49828362464904785, "losses/sft": 0.5583574771881104, "losses/total": 0.49828362464904785, "ref_logps/chosen": -81.28089141845703, "ref_logps/rejected": -86.49082946777344, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3403439223766327, "rewards/margins": 0.2159425914287567, "rewards/rejected": -0.5562864542007446, "step": 683 }, { "epoch": 0.5116887974565176, "grad_norm": 90.7760485001012, "learning_rate": 2.5272912859909267e-07, "logps/chosen": -109.61640167236328, "logps/rejected": -105.40859985351562, "loss": 0.7092, "losses/dpo": 0.7424784898757935, "losses/sft": 1.0770177841186523, "losses/total": 0.7424784898757935, "ref_logps/chosen": -105.1974868774414, "ref_logps/rejected": -100.43496704101562, "rewards/accuracies": 0.5, "rewards/chosen": -0.44189149141311646, "rewards/margins": 0.055472902953624725, "rewards/rejected": -0.4973644018173218, "step": 684 }, { "epoch": 0.5124368804937348, "grad_norm": 49.86581851706285, "learning_rate": 2.521226722335414e-07, "logps/chosen": -66.9931869506836, "logps/rejected": -80.81024169921875, "loss": 0.6039, "losses/dpo": 0.49564677476882935, "losses/sft": 0.6097928285598755, "losses/total": 0.49564677476882935, "ref_logps/chosen": -64.9532699584961, "ref_logps/rejected": -75.63845825195312, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2039916217327118, "rewards/margins": 0.31318604946136475, "rewards/rejected": -0.5171776413917542, "step": 685 }, { "epoch": 0.5131849635309519, "grad_norm": 136.90087241987808, "learning_rate": 2.515162033756749e-07, "logps/chosen": -75.35491943359375, "logps/rejected": -83.08030700683594, "loss": 0.6241, "losses/dpo": 0.5287154912948608, "losses/sft": 0.2265559881925583, "losses/total": 0.5287154912948608, "ref_logps/chosen": -71.79147338867188, "ref_logps/rejected": -77.27041625976562, "rewards/accuracies": 0.71875, "rewards/chosen": -0.35634520649909973, "rewards/margins": 0.22464412450790405, "rewards/rejected": -0.5809893608093262, "step": 686 }, { "epoch": 0.5139330465681691, "grad_norm": 65.7129260651097, "learning_rate": 2.509097255946736e-07, "logps/chosen": -110.98150634765625, "logps/rejected": -115.03165435791016, "loss": 0.6681, "losses/dpo": 0.7933095693588257, "losses/sft": 0.5781087875366211, "losses/total": 0.7933095693588257, "ref_logps/chosen": -107.127685546875, "ref_logps/rejected": -109.62287902832031, "rewards/accuracies": 0.46875, "rewards/chosen": -0.3853823244571686, "rewards/margins": 0.1554957926273346, "rewards/rejected": -0.5408781170845032, "step": 687 }, { "epoch": 0.5146811296053861, "grad_norm": 78.35204138320849, "learning_rate": 2.5030324245977035e-07, "logps/chosen": -94.81838989257812, "logps/rejected": -95.92982482910156, "loss": 0.6571, "losses/dpo": 0.4613385796546936, "losses/sft": 0.805184006690979, "losses/total": 0.4613385796546936, "ref_logps/chosen": -90.8537826538086, "ref_logps/rejected": -90.43589782714844, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3964606523513794, "rewards/margins": 0.15293201804161072, "rewards/rejected": -0.5493927001953125, "step": 688 }, { "epoch": 0.5154292126426033, "grad_norm": 68.38346843825452, "learning_rate": 2.4969675754022963e-07, "logps/chosen": -88.98490905761719, "logps/rejected": -96.13491821289062, "loss": 0.6398, "losses/dpo": 0.5922999382019043, "losses/sft": 0.7085567116737366, "losses/total": 0.5922999382019043, "ref_logps/chosen": -85.92530822753906, "ref_logps/rejected": -90.35888671875, "rewards/accuracies": 0.625, "rewards/chosen": -0.30596017837524414, "rewards/margins": 0.2716427147388458, "rewards/rejected": -0.5776028633117676, "step": 689 }, { "epoch": 0.5161772956798205, "grad_norm": 282.5364231631355, "learning_rate": 2.4909027440532644e-07, "logps/chosen": -88.92112731933594, "logps/rejected": -95.29736328125, "loss": 0.6054, "losses/dpo": 0.5793170928955078, "losses/sft": 0.8071298599243164, "losses/total": 0.5793170928955078, "ref_logps/chosen": -85.38200378417969, "ref_logps/rejected": -88.45188903808594, "rewards/accuracies": 0.75, "rewards/chosen": -0.35391300916671753, "rewards/margins": 0.33063483238220215, "rewards/rejected": -0.6845478415489197, "step": 690 }, { "epoch": 0.5169253787170376, "grad_norm": 63.23294825130813, "learning_rate": 2.4848379662432513e-07, "logps/chosen": -81.29460144042969, "logps/rejected": -85.32061767578125, "loss": 0.6329, "losses/dpo": 0.624758243560791, "losses/sft": 1.0950003862380981, "losses/total": 0.624758243560791, "ref_logps/chosen": -77.89892578125, "ref_logps/rejected": -80.0139389038086, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3395668864250183, "rewards/margins": 0.19110140204429626, "rewards/rejected": -0.5306682586669922, "step": 691 }, { "epoch": 0.5176734617542548, "grad_norm": 67.80453973273413, "learning_rate": 2.4787732776645863e-07, "logps/chosen": -109.1640853881836, "logps/rejected": -115.73435974121094, "loss": 0.5836, "losses/dpo": 0.5772618055343628, "losses/sft": 0.8778402805328369, "losses/total": 0.5772618055343628, "ref_logps/chosen": -105.86685180664062, "ref_logps/rejected": -109.239013671875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.32972317934036255, "rewards/margins": 0.3198109269142151, "rewards/rejected": -0.6495341062545776, "step": 692 }, { "epoch": 0.5184215447914718, "grad_norm": 86.03629357754224, "learning_rate": 2.4727087140090736e-07, "logps/chosen": -94.00679016113281, "logps/rejected": -110.55882263183594, "loss": 0.5574, "losses/dpo": 0.6019448041915894, "losses/sft": 1.4256713390350342, "losses/total": 0.6019448041915894, "ref_logps/chosen": -90.89849090576172, "ref_logps/rejected": -104.05091857910156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3108289837837219, "rewards/margins": 0.3399624824523926, "rewards/rejected": -0.6507914662361145, "step": 693 }, { "epoch": 0.519169627828689, "grad_norm": 76.02676447357733, "learning_rate": 2.466644310967781e-07, "logps/chosen": -91.47792053222656, "logps/rejected": -92.13813781738281, "loss": 0.6589, "losses/dpo": 0.5310184955596924, "losses/sft": 1.0966370105743408, "losses/total": 0.5310184955596924, "ref_logps/chosen": -87.2587890625, "ref_logps/rejected": -86.52764892578125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4219134449958801, "rewards/margins": 0.13913552463054657, "rewards/rejected": -0.5610489845275879, "step": 694 }, { "epoch": 0.5199177108659061, "grad_norm": 68.75053685758961, "learning_rate": 2.460580104230835e-07, "logps/chosen": -84.37950134277344, "logps/rejected": -91.59773254394531, "loss": 0.7001, "losses/dpo": 0.6195641160011292, "losses/sft": 1.172484278678894, "losses/total": 0.6195641160011292, "ref_logps/chosen": -80.340087890625, "ref_logps/rejected": -86.96273040771484, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4039418697357178, "rewards/margins": 0.05955827236175537, "rewards/rejected": -0.4635002017021179, "step": 695 }, { "epoch": 0.5206657939031233, "grad_norm": 71.85238904810448, "learning_rate": 2.454516129487201e-07, "logps/chosen": -108.54081726074219, "logps/rejected": -105.09943389892578, "loss": 0.6327, "losses/dpo": 0.5990722179412842, "losses/sft": 1.5294528007507324, "losses/total": 0.5990722179412842, "ref_logps/chosen": -105.09336853027344, "ref_logps/rejected": -99.77605438232422, "rewards/accuracies": 0.65625, "rewards/chosen": -0.34474581480026245, "rewards/margins": 0.18759159743785858, "rewards/rejected": -0.5323374271392822, "step": 696 }, { "epoch": 0.5214138769403404, "grad_norm": 57.53921952546675, "learning_rate": 2.448452422424484e-07, "logps/chosen": -77.7223129272461, "logps/rejected": -97.03621673583984, "loss": 0.6221, "losses/dpo": 0.5358192920684814, "losses/sft": 1.1361356973648071, "losses/total": 0.5358192920684814, "ref_logps/chosen": -73.69986724853516, "ref_logps/rejected": -90.63489532470703, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4022449553012848, "rewards/margins": 0.23788690567016602, "rewards/rejected": -0.6401318311691284, "step": 697 }, { "epoch": 0.5221619599775575, "grad_norm": 61.11849989390787, "learning_rate": 2.4423890187287095e-07, "logps/chosen": -80.745361328125, "logps/rejected": -90.81849670410156, "loss": 0.6316, "losses/dpo": 0.6917493343353271, "losses/sft": 0.4991472065448761, "losses/total": 0.6917493343353271, "ref_logps/chosen": -77.48837280273438, "ref_logps/rejected": -85.57796478271484, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3256998658180237, "rewards/margins": 0.198353111743927, "rewards/rejected": -0.5240530371665955, "step": 698 }, { "epoch": 0.5229100430147746, "grad_norm": 66.62509369977764, "learning_rate": 2.436325954084122e-07, "logps/chosen": -85.08863067626953, "logps/rejected": -100.91558837890625, "loss": 0.598, "losses/dpo": 0.5166934132575989, "losses/sft": 0.836710512638092, "losses/total": 0.5166934132575989, "ref_logps/chosen": -81.29307556152344, "ref_logps/rejected": -94.26953887939453, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3795556128025055, "rewards/margins": 0.28504958748817444, "rewards/rejected": -0.6646052002906799, "step": 699 }, { "epoch": 0.5236581260519918, "grad_norm": 58.78128770485317, "learning_rate": 2.4302632641729675e-07, "logps/chosen": -92.14263916015625, "logps/rejected": -105.92510986328125, "loss": 0.6415, "losses/dpo": 0.6809870004653931, "losses/sft": 1.049342155456543, "losses/total": 0.6809870004653931, "ref_logps/chosen": -89.37870788574219, "ref_logps/rejected": -101.40763092041016, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2763928771018982, "rewards/margins": 0.17535445094108582, "rewards/rejected": -0.451747328042984, "step": 700 }, { "epoch": 0.5244062090892089, "grad_norm": 57.80399156799319, "learning_rate": 2.424200984675288e-07, "logps/chosen": -83.53791046142578, "logps/rejected": -87.5425033569336, "loss": 0.6146, "losses/dpo": 0.48310670256614685, "losses/sft": 1.006746530532837, "losses/total": 0.48310670256614685, "ref_logps/chosen": -80.137451171875, "ref_logps/rejected": -81.54119873046875, "rewards/accuracies": 0.625, "rewards/chosen": -0.3400461971759796, "rewards/margins": 0.26008379459381104, "rewards/rejected": -0.600130021572113, "step": 701 }, { "epoch": 0.5251542921264261, "grad_norm": 65.1435908692133, "learning_rate": 2.4181391512687096e-07, "logps/chosen": -90.80833435058594, "logps/rejected": -110.29483032226562, "loss": 0.5452, "losses/dpo": 0.3509484529495239, "losses/sft": 0.5017501711845398, "losses/total": 0.3509484529495239, "ref_logps/chosen": -87.31624603271484, "ref_logps/rejected": -102.33565521240234, "rewards/accuracies": 0.75, "rewards/chosen": -0.34920942783355713, "rewards/margins": 0.4467083513736725, "rewards/rejected": -0.795917809009552, "step": 702 }, { "epoch": 0.5259023751636431, "grad_norm": 115.5329242035023, "learning_rate": 2.412077799628231e-07, "logps/chosen": -89.86199951171875, "logps/rejected": -105.52149200439453, "loss": 0.5516, "losses/dpo": 0.5171655416488647, "losses/sft": 0.675730288028717, "losses/total": 0.5171655416488647, "ref_logps/chosen": -86.30116271972656, "ref_logps/rejected": -97.8641586303711, "rewards/accuracies": 0.8125, "rewards/chosen": -0.35608258843421936, "rewards/margins": 0.40965116024017334, "rewards/rejected": -0.7657338380813599, "step": 703 }, { "epoch": 0.5266504582008603, "grad_norm": 50.79868644082403, "learning_rate": 2.4060169654260214e-07, "logps/chosen": -77.38911437988281, "logps/rejected": -89.18460083007812, "loss": 0.587, "losses/dpo": 0.5247385501861572, "losses/sft": 0.5334351062774658, "losses/total": 0.5247385501861572, "ref_logps/chosen": -73.66905212402344, "ref_logps/rejected": -82.22248840332031, "rewards/accuracies": 0.65625, "rewards/chosen": -0.37200695276260376, "rewards/margins": 0.3242054283618927, "rewards/rejected": -0.6962123513221741, "step": 704 }, { "epoch": 0.5273985412380774, "grad_norm": 53.321453617167805, "learning_rate": 2.399956684331199e-07, "logps/chosen": -69.79513549804688, "logps/rejected": -87.66065979003906, "loss": 0.5903, "losses/dpo": 0.5406615734100342, "losses/sft": 0.7056463360786438, "losses/total": 0.5406615734100342, "ref_logps/chosen": -66.83075714111328, "ref_logps/rejected": -81.27027893066406, "rewards/accuracies": 0.75, "rewards/chosen": -0.29643839597702026, "rewards/margins": 0.34260010719299316, "rewards/rejected": -0.6390385627746582, "step": 705 }, { "epoch": 0.5281466242752946, "grad_norm": 79.82206273276564, "learning_rate": 2.3938969920096296e-07, "logps/chosen": -101.91224670410156, "logps/rejected": -111.30854797363281, "loss": 0.6283, "losses/dpo": 0.5982733368873596, "losses/sft": 0.775154173374176, "losses/total": 0.5982733368873596, "ref_logps/chosen": -97.85707092285156, "ref_logps/rejected": -104.5997085571289, "rewards/accuracies": 0.71875, "rewards/chosen": -0.40551674365997314, "rewards/margins": 0.2653675079345703, "rewards/rejected": -0.6708842515945435, "step": 706 }, { "epoch": 0.5288947073125116, "grad_norm": 54.56414825641577, "learning_rate": 2.3878379241237134e-07, "logps/chosen": -72.93849182128906, "logps/rejected": -93.69490814208984, "loss": 0.5227, "losses/dpo": 0.549423098564148, "losses/sft": 0.9043461680412292, "losses/total": 0.549423098564148, "ref_logps/chosen": -69.78181457519531, "ref_logps/rejected": -85.92265319824219, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3156678378582001, "rewards/margins": 0.46155813336372375, "rewards/rejected": -0.7772259712219238, "step": 707 }, { "epoch": 0.5296427903497288, "grad_norm": 78.46403213957088, "learning_rate": 2.381779516332174e-07, "logps/chosen": -103.76178741455078, "logps/rejected": -109.49440002441406, "loss": 0.6497, "losses/dpo": 0.5675169825553894, "losses/sft": 1.1026463508605957, "losses/total": 0.5675169825553894, "ref_logps/chosen": -99.96040344238281, "ref_logps/rejected": -104.37030029296875, "rewards/accuracies": 0.625, "rewards/chosen": -0.3801378607749939, "rewards/margins": 0.13227173686027527, "rewards/rejected": -0.5124095678329468, "step": 708 }, { "epoch": 0.5303908733869459, "grad_norm": 73.97603649613494, "learning_rate": 2.375721804289855e-07, "logps/chosen": -97.09569549560547, "logps/rejected": -101.75843811035156, "loss": 0.5748, "losses/dpo": 0.6392579674720764, "losses/sft": 1.3564532995224, "losses/total": 0.6392579674720764, "ref_logps/chosen": -92.99765014648438, "ref_logps/rejected": -93.54541015625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4098036587238312, "rewards/margins": 0.41150015592575073, "rewards/rejected": -0.8213037848472595, "step": 709 }, { "epoch": 0.5311389564241631, "grad_norm": 70.56390711933089, "learning_rate": 2.3696648236475005e-07, "logps/chosen": -78.31350708007812, "logps/rejected": -84.2790756225586, "loss": 0.582, "losses/dpo": 0.4544145166873932, "losses/sft": 0.5516341924667358, "losses/total": 0.4544145166873932, "ref_logps/chosen": -76.09793090820312, "ref_logps/rejected": -77.94314575195312, "rewards/accuracies": 0.65625, "rewards/chosen": -0.22155706584453583, "rewards/margins": 0.4120369553565979, "rewards/rejected": -0.6335939764976501, "step": 710 }, { "epoch": 0.5318870394613802, "grad_norm": 89.52965329329237, "learning_rate": 2.3636086100515523e-07, "logps/chosen": -78.86627197265625, "logps/rejected": -85.2455062866211, "loss": 0.6363, "losses/dpo": 0.8554917573928833, "losses/sft": 1.335742712020874, "losses/total": 0.8554917573928833, "ref_logps/chosen": -74.85356903076172, "ref_logps/rejected": -79.48191833496094, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4012700915336609, "rewards/margins": 0.17508837580680847, "rewards/rejected": -0.576358437538147, "step": 711 }, { "epoch": 0.5326351224985973, "grad_norm": 70.42278649938102, "learning_rate": 2.3575531991439361e-07, "logps/chosen": -131.61952209472656, "logps/rejected": -126.86418151855469, "loss": 0.6887, "losses/dpo": 0.7068346738815308, "losses/sft": 0.9871075749397278, "losses/total": 0.7068346738815308, "ref_logps/chosen": -127.0519790649414, "ref_logps/rejected": -121.030517578125, "rewards/accuracies": 0.53125, "rewards/chosen": -0.45675408840179443, "rewards/margins": 0.12661129236221313, "rewards/rejected": -0.5833654403686523, "step": 712 }, { "epoch": 0.5333832055358145, "grad_norm": 80.75194214177124, "learning_rate": 2.3514986265618577e-07, "logps/chosen": -91.47715759277344, "logps/rejected": -89.07122039794922, "loss": 0.6505, "losses/dpo": 0.5775810480117798, "losses/sft": 0.30990585684776306, "losses/total": 0.5775810480117798, "ref_logps/chosen": -87.92662048339844, "ref_logps/rejected": -83.57295227050781, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3550543487071991, "rewards/margins": 0.19477222859859467, "rewards/rejected": -0.5498265624046326, "step": 713 }, { "epoch": 0.5341312885730316, "grad_norm": 52.406835217242254, "learning_rate": 2.345444927937586e-07, "logps/chosen": -86.93862915039062, "logps/rejected": -97.16441345214844, "loss": 0.5917, "losses/dpo": 0.7890301942825317, "losses/sft": 0.8277908563613892, "losses/total": 0.7890301942825317, "ref_logps/chosen": -82.89651489257812, "ref_logps/rejected": -89.84542846679688, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4042123556137085, "rewards/margins": 0.3276847004890442, "rewards/rejected": -0.7318970561027527, "step": 714 }, { "epoch": 0.5348793716102488, "grad_norm": 59.93970688306222, "learning_rate": 2.3393921388982462e-07, "logps/chosen": -84.82798767089844, "logps/rejected": -95.9739990234375, "loss": 0.6249, "losses/dpo": 0.5692265033721924, "losses/sft": 0.7359537482261658, "losses/total": 0.5692265033721924, "ref_logps/chosen": -81.63194274902344, "ref_logps/rejected": -90.45500183105469, "rewards/accuracies": 0.8125, "rewards/chosen": -0.319603830575943, "rewards/margins": 0.2322961986064911, "rewards/rejected": -0.5519000291824341, "step": 715 }, { "epoch": 0.5356274546474659, "grad_norm": 73.92491179465178, "learning_rate": 2.3333402950656121e-07, "logps/chosen": -107.69517517089844, "logps/rejected": -123.24177551269531, "loss": 0.608, "losses/dpo": 0.5328341722488403, "losses/sft": 0.6241583228111267, "losses/total": 0.5328341722488403, "ref_logps/chosen": -104.22622680664062, "ref_logps/rejected": -117.18858337402344, "rewards/accuracies": 0.71875, "rewards/chosen": -0.34689462184906006, "rewards/margins": 0.25842517614364624, "rewards/rejected": -0.6053197979927063, "step": 716 }, { "epoch": 0.536375537684683, "grad_norm": 73.26707022639384, "learning_rate": 2.3272894320558935e-07, "logps/chosen": -86.48255920410156, "logps/rejected": -98.23998260498047, "loss": 0.6096, "losses/dpo": 0.5729621648788452, "losses/sft": 0.6427972316741943, "losses/total": 0.5729621648788452, "ref_logps/chosen": -82.32496643066406, "ref_logps/rejected": -91.10469055175781, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4157603979110718, "rewards/margins": 0.2977689206600189, "rewards/rejected": -0.7135293483734131, "step": 717 }, { "epoch": 0.5371236207219001, "grad_norm": 69.67269975679244, "learning_rate": 2.3212395854795292e-07, "logps/chosen": -103.0217056274414, "logps/rejected": -122.42633056640625, "loss": 0.6611, "losses/dpo": 0.8159101605415344, "losses/sft": 1.5428260564804077, "losses/total": 0.8159101605415344, "ref_logps/chosen": -98.18865966796875, "ref_logps/rejected": -116.20372772216797, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4833041727542877, "rewards/margins": 0.13895674049854279, "rewards/rejected": -0.6222609281539917, "step": 718 }, { "epoch": 0.5378717037591173, "grad_norm": 57.062565029479096, "learning_rate": 2.3151907909409754e-07, "logps/chosen": -85.75300598144531, "logps/rejected": -89.2967529296875, "loss": 0.6903, "losses/dpo": 0.5976507067680359, "losses/sft": 0.5204651355743408, "losses/total": 0.5976507067680359, "ref_logps/chosen": -81.48199462890625, "ref_logps/rejected": -84.35638427734375, "rewards/accuracies": 0.53125, "rewards/chosen": -0.42710080742836, "rewards/margins": 0.06693603098392487, "rewards/rejected": -0.49403685331344604, "step": 719 }, { "epoch": 0.5386197867963344, "grad_norm": 47.891846424351975, "learning_rate": 2.3091430840384962e-07, "logps/chosen": -79.11569213867188, "logps/rejected": -102.59222412109375, "loss": 0.542, "losses/dpo": 0.5298305153846741, "losses/sft": 1.2060401439666748, "losses/total": 0.5298305153846741, "ref_logps/chosen": -76.39502716064453, "ref_logps/rejected": -95.49906921386719, "rewards/accuracies": 0.75, "rewards/chosen": -0.27206623554229736, "rewards/margins": 0.4372493028640747, "rewards/rejected": -0.7093155980110168, "step": 720 }, { "epoch": 0.5393678698335516, "grad_norm": 68.05659093506043, "learning_rate": 2.3030965003639563e-07, "logps/chosen": -104.19457244873047, "logps/rejected": -115.70348358154297, "loss": 0.6103, "losses/dpo": 0.6236035823822021, "losses/sft": 1.2431223392486572, "losses/total": 0.6236035823822021, "ref_logps/chosen": -100.59078979492188, "ref_logps/rejected": -109.69145202636719, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3603784143924713, "rewards/margins": 0.24082490801811218, "rewards/rejected": -0.6012033224105835, "step": 721 }, { "epoch": 0.5401159528707686, "grad_norm": 74.14282243886903, "learning_rate": 2.297051075502607e-07, "logps/chosen": -109.65520477294922, "logps/rejected": -124.1748275756836, "loss": 0.5766, "losses/dpo": 0.6045805215835571, "losses/sft": 0.9817010760307312, "losses/total": 0.6045805215835571, "ref_logps/chosen": -105.50120544433594, "ref_logps/rejected": -116.75082397460938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41540050506591797, "rewards/margins": 0.32699957489967346, "rewards/rejected": -0.742400050163269, "step": 722 }, { "epoch": 0.5408640359079858, "grad_norm": 70.147961479906, "learning_rate": 2.2910068450328845e-07, "logps/chosen": -99.06129455566406, "logps/rejected": -99.15098571777344, "loss": 0.6599, "losses/dpo": 0.9183305501937866, "losses/sft": 0.9993204474449158, "losses/total": 0.9183305501937866, "ref_logps/chosen": -94.60346984863281, "ref_logps/rejected": -93.07196807861328, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4457828402519226, "rewards/margins": 0.16211912035942078, "rewards/rejected": -0.607901930809021, "step": 723 }, { "epoch": 0.5416121189452029, "grad_norm": 90.67841508659029, "learning_rate": 2.284963844526192e-07, "logps/chosen": -107.07978820800781, "logps/rejected": -105.13143920898438, "loss": 0.6185, "losses/dpo": 0.6027236580848694, "losses/sft": 0.9120121002197266, "losses/total": 0.6027236580848694, "ref_logps/chosen": -103.0954818725586, "ref_logps/rejected": -98.85100555419922, "rewards/accuracies": 0.65625, "rewards/chosen": -0.39842987060546875, "rewards/margins": 0.2296137809753418, "rewards/rejected": -0.6280436515808105, "step": 724 }, { "epoch": 0.5423602019824201, "grad_norm": 286.3495630616974, "learning_rate": 2.2789221095466956e-07, "logps/chosen": -80.08975982666016, "logps/rejected": -93.85139465332031, "loss": 0.5889, "losses/dpo": 0.7795612215995789, "losses/sft": 1.0739223957061768, "losses/total": 0.7795612215995789, "ref_logps/chosen": -77.70636749267578, "ref_logps/rejected": -88.33365631103516, "rewards/accuracies": 0.71875, "rewards/chosen": -0.23833975195884705, "rewards/margins": 0.31343382596969604, "rewards/rejected": -0.5517735481262207, "step": 725 }, { "epoch": 0.5431082850196371, "grad_norm": 109.03009225018002, "learning_rate": 2.2728816756511117e-07, "logps/chosen": -92.20936584472656, "logps/rejected": -90.03421020507812, "loss": 0.635, "losses/dpo": 0.6320152282714844, "losses/sft": 0.9238537549972534, "losses/total": 0.6320152282714844, "ref_logps/chosen": -88.59683990478516, "ref_logps/rejected": -84.49883270263672, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3612528443336487, "rewards/margins": 0.1922861784696579, "rewards/rejected": -0.553538978099823, "step": 726 }, { "epoch": 0.5438563680568543, "grad_norm": 93.2081375245067, "learning_rate": 2.2668425783885037e-07, "logps/chosen": -128.27464294433594, "logps/rejected": -138.5731201171875, "loss": 0.642, "losses/dpo": 0.5524704456329346, "losses/sft": 0.7206475138664246, "losses/total": 0.5524704456329346, "ref_logps/chosen": -124.0890121459961, "ref_logps/rejected": -131.81869506835938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4185630977153778, "rewards/margins": 0.25688081979751587, "rewards/rejected": -0.6754439473152161, "step": 727 }, { "epoch": 0.5446044510940714, "grad_norm": 50.250323944565906, "learning_rate": 2.2608048533000645e-07, "logps/chosen": -89.5003662109375, "logps/rejected": -93.81486511230469, "loss": 0.5523, "losses/dpo": 0.6750078201293945, "losses/sft": 0.9348331093788147, "losses/total": 0.6750078201293945, "ref_logps/chosen": -86.96153259277344, "ref_logps/rejected": -87.403564453125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2538829743862152, "rewards/margins": 0.38724735379219055, "rewards/rejected": -0.6411303877830505, "step": 728 }, { "epoch": 0.5453525341312886, "grad_norm": 61.59586862049005, "learning_rate": 2.2547685359189137e-07, "logps/chosen": -94.36911010742188, "logps/rejected": -105.14651489257812, "loss": 0.5912, "losses/dpo": 0.5413070917129517, "losses/sft": 0.9782027006149292, "losses/total": 0.5413070917129517, "ref_logps/chosen": -90.91891479492188, "ref_logps/rejected": -98.44121551513672, "rewards/accuracies": 0.625, "rewards/chosen": -0.34501904249191284, "rewards/margins": 0.32551148533821106, "rewards/rejected": -0.6705305576324463, "step": 729 }, { "epoch": 0.5461006171685057, "grad_norm": 70.95381931350806, "learning_rate": 2.2487336617698852e-07, "logps/chosen": -93.1278076171875, "logps/rejected": -108.15263366699219, "loss": 0.5673, "losses/dpo": 0.5824554562568665, "losses/sft": 0.7092291116714478, "losses/total": 0.5824554562568665, "ref_logps/chosen": -90.0436782836914, "ref_logps/rejected": -101.1503677368164, "rewards/accuracies": 0.65625, "rewards/chosen": -0.30841338634490967, "rewards/margins": 0.39181363582611084, "rewards/rejected": -0.7002270221710205, "step": 730 }, { "epoch": 0.5468487002057228, "grad_norm": 63.382647261750364, "learning_rate": 2.2427002663693187e-07, "logps/chosen": -100.72113037109375, "logps/rejected": -117.80908203125, "loss": 0.5495, "losses/dpo": 0.52482008934021, "losses/sft": 1.4687871932983398, "losses/total": 0.52482008934021, "ref_logps/chosen": -98.05984497070312, "ref_logps/rejected": -111.19479370117188, "rewards/accuracies": 0.71875, "rewards/chosen": -0.26612818241119385, "rewards/margins": 0.39530062675476074, "rewards/rejected": -0.6614288091659546, "step": 731 }, { "epoch": 0.5475967832429399, "grad_norm": 65.77079964670023, "learning_rate": 2.2366683852248545e-07, "logps/chosen": -96.79289245605469, "logps/rejected": -101.00920867919922, "loss": 0.7094, "losses/dpo": 0.711287260055542, "losses/sft": 0.21270640194416046, "losses/total": 0.711287260055542, "ref_logps/chosen": -91.46063232421875, "ref_logps/rejected": -95.33415985107422, "rewards/accuracies": 0.5, "rewards/chosen": -0.5332256555557251, "rewards/margins": 0.034278929233551025, "rewards/rejected": -0.5675046443939209, "step": 732 }, { "epoch": 0.5483448662801571, "grad_norm": 52.918007875201226, "learning_rate": 2.2306380538352177e-07, "logps/chosen": -106.34062194824219, "logps/rejected": -111.54022979736328, "loss": 0.5888, "losses/dpo": 0.5213849544525146, "losses/sft": 0.8818036913871765, "losses/total": 0.5213849544525146, "ref_logps/chosen": -104.25498962402344, "ref_logps/rejected": -106.55091094970703, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20856335759162903, "rewards/margins": 0.2903677821159363, "rewards/rejected": -0.4989311695098877, "step": 733 }, { "epoch": 0.5490929493173742, "grad_norm": 115.54773254282652, "learning_rate": 2.2246093076900142e-07, "logps/chosen": -87.37506103515625, "logps/rejected": -85.2151107788086, "loss": 0.6118, "losses/dpo": 0.5641258955001831, "losses/sft": 0.5849325060844421, "losses/total": 0.5641258955001831, "ref_logps/chosen": -83.98192596435547, "ref_logps/rejected": -79.51646423339844, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3393133580684662, "rewards/margins": 0.2305520474910736, "rewards/rejected": -0.5698654055595398, "step": 734 }, { "epoch": 0.5498410323545914, "grad_norm": 88.85148064748687, "learning_rate": 2.21858218226952e-07, "logps/chosen": -92.69469451904297, "logps/rejected": -99.46385192871094, "loss": 0.6623, "losses/dpo": 0.6105802655220032, "losses/sft": 1.0474035739898682, "losses/total": 0.6105802655220032, "ref_logps/chosen": -88.09894561767578, "ref_logps/rejected": -92.84990692138672, "rewards/accuracies": 0.59375, "rewards/chosen": -0.45957517623901367, "rewards/margins": 0.20181919634342194, "rewards/rejected": -0.6613944172859192, "step": 735 }, { "epoch": 0.5505891153918084, "grad_norm": 62.86797621156276, "learning_rate": 2.2125567130444723e-07, "logps/chosen": -97.19766235351562, "logps/rejected": -110.39530181884766, "loss": 0.6213, "losses/dpo": 0.5825662612915039, "losses/sft": 0.6707823276519775, "losses/total": 0.5825662612915039, "ref_logps/chosen": -93.45549774169922, "ref_logps/rejected": -104.14497375488281, "rewards/accuracies": 0.59375, "rewards/chosen": -0.37421679496765137, "rewards/margins": 0.2508149743080139, "rewards/rejected": -0.6250317096710205, "step": 736 }, { "epoch": 0.5513371984290256, "grad_norm": 70.091100879938, "learning_rate": 2.2065329354758638e-07, "logps/chosen": -95.3030014038086, "logps/rejected": -105.50682830810547, "loss": 0.6836, "losses/dpo": 0.4437069594860077, "losses/sft": 0.476842999458313, "losses/total": 0.4437069594860077, "ref_logps/chosen": -91.55587005615234, "ref_logps/rejected": -100.77605438232422, "rewards/accuracies": 0.59375, "rewards/chosen": -0.374713271856308, "rewards/margins": 0.09836406260728836, "rewards/rejected": -0.47307732701301575, "step": 737 }, { "epoch": 0.5520852814662428, "grad_norm": 118.29340148348805, "learning_rate": 2.200510885014729e-07, "logps/chosen": -70.60629272460938, "logps/rejected": -81.63068389892578, "loss": 0.5626, "losses/dpo": 0.3705311119556427, "losses/sft": 0.6740475296974182, "losses/total": 0.3705311119556427, "ref_logps/chosen": -67.916259765625, "ref_logps/rejected": -74.76850891113281, "rewards/accuracies": 0.75, "rewards/chosen": -0.26900339126586914, "rewards/margins": 0.417214035987854, "rewards/rejected": -0.6862174272537231, "step": 738 }, { "epoch": 0.5528333645034599, "grad_norm": 52.13175065308176, "learning_rate": 2.1944905971019393e-07, "logps/chosen": -95.3685073852539, "logps/rejected": -106.41765594482422, "loss": 0.616, "losses/dpo": 0.5383334159851074, "losses/sft": 0.7453177571296692, "losses/total": 0.5383334159851074, "ref_logps/chosen": -91.6680679321289, "ref_logps/rejected": -100.1531982421875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3700443506240845, "rewards/margins": 0.2564008831977844, "rewards/rejected": -0.6264451742172241, "step": 739 }, { "epoch": 0.5535814475406771, "grad_norm": 74.64533440943701, "learning_rate": 2.1884721071679909e-07, "logps/chosen": -99.03276062011719, "logps/rejected": -115.39696502685547, "loss": 0.5549, "losses/dpo": 0.3648090958595276, "losses/sft": 0.9212062358856201, "losses/total": 0.3648090958595276, "ref_logps/chosen": -96.08860778808594, "ref_logps/rejected": -108.12482452392578, "rewards/accuracies": 0.625, "rewards/chosen": -0.2944161295890808, "rewards/margins": 0.43279799818992615, "rewards/rejected": -0.7272140979766846, "step": 740 }, { "epoch": 0.5543295305778941, "grad_norm": 64.16346541837207, "learning_rate": 2.182455450632803e-07, "logps/chosen": -129.76925659179688, "logps/rejected": -134.19627380371094, "loss": 0.655, "losses/dpo": 0.5490491390228271, "losses/sft": 0.8984916806221008, "losses/total": 0.5490491390228271, "ref_logps/chosen": -125.25975036621094, "ref_logps/rejected": -127.20679473876953, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4509505033493042, "rewards/margins": 0.2479972541332245, "rewards/rejected": -0.6989477276802063, "step": 741 }, { "epoch": 0.5550776136151113, "grad_norm": 65.32180010275705, "learning_rate": 2.176440662905501e-07, "logps/chosen": -71.41459655761719, "logps/rejected": -78.16667175292969, "loss": 0.6399, "losses/dpo": 0.4967697858810425, "losses/sft": 0.23712058365345, "losses/total": 0.4967697858810425, "ref_logps/chosen": -68.50955963134766, "ref_logps/rejected": -73.17625427246094, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2905043959617615, "rewards/margins": 0.20853713154792786, "rewards/rejected": -0.49904152750968933, "step": 742 }, { "epoch": 0.5558256966523284, "grad_norm": 73.8471777689843, "learning_rate": 2.1704277793842135e-07, "logps/chosen": -97.61365509033203, "logps/rejected": -96.49516296386719, "loss": 0.6751, "losses/dpo": 0.6266957521438599, "losses/sft": 1.2450876235961914, "losses/total": 0.6266957521438599, "ref_logps/chosen": -93.4267807006836, "ref_logps/rejected": -91.34673309326172, "rewards/accuracies": 0.5625, "rewards/chosen": -0.41868749260902405, "rewards/margins": 0.09615587443113327, "rewards/rejected": -0.5148433446884155, "step": 743 }, { "epoch": 0.5565737796895456, "grad_norm": 106.83445904198675, "learning_rate": 2.164416835455862e-07, "logps/chosen": -95.68092346191406, "logps/rejected": -113.15487670898438, "loss": 0.6679, "losses/dpo": 0.5772762298583984, "losses/sft": 1.1061562299728394, "losses/total": 0.5772762298583984, "ref_logps/chosen": -90.56390380859375, "ref_logps/rejected": -106.50032043457031, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5117028951644897, "rewards/margins": 0.15375366806983948, "rewards/rejected": -0.6654565930366516, "step": 744 }, { "epoch": 0.5573218627267627, "grad_norm": 62.17135815478693, "learning_rate": 2.1584078664959526e-07, "logps/chosen": -86.84593200683594, "logps/rejected": -92.05323791503906, "loss": 0.6008, "losses/dpo": 0.8202056884765625, "losses/sft": 0.7336035370826721, "losses/total": 0.8202056884765625, "ref_logps/chosen": -83.567138671875, "ref_logps/rejected": -86.08370971679688, "rewards/accuracies": 0.65625, "rewards/chosen": -0.32787972688674927, "rewards/margins": 0.2690735459327698, "rewards/rejected": -0.596953272819519, "step": 745 }, { "epoch": 0.5580699457639798, "grad_norm": 63.79609412491561, "learning_rate": 2.1524009078683714e-07, "logps/chosen": -121.11738586425781, "logps/rejected": -129.61062622070312, "loss": 0.6321, "losses/dpo": 0.9354363679885864, "losses/sft": 0.8125040531158447, "losses/total": 0.9354363679885864, "ref_logps/chosen": -116.3980712890625, "ref_logps/rejected": -122.44114685058594, "rewards/accuracies": 0.625, "rewards/chosen": -0.4719323515892029, "rewards/margins": 0.24501438438892365, "rewards/rejected": -0.7169467806816101, "step": 746 }, { "epoch": 0.5588180288011969, "grad_norm": 90.54594848264108, "learning_rate": 2.1463959949251696e-07, "logps/chosen": -108.13744354248047, "logps/rejected": -113.57553100585938, "loss": 0.6964, "losses/dpo": 0.7254799604415894, "losses/sft": 1.2958778142929077, "losses/total": 0.7254799604415894, "ref_logps/chosen": -103.6173095703125, "ref_logps/rejected": -108.61776733398438, "rewards/accuracies": 0.59375, "rewards/chosen": -0.45201367139816284, "rewards/margins": 0.04376233369112015, "rewards/rejected": -0.49577596783638, "step": 747 }, { "epoch": 0.5595661118384141, "grad_norm": 61.81989360596539, "learning_rate": 2.1403931630063614e-07, "logps/chosen": -101.16410827636719, "logps/rejected": -108.0478515625, "loss": 0.6814, "losses/dpo": 0.5967063903808594, "losses/sft": 0.641335666179657, "losses/total": 0.5967063903808594, "ref_logps/chosen": -96.92571258544922, "ref_logps/rejected": -103.06588745117188, "rewards/accuracies": 0.5, "rewards/chosen": -0.42383870482444763, "rewards/margins": 0.07435711473226547, "rewards/rejected": -0.4981957972049713, "step": 748 }, { "epoch": 0.5603141948756312, "grad_norm": 63.33877039543062, "learning_rate": 2.134392447439713e-07, "logps/chosen": -69.73612976074219, "logps/rejected": -83.73373413085938, "loss": 0.6121, "losses/dpo": 0.7220572829246521, "losses/sft": 0.5745623111724854, "losses/total": 0.7220572829246521, "ref_logps/chosen": -66.77163696289062, "ref_logps/rejected": -78.28790283203125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2964499592781067, "rewards/margins": 0.24813267588615417, "rewards/rejected": -0.5445826649665833, "step": 749 }, { "epoch": 0.5610622779128484, "grad_norm": 77.55865380451277, "learning_rate": 2.1283938835405341e-07, "logps/chosen": -111.56031799316406, "logps/rejected": -119.48397064208984, "loss": 0.6636, "losses/dpo": 0.9218258261680603, "losses/sft": 1.6219269037246704, "losses/total": 0.9218258261680603, "ref_logps/chosen": -106.97537231445312, "ref_logps/rejected": -113.69757080078125, "rewards/accuracies": 0.53125, "rewards/chosen": -0.458493709564209, "rewards/margins": 0.12014550715684891, "rewards/rejected": -0.5786392688751221, "step": 750 }, { "epoch": 0.5618103609500654, "grad_norm": 95.63441809855293, "learning_rate": 2.122397506611475e-07, "logps/chosen": -80.64571380615234, "logps/rejected": -80.87684631347656, "loss": 0.6231, "losses/dpo": 0.5769687294960022, "losses/sft": 0.9036279916763306, "losses/total": 0.5769687294960022, "ref_logps/chosen": -76.71385955810547, "ref_logps/rejected": -74.74583435058594, "rewards/accuracies": 0.65625, "rewards/chosen": -0.39318498969078064, "rewards/margins": 0.2199159413576126, "rewards/rejected": -0.6131009459495544, "step": 751 }, { "epoch": 0.5625584439872826, "grad_norm": 62.882134671704335, "learning_rate": 2.1164033519423128e-07, "logps/chosen": -80.54865264892578, "logps/rejected": -95.72635650634766, "loss": 0.6123, "losses/dpo": 0.6589013934135437, "losses/sft": 0.9523687362670898, "losses/total": 0.6589013934135437, "ref_logps/chosen": -76.36927795410156, "ref_logps/rejected": -88.98316955566406, "rewards/accuracies": 0.65625, "rewards/chosen": -0.41793733835220337, "rewards/margins": 0.25638124346733093, "rewards/rejected": -0.6743186116218567, "step": 752 }, { "epoch": 0.5633065270244997, "grad_norm": 52.19353677703568, "learning_rate": 2.110411454809746e-07, "logps/chosen": -107.68157958984375, "logps/rejected": -115.11443328857422, "loss": 0.5217, "losses/dpo": 0.3013450503349304, "losses/sft": 1.04268217086792, "losses/total": 0.3013450503349304, "ref_logps/chosen": -104.78569030761719, "ref_logps/rejected": -107.62266540527344, "rewards/accuracies": 0.84375, "rewards/chosen": -0.28958892822265625, "rewards/margins": 0.4595869779586792, "rewards/rejected": -0.7491759657859802, "step": 753 }, { "epoch": 0.5640546100617169, "grad_norm": 77.09853021331513, "learning_rate": 2.1044218504771867e-07, "logps/chosen": -93.32787322998047, "logps/rejected": -98.23147583007812, "loss": 0.6005, "losses/dpo": 0.6605114340782166, "losses/sft": 0.8071670532226562, "losses/total": 0.6605114340782166, "ref_logps/chosen": -89.79879760742188, "ref_logps/rejected": -92.0653076171875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.35290849208831787, "rewards/margins": 0.26370829343795776, "rewards/rejected": -0.6166167259216309, "step": 754 }, { "epoch": 0.5648026930989339, "grad_norm": 160.55890926802198, "learning_rate": 2.0984345741945566e-07, "logps/chosen": -97.63972473144531, "logps/rejected": -103.98592376708984, "loss": 0.6125, "losses/dpo": 0.50493323802948, "losses/sft": 0.9360910654067993, "losses/total": 0.50493323802948, "ref_logps/chosen": -93.8392105102539, "ref_logps/rejected": -97.74765014648438, "rewards/accuracies": 0.625, "rewards/chosen": -0.3800511956214905, "rewards/margins": 0.243775874376297, "rewards/rejected": -0.6238270998001099, "step": 755 }, { "epoch": 0.5655507761361511, "grad_norm": 70.02075642342521, "learning_rate": 2.0924496611980733e-07, "logps/chosen": -88.7099380493164, "logps/rejected": -104.07511138916016, "loss": 0.6291, "losses/dpo": 0.6287540793418884, "losses/sft": 0.27915292978286743, "losses/total": 0.6287540793418884, "ref_logps/chosen": -85.93697357177734, "ref_logps/rejected": -99.27788543701172, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2772962749004364, "rewards/margins": 0.20242594182491302, "rewards/rejected": -0.47972220182418823, "step": 756 }, { "epoch": 0.5662988591733682, "grad_norm": 72.34294203766387, "learning_rate": 2.0864671467100467e-07, "logps/chosen": -114.95625305175781, "logps/rejected": -125.27938842773438, "loss": 0.6146, "losses/dpo": 0.6062053442001343, "losses/sft": 1.0386062860488892, "losses/total": 0.6062053442001343, "ref_logps/chosen": -111.93860626220703, "ref_logps/rejected": -119.24504089355469, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3017652630805969, "rewards/margins": 0.30166926980018616, "rewards/rejected": -0.6034345626831055, "step": 757 }, { "epoch": 0.5670469422105854, "grad_norm": 110.15220803062006, "learning_rate": 2.0804870659386714e-07, "logps/chosen": -65.70130920410156, "logps/rejected": -80.42312622070312, "loss": 0.5974, "losses/dpo": 0.4914743900299072, "losses/sft": 0.863038182258606, "losses/total": 0.4914743900299072, "ref_logps/chosen": -62.14784240722656, "ref_logps/rejected": -73.34054565429688, "rewards/accuracies": 0.65625, "rewards/chosen": -0.355347216129303, "rewards/margins": 0.3529113531112671, "rewards/rejected": -0.7082585096359253, "step": 758 }, { "epoch": 0.5677950252478025, "grad_norm": 82.27136588356939, "learning_rate": 2.0745094540778179e-07, "logps/chosen": -92.39468383789062, "logps/rejected": -94.21600341796875, "loss": 0.6615, "losses/dpo": 0.7468023896217346, "losses/sft": 1.069977879524231, "losses/total": 0.7468023896217346, "ref_logps/chosen": -88.49598693847656, "ref_logps/rejected": -89.01250457763672, "rewards/accuracies": 0.65625, "rewards/chosen": -0.389869749546051, "rewards/margins": 0.13047988712787628, "rewards/rejected": -0.5203496217727661, "step": 759 }, { "epoch": 0.5685431082850196, "grad_norm": 69.71489470665915, "learning_rate": 2.0685343463068302e-07, "logps/chosen": -109.38277435302734, "logps/rejected": -125.15309143066406, "loss": 0.5664, "losses/dpo": 0.49979233741760254, "losses/sft": 1.2296056747436523, "losses/total": 0.49979233741760254, "ref_logps/chosen": -105.53657531738281, "ref_logps/rejected": -117.5227279663086, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3846200406551361, "rewards/margins": 0.37841618061065674, "rewards/rejected": -0.7630361914634705, "step": 760 }, { "epoch": 0.5692911913222368, "grad_norm": 55.48870232867773, "learning_rate": 2.0625617777903118e-07, "logps/chosen": -93.91212463378906, "logps/rejected": -97.11994934082031, "loss": 0.6827, "losses/dpo": 0.7457760572433472, "losses/sft": 1.1726499795913696, "losses/total": 0.7457760572433472, "ref_logps/chosen": -88.16736602783203, "ref_logps/rejected": -90.06395721435547, "rewards/accuracies": 0.625, "rewards/chosen": -0.5744746923446655, "rewards/margins": 0.13112470507621765, "rewards/rejected": -0.7055993676185608, "step": 761 }, { "epoch": 0.5700392743594539, "grad_norm": 74.27611693804273, "learning_rate": 2.0565917836779228e-07, "logps/chosen": -80.68683624267578, "logps/rejected": -77.5755386352539, "loss": 0.709, "losses/dpo": 0.6218888163566589, "losses/sft": 0.6372210383415222, "losses/total": 0.6218888163566589, "ref_logps/chosen": -75.7669677734375, "ref_logps/rejected": -72.226318359375, "rewards/accuracies": 0.5, "rewards/chosen": -0.4919864535331726, "rewards/margins": 0.04293542727828026, "rewards/rejected": -0.5349218845367432, "step": 762 }, { "epoch": 0.5707873573966711, "grad_norm": 53.41296132051636, "learning_rate": 2.0506243991041734e-07, "logps/chosen": -108.84615325927734, "logps/rejected": -120.52518463134766, "loss": 0.5858, "losses/dpo": 0.35987260937690735, "losses/sft": 1.0621179342269897, "losses/total": 0.35987260937690735, "ref_logps/chosen": -105.68606567382812, "ref_logps/rejected": -114.43099975585938, "rewards/accuracies": 0.625, "rewards/chosen": -0.31600871682167053, "rewards/margins": 0.2934095561504364, "rewards/rejected": -0.6094182729721069, "step": 763 }, { "epoch": 0.5715354404338882, "grad_norm": 54.12563159741502, "learning_rate": 2.044659659188215e-07, "logps/chosen": -92.49252319335938, "logps/rejected": -100.6617660522461, "loss": 0.5736, "losses/dpo": 0.7229492664337158, "losses/sft": 1.054240345954895, "losses/total": 0.7229492664337158, "ref_logps/chosen": -89.2479248046875, "ref_logps/rejected": -93.36345672607422, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3244601786136627, "rewards/margins": 0.4053717851638794, "rewards/rejected": -0.7298319935798645, "step": 764 }, { "epoch": 0.5722835234711053, "grad_norm": 66.21877174683124, "learning_rate": 2.0386975990336375e-07, "logps/chosen": -68.58554077148438, "logps/rejected": -73.67927551269531, "loss": 0.6026, "losses/dpo": 0.5939115285873413, "losses/sft": 0.9384081363677979, "losses/total": 0.5939115285873413, "ref_logps/chosen": -65.00981140136719, "ref_logps/rejected": -67.7872314453125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3575727641582489, "rewards/margins": 0.23163148760795593, "rewards/rejected": -0.5892043113708496, "step": 765 }, { "epoch": 0.5730316065083224, "grad_norm": 53.15053515519223, "learning_rate": 2.0327382537282563e-07, "logps/chosen": -107.42680358886719, "logps/rejected": -111.0868148803711, "loss": 0.6274, "losses/dpo": 0.6981675624847412, "losses/sft": 0.826099693775177, "losses/total": 0.6981675624847412, "ref_logps/chosen": -104.00175476074219, "ref_logps/rejected": -105.03826141357422, "rewards/accuracies": 0.65625, "rewards/chosen": -0.34250596165657043, "rewards/margins": 0.26235005259513855, "rewards/rejected": -0.604856014251709, "step": 766 }, { "epoch": 0.5737796895455396, "grad_norm": 65.91716509926773, "learning_rate": 2.0267816583439109e-07, "logps/chosen": -79.35228729248047, "logps/rejected": -84.22964477539062, "loss": 0.5591, "losses/dpo": 0.5209208726882935, "losses/sft": 0.23242077231407166, "losses/total": 0.5209208726882935, "ref_logps/chosen": -75.86894226074219, "ref_logps/rejected": -75.96397399902344, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3483344316482544, "rewards/margins": 0.47823235392570496, "rewards/rejected": -0.826566755771637, "step": 767 }, { "epoch": 0.5745277725827567, "grad_norm": 64.63720912451096, "learning_rate": 2.020827847936257e-07, "logps/chosen": -107.87992858886719, "logps/rejected": -121.41731262207031, "loss": 0.6583, "losses/dpo": 0.8020015954971313, "losses/sft": 0.4757208824157715, "losses/total": 0.8020015954971313, "ref_logps/chosen": -103.17607879638672, "ref_logps/rejected": -115.58634185791016, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4703848958015442, "rewards/margins": 0.11271172761917114, "rewards/rejected": -0.5830966234207153, "step": 768 }, { "epoch": 0.5752758556199739, "grad_norm": 69.10220529484806, "learning_rate": 2.0148768575445614e-07, "logps/chosen": -81.27722930908203, "logps/rejected": -96.58660888671875, "loss": 0.5859, "losses/dpo": 0.6772838234901428, "losses/sft": 1.0419573783874512, "losses/total": 0.6772838234901428, "ref_logps/chosen": -77.56193542480469, "ref_logps/rejected": -88.92700958251953, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3715291917324066, "rewards/margins": 0.3944319784641266, "rewards/rejected": -0.7659611701965332, "step": 769 }, { "epoch": 0.5760239386571909, "grad_norm": 90.24632822107088, "learning_rate": 2.0089287221914935e-07, "logps/chosen": -107.05518341064453, "logps/rejected": -127.2039794921875, "loss": 0.5742, "losses/dpo": 0.49876904487609863, "losses/sft": 1.3318110704421997, "losses/total": 0.49876904487609863, "ref_logps/chosen": -103.68029022216797, "ref_logps/rejected": -120.25936126708984, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3374895453453064, "rewards/margins": 0.35697269439697266, "rewards/rejected": -0.6944621801376343, "step": 770 }, { "epoch": 0.5767720216944081, "grad_norm": 95.55033295082838, "learning_rate": 2.0029834768829196e-07, "logps/chosen": -76.22683715820312, "logps/rejected": -83.72325897216797, "loss": 0.669, "losses/dpo": 0.8706452250480652, "losses/sft": 1.086737871170044, "losses/total": 0.8706452250480652, "ref_logps/chosen": -72.45101928710938, "ref_logps/rejected": -78.70630645751953, "rewards/accuracies": 0.53125, "rewards/chosen": -0.37758171558380127, "rewards/margins": 0.12411396205425262, "rewards/rejected": -0.5016956925392151, "step": 771 }, { "epoch": 0.5775201047316252, "grad_norm": 82.7274859822942, "learning_rate": 1.9970411566076976e-07, "logps/chosen": -115.42694091796875, "logps/rejected": -117.89756774902344, "loss": 0.7603, "losses/dpo": 0.6025121212005615, "losses/sft": 1.1138159036636353, "losses/total": 0.6025121212005615, "ref_logps/chosen": -109.75875091552734, "ref_logps/rejected": -112.14763641357422, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5668185353279114, "rewards/margins": 0.00817469134926796, "rewards/rejected": -0.5749931335449219, "step": 772 }, { "epoch": 0.5782681877688424, "grad_norm": 75.47652868774149, "learning_rate": 1.9911017963374725e-07, "logps/chosen": -108.31341552734375, "logps/rejected": -117.06892395019531, "loss": 0.5874, "losses/dpo": 0.5170814394950867, "losses/sft": 0.7010729312896729, "losses/total": 0.5170814394950867, "ref_logps/chosen": -104.51863098144531, "ref_logps/rejected": -109.97779846191406, "rewards/accuracies": 0.6875, "rewards/chosen": -0.37947767972946167, "rewards/margins": 0.32963448762893677, "rewards/rejected": -0.7091121673583984, "step": 773 }, { "epoch": 0.5790162708060594, "grad_norm": 58.45876065037082, "learning_rate": 1.9851654310264688e-07, "logps/chosen": -96.27960968017578, "logps/rejected": -96.58181762695312, "loss": 0.678, "losses/dpo": 0.6980656385421753, "losses/sft": 0.17926400899887085, "losses/total": 0.6980656385421753, "ref_logps/chosen": -90.53779602050781, "ref_logps/rejected": -89.50775146484375, "rewards/accuracies": 0.625, "rewards/chosen": -0.5741814374923706, "rewards/margins": 0.13322517275810242, "rewards/rejected": -0.7074066400527954, "step": 774 }, { "epoch": 0.5797643538432766, "grad_norm": 66.37260103649862, "learning_rate": 1.979232095611284e-07, "logps/chosen": -92.8978271484375, "logps/rejected": -95.70602416992188, "loss": 0.6463, "losses/dpo": 0.6969864964485168, "losses/sft": 1.2029526233673096, "losses/total": 0.6969864964485168, "ref_logps/chosen": -89.89938354492188, "ref_logps/rejected": -91.23823547363281, "rewards/accuracies": 0.625, "rewards/chosen": -0.299843966960907, "rewards/margins": 0.14693601429462433, "rewards/rejected": -0.4467799663543701, "step": 775 }, { "epoch": 0.5805124368804937, "grad_norm": 71.26160100889419, "learning_rate": 1.973301825010685e-07, "logps/chosen": -88.37908935546875, "logps/rejected": -101.75452423095703, "loss": 0.6417, "losses/dpo": 0.5130168199539185, "losses/sft": 0.9170936942100525, "losses/total": 0.5130168199539185, "ref_logps/chosen": -84.4129638671875, "ref_logps/rejected": -95.70857238769531, "rewards/accuracies": 0.6875, "rewards/chosen": -0.39661267399787903, "rewards/margins": 0.2079828381538391, "rewards/rejected": -0.6045954823493958, "step": 776 }, { "epoch": 0.5812605199177109, "grad_norm": 68.59142778849593, "learning_rate": 1.9673746541254003e-07, "logps/chosen": -120.58090209960938, "logps/rejected": -110.83314514160156, "loss": 0.7438, "losses/dpo": 0.8751140236854553, "losses/sft": 1.059327244758606, "losses/total": 0.8751140236854553, "ref_logps/chosen": -116.61100769042969, "ref_logps/rejected": -106.769287109375, "rewards/accuracies": 0.53125, "rewards/chosen": -0.39699018001556396, "rewards/margins": 0.009395703673362732, "rewards/rejected": -0.4063858389854431, "step": 777 }, { "epoch": 0.582008602954928, "grad_norm": 69.51957193780592, "learning_rate": 1.9614506178379182e-07, "logps/chosen": -100.4166259765625, "logps/rejected": -108.32567596435547, "loss": 0.6233, "losses/dpo": 0.5978978276252747, "losses/sft": 0.6159957051277161, "losses/total": 0.5978978276252747, "ref_logps/chosen": -96.73644256591797, "ref_logps/rejected": -102.20785522460938, "rewards/accuracies": 0.59375, "rewards/chosen": -0.36801809072494507, "rewards/margins": 0.24376346170902252, "rewards/rejected": -0.6117815971374512, "step": 778 }, { "epoch": 0.5827566859921451, "grad_norm": 57.80987853009261, "learning_rate": 1.9555297510122787e-07, "logps/chosen": -94.88877868652344, "logps/rejected": -111.8741683959961, "loss": 0.57, "losses/dpo": 0.5288835167884827, "losses/sft": 1.0264379978179932, "losses/total": 0.5288835167884827, "ref_logps/chosen": -91.55485534667969, "ref_logps/rejected": -104.73049926757812, "rewards/accuracies": 0.75, "rewards/chosen": -0.3333926796913147, "rewards/margins": 0.3809751868247986, "rewards/rejected": -0.7143678665161133, "step": 779 }, { "epoch": 0.5835047690293622, "grad_norm": 86.21196854825654, "learning_rate": 1.949612088493868e-07, "logps/chosen": -77.43599700927734, "logps/rejected": -87.36795806884766, "loss": 0.6598, "losses/dpo": 0.6476399898529053, "losses/sft": 0.39604806900024414, "losses/total": 0.6476399898529053, "ref_logps/chosen": -74.07025146484375, "ref_logps/rejected": -82.49320983886719, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3365745544433594, "rewards/margins": 0.1509002447128296, "rewards/rejected": -0.48747479915618896, "step": 780 }, { "epoch": 0.5842528520665794, "grad_norm": 52.179715006478794, "learning_rate": 1.9436976651092142e-07, "logps/chosen": -90.93185424804688, "logps/rejected": -106.17645263671875, "loss": 0.6048, "losses/dpo": 0.44085800647735596, "losses/sft": 1.073333501815796, "losses/total": 0.44085800647735596, "ref_logps/chosen": -87.79855346679688, "ref_logps/rejected": -100.4344482421875, "rewards/accuracies": 0.75, "rewards/chosen": -0.3133293092250824, "rewards/margins": 0.26087096333503723, "rewards/rejected": -0.5742002725601196, "step": 781 }, { "epoch": 0.5850009351037965, "grad_norm": 76.14917281134237, "learning_rate": 1.9377865156657841e-07, "logps/chosen": -97.59228515625, "logps/rejected": -110.4793472290039, "loss": 0.6364, "losses/dpo": 0.775155782699585, "losses/sft": 0.5104193687438965, "losses/total": 0.775155782699585, "ref_logps/chosen": -93.94828796386719, "ref_logps/rejected": -104.84809875488281, "rewards/accuracies": 0.59375, "rewards/chosen": -0.36440014839172363, "rewards/margins": 0.19872505962848663, "rewards/rejected": -0.5631252527236938, "step": 782 }, { "epoch": 0.5857490181410137, "grad_norm": 56.170670166676594, "learning_rate": 1.9318786749517752e-07, "logps/chosen": -95.55096435546875, "logps/rejected": -102.0835952758789, "loss": 0.6658, "losses/dpo": 0.6004563570022583, "losses/sft": 1.1498953104019165, "losses/total": 0.6004563570022583, "ref_logps/chosen": -92.13764953613281, "ref_logps/rejected": -97.39167022705078, "rewards/accuracies": 0.46875, "rewards/chosen": -0.34133195877075195, "rewards/margins": 0.12786129117012024, "rewards/rejected": -0.4691932201385498, "step": 783 }, { "epoch": 0.5864971011782307, "grad_norm": 57.94324370290473, "learning_rate": 1.9259741777359137e-07, "logps/chosen": -96.54763793945312, "logps/rejected": -109.80841827392578, "loss": 0.6708, "losses/dpo": 0.8395880460739136, "losses/sft": 0.2780463397502899, "losses/total": 0.8395880460739136, "ref_logps/chosen": -91.49314880371094, "ref_logps/rejected": -102.36381530761719, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5054490566253662, "rewards/margins": 0.23901115357875824, "rewards/rejected": -0.7444602251052856, "step": 784 }, { "epoch": 0.5872451842154479, "grad_norm": 107.83455956384356, "learning_rate": 1.9200730587672474e-07, "logps/chosen": -87.02191162109375, "logps/rejected": -95.44355773925781, "loss": 0.6553, "losses/dpo": 0.5397589206695557, "losses/sft": 1.4075525999069214, "losses/total": 0.5397589206695557, "ref_logps/chosen": -82.25819396972656, "ref_logps/rejected": -88.82997131347656, "rewards/accuracies": 0.5, "rewards/chosen": -0.4763721227645874, "rewards/margins": 0.1849876493215561, "rewards/rejected": -0.6613597869873047, "step": 785 }, { "epoch": 0.5879932672526651, "grad_norm": 60.579679471651225, "learning_rate": 1.914175352774944e-07, "logps/chosen": -75.82745361328125, "logps/rejected": -83.18760681152344, "loss": 0.6573, "losses/dpo": 0.6396875381469727, "losses/sft": 1.1041486263275146, "losses/total": 0.6396875381469727, "ref_logps/chosen": -71.95909881591797, "ref_logps/rejected": -77.6596908569336, "rewards/accuracies": 0.625, "rewards/chosen": -0.3868345320224762, "rewards/margins": 0.1659567952156067, "rewards/rejected": -0.5527913570404053, "step": 786 }, { "epoch": 0.5887413502898822, "grad_norm": 59.447886613413004, "learning_rate": 1.908281094468084e-07, "logps/chosen": -78.57683563232422, "logps/rejected": -91.90240478515625, "loss": 0.6363, "losses/dpo": 0.6715387105941772, "losses/sft": 0.5975856781005859, "losses/total": 0.6715387105941772, "ref_logps/chosen": -75.04911804199219, "ref_logps/rejected": -86.35759735107422, "rewards/accuracies": 0.71875, "rewards/chosen": -0.35277244448661804, "rewards/margins": 0.20170852541923523, "rewards/rejected": -0.554481029510498, "step": 787 }, { "epoch": 0.5894894333270994, "grad_norm": 83.65566740520704, "learning_rate": 1.9023903185354583e-07, "logps/chosen": -94.48074340820312, "logps/rejected": -98.87528991699219, "loss": 0.6481, "losses/dpo": 0.8775651454925537, "losses/sft": 1.1517698764801025, "losses/total": 0.8775651454925537, "ref_logps/chosen": -89.25123596191406, "ref_logps/rejected": -91.93014526367188, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5229513645172119, "rewards/margins": 0.1715620458126068, "rewards/rejected": -0.6945134401321411, "step": 788 }, { "epoch": 0.5902375163643164, "grad_norm": 70.02156715572252, "learning_rate": 1.8965030596453635e-07, "logps/chosen": -89.76758575439453, "logps/rejected": -96.82927703857422, "loss": 0.6703, "losses/dpo": 0.6160288453102112, "losses/sft": 0.8414068222045898, "losses/total": 0.6160288453102112, "ref_logps/chosen": -85.98950958251953, "ref_logps/rejected": -91.94967651367188, "rewards/accuracies": 0.625, "rewards/chosen": -0.3778069019317627, "rewards/margins": 0.11015267670154572, "rewards/rejected": -0.4879595637321472, "step": 789 }, { "epoch": 0.5909855994015336, "grad_norm": 63.180890153589765, "learning_rate": 1.890619352445396e-07, "logps/chosen": -93.4820556640625, "logps/rejected": -101.33648681640625, "loss": 0.6085, "losses/dpo": 0.46952471137046814, "losses/sft": 0.6298993229866028, "losses/total": 0.46952471137046814, "ref_logps/chosen": -90.15571594238281, "ref_logps/rejected": -95.6712875366211, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3326336741447449, "rewards/margins": 0.23388665914535522, "rewards/rejected": -0.5665203332901001, "step": 790 }, { "epoch": 0.5917336824387507, "grad_norm": 67.09183643385761, "learning_rate": 1.884739231562253e-07, "logps/chosen": -128.0453643798828, "logps/rejected": -132.15682983398438, "loss": 0.6333, "losses/dpo": 0.6022886633872986, "losses/sft": 1.2249799966812134, "losses/total": 0.6022886633872986, "ref_logps/chosen": -124.14269256591797, "ref_logps/rejected": -126.03939819335938, "rewards/accuracies": 0.53125, "rewards/chosen": -0.390266478061676, "rewards/margins": 0.22147634625434875, "rewards/rejected": -0.6117428541183472, "step": 791 }, { "epoch": 0.5924817654759679, "grad_norm": 68.72677836027044, "learning_rate": 1.8788627316015225e-07, "logps/chosen": -83.35537719726562, "logps/rejected": -92.7883529663086, "loss": 0.5957, "losses/dpo": 0.7697493433952332, "losses/sft": 0.5797915458679199, "losses/total": 0.7697493433952332, "ref_logps/chosen": -78.71635437011719, "ref_logps/rejected": -84.5157470703125, "rewards/accuracies": 0.65625, "rewards/chosen": -0.46390241384506226, "rewards/margins": 0.3633584976196289, "rewards/rejected": -0.8272609710693359, "step": 792 }, { "epoch": 0.5932298485131849, "grad_norm": 80.20043295127464, "learning_rate": 1.8729898871474858e-07, "logps/chosen": -101.16033172607422, "logps/rejected": -103.7900390625, "loss": 0.6315, "losses/dpo": 0.5633218884468079, "losses/sft": 1.0717202425003052, "losses/total": 0.5633218884468079, "ref_logps/chosen": -97.90293884277344, "ref_logps/rejected": -98.85025024414062, "rewards/accuracies": 0.625, "rewards/chosen": -0.32574009895324707, "rewards/margins": 0.16823825240135193, "rewards/rejected": -0.4939783811569214, "step": 793 }, { "epoch": 0.5939779315504021, "grad_norm": 72.17029348632202, "learning_rate": 1.867120732762907e-07, "logps/chosen": -94.26893615722656, "logps/rejected": -106.57215881347656, "loss": 0.6139, "losses/dpo": 0.6024343967437744, "losses/sft": 1.4335709810256958, "losses/total": 0.6024343967437744, "ref_logps/chosen": -89.58304595947266, "ref_logps/rejected": -99.1763916015625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4685889184474945, "rewards/margins": 0.2709865868091583, "rewards/rejected": -0.7395755052566528, "step": 794 }, { "epoch": 0.5947260145876192, "grad_norm": 54.05681473490699, "learning_rate": 1.8612553029888373e-07, "logps/chosen": -73.75337219238281, "logps/rejected": -97.30532836914062, "loss": 0.5683, "losses/dpo": 0.4088338017463684, "losses/sft": 0.6208564043045044, "losses/total": 0.4088338017463684, "ref_logps/chosen": -70.34674835205078, "ref_logps/rejected": -89.97283935546875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34066256880760193, "rewards/margins": 0.3925855755805969, "rewards/rejected": -0.7332481741905212, "step": 795 }, { "epoch": 0.5954740976248364, "grad_norm": 59.202012857807595, "learning_rate": 1.8553936323444056e-07, "logps/chosen": -76.73450469970703, "logps/rejected": -85.76051330566406, "loss": 0.5999, "losses/dpo": 0.4565966725349426, "losses/sft": 0.8538500666618347, "losses/total": 0.4565966725349426, "ref_logps/chosen": -73.76720428466797, "ref_logps/rejected": -79.73815155029297, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2967304289340973, "rewards/margins": 0.30550602078437805, "rewards/rejected": -0.6022364497184753, "step": 796 }, { "epoch": 0.5962221806620535, "grad_norm": 81.66721900701258, "learning_rate": 1.8495357553266175e-07, "logps/chosen": -75.580810546875, "logps/rejected": -84.20205688476562, "loss": 0.6153, "losses/dpo": 0.45167428255081177, "losses/sft": 0.22966411709785461, "losses/total": 0.45167428255081177, "ref_logps/chosen": -71.53180694580078, "ref_logps/rejected": -77.66207885742188, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4049006700515747, "rewards/margins": 0.2490973323583603, "rewards/rejected": -0.6539980173110962, "step": 797 }, { "epoch": 0.5969702636992706, "grad_norm": 71.70277565301384, "learning_rate": 1.8436817064101543e-07, "logps/chosen": -107.25288391113281, "logps/rejected": -119.61359405517578, "loss": 0.6302, "losses/dpo": 0.6033729314804077, "losses/sft": 1.0677926540374756, "losses/total": 0.6033729314804077, "ref_logps/chosen": -102.08615112304688, "ref_logps/rejected": -111.72511291503906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5166739225387573, "rewards/margins": 0.27217310667037964, "rewards/rejected": -0.788847029209137, "step": 798 }, { "epoch": 0.5977183467364877, "grad_norm": 55.03238130497306, "learning_rate": 1.8378315200471662e-07, "logps/chosen": -79.69654846191406, "logps/rejected": -86.79519653320312, "loss": 0.6477, "losses/dpo": 0.5214998722076416, "losses/sft": 1.2507280111312866, "losses/total": 0.5214998722076416, "ref_logps/chosen": -76.0411376953125, "ref_logps/rejected": -81.34851837158203, "rewards/accuracies": 0.46875, "rewards/chosen": -0.36554011702537537, "rewards/margins": 0.17912712693214417, "rewards/rejected": -0.5446673035621643, "step": 799 }, { "epoch": 0.5984664297737049, "grad_norm": 77.74409705320033, "learning_rate": 1.8319852306670727e-07, "logps/chosen": -103.3642578125, "logps/rejected": -116.17051696777344, "loss": 0.6346, "losses/dpo": 0.6713306903839111, "losses/sft": 0.5955090522766113, "losses/total": 0.6713306903839111, "ref_logps/chosen": -97.82376098632812, "ref_logps/rejected": -108.24122619628906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.554048478603363, "rewards/margins": 0.2388797402381897, "rewards/rejected": -0.7929282188415527, "step": 800 }, { "epoch": 0.599214512810922, "grad_norm": 60.65866046343374, "learning_rate": 1.8261428726763584e-07, "logps/chosen": -108.50553131103516, "logps/rejected": -120.1315689086914, "loss": 0.5781, "losses/dpo": 0.5246008634567261, "losses/sft": 1.7560762166976929, "losses/total": 0.5246008634567261, "ref_logps/chosen": -105.18856811523438, "ref_logps/rejected": -113.51537322998047, "rewards/accuracies": 0.71875, "rewards/chosen": -0.331695556640625, "rewards/margins": 0.3299238085746765, "rewards/rejected": -0.6616193652153015, "step": 801 }, { "epoch": 0.5999625958481392, "grad_norm": 58.52679581198845, "learning_rate": 1.8203044804583717e-07, "logps/chosen": -67.74640655517578, "logps/rejected": -75.06791687011719, "loss": 0.6639, "losses/dpo": 0.49471643567085266, "losses/sft": 0.7362301349639893, "losses/total": 0.49471643567085266, "ref_logps/chosen": -64.40644836425781, "ref_logps/rejected": -70.1292953491211, "rewards/accuracies": 0.625, "rewards/chosen": -0.3339956998825073, "rewards/margins": 0.15986649692058563, "rewards/rejected": -0.4938621520996094, "step": 802 }, { "epoch": 0.6007106788853562, "grad_norm": 180.0575645800531, "learning_rate": 1.8144700883731205e-07, "logps/chosen": -121.63482666015625, "logps/rejected": -119.35884857177734, "loss": 0.6276, "losses/dpo": 0.6016679406166077, "losses/sft": 0.763638436794281, "losses/total": 0.6016679406166077, "ref_logps/chosen": -118.15718841552734, "ref_logps/rejected": -113.53517150878906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3477637469768524, "rewards/margins": 0.23460371792316437, "rewards/rejected": -0.582367479801178, "step": 803 }, { "epoch": 0.6014587619225734, "grad_norm": 79.73964485217577, "learning_rate": 1.8086397307570723e-07, "logps/chosen": -81.3271713256836, "logps/rejected": -78.93311309814453, "loss": 0.7538, "losses/dpo": 0.8053181767463684, "losses/sft": 0.6104837656021118, "losses/total": 0.8053181767463684, "ref_logps/chosen": -75.62232971191406, "ref_logps/rejected": -73.66278076171875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5704835653305054, "rewards/margins": -0.04345016926527023, "rewards/rejected": -0.5270333886146545, "step": 804 }, { "epoch": 0.6014587619225734, "eval_logps/chosen": -39.372833251953125, "eval_logps/rejected": -45.46757888793945, "eval_loss": 0.6183610558509827, "eval_losses/dpo": 0.6199437975883484, "eval_losses/sft": 0.3277609050273895, "eval_losses/total": 0.6199437975883484, "eval_ref_logps/chosen": -35.788204193115234, "eval_ref_logps/rejected": -39.235660552978516, "eval_rewards/accuracies": 0.6357758641242981, "eval_rewards/chosen": -0.3584630787372589, "eval_rewards/margins": 0.2647288143634796, "eval_rewards/rejected": -0.6231919527053833, "eval_runtime": 38.0947, "eval_samples_per_second": 12.154, "eval_steps_per_second": 1.523, "step": 804 }, { "epoch": 0.6022068449597905, "grad_norm": 75.53496057868041, "learning_rate": 1.80281344192295e-07, "logps/chosen": -81.55149841308594, "logps/rejected": -83.89018249511719, "loss": 0.6284, "losses/dpo": 0.6287651658058167, "losses/sft": 0.9576244354248047, "losses/total": 0.6287651658058167, "ref_logps/chosen": -78.09294128417969, "ref_logps/rejected": -78.38331604003906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.34585562348365784, "rewards/margins": 0.20483186841011047, "rewards/rejected": -0.5506874918937683, "step": 805 }, { "epoch": 0.6029549279970077, "grad_norm": 70.21321899741358, "learning_rate": 1.7969912561595315e-07, "logps/chosen": -65.81419372558594, "logps/rejected": -71.39042663574219, "loss": 0.6486, "losses/dpo": 0.8052828311920166, "losses/sft": 0.5955185890197754, "losses/total": 0.8052828311920166, "ref_logps/chosen": -62.95854949951172, "ref_logps/rejected": -66.85308837890625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.28556469082832336, "rewards/margins": 0.16816894710063934, "rewards/rejected": -0.4537336230278015, "step": 806 }, { "epoch": 0.6037030110342247, "grad_norm": 62.615178691508824, "learning_rate": 1.7911732077314478e-07, "logps/chosen": -84.5088882446289, "logps/rejected": -101.94137573242188, "loss": 0.5608, "losses/dpo": 0.6334854364395142, "losses/sft": 0.6950664520263672, "losses/total": 0.6334854364395142, "ref_logps/chosen": -81.60356903076172, "ref_logps/rejected": -95.00187683105469, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2905316948890686, "rewards/margins": 0.403419554233551, "rewards/rejected": -0.6939512491226196, "step": 807 }, { "epoch": 0.6044510940714419, "grad_norm": 123.49008741250428, "learning_rate": 1.7853593308789799e-07, "logps/chosen": -111.71012878417969, "logps/rejected": -129.01846313476562, "loss": 0.6252, "losses/dpo": 0.7270880341529846, "losses/sft": 1.1436893939971924, "losses/total": 0.7270880341529846, "ref_logps/chosen": -107.37156677246094, "ref_logps/rejected": -122.3121566772461, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4338560104370117, "rewards/margins": 0.23677581548690796, "rewards/rejected": -0.6706318259239197, "step": 808 }, { "epoch": 0.6051991771086591, "grad_norm": 68.57746410942299, "learning_rate": 1.7795496598178598e-07, "logps/chosen": -89.79459381103516, "logps/rejected": -95.2530746459961, "loss": 0.6073, "losses/dpo": 0.5504270792007446, "losses/sft": 1.1245629787445068, "losses/total": 0.5504270792007446, "ref_logps/chosen": -86.15068054199219, "ref_logps/rejected": -89.37052154541016, "rewards/accuracies": 0.75, "rewards/chosen": -0.36439189314842224, "rewards/margins": 0.22386333346366882, "rewards/rejected": -0.5882552266120911, "step": 809 }, { "epoch": 0.6059472601458762, "grad_norm": 108.6004072824502, "learning_rate": 1.7737442287390658e-07, "logps/chosen": -80.16291046142578, "logps/rejected": -89.69806671142578, "loss": 0.572, "losses/dpo": 0.636359453201294, "losses/sft": 0.5387392044067383, "losses/total": 0.636359453201294, "ref_logps/chosen": -76.88577270507812, "ref_logps/rejected": -82.65753936767578, "rewards/accuracies": 0.65625, "rewards/chosen": -0.32771363854408264, "rewards/margins": 0.3763388693332672, "rewards/rejected": -0.7040524482727051, "step": 810 }, { "epoch": 0.6066953431830934, "grad_norm": 61.94270578442824, "learning_rate": 1.767943071808624e-07, "logps/chosen": -102.47097778320312, "logps/rejected": -103.94987487792969, "loss": 0.6709, "losses/dpo": 0.5812995433807373, "losses/sft": 0.518752932548523, "losses/total": 0.5812995433807373, "ref_logps/chosen": -97.57785034179688, "ref_logps/rejected": -97.31158447265625, "rewards/accuracies": 0.625, "rewards/chosen": -0.4893132448196411, "rewards/margins": 0.17451506853103638, "rewards/rejected": -0.6638283133506775, "step": 811 }, { "epoch": 0.6074434262203104, "grad_norm": 84.85892771485531, "learning_rate": 1.762146223167407e-07, "logps/chosen": -118.16815185546875, "logps/rejected": -115.39945220947266, "loss": 0.7208, "losses/dpo": 0.7017483711242676, "losses/sft": 1.3114286661148071, "losses/total": 0.7017483711242676, "ref_logps/chosen": -112.65155792236328, "ref_logps/rejected": -109.66409301757812, "rewards/accuracies": 0.625, "rewards/chosen": -0.5516600608825684, "rewards/margins": 0.021875184029340744, "rewards/rejected": -0.5735352039337158, "step": 812 }, { "epoch": 0.6081915092575276, "grad_norm": 82.73350777752376, "learning_rate": 1.7563537169309317e-07, "logps/chosen": -73.69676208496094, "logps/rejected": -98.30213928222656, "loss": 0.5579, "losses/dpo": 0.5864120721817017, "losses/sft": 0.4919869601726532, "losses/total": 0.5864120721817017, "ref_logps/chosen": -69.98674011230469, "ref_logps/rejected": -90.29547882080078, "rewards/accuracies": 0.75, "rewards/chosen": -0.3710029125213623, "rewards/margins": 0.42966270446777344, "rewards/rejected": -0.8006656765937805, "step": 813 }, { "epoch": 0.6089395922947447, "grad_norm": 166.97959750146268, "learning_rate": 1.7505655871891595e-07, "logps/chosen": -96.67410278320312, "logps/rejected": -113.57124328613281, "loss": 0.6934, "losses/dpo": 0.8046914339065552, "losses/sft": 0.715644896030426, "losses/total": 0.8046914339065552, "ref_logps/chosen": -91.25386810302734, "ref_logps/rejected": -106.64505004882812, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5420226454734802, "rewards/margins": 0.15059754252433777, "rewards/rejected": -0.6926202774047852, "step": 814 }, { "epoch": 0.6096876753319619, "grad_norm": 77.92497051141773, "learning_rate": 1.7447818680062938e-07, "logps/chosen": -81.6982421875, "logps/rejected": -95.38542175292969, "loss": 0.6479, "losses/dpo": 0.483390748500824, "losses/sft": 0.6990163922309875, "losses/total": 0.483390748500824, "ref_logps/chosen": -76.9735336303711, "ref_logps/rejected": -88.07687377929688, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4724714756011963, "rewards/margins": 0.25838369131088257, "rewards/rejected": -0.7308552265167236, "step": 815 }, { "epoch": 0.610435758369179, "grad_norm": 82.1089819117464, "learning_rate": 1.7390025934205835e-07, "logps/chosen": -85.41120910644531, "logps/rejected": -96.96945190429688, "loss": 0.6812, "losses/dpo": 0.5822662115097046, "losses/sft": 0.517145574092865, "losses/total": 0.5822662115097046, "ref_logps/chosen": -80.58287048339844, "ref_logps/rejected": -90.82737731933594, "rewards/accuracies": 0.625, "rewards/chosen": -0.48283347487449646, "rewards/margins": 0.13137422502040863, "rewards/rejected": -0.6142076849937439, "step": 816 }, { "epoch": 0.6111838414063961, "grad_norm": 80.07516234039325, "learning_rate": 1.7332277974441182e-07, "logps/chosen": -106.01931762695312, "logps/rejected": -104.59400177001953, "loss": 0.6609, "losses/dpo": 0.6088835000991821, "losses/sft": 0.9316043257713318, "losses/total": 0.6088835000991821, "ref_logps/chosen": -102.00434875488281, "ref_logps/rejected": -99.30659484863281, "rewards/accuracies": 0.59375, "rewards/chosen": -0.40149661898612976, "rewards/margins": 0.12724335491657257, "rewards/rejected": -0.5287399888038635, "step": 817 }, { "epoch": 0.6119319244436132, "grad_norm": 72.06052350313857, "learning_rate": 1.7274575140626315e-07, "logps/chosen": -100.52535247802734, "logps/rejected": -108.72064208984375, "loss": 0.673, "losses/dpo": 0.7512806057929993, "losses/sft": 1.0774856805801392, "losses/total": 0.7512806057929993, "ref_logps/chosen": -95.16819763183594, "ref_logps/rejected": -101.86100006103516, "rewards/accuracies": 0.40625, "rewards/chosen": -0.5357153415679932, "rewards/margins": 0.15024907886981964, "rewards/rejected": -0.6859644055366516, "step": 818 }, { "epoch": 0.6126800074808304, "grad_norm": 62.0466654881341, "learning_rate": 1.721691777235299e-07, "logps/chosen": -106.51571655273438, "logps/rejected": -129.2591094970703, "loss": 0.535, "losses/dpo": 0.4566825032234192, "losses/sft": 0.23955190181732178, "losses/total": 0.4566825032234192, "ref_logps/chosen": -103.54396057128906, "ref_logps/rejected": -121.87126922607422, "rewards/accuracies": 0.84375, "rewards/chosen": -0.29717573523521423, "rewards/margins": 0.44160664081573486, "rewards/rejected": -0.7387824058532715, "step": 819 }, { "epoch": 0.6134280905180475, "grad_norm": 115.78089970215227, "learning_rate": 1.7159306208945375e-07, "logps/chosen": -92.68099975585938, "logps/rejected": -107.07107543945312, "loss": 0.5986, "losses/dpo": 0.4128184914588928, "losses/sft": 0.841388463973999, "losses/total": 0.4128184914588928, "ref_logps/chosen": -88.37820434570312, "ref_logps/rejected": -99.28573608398438, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4302789568901062, "rewards/margins": 0.34825599193573, "rewards/rejected": -0.778535008430481, "step": 820 }, { "epoch": 0.6141761735552647, "grad_norm": 60.10591704554135, "learning_rate": 1.7101740789458098e-07, "logps/chosen": -102.10384368896484, "logps/rejected": -99.80221557617188, "loss": 0.6397, "losses/dpo": 0.4501940608024597, "losses/sft": 0.6157944202423096, "losses/total": 0.4501940608024597, "ref_logps/chosen": -97.96077728271484, "ref_logps/rejected": -93.33872985839844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4143068790435791, "rewards/margins": 0.232041135430336, "rewards/rejected": -0.6463480591773987, "step": 821 }, { "epoch": 0.6149242565924817, "grad_norm": 66.76980241365493, "learning_rate": 1.70442218526742e-07, "logps/chosen": -113.30827331542969, "logps/rejected": -112.96620178222656, "loss": 0.6805, "losses/dpo": 0.7164897918701172, "losses/sft": 1.3170849084854126, "losses/total": 0.7164897918701172, "ref_logps/chosen": -109.45071411132812, "ref_logps/rejected": -108.27215576171875, "rewards/accuracies": 0.5, "rewards/chosen": -0.38575488328933716, "rewards/margins": 0.08364902436733246, "rewards/rejected": -0.4694039225578308, "step": 822 }, { "epoch": 0.6156723396296989, "grad_norm": 92.7205995298723, "learning_rate": 1.6986749737103174e-07, "logps/chosen": -94.03459167480469, "logps/rejected": -97.99684143066406, "loss": 0.6807, "losses/dpo": 0.5866599082946777, "losses/sft": 1.541839599609375, "losses/total": 0.5866599082946777, "ref_logps/chosen": -89.51844787597656, "ref_logps/rejected": -91.93400573730469, "rewards/accuracies": 0.59375, "rewards/chosen": -0.45161473751068115, "rewards/margins": 0.15466901659965515, "rewards/rejected": -0.6062837839126587, "step": 823 }, { "epoch": 0.616420422666916, "grad_norm": 70.47059164072519, "learning_rate": 1.692932478097895e-07, "logps/chosen": -108.3052978515625, "logps/rejected": -111.16838073730469, "loss": 0.6582, "losses/dpo": 0.8987479209899902, "losses/sft": 0.6299847364425659, "losses/total": 0.8987479209899902, "ref_logps/chosen": -104.6656494140625, "ref_logps/rejected": -106.12389373779297, "rewards/accuracies": 0.59375, "rewards/chosen": -0.36396530270576477, "rewards/margins": 0.14048297703266144, "rewards/rejected": -0.5044482946395874, "step": 824 }, { "epoch": 0.6171685057041332, "grad_norm": 72.55325542378584, "learning_rate": 1.6871947322257912e-07, "logps/chosen": -107.68380737304688, "logps/rejected": -114.98779296875, "loss": 0.6249, "losses/dpo": 0.6401402354240417, "losses/sft": 0.8152303695678711, "losses/total": 0.6401402354240417, "ref_logps/chosen": -103.98624420166016, "ref_logps/rejected": -109.08587646484375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.369756281375885, "rewards/margins": 0.2204357385635376, "rewards/rejected": -0.5901919603347778, "step": 825 }, { "epoch": 0.6179165887413502, "grad_norm": 48.87931980252907, "learning_rate": 1.6814617698616944e-07, "logps/chosen": -75.26078033447266, "logps/rejected": -82.82573699951172, "loss": 0.6153, "losses/dpo": 0.7844967246055603, "losses/sft": 1.090637445449829, "losses/total": 0.7844967246055603, "ref_logps/chosen": -71.65459442138672, "ref_logps/rejected": -76.56295776367188, "rewards/accuracies": 0.75, "rewards/chosen": -0.3606187105178833, "rewards/margins": 0.26565951108932495, "rewards/rejected": -0.6262781620025635, "step": 826 }, { "epoch": 0.6186646717785674, "grad_norm": 85.370843431485, "learning_rate": 1.6757336247451382e-07, "logps/chosen": -99.05094909667969, "logps/rejected": -101.34825897216797, "loss": 0.7251, "losses/dpo": 0.8346315622329712, "losses/sft": 0.6230324506759644, "losses/total": 0.8346315622329712, "ref_logps/chosen": -94.18124389648438, "ref_logps/rejected": -95.98501586914062, "rewards/accuracies": 0.40625, "rewards/chosen": -0.48697036504745483, "rewards/margins": 0.04935334622859955, "rewards/rejected": -0.5363237261772156, "step": 827 }, { "epoch": 0.6194127548157845, "grad_norm": 62.0502909995486, "learning_rate": 1.6700103305873063e-07, "logps/chosen": -114.51808166503906, "logps/rejected": -128.8008575439453, "loss": 0.557, "losses/dpo": 0.434485524892807, "losses/sft": 0.9067302942276001, "losses/total": 0.434485524892807, "ref_logps/chosen": -110.9065933227539, "ref_logps/rejected": -120.86555480957031, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3611488342285156, "rewards/margins": 0.4323815703392029, "rewards/rejected": -0.7935304641723633, "step": 828 }, { "epoch": 0.6201608378530017, "grad_norm": 79.06778778366348, "learning_rate": 1.664291921070834e-07, "logps/chosen": -112.6204833984375, "logps/rejected": -117.35979461669922, "loss": 0.6661, "losses/dpo": 0.612920880317688, "losses/sft": 1.311180591583252, "losses/total": 0.612920880317688, "ref_logps/chosen": -108.17169952392578, "ref_logps/rejected": -111.43861389160156, "rewards/accuracies": 0.625, "rewards/chosen": -0.44487762451171875, "rewards/margins": 0.14724008738994598, "rewards/rejected": -0.5921176671981812, "step": 829 }, { "epoch": 0.6209089208902188, "grad_norm": 75.1912744958811, "learning_rate": 1.6585784298496102e-07, "logps/chosen": -107.66201782226562, "logps/rejected": -119.63121032714844, "loss": 0.6468, "losses/dpo": 0.6837844848632812, "losses/sft": 0.755013644695282, "losses/total": 0.6837844848632812, "ref_logps/chosen": -103.69987487792969, "ref_logps/rejected": -113.54867553710938, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3962143063545227, "rewards/margins": 0.21203990280628204, "rewards/rejected": -0.6082541942596436, "step": 830 }, { "epoch": 0.621657003927436, "grad_norm": 82.06389289597729, "learning_rate": 1.6528698905485783e-07, "logps/chosen": -58.1610107421875, "logps/rejected": -69.28992462158203, "loss": 0.594, "losses/dpo": 0.38826367259025574, "losses/sft": 0.5779051184654236, "losses/total": 0.38826367259025574, "ref_logps/chosen": -55.428958892822266, "ref_logps/rejected": -63.5007209777832, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2732049524784088, "rewards/margins": 0.3057152330875397, "rewards/rejected": -0.5789201259613037, "step": 831 }, { "epoch": 0.6224050869646531, "grad_norm": 57.883277630671614, "learning_rate": 1.647166336763538e-07, "logps/chosen": -105.94047546386719, "logps/rejected": -118.23344421386719, "loss": 0.616, "losses/dpo": 0.5893733501434326, "losses/sft": 1.1758700609207153, "losses/total": 0.5893733501434326, "ref_logps/chosen": -101.99162292480469, "ref_logps/rejected": -111.40547180175781, "rewards/accuracies": 0.65625, "rewards/chosen": -0.39488571882247925, "rewards/margins": 0.2879120409488678, "rewards/rejected": -0.6827977895736694, "step": 832 }, { "epoch": 0.6231531700018702, "grad_norm": 62.503283452569875, "learning_rate": 1.641467802060949e-07, "logps/chosen": -100.9263687133789, "logps/rejected": -112.94486999511719, "loss": 0.6066, "losses/dpo": 0.620216965675354, "losses/sft": 1.1485962867736816, "losses/total": 0.620216965675354, "ref_logps/chosen": -96.58686828613281, "ref_logps/rejected": -106.16860961914062, "rewards/accuracies": 0.75, "rewards/chosen": -0.43395015597343445, "rewards/margins": 0.24367575347423553, "rewards/rejected": -0.6776258945465088, "step": 833 }, { "epoch": 0.6239012530390874, "grad_norm": 73.06249018425137, "learning_rate": 1.635774319977732e-07, "logps/chosen": -98.28231811523438, "logps/rejected": -110.61717987060547, "loss": 0.565, "losses/dpo": 0.6510128378868103, "losses/sft": 1.214104175567627, "losses/total": 0.6510128378868103, "ref_logps/chosen": -93.67021942138672, "ref_logps/rejected": -101.69581604003906, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4612104296684265, "rewards/margins": 0.43092525005340576, "rewards/rejected": -0.8921356797218323, "step": 834 }, { "epoch": 0.6246493360763045, "grad_norm": 55.261593555953894, "learning_rate": 1.6300859240210745e-07, "logps/chosen": -83.79402923583984, "logps/rejected": -93.42515563964844, "loss": 0.6175, "losses/dpo": 0.5053466558456421, "losses/sft": 0.27431732416152954, "losses/total": 0.5053466558456421, "ref_logps/chosen": -80.60182189941406, "ref_logps/rejected": -87.61500549316406, "rewards/accuracies": 0.625, "rewards/chosen": -0.319220632314682, "rewards/margins": 0.26179516315460205, "rewards/rejected": -0.5810158252716064, "step": 835 }, { "epoch": 0.6253974191135216, "grad_norm": 77.14335045530635, "learning_rate": 1.6244026476682287e-07, "logps/chosen": -93.5936508178711, "logps/rejected": -103.15538024902344, "loss": 0.6134, "losses/dpo": 0.7355585694313049, "losses/sft": 1.0507447719573975, "losses/total": 0.7355585694313049, "ref_logps/chosen": -88.20671081542969, "ref_logps/rejected": -95.18181610107422, "rewards/accuracies": 0.71875, "rewards/chosen": -0.538693904876709, "rewards/margins": 0.25866228342056274, "rewards/rejected": -0.7973562479019165, "step": 836 }, { "epoch": 0.6261455021507387, "grad_norm": 68.21706478817605, "learning_rate": 1.6187245243663175e-07, "logps/chosen": -86.92250061035156, "logps/rejected": -96.41758728027344, "loss": 0.6377, "losses/dpo": 0.6292951703071594, "losses/sft": 1.1053705215454102, "losses/total": 0.6292951703071594, "ref_logps/chosen": -82.41895294189453, "ref_logps/rejected": -89.23081970214844, "rewards/accuracies": 0.65625, "rewards/chosen": -0.450354665517807, "rewards/margins": 0.2683212161064148, "rewards/rejected": -0.7186758518218994, "step": 837 }, { "epoch": 0.6268935851879559, "grad_norm": 78.15161507097537, "learning_rate": 1.6130515875321377e-07, "logps/chosen": -95.88468933105469, "logps/rejected": -116.12866973876953, "loss": 0.5674, "losses/dpo": 0.6081186532974243, "losses/sft": 1.2379179000854492, "losses/total": 0.6081186532974243, "ref_logps/chosen": -92.06822967529297, "ref_logps/rejected": -108.40458679199219, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3816449046134949, "rewards/margins": 0.3907619118690491, "rewards/rejected": -0.772406816482544, "step": 838 }, { "epoch": 0.627641668225173, "grad_norm": 61.076148562536524, "learning_rate": 1.6073838705519615e-07, "logps/chosen": -117.2767105102539, "logps/rejected": -115.36387634277344, "loss": 0.6843, "losses/dpo": 0.54190593957901, "losses/sft": 0.817786693572998, "losses/total": 0.54190593957901, "ref_logps/chosen": -111.49823760986328, "ref_logps/rejected": -108.4613037109375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5778464078903198, "rewards/margins": 0.11240940541028976, "rewards/rejected": -0.6902558207511902, "step": 839 }, { "epoch": 0.6283897512623902, "grad_norm": 65.63678637065604, "learning_rate": 1.6017214067813444e-07, "logps/chosen": -91.19856262207031, "logps/rejected": -98.4168701171875, "loss": 0.5469, "losses/dpo": 0.43406566977500916, "losses/sft": 0.56181401014328, "losses/total": 0.43406566977500916, "ref_logps/chosen": -87.44392395019531, "ref_logps/rejected": -90.3638687133789, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3754647672176361, "rewards/margins": 0.42983514070510864, "rewards/rejected": -0.8052998781204224, "step": 840 }, { "epoch": 0.6291378342996072, "grad_norm": 75.51284354904824, "learning_rate": 1.596064229544923e-07, "logps/chosen": -101.19005584716797, "logps/rejected": -103.63406372070312, "loss": 0.6395, "losses/dpo": 0.6441993117332458, "losses/sft": 0.6481130123138428, "losses/total": 0.6441993117332458, "ref_logps/chosen": -96.46707153320312, "ref_logps/rejected": -96.51808166503906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4722990393638611, "rewards/margins": 0.239299476146698, "rewards/rejected": -0.7115985155105591, "step": 841 }, { "epoch": 0.6298859173368244, "grad_norm": 62.2120420555964, "learning_rate": 1.5904123721362217e-07, "logps/chosen": -65.95295715332031, "logps/rejected": -73.34744262695312, "loss": 0.6351, "losses/dpo": 0.6173135042190552, "losses/sft": 0.8595802783966064, "losses/total": 0.6173135042190552, "ref_logps/chosen": -62.46685028076172, "ref_logps/rejected": -68.22369384765625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.34861013293266296, "rewards/margins": 0.16376423835754395, "rewards/rejected": -0.5123744010925293, "step": 842 }, { "epoch": 0.6306340003740415, "grad_norm": 60.77046620237682, "learning_rate": 1.584765867817457e-07, "logps/chosen": -81.6565933227539, "logps/rejected": -87.93914794921875, "loss": 0.5966, "losses/dpo": 0.6111971139907837, "losses/sft": 1.2322076559066772, "losses/total": 0.6111971139907837, "ref_logps/chosen": -78.01158905029297, "ref_logps/rejected": -81.77125549316406, "rewards/accuracies": 0.75, "rewards/chosen": -0.3644999861717224, "rewards/margins": 0.25228962302207947, "rewards/rejected": -0.6167895793914795, "step": 843 }, { "epoch": 0.6313820834112587, "grad_norm": 64.20943033946838, "learning_rate": 1.579124749819344e-07, "logps/chosen": -77.90571594238281, "logps/rejected": -85.15892791748047, "loss": 0.6303, "losses/dpo": 0.5703545808792114, "losses/sft": 1.2747451066970825, "losses/total": 0.5703545808792114, "ref_logps/chosen": -73.53707122802734, "ref_logps/rejected": -78.57096862792969, "rewards/accuracies": 0.65625, "rewards/chosen": -0.43686428666114807, "rewards/margins": 0.22193196415901184, "rewards/rejected": -0.6587962508201599, "step": 844 }, { "epoch": 0.6321301664484757, "grad_norm": 80.77245006922563, "learning_rate": 1.5734890513408949e-07, "logps/chosen": -97.78323364257812, "logps/rejected": -116.37800598144531, "loss": 0.5921, "losses/dpo": 0.46191638708114624, "losses/sft": 0.9602814316749573, "losses/total": 0.46191638708114624, "ref_logps/chosen": -93.37318420410156, "ref_logps/rejected": -107.93324279785156, "rewards/accuracies": 0.59375, "rewards/chosen": -0.44100624322891235, "rewards/margins": 0.40346989035606384, "rewards/rejected": -0.8444761037826538, "step": 845 }, { "epoch": 0.6328782494856929, "grad_norm": 54.71410065597946, "learning_rate": 1.5678588055492286e-07, "logps/chosen": -77.17852020263672, "logps/rejected": -85.58736419677734, "loss": 0.6602, "losses/dpo": 0.6916992664337158, "losses/sft": 0.7471706867218018, "losses/total": 0.6916992664337158, "ref_logps/chosen": -72.5195541381836, "ref_logps/rejected": -79.43811798095703, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4658960700035095, "rewards/margins": 0.14902816712856293, "rewards/rejected": -0.6149242520332336, "step": 846 }, { "epoch": 0.63362633252291, "grad_norm": 70.76989089058311, "learning_rate": 1.5622340455793736e-07, "logps/chosen": -76.5459976196289, "logps/rejected": -84.93930053710938, "loss": 0.7356, "losses/dpo": 0.7424417734146118, "losses/sft": 0.5850024223327637, "losses/total": 0.7424417734146118, "ref_logps/chosen": -71.5974349975586, "ref_logps/rejected": -79.96824645996094, "rewards/accuracies": 0.53125, "rewards/chosen": -0.49485620856285095, "rewards/margins": 0.002249062992632389, "rewards/rejected": -0.49710530042648315, "step": 847 }, { "epoch": 0.6343744155601272, "grad_norm": 62.981020869112434, "learning_rate": 1.5566148045340726e-07, "logps/chosen": -101.96143341064453, "logps/rejected": -111.3128662109375, "loss": 0.6319, "losses/dpo": 0.8279223442077637, "losses/sft": 0.8009216785430908, "losses/total": 0.8279223442077637, "ref_logps/chosen": -97.61417388916016, "ref_logps/rejected": -104.96592712402344, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4347264766693115, "rewards/margins": 0.19996701180934906, "rewards/rejected": -0.6346935033798218, "step": 848 }, { "epoch": 0.6351224985973443, "grad_norm": 62.912759109651695, "learning_rate": 1.5510011154835916e-07, "logps/chosen": -69.99032592773438, "logps/rejected": -67.0713119506836, "loss": 0.6758, "losses/dpo": 0.9217380285263062, "losses/sft": 0.8994337916374207, "losses/total": 0.9217380285263062, "ref_logps/chosen": -66.040771484375, "ref_logps/rejected": -61.840023040771484, "rewards/accuracies": 0.625, "rewards/chosen": -0.3949558138847351, "rewards/margins": 0.12817291915416718, "rewards/rejected": -0.5231287479400635, "step": 849 }, { "epoch": 0.6358705816345614, "grad_norm": 72.59623888062265, "learning_rate": 1.5453930114655183e-07, "logps/chosen": -86.19720458984375, "logps/rejected": -87.71266174316406, "loss": 0.6748, "losses/dpo": 0.704290509223938, "losses/sft": 0.7054144144058228, "losses/total": 0.704290509223938, "ref_logps/chosen": -81.45414733886719, "ref_logps/rejected": -81.90373229980469, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4743054509162903, "rewards/margins": 0.10658827424049377, "rewards/rejected": -0.5808937549591064, "step": 850 }, { "epoch": 0.6366186646717785, "grad_norm": 53.05197905535149, "learning_rate": 1.5397905254845728e-07, "logps/chosen": -114.73582458496094, "logps/rejected": -111.88125610351562, "loss": 0.6623, "losses/dpo": 1.0512712001800537, "losses/sft": 0.9365469217300415, "losses/total": 1.0512712001800537, "ref_logps/chosen": -110.10737609863281, "ref_logps/rejected": -105.36962127685547, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4628455638885498, "rewards/margins": 0.18831881880760193, "rewards/rejected": -0.6511643528938293, "step": 851 }, { "epoch": 0.6373667477089957, "grad_norm": 57.25740349886754, "learning_rate": 1.5341936905124125e-07, "logps/chosen": -69.19288635253906, "logps/rejected": -80.198974609375, "loss": 0.6268, "losses/dpo": 0.4484587013721466, "losses/sft": 0.5077493190765381, "losses/total": 0.4484587013721466, "ref_logps/chosen": -65.71357727050781, "ref_logps/rejected": -74.23944854736328, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3479304909706116, "rewards/margins": 0.24802245199680328, "rewards/rejected": -0.5959529280662537, "step": 852 }, { "epoch": 0.6381148307462128, "grad_norm": 60.47362037778687, "learning_rate": 1.5286025394874363e-07, "logps/chosen": -104.34928131103516, "logps/rejected": -108.6212158203125, "loss": 0.6027, "losses/dpo": 0.6682823300361633, "losses/sft": 0.7053226232528687, "losses/total": 0.6682823300361633, "ref_logps/chosen": -101.11181640625, "ref_logps/rejected": -101.37751007080078, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3237472176551819, "rewards/margins": 0.40062397718429565, "rewards/rejected": -0.7243711948394775, "step": 853 }, { "epoch": 0.63886291378343, "grad_norm": 75.09180855910317, "learning_rate": 1.5230171053145948e-07, "logps/chosen": -93.39785766601562, "logps/rejected": -102.39068603515625, "loss": 0.5921, "losses/dpo": 0.6078600883483887, "losses/sft": 0.4464501142501831, "losses/total": 0.6078600883483887, "ref_logps/chosen": -89.33592224121094, "ref_logps/rejected": -95.1160659790039, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4061945378780365, "rewards/margins": 0.3212672472000122, "rewards/rejected": -0.7274617552757263, "step": 854 }, { "epoch": 0.639610996820647, "grad_norm": 77.33787387913517, "learning_rate": 1.517437420865191e-07, "logps/chosen": -75.8436508178711, "logps/rejected": -82.11386108398438, "loss": 0.6472, "losses/dpo": 0.668043851852417, "losses/sft": 0.306137353181839, "losses/total": 0.668043851852417, "ref_logps/chosen": -71.94375610351562, "ref_logps/rejected": -75.89907836914062, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3899902105331421, "rewards/margins": 0.2314871996641159, "rewards/rejected": -0.6214774250984192, "step": 855 }, { "epoch": 0.6403590798578642, "grad_norm": 72.0137892472761, "learning_rate": 1.511863518976691e-07, "logps/chosen": -89.48446655273438, "logps/rejected": -96.9632339477539, "loss": 0.6438, "losses/dpo": 0.7075892090797424, "losses/sft": 1.1989761590957642, "losses/total": 0.7075892090797424, "ref_logps/chosen": -84.8335952758789, "ref_logps/rejected": -89.59751892089844, "rewards/accuracies": 0.46875, "rewards/chosen": -0.4650876224040985, "rewards/margins": 0.27148330211639404, "rewards/rejected": -0.7365708947181702, "step": 856 }, { "epoch": 0.6411071628950814, "grad_norm": 92.47505738358937, "learning_rate": 1.5062954324525278e-07, "logps/chosen": -85.81840515136719, "logps/rejected": -106.0491714477539, "loss": 0.5387, "losses/dpo": 0.48836657404899597, "losses/sft": 0.2880285978317261, "losses/total": 0.48836657404899597, "ref_logps/chosen": -81.91718292236328, "ref_logps/rejected": -97.19711303710938, "rewards/accuracies": 0.75, "rewards/chosen": -0.3901214897632599, "rewards/margins": 0.4950851798057556, "rewards/rejected": -0.8852065801620483, "step": 857 }, { "epoch": 0.6418552459322985, "grad_norm": 65.45629577897371, "learning_rate": 1.5007331940619138e-07, "logps/chosen": -95.9244384765625, "logps/rejected": -108.56822204589844, "loss": 0.593, "losses/dpo": 0.4896422028541565, "losses/sft": 1.3211220502853394, "losses/total": 0.4896422028541565, "ref_logps/chosen": -92.26240539550781, "ref_logps/rejected": -102.15351867675781, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3662027418613434, "rewards/margins": 0.2752683162689209, "rewards/rejected": -0.6414710283279419, "step": 858 }, { "epoch": 0.6426033289695157, "grad_norm": 67.02438572591802, "learning_rate": 1.4951768365396403e-07, "logps/chosen": -107.28053283691406, "logps/rejected": -119.16412353515625, "loss": 0.6069, "losses/dpo": 0.4804196357727051, "losses/sft": 1.2686249017715454, "losses/total": 0.4804196357727051, "ref_logps/chosen": -103.79369354248047, "ref_logps/rejected": -112.21794128417969, "rewards/accuracies": 0.78125, "rewards/chosen": -0.34868308901786804, "rewards/margins": 0.34593597054481506, "rewards/rejected": -0.6946190595626831, "step": 859 }, { "epoch": 0.6433514120067327, "grad_norm": 63.52502846264943, "learning_rate": 1.4896263925858902e-07, "logps/chosen": -101.20115661621094, "logps/rejected": -116.24148559570312, "loss": 0.5894, "losses/dpo": 0.4327818751335144, "losses/sft": 0.6280657649040222, "losses/total": 0.4327818751335144, "ref_logps/chosen": -97.88961791992188, "ref_logps/rejected": -110.04209899902344, "rewards/accuracies": 0.75, "rewards/chosen": -0.3311537504196167, "rewards/margins": 0.28878387808799744, "rewards/rejected": -0.6199376583099365, "step": 860 }, { "epoch": 0.6440994950439499, "grad_norm": 77.67672826675499, "learning_rate": 1.4840818948660432e-07, "logps/chosen": -72.9485855102539, "logps/rejected": -81.31387329101562, "loss": 0.6609, "losses/dpo": 0.6441037654876709, "losses/sft": 0.32470670342445374, "losses/total": 0.6441037654876709, "ref_logps/chosen": -69.01898193359375, "ref_logps/rejected": -75.80313110351562, "rewards/accuracies": 0.5, "rewards/chosen": -0.39295998215675354, "rewards/margins": 0.1581144779920578, "rewards/rejected": -0.5510743856430054, "step": 861 }, { "epoch": 0.644847578081167, "grad_norm": 58.932758549149916, "learning_rate": 1.4785433760104844e-07, "logps/chosen": -81.9010009765625, "logps/rejected": -94.84883117675781, "loss": 0.6243, "losses/dpo": 0.579012393951416, "losses/sft": 0.9030636548995972, "losses/total": 0.579012393951416, "ref_logps/chosen": -77.36561584472656, "ref_logps/rejected": -87.41515350341797, "rewards/accuracies": 0.625, "rewards/chosen": -0.45353907346725464, "rewards/margins": 0.28982844948768616, "rewards/rejected": -0.7433674931526184, "step": 862 }, { "epoch": 0.6455956611183842, "grad_norm": 81.40947395135068, "learning_rate": 1.4730108686144143e-07, "logps/chosen": -118.11990356445312, "logps/rejected": -123.44241333007812, "loss": 0.6279, "losses/dpo": 0.7593077421188354, "losses/sft": 0.5556212663650513, "losses/total": 0.7593077421188354, "ref_logps/chosen": -113.49488830566406, "ref_logps/rejected": -116.58934020996094, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4625011086463928, "rewards/margins": 0.22280552983283997, "rewards/rejected": -0.6853066086769104, "step": 863 }, { "epoch": 0.6463437441556013, "grad_norm": 80.55402809741115, "learning_rate": 1.4674844052376522e-07, "logps/chosen": -103.13996124267578, "logps/rejected": -119.45301055908203, "loss": 0.5914, "losses/dpo": 0.5776234865188599, "losses/sft": 1.0813196897506714, "losses/total": 0.5776234865188599, "ref_logps/chosen": -98.4354248046875, "ref_logps/rejected": -111.47126770019531, "rewards/accuracies": 0.71875, "rewards/chosen": -0.47045421600341797, "rewards/margins": 0.3277195692062378, "rewards/rejected": -0.7981737852096558, "step": 864 }, { "epoch": 0.6470918271928184, "grad_norm": 109.79176102597394, "learning_rate": 1.4619640184044482e-07, "logps/chosen": -81.49754333496094, "logps/rejected": -87.56987762451172, "loss": 0.6591, "losses/dpo": 0.5397162437438965, "losses/sft": 0.4585627317428589, "losses/total": 0.5397162437438965, "ref_logps/chosen": -76.84395599365234, "ref_logps/rejected": -81.02088928222656, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4653578996658325, "rewards/margins": 0.1895400732755661, "rewards/rejected": -0.654897928237915, "step": 865 }, { "epoch": 0.6478399102300355, "grad_norm": 70.0263708850011, "learning_rate": 1.456449740603291e-07, "logps/chosen": -95.58631896972656, "logps/rejected": -99.63969421386719, "loss": 0.6133, "losses/dpo": 0.7998800277709961, "losses/sft": 0.7078521251678467, "losses/total": 0.7998800277709961, "ref_logps/chosen": -92.26144409179688, "ref_logps/rejected": -93.69593811035156, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3324868083000183, "rewards/margins": 0.2618873119354248, "rewards/rejected": -0.5943740606307983, "step": 866 }, { "epoch": 0.6485879932672527, "grad_norm": 63.86100739702067, "learning_rate": 1.4509416042867148e-07, "logps/chosen": -112.4775390625, "logps/rejected": -132.40322875976562, "loss": 0.5735, "losses/dpo": 0.5978193879127502, "losses/sft": 1.2292463779449463, "losses/total": 0.5978193879127502, "ref_logps/chosen": -108.99334716796875, "ref_logps/rejected": -125.28577423095703, "rewards/accuracies": 0.75, "rewards/chosen": -0.3484179675579071, "rewards/margins": 0.36332836747169495, "rewards/rejected": -0.711746335029602, "step": 867 }, { "epoch": 0.6493360763044698, "grad_norm": 59.86274444090473, "learning_rate": 1.445439641871114e-07, "logps/chosen": -74.64927673339844, "logps/rejected": -73.90097045898438, "loss": 0.6552, "losses/dpo": 0.7012574672698975, "losses/sft": 0.22254768013954163, "losses/total": 0.7012574672698975, "ref_logps/chosen": -71.21803283691406, "ref_logps/rejected": -68.31608581542969, "rewards/accuracies": 0.53125, "rewards/chosen": -0.34312450885772705, "rewards/margins": 0.2153652310371399, "rewards/rejected": -0.5584897398948669, "step": 868 }, { "epoch": 0.650084159341687, "grad_norm": 128.01121412267798, "learning_rate": 1.439943885736545e-07, "logps/chosen": -78.41645812988281, "logps/rejected": -78.89535522460938, "loss": 0.6527, "losses/dpo": 1.0203320980072021, "losses/sft": 0.5521835684776306, "losses/total": 1.0203320980072021, "ref_logps/chosen": -73.94929504394531, "ref_logps/rejected": -72.327392578125, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4467167854309082, "rewards/margins": 0.2100788801908493, "rewards/rejected": -0.6567957401275635, "step": 869 }, { "epoch": 0.650832242378904, "grad_norm": 67.29234866651926, "learning_rate": 1.4344543682265403e-07, "logps/chosen": -81.84754943847656, "logps/rejected": -99.52090454101562, "loss": 0.596, "losses/dpo": 0.6485967636108398, "losses/sft": 0.5555850267410278, "losses/total": 0.6485967636108398, "ref_logps/chosen": -78.77497863769531, "ref_logps/rejected": -93.11347961425781, "rewards/accuracies": 0.625, "rewards/chosen": -0.3072560429573059, "rewards/margins": 0.3334876000881195, "rewards/rejected": -0.6407436728477478, "step": 870 }, { "epoch": 0.6515803254161212, "grad_norm": 84.27102186058531, "learning_rate": 1.4289711216479155e-07, "logps/chosen": -99.86886596679688, "logps/rejected": -107.48023986816406, "loss": 0.6824, "losses/dpo": 0.6554781198501587, "losses/sft": 0.6920866966247559, "losses/total": 0.6554781198501587, "ref_logps/chosen": -95.05282592773438, "ref_logps/rejected": -101.24940490722656, "rewards/accuracies": 0.53125, "rewards/chosen": -0.48160475492477417, "rewards/margins": 0.141479030251503, "rewards/rejected": -0.6230838298797607, "step": 871 }, { "epoch": 0.6523284084533383, "grad_norm": 63.87154604537199, "learning_rate": 1.4234941782705843e-07, "logps/chosen": -119.591064453125, "logps/rejected": -123.18486022949219, "loss": 0.7175, "losses/dpo": 0.6336790323257446, "losses/sft": 1.0507208108901978, "losses/total": 0.6336790323257446, "ref_logps/chosen": -114.37103271484375, "ref_logps/rejected": -117.19393920898438, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5220023989677429, "rewards/margins": 0.07708971202373505, "rewards/rejected": -0.5990920662879944, "step": 872 }, { "epoch": 0.6530764914905555, "grad_norm": 68.9221872181243, "learning_rate": 1.4180235703273606e-07, "logps/chosen": -100.21214294433594, "logps/rejected": -108.0419692993164, "loss": 0.6287, "losses/dpo": 0.5640860199928284, "losses/sft": 0.994064211845398, "losses/total": 0.5640860199928284, "ref_logps/chosen": -94.5927734375, "ref_logps/rejected": -100.13450622558594, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5619379281997681, "rewards/margins": 0.22880786657333374, "rewards/rejected": -0.790745735168457, "step": 873 }, { "epoch": 0.6538245745277725, "grad_norm": 60.077410559008854, "learning_rate": 1.4125593300137765e-07, "logps/chosen": -70.59876251220703, "logps/rejected": -82.33596801757812, "loss": 0.6189, "losses/dpo": 0.6844773888587952, "losses/sft": 0.6990987062454224, "losses/total": 0.6844773888587952, "ref_logps/chosen": -67.67605590820312, "ref_logps/rejected": -76.97392272949219, "rewards/accuracies": 0.6875, "rewards/chosen": -0.29227137565612793, "rewards/margins": 0.24393245577812195, "rewards/rejected": -0.5362038612365723, "step": 874 }, { "epoch": 0.6545726575649897, "grad_norm": 83.52089895612335, "learning_rate": 1.4071014894878868e-07, "logps/chosen": -89.32740020751953, "logps/rejected": -96.62516784667969, "loss": 0.6354, "losses/dpo": 0.6689467430114746, "losses/sft": 1.7289495468139648, "losses/total": 0.6689467430114746, "ref_logps/chosen": -84.43315124511719, "ref_logps/rejected": -89.41510772705078, "rewards/accuracies": 0.5625, "rewards/chosen": -0.48942503333091736, "rewards/margins": 0.2315816581249237, "rewards/rejected": -0.7210067510604858, "step": 875 }, { "epoch": 0.6553207406022068, "grad_norm": 83.12091216167853, "learning_rate": 1.401650080870083e-07, "logps/chosen": -74.17728424072266, "logps/rejected": -87.66437530517578, "loss": 0.5806, "losses/dpo": 0.6406930088996887, "losses/sft": 1.0336557626724243, "losses/total": 0.6406930088996887, "ref_logps/chosen": -70.69469451904297, "ref_logps/rejected": -81.07592010498047, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3482595384120941, "rewards/margins": 0.3105853199958801, "rewards/rejected": -0.6588448286056519, "step": 876 }, { "epoch": 0.656068823639424, "grad_norm": 92.22029255708293, "learning_rate": 1.3962051362429058e-07, "logps/chosen": -129.12530517578125, "logps/rejected": -138.2911376953125, "loss": 0.6062, "losses/dpo": 0.718751847743988, "losses/sft": 1.393606424331665, "losses/total": 0.718751847743988, "ref_logps/chosen": -124.32588195800781, "ref_logps/rejected": -130.61810302734375, "rewards/accuracies": 0.625, "rewards/chosen": -0.47994256019592285, "rewards/margins": 0.2873610854148865, "rewards/rejected": -0.7673036456108093, "step": 877 }, { "epoch": 0.656816906676641, "grad_norm": 53.67752508328008, "learning_rate": 1.3907666876508522e-07, "logps/chosen": -105.52940368652344, "logps/rejected": -120.91755676269531, "loss": 0.568, "losses/dpo": 0.7006046772003174, "losses/sft": 1.2002087831497192, "losses/total": 0.7006046772003174, "ref_logps/chosen": -101.54853820800781, "ref_logps/rejected": -113.274658203125, "rewards/accuracies": 0.75, "rewards/chosen": -0.39808616042137146, "rewards/margins": 0.36620408296585083, "rewards/rejected": -0.7642902731895447, "step": 878 }, { "epoch": 0.6575649897138582, "grad_norm": 83.7646338132547, "learning_rate": 1.3853347671001885e-07, "logps/chosen": -100.25201416015625, "logps/rejected": -115.5998764038086, "loss": 0.635, "losses/dpo": 0.4814826250076294, "losses/sft": 0.6212716102600098, "losses/total": 0.4814826250076294, "ref_logps/chosen": -95.93391418457031, "ref_logps/rejected": -109.28721618652344, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4318104386329651, "rewards/margins": 0.19945570826530457, "rewards/rejected": -0.631266176700592, "step": 879 }, { "epoch": 0.6583130727510754, "grad_norm": 114.06613797173598, "learning_rate": 1.379909406558763e-07, "logps/chosen": -93.38902282714844, "logps/rejected": -95.93753051757812, "loss": 0.7312, "losses/dpo": 0.6931320428848267, "losses/sft": 0.9360780715942383, "losses/total": 0.6931320428848267, "ref_logps/chosen": -86.52688598632812, "ref_logps/rejected": -88.21418762207031, "rewards/accuracies": 0.5, "rewards/chosen": -0.6862133741378784, "rewards/margins": 0.08612091839313507, "rewards/rejected": -0.7723343372344971, "step": 880 }, { "epoch": 0.6590611557882925, "grad_norm": 69.00373277380542, "learning_rate": 1.3744906379558164e-07, "logps/chosen": -76.0380859375, "logps/rejected": -98.59235382080078, "loss": 0.6466, "losses/dpo": 0.6565413475036621, "losses/sft": 0.48741403222084045, "losses/total": 0.6565413475036621, "ref_logps/chosen": -71.91915893554688, "ref_logps/rejected": -92.29669952392578, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4118931293487549, "rewards/margins": 0.2176721841096878, "rewards/rejected": -0.6295653581619263, "step": 881 }, { "epoch": 0.6598092388255097, "grad_norm": 65.87434994921537, "learning_rate": 1.3690784931817973e-07, "logps/chosen": -92.40971374511719, "logps/rejected": -102.97704315185547, "loss": 0.614, "losses/dpo": 0.6478174924850464, "losses/sft": 1.2128344774246216, "losses/total": 0.6478174924850464, "ref_logps/chosen": -88.72975158691406, "ref_logps/rejected": -97.11844635009766, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3679962754249573, "rewards/margins": 0.21786300837993622, "rewards/rejected": -0.5858592987060547, "step": 882 }, { "epoch": 0.6605573218627268, "grad_norm": 63.865293717144596, "learning_rate": 1.3636730040881685e-07, "logps/chosen": -94.02407836914062, "logps/rejected": -106.70199584960938, "loss": 0.5778, "losses/dpo": 0.5090200901031494, "losses/sft": 1.3564445972442627, "losses/total": 0.5090200901031494, "ref_logps/chosen": -91.65679931640625, "ref_logps/rejected": -101.12808227539062, "rewards/accuracies": 0.75, "rewards/chosen": -0.23672904074192047, "rewards/margins": 0.320663183927536, "rewards/rejected": -0.5573922395706177, "step": 883 }, { "epoch": 0.6613054048999439, "grad_norm": 94.57089863736937, "learning_rate": 1.3582742024872247e-07, "logps/chosen": -91.92747497558594, "logps/rejected": -98.66450500488281, "loss": 0.6497, "losses/dpo": 0.6277391910552979, "losses/sft": 1.0526618957519531, "losses/total": 0.6277391910552979, "ref_logps/chosen": -88.14312744140625, "ref_logps/rejected": -93.03421783447266, "rewards/accuracies": 0.5625, "rewards/chosen": -0.37843579053878784, "rewards/margins": 0.1845933496952057, "rewards/rejected": -0.5630291700363159, "step": 884 }, { "epoch": 0.662053487937161, "grad_norm": 65.36479865982233, "learning_rate": 1.3528821201519017e-07, "logps/chosen": -67.87739562988281, "logps/rejected": -90.14691162109375, "loss": 0.5863, "losses/dpo": 0.3528575897216797, "losses/sft": 0.49917206168174744, "losses/total": 0.3528575897216797, "ref_logps/chosen": -64.546875, "ref_logps/rejected": -83.46306610107422, "rewards/accuracies": 0.625, "rewards/chosen": -0.33305251598358154, "rewards/margins": 0.3353327512741089, "rewards/rejected": -0.6683852672576904, "step": 885 }, { "epoch": 0.6628015709743782, "grad_norm": 63.67652473556234, "learning_rate": 1.3474967888155947e-07, "logps/chosen": -111.52072143554688, "logps/rejected": -113.14498138427734, "loss": 0.6621, "losses/dpo": 0.7713974118232727, "losses/sft": 1.256648302078247, "losses/total": 0.7713974118232727, "ref_logps/chosen": -106.80848693847656, "ref_logps/rejected": -106.83968353271484, "rewards/accuracies": 0.625, "rewards/chosen": -0.47122353315353394, "rewards/margins": 0.15930643677711487, "rewards/rejected": -0.6305299997329712, "step": 886 }, { "epoch": 0.6635496540115953, "grad_norm": 91.20767954985155, "learning_rate": 1.3421182401719647e-07, "logps/chosen": -78.28451538085938, "logps/rejected": -91.2057876586914, "loss": 0.5462, "losses/dpo": 0.35587963461875916, "losses/sft": 0.48903030157089233, "losses/total": 0.35587963461875916, "ref_logps/chosen": -74.33362579345703, "ref_logps/rejected": -82.47058868408203, "rewards/accuracies": 0.75, "rewards/chosen": -0.3950886130332947, "rewards/margins": 0.4784317910671234, "rewards/rejected": -0.8735203742980957, "step": 887 }, { "epoch": 0.6642977370488125, "grad_norm": 68.57037414012554, "learning_rate": 1.3367465058747565e-07, "logps/chosen": -106.19804382324219, "logps/rejected": -117.12458801269531, "loss": 0.6308, "losses/dpo": 0.4932895302772522, "losses/sft": 0.757040798664093, "losses/total": 0.4932895302772522, "ref_logps/chosen": -102.08763122558594, "ref_logps/rejected": -110.94412994384766, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4110422730445862, "rewards/margins": 0.20700329542160034, "rewards/rejected": -0.6180455684661865, "step": 888 }, { "epoch": 0.6650458200860295, "grad_norm": 95.08440203543908, "learning_rate": 1.3313816175376113e-07, "logps/chosen": -127.63916015625, "logps/rejected": -115.54049682617188, "loss": 0.7076, "losses/dpo": 0.6321174502372742, "losses/sft": 0.30129188299179077, "losses/total": 0.6321174502372742, "ref_logps/chosen": -123.00010681152344, "ref_logps/rejected": -110.44422912597656, "rewards/accuracies": 0.4375, "rewards/chosen": -0.46390584111213684, "rewards/margins": 0.04572165012359619, "rewards/rejected": -0.5096274614334106, "step": 889 }, { "epoch": 0.6657939031232467, "grad_norm": 71.97563704923908, "learning_rate": 1.3260236067338798e-07, "logps/chosen": -88.58604431152344, "logps/rejected": -104.54383850097656, "loss": 0.5899, "losses/dpo": 0.6685219407081604, "losses/sft": 0.6179054379463196, "losses/total": 0.6685219407081604, "ref_logps/chosen": -85.82688903808594, "ref_logps/rejected": -98.90264892578125, "rewards/accuracies": 0.65625, "rewards/chosen": -0.27591589093208313, "rewards/margins": 0.2882038354873657, "rewards/rejected": -0.5641197562217712, "step": 890 }, { "epoch": 0.6665419861604638, "grad_norm": 68.88045525798034, "learning_rate": 1.320672504996439e-07, "logps/chosen": -92.89900207519531, "logps/rejected": -104.85590362548828, "loss": 0.5812, "losses/dpo": 0.6178168058395386, "losses/sft": 1.3342570066452026, "losses/total": 0.6178168058395386, "ref_logps/chosen": -89.69923400878906, "ref_logps/rejected": -98.04458618164062, "rewards/accuracies": 0.75, "rewards/chosen": -0.3199771046638489, "rewards/margins": 0.3611542284488678, "rewards/rejected": -0.6811313629150391, "step": 891 }, { "epoch": 0.667290069197681, "grad_norm": 62.1878303557396, "learning_rate": 1.3153283438175034e-07, "logps/chosen": -105.09220123291016, "logps/rejected": -112.4149398803711, "loss": 0.6429, "losses/dpo": 0.4707898497581482, "losses/sft": 0.6660915017127991, "losses/total": 0.4707898497581482, "ref_logps/chosen": -100.98382568359375, "ref_logps/rejected": -105.89016723632812, "rewards/accuracies": 0.65625, "rewards/chosen": -0.41083723306655884, "rewards/margins": 0.24164126813411713, "rewards/rejected": -0.6524784564971924, "step": 892 }, { "epoch": 0.668038152234898, "grad_norm": 72.5352545951256, "learning_rate": 1.309991154648441e-07, "logps/chosen": -81.1202392578125, "logps/rejected": -91.28315734863281, "loss": 0.6159, "losses/dpo": 0.7430461049079895, "losses/sft": 0.6106572151184082, "losses/total": 0.7430461049079895, "ref_logps/chosen": -76.05604553222656, "ref_logps/rejected": -83.42156219482422, "rewards/accuracies": 0.75, "rewards/chosen": -0.5064194202423096, "rewards/margins": 0.27974027395248413, "rewards/rejected": -0.7861597537994385, "step": 893 }, { "epoch": 0.6687862352721152, "grad_norm": 72.02916871082242, "learning_rate": 1.304660968899588e-07, "logps/chosen": -94.16915130615234, "logps/rejected": -102.92616271972656, "loss": 0.6087, "losses/dpo": 0.6852619647979736, "losses/sft": 0.935711681842804, "losses/total": 0.6852619647979736, "ref_logps/chosen": -91.0193862915039, "ref_logps/rejected": -97.15489959716797, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3149760961532593, "rewards/margins": 0.26215019822120667, "rewards/rejected": -0.5771262645721436, "step": 894 }, { "epoch": 0.6695343183093323, "grad_norm": 66.34527327396519, "learning_rate": 1.2993378179400644e-07, "logps/chosen": -76.2083969116211, "logps/rejected": -97.72783660888672, "loss": 0.575, "losses/dpo": 0.6906417012214661, "losses/sft": 0.5095822215080261, "losses/total": 0.6906417012214661, "ref_logps/chosen": -73.99163818359375, "ref_logps/rejected": -92.33368682861328, "rewards/accuracies": 0.75, "rewards/chosen": -0.22167576849460602, "rewards/margins": 0.31773868203163147, "rewards/rejected": -0.5394144654273987, "step": 895 }, { "epoch": 0.6702824013465495, "grad_norm": 73.78588780227241, "learning_rate": 1.294021733097591e-07, "logps/chosen": -98.44496154785156, "logps/rejected": -100.68054962158203, "loss": 0.6508, "losses/dpo": 0.6215669512748718, "losses/sft": 0.41767024993896484, "losses/total": 0.6215669512748718, "ref_logps/chosen": -94.05329895019531, "ref_logps/rejected": -94.74860382080078, "rewards/accuracies": 0.59375, "rewards/chosen": -0.43916553258895874, "rewards/margins": 0.1540295034646988, "rewards/rejected": -0.5931950807571411, "step": 896 }, { "epoch": 0.6710304843837666, "grad_norm": 62.582728893071895, "learning_rate": 1.2887127456583008e-07, "logps/chosen": -88.60198974609375, "logps/rejected": -92.06727600097656, "loss": 0.5939, "losses/dpo": 0.6045426726341248, "losses/sft": 0.589317798614502, "losses/total": 0.6045426726341248, "ref_logps/chosen": -85.0143814086914, "ref_logps/rejected": -85.11647033691406, "rewards/accuracies": 0.71875, "rewards/chosen": -0.35875993967056274, "rewards/margins": 0.33632123470306396, "rewards/rejected": -0.6950811743736267, "step": 897 }, { "epoch": 0.6717785674209837, "grad_norm": 70.01999267085587, "learning_rate": 1.283410886866558e-07, "logps/chosen": -96.78480529785156, "logps/rejected": -101.7309341430664, "loss": 0.6535, "losses/dpo": 0.6506568193435669, "losses/sft": 0.9145074486732483, "losses/total": 0.6506568193435669, "ref_logps/chosen": -91.6741943359375, "ref_logps/rejected": -94.85902404785156, "rewards/accuracies": 0.625, "rewards/chosen": -0.5110617876052856, "rewards/margins": 0.17612864077091217, "rewards/rejected": -0.6871904134750366, "step": 898 }, { "epoch": 0.6725266504582008, "grad_norm": 162.86794658620684, "learning_rate": 1.2781161879247727e-07, "logps/chosen": -86.25369262695312, "logps/rejected": -110.26692199707031, "loss": 0.4886, "losses/dpo": 0.4912080466747284, "losses/sft": 1.0439960956573486, "losses/total": 0.4912080466747284, "ref_logps/chosen": -83.47697448730469, "ref_logps/rejected": -100.38314819335938, "rewards/accuracies": 0.75, "rewards/chosen": -0.27767202258110046, "rewards/margins": 0.710705578327179, "rewards/rejected": -0.988377571105957, "step": 899 }, { "epoch": 0.673274733495418, "grad_norm": 57.722787382296104, "learning_rate": 1.272828679993221e-07, "logps/chosen": -88.9894790649414, "logps/rejected": -105.04479217529297, "loss": 0.5534, "losses/dpo": 0.40993306040763855, "losses/sft": 1.0370712280273438, "losses/total": 0.40993306040763855, "ref_logps/chosen": -84.25951385498047, "ref_logps/rejected": -95.41533660888672, "rewards/accuracies": 0.625, "rewards/chosen": -0.4729967415332794, "rewards/margins": 0.4899495840072632, "rewards/rejected": -0.962946355342865, "step": 900 }, { "epoch": 0.6740228165326351, "grad_norm": 62.83777936422193, "learning_rate": 1.2675483941898545e-07, "logps/chosen": -103.11067199707031, "logps/rejected": -126.13377380371094, "loss": 0.5731, "losses/dpo": 0.6142141819000244, "losses/sft": 1.3821146488189697, "losses/total": 0.6142141819000244, "ref_logps/chosen": -99.46526336669922, "ref_logps/rejected": -118.31388854980469, "rewards/accuracies": 0.75, "rewards/chosen": -0.36454054713249207, "rewards/margins": 0.4174489974975586, "rewards/rejected": -0.781989574432373, "step": 901 }, { "epoch": 0.6747708995698523, "grad_norm": 65.04214996249566, "learning_rate": 1.2622753615901243e-07, "logps/chosen": -89.5456314086914, "logps/rejected": -102.70330810546875, "loss": 0.5309, "losses/dpo": 0.5857716798782349, "losses/sft": 1.0723731517791748, "losses/total": 0.5857716798782349, "ref_logps/chosen": -86.73637390136719, "ref_logps/rejected": -95.58146667480469, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2809256613254547, "rewards/margins": 0.4312595725059509, "rewards/rejected": -0.7121852040290833, "step": 902 }, { "epoch": 0.6755189826070693, "grad_norm": 49.23588212496092, "learning_rate": 1.2570096132267933e-07, "logps/chosen": -103.28697204589844, "logps/rejected": -109.32987976074219, "loss": 0.5974, "losses/dpo": 0.7241487503051758, "losses/sft": 1.1693843603134155, "losses/total": 0.7241487503051758, "ref_logps/chosen": -99.33975219726562, "ref_logps/rejected": -102.00694274902344, "rewards/accuracies": 0.75, "rewards/chosen": -0.39472195506095886, "rewards/margins": 0.33757108449935913, "rewards/rejected": -0.7322930097579956, "step": 903 }, { "epoch": 0.6762670656442865, "grad_norm": 64.64378054530611, "learning_rate": 1.2517511800897552e-07, "logps/chosen": -102.49919128417969, "logps/rejected": -117.33930969238281, "loss": 0.5738, "losses/dpo": 0.6210899949073792, "losses/sft": 1.303556203842163, "losses/total": 0.6210899949073792, "ref_logps/chosen": -98.21879577636719, "ref_logps/rejected": -109.66181945800781, "rewards/accuracies": 0.6875, "rewards/chosen": -0.42803990840911865, "rewards/margins": 0.33970966935157776, "rewards/rejected": -0.7677496075630188, "step": 904 }, { "epoch": 0.6770151486815037, "grad_norm": 71.70775995758595, "learning_rate": 1.2465000931258547e-07, "logps/chosen": -86.61361694335938, "logps/rejected": -105.09761047363281, "loss": 0.602, "losses/dpo": 0.4323791265487671, "losses/sft": 1.0261324644088745, "losses/total": 0.4323791265487671, "ref_logps/chosen": -81.9705810546875, "ref_logps/rejected": -97.84169006347656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4643043279647827, "rewards/margins": 0.2612861394882202, "rewards/rejected": -0.7255904674530029, "step": 905 }, { "epoch": 0.6777632317187208, "grad_norm": 84.4572635451774, "learning_rate": 1.2412563832387003e-07, "logps/chosen": -97.61595153808594, "logps/rejected": -100.83129119873047, "loss": 0.6375, "losses/dpo": 0.7208411693572998, "losses/sft": 0.6266285181045532, "losses/total": 0.7208411693572998, "ref_logps/chosen": -93.7735595703125, "ref_logps/rejected": -95.1733627319336, "rewards/accuracies": 0.625, "rewards/chosen": -0.38423991203308105, "rewards/margins": 0.18155233561992645, "rewards/rejected": -0.5657922625541687, "step": 906 }, { "epoch": 0.678511314755938, "grad_norm": 61.18874804399721, "learning_rate": 1.2360200812884861e-07, "logps/chosen": -88.47825622558594, "logps/rejected": -98.27239990234375, "loss": 0.628, "losses/dpo": 0.684822678565979, "losses/sft": 0.5248903632164001, "losses/total": 0.684822678565979, "ref_logps/chosen": -83.94464111328125, "ref_logps/rejected": -91.6527099609375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4533611536026001, "rewards/margins": 0.2086091786623001, "rewards/rejected": -0.6619703769683838, "step": 907 }, { "epoch": 0.679259397793155, "grad_norm": 93.15399156946903, "learning_rate": 1.230791218091809e-07, "logps/chosen": -108.70833587646484, "logps/rejected": -108.48677062988281, "loss": 0.7555, "losses/dpo": 0.7113544940948486, "losses/sft": 1.2018117904663086, "losses/total": 0.7113544940948486, "ref_logps/chosen": -103.48741149902344, "ref_logps/rejected": -103.1236572265625, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5220940113067627, "rewards/margins": 0.014216654002666473, "rewards/rejected": -0.5363106727600098, "step": 908 }, { "epoch": 0.6800074808303722, "grad_norm": 58.27104487930248, "learning_rate": 1.2255698244214862e-07, "logps/chosen": -89.88724517822266, "logps/rejected": -93.42657470703125, "loss": 0.6368, "losses/dpo": 0.6827441453933716, "losses/sft": 1.0738017559051514, "losses/total": 0.6827441453933716, "ref_logps/chosen": -85.9961929321289, "ref_logps/rejected": -87.74903869628906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3891048729419708, "rewards/margins": 0.17864906787872314, "rewards/rejected": -0.5677539706230164, "step": 909 }, { "epoch": 0.6807555638675893, "grad_norm": 66.01133741912777, "learning_rate": 1.2203559310063792e-07, "logps/chosen": -81.42305755615234, "logps/rejected": -99.993896484375, "loss": 0.5594, "losses/dpo": 0.42174386978149414, "losses/sft": 0.5892874598503113, "losses/total": 0.42174386978149414, "ref_logps/chosen": -77.91547393798828, "ref_logps/rejected": -92.24710845947266, "rewards/accuracies": 0.71875, "rewards/chosen": -0.35075896978378296, "rewards/margins": 0.4239192306995392, "rewards/rejected": -0.7746782302856445, "step": 910 }, { "epoch": 0.6815036469048065, "grad_norm": 77.48649816725413, "learning_rate": 1.2151495685312053e-07, "logps/chosen": -99.61642456054688, "logps/rejected": -116.72686767578125, "loss": 0.6569, "losses/dpo": 0.5282369256019592, "losses/sft": 0.9057922959327698, "losses/total": 0.5282369256019592, "ref_logps/chosen": -95.48169708251953, "ref_logps/rejected": -110.59881591796875, "rewards/accuracies": 0.5, "rewards/chosen": -0.41347217559814453, "rewards/margins": 0.1993330717086792, "rewards/rejected": -0.6128052473068237, "step": 911 }, { "epoch": 0.6822517299420235, "grad_norm": 75.4307436568343, "learning_rate": 1.2099507676363616e-07, "logps/chosen": -95.08317565917969, "logps/rejected": -99.4542236328125, "loss": 0.5863, "losses/dpo": 0.8498323559761047, "losses/sft": 0.8454011082649231, "losses/total": 0.8498323559761047, "ref_logps/chosen": -91.20317077636719, "ref_logps/rejected": -91.71695709228516, "rewards/accuracies": 0.65625, "rewards/chosen": -0.38800016045570374, "rewards/margins": 0.3857271075248718, "rewards/rejected": -0.773727297782898, "step": 912 }, { "epoch": 0.6829998129792407, "grad_norm": 62.3493296791392, "learning_rate": 1.2047595589177441e-07, "logps/chosen": -96.84097290039062, "logps/rejected": -105.31169128417969, "loss": 0.6143, "losses/dpo": 0.712195098400116, "losses/sft": 0.5572599172592163, "losses/total": 0.712195098400116, "ref_logps/chosen": -93.25594329833984, "ref_logps/rejected": -98.8716812133789, "rewards/accuracies": 0.6875, "rewards/chosen": -0.35850298404693604, "rewards/margins": 0.28549832105636597, "rewards/rejected": -0.644001305103302, "step": 913 }, { "epoch": 0.6837478960164578, "grad_norm": 66.44261731081252, "learning_rate": 1.1995759729265697e-07, "logps/chosen": -82.06236267089844, "logps/rejected": -97.02055358886719, "loss": 0.5222, "losses/dpo": 0.48918047547340393, "losses/sft": 0.8534044027328491, "losses/total": 0.48918047547340393, "ref_logps/chosen": -79.70663452148438, "ref_logps/rejected": -90.14797973632812, "rewards/accuracies": 0.8125, "rewards/chosen": -0.23557224869728088, "rewards/margins": 0.4516845941543579, "rewards/rejected": -0.6872568130493164, "step": 914 }, { "epoch": 0.684495979053675, "grad_norm": 73.21577130395511, "learning_rate": 1.194400040169191e-07, "logps/chosen": -67.56504821777344, "logps/rejected": -90.09709167480469, "loss": 0.547, "losses/dpo": 0.477711945772171, "losses/sft": 0.989822268486023, "losses/total": 0.477711945772171, "ref_logps/chosen": -63.718536376953125, "ref_logps/rejected": -81.79573059082031, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3846522271633148, "rewards/margins": 0.44548261165618896, "rewards/rejected": -0.8301348090171814, "step": 915 }, { "epoch": 0.6852440620908921, "grad_norm": 86.10457582620363, "learning_rate": 1.189231791106921e-07, "logps/chosen": -87.12594604492188, "logps/rejected": -92.33296966552734, "loss": 0.681, "losses/dpo": 0.7317131757736206, "losses/sft": 0.5928212404251099, "losses/total": 0.7317131757736206, "ref_logps/chosen": -82.14408111572266, "ref_logps/rejected": -86.61641693115234, "rewards/accuracies": 0.5625, "rewards/chosen": -0.49818551540374756, "rewards/margins": 0.07346992939710617, "rewards/rejected": -0.5716555118560791, "step": 916 }, { "epoch": 0.6859921451281092, "grad_norm": 69.31379595302602, "learning_rate": 1.1840712561558532e-07, "logps/chosen": -90.21749114990234, "logps/rejected": -98.60611724853516, "loss": 0.6383, "losses/dpo": 0.8536398410797119, "losses/sft": 0.9129378199577332, "losses/total": 0.8536398410797119, "ref_logps/chosen": -85.92369079589844, "ref_logps/rejected": -92.05956268310547, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4293791651725769, "rewards/margins": 0.22527654469013214, "rewards/rejected": -0.6546556949615479, "step": 917 }, { "epoch": 0.6867402281653263, "grad_norm": 68.44148377968826, "learning_rate": 1.1789184656866811e-07, "logps/chosen": -89.43386840820312, "logps/rejected": -103.04425048828125, "loss": 0.5929, "losses/dpo": 0.8087693452835083, "losses/sft": 0.5895588397979736, "losses/total": 0.8087693452835083, "ref_logps/chosen": -86.0171890258789, "ref_logps/rejected": -95.758544921875, "rewards/accuracies": 0.625, "rewards/chosen": -0.3416684865951538, "rewards/margins": 0.3869022727012634, "rewards/rejected": -0.7285707592964172, "step": 918 }, { "epoch": 0.6874883112025435, "grad_norm": 64.4431744878793, "learning_rate": 1.1737734500245225e-07, "logps/chosen": -89.24295043945312, "logps/rejected": -107.95516967773438, "loss": 0.5758, "losses/dpo": 0.6025048494338989, "losses/sft": 0.6378225088119507, "losses/total": 0.6025048494338989, "ref_logps/chosen": -85.06976318359375, "ref_logps/rejected": -99.44955444335938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41731852293014526, "rewards/margins": 0.4332442283630371, "rewards/rejected": -0.8505626916885376, "step": 919 }, { "epoch": 0.6882363942397606, "grad_norm": 95.34587186329605, "learning_rate": 1.168636239448737e-07, "logps/chosen": -107.00914001464844, "logps/rejected": -118.36149597167969, "loss": 0.7052, "losses/dpo": 0.6910490989685059, "losses/sft": 1.4612302780151367, "losses/total": 0.6910490989685059, "ref_logps/chosen": -101.49693298339844, "ref_logps/rejected": -112.04182434082031, "rewards/accuracies": 0.5, "rewards/chosen": -0.5512199401855469, "rewards/margins": 0.08074717223644257, "rewards/rejected": -0.6319670677185059, "step": 920 }, { "epoch": 0.6889844772769778, "grad_norm": 77.62020102392566, "learning_rate": 1.163506864192751e-07, "logps/chosen": -114.79393768310547, "logps/rejected": -115.82930755615234, "loss": 0.706, "losses/dpo": 0.6121622920036316, "losses/sft": 1.028231143951416, "losses/total": 0.6121622920036316, "ref_logps/chosen": -110.03482055664062, "ref_logps/rejected": -109.93075561523438, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4759118854999542, "rewards/margins": 0.11394377052783966, "rewards/rejected": -0.5898556709289551, "step": 921 }, { "epoch": 0.6897325603141948, "grad_norm": 69.18795067727399, "learning_rate": 1.1583853544438777e-07, "logps/chosen": -94.78605651855469, "logps/rejected": -99.6746826171875, "loss": 0.7021, "losses/dpo": 0.5805174708366394, "losses/sft": 0.4857397675514221, "losses/total": 0.5805174708366394, "ref_logps/chosen": -90.238037109375, "ref_logps/rejected": -94.1716079711914, "rewards/accuracies": 0.46875, "rewards/chosen": -0.4548020362854004, "rewards/margins": 0.09550591558218002, "rewards/rejected": -0.5503079891204834, "step": 922 }, { "epoch": 0.690480643351412, "grad_norm": 57.20884794747909, "learning_rate": 1.1532717403431403e-07, "logps/chosen": -91.88741302490234, "logps/rejected": -102.22807312011719, "loss": 0.5828, "losses/dpo": 0.5715733766555786, "losses/sft": 0.9209566712379456, "losses/total": 0.5715733766555786, "ref_logps/chosen": -89.22407531738281, "ref_logps/rejected": -96.23377990722656, "rewards/accuracies": 0.75, "rewards/chosen": -0.2663334012031555, "rewards/margins": 0.33309608697891235, "rewards/rejected": -0.5994294881820679, "step": 923 }, { "epoch": 0.6912287263886291, "grad_norm": 57.26648999402671, "learning_rate": 1.1481660519850969e-07, "logps/chosen": -99.96662139892578, "logps/rejected": -114.50553894042969, "loss": 0.5255, "losses/dpo": 0.649815559387207, "losses/sft": 0.8855980038642883, "losses/total": 0.649815559387207, "ref_logps/chosen": -96.35102844238281, "ref_logps/rejected": -105.1385726928711, "rewards/accuracies": 0.71875, "rewards/chosen": -0.36155885457992554, "rewards/margins": 0.5751376152038574, "rewards/rejected": -0.9366964101791382, "step": 924 }, { "epoch": 0.6919768094258463, "grad_norm": 116.08290117178329, "learning_rate": 1.1430683194176585e-07, "logps/chosen": -111.74695587158203, "logps/rejected": -122.084716796875, "loss": 0.6234, "losses/dpo": 0.6630674600601196, "losses/sft": 0.3248690068721771, "losses/total": 0.6630674600601196, "ref_logps/chosen": -107.24793243408203, "ref_logps/rejected": -114.58414459228516, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4499027729034424, "rewards/margins": 0.3001546859741211, "rewards/rejected": -0.7500575184822083, "step": 925 }, { "epoch": 0.6927248924630633, "grad_norm": 50.3028941017715, "learning_rate": 1.1379785726419161e-07, "logps/chosen": -96.22076416015625, "logps/rejected": -97.48571014404297, "loss": 0.6205, "losses/dpo": 0.7137981653213501, "losses/sft": 0.7557131052017212, "losses/total": 0.7137981653213501, "ref_logps/chosen": -92.3294906616211, "ref_logps/rejected": -91.41908264160156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.38912659883499146, "rewards/margins": 0.21753650903701782, "rewards/rejected": -0.6066631078720093, "step": 926 }, { "epoch": 0.6934729755002805, "grad_norm": 66.88760549515226, "learning_rate": 1.1328968416119611e-07, "logps/chosen": -90.71990966796875, "logps/rejected": -94.01922607421875, "loss": 0.6315, "losses/dpo": 0.4527641832828522, "losses/sft": 0.8102312684059143, "losses/total": 0.4527641832828522, "ref_logps/chosen": -86.59362030029297, "ref_logps/rejected": -87.19009399414062, "rewards/accuracies": 0.59375, "rewards/chosen": -0.41262930631637573, "rewards/margins": 0.27028346061706543, "rewards/rejected": -0.6829128265380859, "step": 927 }, { "epoch": 0.6942210585374977, "grad_norm": 55.94130941877661, "learning_rate": 1.1278231562347145e-07, "logps/chosen": -93.42855072021484, "logps/rejected": -102.68055725097656, "loss": 0.6307, "losses/dpo": 0.8295868635177612, "losses/sft": 1.2467650175094604, "losses/total": 0.8295868635177612, "ref_logps/chosen": -90.44097137451172, "ref_logps/rejected": -97.1676025390625, "rewards/accuracies": 0.59375, "rewards/chosen": -0.29875802993774414, "rewards/margins": 0.25253742933273315, "rewards/rejected": -0.5512954592704773, "step": 928 }, { "epoch": 0.6949691415747148, "grad_norm": 72.02625519156307, "learning_rate": 1.1227575463697439e-07, "logps/chosen": -87.80792236328125, "logps/rejected": -95.11567687988281, "loss": 0.661, "losses/dpo": 0.5990602374076843, "losses/sft": 0.9604889154434204, "losses/total": 0.5990602374076843, "ref_logps/chosen": -82.8058853149414, "ref_logps/rejected": -88.50723266601562, "rewards/accuracies": 0.625, "rewards/chosen": -0.5002036094665527, "rewards/margins": 0.16064101457595825, "rewards/rejected": -0.6608446836471558, "step": 929 }, { "epoch": 0.695717224611932, "grad_norm": 87.46444910725442, "learning_rate": 1.1177000418290916e-07, "logps/chosen": -80.61663818359375, "logps/rejected": -98.06109619140625, "loss": 0.5852, "losses/dpo": 0.4830561578273773, "losses/sft": 0.9942502975463867, "losses/total": 0.4830561578273773, "ref_logps/chosen": -76.43595886230469, "ref_logps/rejected": -90.90336608886719, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4180683493614197, "rewards/margins": 0.2977043688297272, "rewards/rejected": -0.7157727479934692, "step": 930 }, { "epoch": 0.696465307649149, "grad_norm": 81.42055826507837, "learning_rate": 1.1126506723770995e-07, "logps/chosen": -66.68682861328125, "logps/rejected": -80.57721710205078, "loss": 0.6122, "losses/dpo": 0.6352646350860596, "losses/sft": 0.6957409381866455, "losses/total": 0.6352646350860596, "ref_logps/chosen": -61.889286041259766, "ref_logps/rejected": -71.4878921508789, "rewards/accuracies": 0.5625, "rewards/chosen": -0.47975409030914307, "rewards/margins": 0.4291788935661316, "rewards/rejected": -0.9089329838752747, "step": 931 }, { "epoch": 0.6972133906863662, "grad_norm": 61.77304964340602, "learning_rate": 1.1076094677302311e-07, "logps/chosen": -96.69369506835938, "logps/rejected": -97.6051025390625, "loss": 0.6507, "losses/dpo": 0.6483075618743896, "losses/sft": 0.8829600214958191, "losses/total": 0.6483075618743896, "ref_logps/chosen": -91.88961791992188, "ref_logps/rejected": -91.15496063232422, "rewards/accuracies": 0.625, "rewards/chosen": -0.48040759563446045, "rewards/margins": 0.1646071970462799, "rewards/rejected": -0.6450148224830627, "step": 932 }, { "epoch": 0.6979614737235833, "grad_norm": 82.98095767340398, "learning_rate": 1.1025764575569021e-07, "logps/chosen": -108.8394546508789, "logps/rejected": -111.40000915527344, "loss": 0.6806, "losses/dpo": 0.9170799851417542, "losses/sft": 0.9800631403923035, "losses/total": 0.9170799851417542, "ref_logps/chosen": -104.46269226074219, "ref_logps/rejected": -105.09199523925781, "rewards/accuracies": 0.625, "rewards/chosen": -0.4376755654811859, "rewards/margins": 0.19312575459480286, "rewards/rejected": -0.6308013200759888, "step": 933 }, { "epoch": 0.6987095567608005, "grad_norm": 82.15015266576084, "learning_rate": 1.097551671477299e-07, "logps/chosen": -89.82624053955078, "logps/rejected": -90.47570037841797, "loss": 0.6292, "losses/dpo": 0.7431254386901855, "losses/sft": 0.9907591938972473, "losses/total": 0.7431254386901855, "ref_logps/chosen": -85.29288482666016, "ref_logps/rejected": -83.62765502929688, "rewards/accuracies": 0.625, "rewards/chosen": -0.45333588123321533, "rewards/margins": 0.23146933317184448, "rewards/rejected": -0.684805154800415, "step": 934 }, { "epoch": 0.6994576397980176, "grad_norm": 58.07691676391941, "learning_rate": 1.0925351390632087e-07, "logps/chosen": -72.30995178222656, "logps/rejected": -75.90328979492188, "loss": 0.6546, "losses/dpo": 0.6227418184280396, "losses/sft": 0.7382521629333496, "losses/total": 0.6227418184280396, "ref_logps/chosen": -68.44235229492188, "ref_logps/rejected": -69.98274230957031, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3867604434490204, "rewards/margins": 0.20529451966285706, "rewards/rejected": -0.5920549631118774, "step": 935 }, { "epoch": 0.7002057228352347, "grad_norm": 470.02296943540426, "learning_rate": 1.0875268898378449e-07, "logps/chosen": -86.57674407958984, "logps/rejected": -86.04463195800781, "loss": 0.6528, "losses/dpo": 0.6046533584594727, "losses/sft": 1.1077549457550049, "losses/total": 0.6046533584594727, "ref_logps/chosen": -82.30596923828125, "ref_logps/rejected": -80.23637390136719, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4270775616168976, "rewards/margins": 0.15374812483787537, "rewards/rejected": -0.580825686454773, "step": 936 }, { "epoch": 0.7009538058724518, "grad_norm": 71.72701041328258, "learning_rate": 1.0825269532756706e-07, "logps/chosen": -80.24886322021484, "logps/rejected": -89.53234100341797, "loss": 0.5872, "losses/dpo": 0.4908347725868225, "losses/sft": 0.5085676312446594, "losses/total": 0.4908347725868225, "ref_logps/chosen": -75.6297607421875, "ref_logps/rejected": -80.61908721923828, "rewards/accuracies": 0.71875, "rewards/chosen": -0.46191006898880005, "rewards/margins": 0.4294154942035675, "rewards/rejected": -0.8913256525993347, "step": 937 }, { "epoch": 0.701701888909669, "grad_norm": 75.35950360019254, "learning_rate": 1.0775353588022315e-07, "logps/chosen": -73.13348388671875, "logps/rejected": -94.74896240234375, "loss": 0.6078, "losses/dpo": 0.5174742937088013, "losses/sft": 0.48052164912223816, "losses/total": 0.5174742937088013, "ref_logps/chosen": -69.59345245361328, "ref_logps/rejected": -88.39555358886719, "rewards/accuracies": 0.65625, "rewards/chosen": -0.35400262475013733, "rewards/margins": 0.28133824467658997, "rewards/rejected": -0.6353408694267273, "step": 938 }, { "epoch": 0.701701888909669, "eval_logps/chosen": -39.516353607177734, "eval_logps/rejected": -45.744110107421875, "eval_loss": 0.6168679594993591, "eval_losses/dpo": 0.6087402105331421, "eval_losses/sft": 0.3285222351551056, "eval_losses/total": 0.6087402105331421, "eval_ref_logps/chosen": -35.788204193115234, "eval_ref_logps/rejected": -39.235660552978516, "eval_rewards/accuracies": 0.642241358757019, "eval_rewards/chosen": -0.37281522154808044, "eval_rewards/margins": 0.2780299484729767, "eval_rewards/rejected": -0.6508451104164124, "eval_runtime": 38.1168, "eval_samples_per_second": 12.147, "eval_steps_per_second": 1.522, "step": 938 }, { "epoch": 0.7024499719468861, "grad_norm": 74.25365297726941, "learning_rate": 1.0725521357939749e-07, "logps/chosen": -92.58843994140625, "logps/rejected": -115.03865051269531, "loss": 0.5443, "losses/dpo": 0.6191145181655884, "losses/sft": 1.098022699356079, "losses/total": 0.6191145181655884, "ref_logps/chosen": -88.34347534179688, "ref_logps/rejected": -106.0360336303711, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4244961142539978, "rewards/margins": 0.4757658839225769, "rewards/rejected": -0.9002619981765747, "step": 939 }, { "epoch": 0.7031980549841033, "grad_norm": 78.50181296860933, "learning_rate": 1.0675773135780818e-07, "logps/chosen": -102.86430358886719, "logps/rejected": -105.06809997558594, "loss": 0.6441, "losses/dpo": 0.6573469638824463, "losses/sft": 0.6220128536224365, "losses/total": 0.6573469638824463, "ref_logps/chosen": -99.31413269042969, "ref_logps/rejected": -99.58451843261719, "rewards/accuracies": 0.625, "rewards/chosen": -0.3550169765949249, "rewards/margins": 0.193341463804245, "rewards/rejected": -0.5483584403991699, "step": 940 }, { "epoch": 0.7039461380213203, "grad_norm": 79.60182607271851, "learning_rate": 1.0626109214322923e-07, "logps/chosen": -76.4559326171875, "logps/rejected": -83.23793029785156, "loss": 0.6303, "losses/dpo": 0.5670258402824402, "losses/sft": 1.5826183557510376, "losses/total": 0.5670258402824402, "ref_logps/chosen": -72.4742660522461, "ref_logps/rejected": -76.96290588378906, "rewards/accuracies": 0.75, "rewards/chosen": -0.3981670141220093, "rewards/margins": 0.22933612763881683, "rewards/rejected": -0.6275031566619873, "step": 941 }, { "epoch": 0.7046942210585375, "grad_norm": 96.14519156713307, "learning_rate": 1.0576529885847363e-07, "logps/chosen": -104.78475952148438, "logps/rejected": -93.47880554199219, "loss": 0.7316, "losses/dpo": 0.5526950359344482, "losses/sft": 1.0436444282531738, "losses/total": 0.5526950359344482, "ref_logps/chosen": -99.56076049804688, "ref_logps/rejected": -87.89488983154297, "rewards/accuracies": 0.625, "rewards/chosen": -0.5224006175994873, "rewards/margins": 0.035991370677948, "rewards/rejected": -0.5583920478820801, "step": 942 }, { "epoch": 0.7054423040957546, "grad_norm": 80.357134077768, "learning_rate": 1.0527035442137564e-07, "logps/chosen": -104.2413330078125, "logps/rejected": -107.84243774414062, "loss": 0.7098, "losses/dpo": 0.4770321249961853, "losses/sft": 0.8625088334083557, "losses/total": 0.4770321249961853, "ref_logps/chosen": -98.90699005126953, "ref_logps/rejected": -101.22731018066406, "rewards/accuracies": 0.625, "rewards/chosen": -0.5334346890449524, "rewards/margins": 0.1280781775712967, "rewards/rejected": -0.6615128517150879, "step": 943 }, { "epoch": 0.7061903871329718, "grad_norm": 88.04005539215994, "learning_rate": 1.0477626174477403e-07, "logps/chosen": -101.61292266845703, "logps/rejected": -118.08647918701172, "loss": 0.6728, "losses/dpo": 0.4579921066761017, "losses/sft": 1.0059088468551636, "losses/total": 0.4579921066761017, "ref_logps/chosen": -96.62654113769531, "ref_logps/rejected": -111.5544662475586, "rewards/accuracies": 0.53125, "rewards/chosen": -0.49863871932029724, "rewards/margins": 0.15456272661685944, "rewards/rejected": -0.6532014608383179, "step": 944 }, { "epoch": 0.7069384701701888, "grad_norm": 69.94591997203011, "learning_rate": 1.0428302373649475e-07, "logps/chosen": -99.69346618652344, "logps/rejected": -107.16061401367188, "loss": 0.5855, "losses/dpo": 0.6895566582679749, "losses/sft": 1.0714055299758911, "losses/total": 0.6895566582679749, "ref_logps/chosen": -95.67269897460938, "ref_logps/rejected": -98.94403076171875, "rewards/accuracies": 0.75, "rewards/chosen": -0.4020768404006958, "rewards/margins": 0.419582337141037, "rewards/rejected": -0.8216592073440552, "step": 945 }, { "epoch": 0.707686553207406, "grad_norm": 67.10394691479466, "learning_rate": 1.0379064329933384e-07, "logps/chosen": -100.07565307617188, "logps/rejected": -117.09846496582031, "loss": 0.5487, "losses/dpo": 0.5486155152320862, "losses/sft": 1.0134385824203491, "losses/total": 0.5486155152320862, "ref_logps/chosen": -96.42488098144531, "ref_logps/rejected": -109.37775421142578, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3650766909122467, "rewards/margins": 0.40699341893196106, "rewards/rejected": -0.7720701098442078, "step": 946 }, { "epoch": 0.7084346362446231, "grad_norm": 77.26541988704291, "learning_rate": 1.0329912333104063e-07, "logps/chosen": -110.72129821777344, "logps/rejected": -112.89335632324219, "loss": 0.5801, "losses/dpo": 0.4491061568260193, "losses/sft": 0.6199053525924683, "losses/total": 0.4491061568260193, "ref_logps/chosen": -106.42683410644531, "ref_logps/rejected": -105.23750305175781, "rewards/accuracies": 0.6875, "rewards/chosen": -0.42944687604904175, "rewards/margins": 0.33613836765289307, "rewards/rejected": -0.7655852437019348, "step": 947 }, { "epoch": 0.7091827192818403, "grad_norm": 84.9786733800199, "learning_rate": 1.028084667243001e-07, "logps/chosen": -99.47093963623047, "logps/rejected": -113.368408203125, "loss": 0.5917, "losses/dpo": 0.8304334282875061, "losses/sft": 0.979951024055481, "losses/total": 0.8304334282875061, "ref_logps/chosen": -94.6408920288086, "ref_logps/rejected": -105.19464111328125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.48300498723983765, "rewards/margins": 0.3343716859817505, "rewards/rejected": -0.8173766732215881, "step": 948 }, { "epoch": 0.7099308023190574, "grad_norm": 89.96469767468656, "learning_rate": 1.0231867636671637e-07, "logps/chosen": -81.81849670410156, "logps/rejected": -99.97933197021484, "loss": 0.5384, "losses/dpo": 0.43085959553718567, "losses/sft": 0.20705662667751312, "losses/total": 0.43085959553718567, "ref_logps/chosen": -77.75130462646484, "ref_logps/rejected": -91.62901306152344, "rewards/accuracies": 0.75, "rewards/chosen": -0.40671923756599426, "rewards/margins": 0.42831259965896606, "rewards/rejected": -0.8350318670272827, "step": 949 }, { "epoch": 0.7106788853562745, "grad_norm": 61.50976605912425, "learning_rate": 1.0182975514079545e-07, "logps/chosen": -103.16618347167969, "logps/rejected": -98.66064453125, "loss": 0.5751, "losses/dpo": 0.3587545156478882, "losses/sft": 0.6092569828033447, "losses/total": 0.3587545156478882, "ref_logps/chosen": -99.32875061035156, "ref_logps/rejected": -91.09042358398438, "rewards/accuracies": 0.75, "rewards/chosen": -0.38374394178390503, "rewards/margins": 0.37327808141708374, "rewards/rejected": -0.7570220232009888, "step": 950 }, { "epoch": 0.7114269683934917, "grad_norm": 59.007324669264776, "learning_rate": 1.0134170592392836e-07, "logps/chosen": -68.79620361328125, "logps/rejected": -73.83695220947266, "loss": 0.6399, "losses/dpo": 0.7580937147140503, "losses/sft": 0.6734577417373657, "losses/total": 0.7580937147140503, "ref_logps/chosen": -65.21788024902344, "ref_logps/rejected": -68.33442687988281, "rewards/accuracies": 0.65625, "rewards/chosen": -0.35783272981643677, "rewards/margins": 0.19242005050182343, "rewards/rejected": -0.5502527952194214, "step": 951 }, { "epoch": 0.7121750514307088, "grad_norm": 75.39312380745939, "learning_rate": 1.0085453158837429e-07, "logps/chosen": -92.28594970703125, "logps/rejected": -112.37348937988281, "loss": 0.5853, "losses/dpo": 0.6823874711990356, "losses/sft": 0.5866425633430481, "losses/total": 0.6823874711990356, "ref_logps/chosen": -88.33073425292969, "ref_logps/rejected": -104.61531829833984, "rewards/accuracies": 0.6875, "rewards/chosen": -0.39552074670791626, "rewards/margins": 0.38029682636260986, "rewards/rejected": -0.7758176326751709, "step": 952 }, { "epoch": 0.712923134467926, "grad_norm": 94.37530234585262, "learning_rate": 1.003682350012435e-07, "logps/chosen": -96.34440612792969, "logps/rejected": -96.02486419677734, "loss": 0.6698, "losses/dpo": 0.7298629283905029, "losses/sft": 1.6227757930755615, "losses/total": 0.7298629283905029, "ref_logps/chosen": -91.61669158935547, "ref_logps/rejected": -90.13005065917969, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4727723002433777, "rewards/margins": 0.11670969426631927, "rewards/rejected": -0.5894819498062134, "step": 953 }, { "epoch": 0.7136712175051431, "grad_norm": 72.8320400230871, "learning_rate": 9.988281902448054e-08, "logps/chosen": -88.3904037475586, "logps/rejected": -96.00267028808594, "loss": 0.6626, "losses/dpo": 0.8188714385032654, "losses/sft": 0.801943302154541, "losses/total": 0.8188714385032654, "ref_logps/chosen": -83.91116333007812, "ref_logps/rejected": -89.82953643798828, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4479244351387024, "rewards/margins": 0.16938813030719757, "rewards/rejected": -0.6173125505447388, "step": 954 }, { "epoch": 0.7144193005423602, "grad_norm": 81.84675968399888, "learning_rate": 9.939828651484733e-08, "logps/chosen": -134.05999755859375, "logps/rejected": -124.5965805053711, "loss": 0.7739, "losses/dpo": 0.5915156602859497, "losses/sft": 1.9007251262664795, "losses/total": 0.5915156602859497, "ref_logps/chosen": -128.3988037109375, "ref_logps/rejected": -119.63256072998047, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5661199688911438, "rewards/margins": -0.06971850991249084, "rewards/rejected": -0.49640148878097534, "step": 955 }, { "epoch": 0.7151673835795773, "grad_norm": 64.03813060693496, "learning_rate": 9.89146403239067e-08, "logps/chosen": -89.00627136230469, "logps/rejected": -97.12864685058594, "loss": 0.65, "losses/dpo": 0.9478813409805298, "losses/sft": 1.6702804565429688, "losses/total": 0.9478813409805298, "ref_logps/chosen": -83.60423278808594, "ref_logps/rejected": -88.69108581542969, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5402047634124756, "rewards/margins": 0.3035515546798706, "rewards/rejected": -0.843756377696991, "step": 956 }, { "epoch": 0.7159154666167945, "grad_norm": 59.91422294687735, "learning_rate": 9.843188329800503e-08, "logps/chosen": -92.77175903320312, "logps/rejected": -92.10127258300781, "loss": 0.6242, "losses/dpo": 0.6582964062690735, "losses/sft": 0.9741860628128052, "losses/total": 0.6582964062690735, "ref_logps/chosen": -88.25143432617188, "ref_logps/rejected": -84.57859802246094, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4520327150821686, "rewards/margins": 0.30023497343063354, "rewards/rejected": -0.7522676587104797, "step": 957 }, { "epoch": 0.7166635496540116, "grad_norm": 68.56439732140586, "learning_rate": 9.795001827825595e-08, "logps/chosen": -94.72044372558594, "logps/rejected": -96.1901626586914, "loss": 0.7231, "losses/dpo": 0.800869882106781, "losses/sft": 1.0139927864074707, "losses/total": 0.800869882106781, "ref_logps/chosen": -90.29800415039062, "ref_logps/rejected": -91.73719787597656, "rewards/accuracies": 0.59375, "rewards/chosen": -0.44224390387535095, "rewards/margins": 0.003052288666367531, "rewards/rejected": -0.4452962279319763, "step": 958 }, { "epoch": 0.7174116326912288, "grad_norm": 47.48700440969825, "learning_rate": 9.74690481005234e-08, "logps/chosen": -85.14610290527344, "logps/rejected": -95.38029479980469, "loss": 0.5809, "losses/dpo": 0.5369951725006104, "losses/sft": 1.2008922100067139, "losses/total": 0.5369951725006104, "ref_logps/chosen": -81.61279296875, "ref_logps/rejected": -87.9732437133789, "rewards/accuracies": 0.6875, "rewards/chosen": -0.35333147644996643, "rewards/margins": 0.38737398386001587, "rewards/rejected": -0.7407054305076599, "step": 959 }, { "epoch": 0.7181597157284458, "grad_norm": 70.6654502416076, "learning_rate": 9.698897559540497e-08, "logps/chosen": -118.08851623535156, "logps/rejected": -119.81986236572266, "loss": 0.6665, "losses/dpo": 0.7811059951782227, "losses/sft": 1.0768506526947021, "losses/total": 0.7811059951782227, "ref_logps/chosen": -114.10249328613281, "ref_logps/rejected": -114.42371368408203, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3986029624938965, "rewards/margins": 0.1410127580165863, "rewards/rejected": -0.5396157503128052, "step": 960 }, { "epoch": 0.718907798765663, "grad_norm": 66.45582636426673, "learning_rate": 9.65098035882156e-08, "logps/chosen": -98.67167663574219, "logps/rejected": -115.3238754272461, "loss": 0.58, "losses/dpo": 0.4268215298652649, "losses/sft": 1.344132900238037, "losses/total": 0.4268215298652649, "ref_logps/chosen": -94.45909118652344, "ref_logps/rejected": -107.22911834716797, "rewards/accuracies": 0.625, "rewards/chosen": -0.42125892639160156, "rewards/margins": 0.38821709156036377, "rewards/rejected": -0.8094760179519653, "step": 961 }, { "epoch": 0.7196558818028801, "grad_norm": 56.89693111932954, "learning_rate": 9.60315348989702e-08, "logps/chosen": -98.30780029296875, "logps/rejected": -112.20681762695312, "loss": 0.5671, "losses/dpo": 0.7943713068962097, "losses/sft": 0.6139572858810425, "losses/total": 0.7943713068962097, "ref_logps/chosen": -94.49311828613281, "ref_logps/rejected": -104.7347640991211, "rewards/accuracies": 0.6875, "rewards/chosen": -0.38146787881851196, "rewards/margins": 0.36573898792266846, "rewards/rejected": -0.7472069263458252, "step": 962 }, { "epoch": 0.7204039648400973, "grad_norm": 49.27564190659835, "learning_rate": 9.555417234236776e-08, "logps/chosen": -77.69087219238281, "logps/rejected": -92.51446533203125, "loss": 0.568, "losses/dpo": 0.7909185886383057, "losses/sft": 0.7699014544487, "losses/total": 0.7909185886383057, "ref_logps/chosen": -74.04238891601562, "ref_logps/rejected": -84.99803161621094, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3648490309715271, "rewards/margins": 0.38679471611976624, "rewards/rejected": -0.751643717288971, "step": 963 }, { "epoch": 0.7211520478773144, "grad_norm": 55.195042493285314, "learning_rate": 9.507771872777442e-08, "logps/chosen": -89.1159439086914, "logps/rejected": -96.03311157226562, "loss": 0.6639, "losses/dpo": 0.5111933350563049, "losses/sft": 1.0277103185653687, "losses/total": 0.5111933350563049, "ref_logps/chosen": -85.15815734863281, "ref_logps/rejected": -89.71009826660156, "rewards/accuracies": 0.53125, "rewards/chosen": -0.39577794075012207, "rewards/margins": 0.23652280867099762, "rewards/rejected": -0.6323007345199585, "step": 964 }, { "epoch": 0.7219001309145315, "grad_norm": 57.24166627888446, "learning_rate": 9.460217685920696e-08, "logps/chosen": -91.1390609741211, "logps/rejected": -104.36399841308594, "loss": 0.5783, "losses/dpo": 0.4931465685367584, "losses/sft": 0.2107858806848526, "losses/total": 0.4931465685367584, "ref_logps/chosen": -87.85647583007812, "ref_logps/rejected": -96.90055847167969, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3282585144042969, "rewards/margins": 0.41808563470840454, "rewards/rejected": -0.7463441491127014, "step": 965 }, { "epoch": 0.7226482139517486, "grad_norm": 80.67440226274032, "learning_rate": 9.412754953531663e-08, "logps/chosen": -80.66847229003906, "logps/rejected": -89.36790466308594, "loss": 0.6393, "losses/dpo": 0.5935115814208984, "losses/sft": 0.8180366158485413, "losses/total": 0.5935115814208984, "ref_logps/chosen": -75.66791534423828, "ref_logps/rejected": -81.71306610107422, "rewards/accuracies": 0.625, "rewards/chosen": -0.5000560283660889, "rewards/margins": 0.26542818546295166, "rewards/rejected": -0.7654841542243958, "step": 966 }, { "epoch": 0.7233962969889658, "grad_norm": 82.36760524501625, "learning_rate": 9.365383954937215e-08, "logps/chosen": -92.56245422363281, "logps/rejected": -103.27366638183594, "loss": 0.6622, "losses/dpo": 0.7354505062103271, "losses/sft": 1.5215747356414795, "losses/total": 0.7354505062103271, "ref_logps/chosen": -87.7088623046875, "ref_logps/rejected": -96.86590576171875, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4853586256504059, "rewards/margins": 0.15541811287403107, "rewards/rejected": -0.6407766938209534, "step": 967 }, { "epoch": 0.7241443800261829, "grad_norm": 71.99674402928143, "learning_rate": 9.318104968924358e-08, "logps/chosen": -104.11263275146484, "logps/rejected": -122.8359146118164, "loss": 0.5857, "losses/dpo": 0.7559438943862915, "losses/sft": 1.0009828805923462, "losses/total": 0.7559438943862915, "ref_logps/chosen": -99.71062469482422, "ref_logps/rejected": -115.1356201171875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.44020193815231323, "rewards/margins": 0.32982635498046875, "rewards/rejected": -0.7700282335281372, "step": 968 }, { "epoch": 0.7248924630634, "grad_norm": 85.3975846004846, "learning_rate": 9.270918273738601e-08, "logps/chosen": -90.22044372558594, "logps/rejected": -91.92231750488281, "loss": 0.6825, "losses/dpo": 0.6476396918296814, "losses/sft": 1.3503754138946533, "losses/total": 0.6476396918296814, "ref_logps/chosen": -85.10803985595703, "ref_logps/rejected": -85.733154296875, "rewards/accuracies": 0.75, "rewards/chosen": -0.5112394690513611, "rewards/margins": 0.1076778843998909, "rewards/rejected": -0.6189173460006714, "step": 969 }, { "epoch": 0.7256405461006171, "grad_norm": 52.69837061192668, "learning_rate": 9.223824147082282e-08, "logps/chosen": -81.40966033935547, "logps/rejected": -88.17707061767578, "loss": 0.5641, "losses/dpo": 0.5558119416236877, "losses/sft": 0.5560341477394104, "losses/total": 0.5558119416236877, "ref_logps/chosen": -78.75767517089844, "ref_logps/rejected": -81.92312622070312, "rewards/accuracies": 0.6875, "rewards/chosen": -0.26519837975502014, "rewards/margins": 0.36019670963287354, "rewards/rejected": -0.6253951787948608, "step": 970 }, { "epoch": 0.7263886291378343, "grad_norm": 96.01074099057398, "learning_rate": 9.176822866112987e-08, "logps/chosen": -81.12785339355469, "logps/rejected": -81.6131591796875, "loss": 0.669, "losses/dpo": 0.6059054732322693, "losses/sft": 1.0971956253051758, "losses/total": 0.6059054732322693, "ref_logps/chosen": -78.0201644897461, "ref_logps/rejected": -77.4030532836914, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3107697069644928, "rewards/margins": 0.1102408841252327, "rewards/rejected": -0.4210105836391449, "step": 971 }, { "epoch": 0.7271367121750514, "grad_norm": 47.404746879832906, "learning_rate": 9.129914707441863e-08, "logps/chosen": -87.09758758544922, "logps/rejected": -104.20289611816406, "loss": 0.5558, "losses/dpo": 0.3995906710624695, "losses/sft": 0.848328173160553, "losses/total": 0.3995906710624695, "ref_logps/chosen": -82.79769897460938, "ref_logps/rejected": -95.31553649902344, "rewards/accuracies": 0.75, "rewards/chosen": -0.4299876093864441, "rewards/margins": 0.45874881744384766, "rewards/rejected": -0.888736367225647, "step": 972 }, { "epoch": 0.7278847952122686, "grad_norm": 96.94365290403961, "learning_rate": 9.083099947132022e-08, "logps/chosen": -107.3482666015625, "logps/rejected": -124.57662200927734, "loss": 0.6234, "losses/dpo": 0.4952501952648163, "losses/sft": 0.7373408675193787, "losses/total": 0.4952501952648163, "ref_logps/chosen": -103.45287322998047, "ref_logps/rejected": -117.9139633178711, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3895387649536133, "rewards/margins": 0.2767268717288971, "rewards/rejected": -0.6662656664848328, "step": 973 }, { "epoch": 0.7286328782494856, "grad_norm": 60.83598520255751, "learning_rate": 9.036378860696903e-08, "logps/chosen": -86.00257873535156, "logps/rejected": -83.31922912597656, "loss": 0.6447, "losses/dpo": 0.33701401948928833, "losses/sft": 0.5828405618667603, "losses/total": 0.33701401948928833, "ref_logps/chosen": -81.78369140625, "ref_logps/rejected": -76.79242706298828, "rewards/accuracies": 0.625, "rewards/chosen": -0.42188823223114014, "rewards/margins": 0.23079130053520203, "rewards/rejected": -0.6526795625686646, "step": 974 }, { "epoch": 0.7293809612867028, "grad_norm": 50.435000181586695, "learning_rate": 8.989751723098682e-08, "logps/chosen": -81.49234771728516, "logps/rejected": -90.46658325195312, "loss": 0.6307, "losses/dpo": 0.5673946142196655, "losses/sft": 1.1260919570922852, "losses/total": 0.5673946142196655, "ref_logps/chosen": -77.95211791992188, "ref_logps/rejected": -84.72193908691406, "rewards/accuracies": 0.59375, "rewards/chosen": -0.354022741317749, "rewards/margins": 0.22044163942337036, "rewards/rejected": -0.5744643807411194, "step": 975 }, { "epoch": 0.73012904432392, "grad_norm": 60.379849667584686, "learning_rate": 8.943218808746602e-08, "logps/chosen": -64.19792175292969, "logps/rejected": -71.66119384765625, "loss": 0.6457, "losses/dpo": 0.5927824974060059, "losses/sft": 0.5314130783081055, "losses/total": 0.5927824974060059, "ref_logps/chosen": -60.85179138183594, "ref_logps/rejected": -66.2085952758789, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3346136808395386, "rewards/margins": 0.21064648032188416, "rewards/rejected": -0.5452601313591003, "step": 976 }, { "epoch": 0.7308771273611371, "grad_norm": 79.17902047657809, "learning_rate": 8.896780391495398e-08, "logps/chosen": -82.30524444580078, "logps/rejected": -90.56928253173828, "loss": 0.6081, "losses/dpo": 0.5142737030982971, "losses/sft": 0.9998654723167419, "losses/total": 0.5142737030982971, "ref_logps/chosen": -78.85182189941406, "ref_logps/rejected": -84.53229522705078, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3453429043292999, "rewards/margins": 0.2583563029766083, "rewards/rejected": -0.603699266910553, "step": 977 }, { "epoch": 0.7316252103983543, "grad_norm": 63.15115506527082, "learning_rate": 8.850436744643663e-08, "logps/chosen": -78.42195129394531, "logps/rejected": -84.0863037109375, "loss": 0.4665, "losses/dpo": 0.4258180856704712, "losses/sft": 0.6624420285224915, "losses/total": 0.4258180856704712, "ref_logps/chosen": -74.8340072631836, "ref_logps/rejected": -74.1183090209961, "rewards/accuracies": 0.875, "rewards/chosen": -0.3587941825389862, "rewards/margins": 0.6380045413970947, "rewards/rejected": -0.9967986941337585, "step": 978 }, { "epoch": 0.7323732934355713, "grad_norm": 60.890108634437084, "learning_rate": 8.804188140932251e-08, "logps/chosen": -91.03530883789062, "logps/rejected": -107.33451080322266, "loss": 0.5769, "losses/dpo": 0.4391547739505768, "losses/sft": 0.6462388038635254, "losses/total": 0.4391547739505768, "ref_logps/chosen": -88.4545669555664, "ref_logps/rejected": -101.30510711669922, "rewards/accuracies": 0.65625, "rewards/chosen": -0.25807368755340576, "rewards/margins": 0.3448658287525177, "rewards/rejected": -0.6029394865036011, "step": 979 }, { "epoch": 0.7331213764727885, "grad_norm": 66.44456830468572, "learning_rate": 8.758034852542684e-08, "logps/chosen": -86.22628784179688, "logps/rejected": -85.81117248535156, "loss": 0.6661, "losses/dpo": 0.6328878998756409, "losses/sft": 0.9830456972122192, "losses/total": 0.6328878998756409, "ref_logps/chosen": -81.58238983154297, "ref_logps/rejected": -79.57984924316406, "rewards/accuracies": 0.65625, "rewards/chosen": -0.46438971161842346, "rewards/margins": 0.15874265134334564, "rewards/rejected": -0.6231323480606079, "step": 980 }, { "epoch": 0.7338694595100056, "grad_norm": 75.60927795416542, "learning_rate": 8.711977151095523e-08, "logps/chosen": -94.26533508300781, "logps/rejected": -107.03697204589844, "loss": 0.6027, "losses/dpo": 0.6385862827301025, "losses/sft": 1.6770917177200317, "losses/total": 0.6385862827301025, "ref_logps/chosen": -90.62223815917969, "ref_logps/rejected": -100.41301727294922, "rewards/accuracies": 0.59375, "rewards/chosen": -0.36431071162223816, "rewards/margins": 0.2980859875679016, "rewards/rejected": -0.6623967289924622, "step": 981 }, { "epoch": 0.7346175425472228, "grad_norm": 92.72205355383626, "learning_rate": 8.666015307648783e-08, "logps/chosen": -79.80712890625, "logps/rejected": -86.32670593261719, "loss": 0.6748, "losses/dpo": 0.8978291153907776, "losses/sft": 0.494566410779953, "losses/total": 0.8978291153907776, "ref_logps/chosen": -74.74178314208984, "ref_logps/rejected": -79.25187683105469, "rewards/accuracies": 0.5, "rewards/chosen": -0.5065357089042664, "rewards/margins": 0.20094707608222961, "rewards/rejected": -0.7074827551841736, "step": 982 }, { "epoch": 0.7353656255844399, "grad_norm": 59.1720114304204, "learning_rate": 8.620149592696338e-08, "logps/chosen": -104.3839340209961, "logps/rejected": -107.8423080444336, "loss": 0.6127, "losses/dpo": 0.6859056949615479, "losses/sft": 1.863389253616333, "losses/total": 0.6859056949615479, "ref_logps/chosen": -100.00182342529297, "ref_logps/rejected": -100.72987365722656, "rewards/accuracies": 0.71875, "rewards/chosen": -0.43821072578430176, "rewards/margins": 0.2730334401130676, "rewards/rejected": -0.7112441658973694, "step": 983 }, { "epoch": 0.736113708621657, "grad_norm": 73.30956213642779, "learning_rate": 8.574380276166318e-08, "logps/chosen": -96.08053588867188, "logps/rejected": -103.93871307373047, "loss": 0.6613, "losses/dpo": 0.6156884431838989, "losses/sft": 0.9521422982215881, "losses/total": 0.6156884431838989, "ref_logps/chosen": -91.27934265136719, "ref_logps/rejected": -97.44454193115234, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4801192879676819, "rewards/margins": 0.16929763555526733, "rewards/rejected": -0.6494169235229492, "step": 984 }, { "epoch": 0.7368617916588741, "grad_norm": 60.45361887951186, "learning_rate": 8.52870762741956e-08, "logps/chosen": -95.40397644042969, "logps/rejected": -102.6895751953125, "loss": 0.5567, "losses/dpo": 0.5030571222305298, "losses/sft": 0.7702058553695679, "losses/total": 0.5030571222305298, "ref_logps/chosen": -91.33152770996094, "ref_logps/rejected": -94.20455932617188, "rewards/accuracies": 0.625, "rewards/chosen": -0.40724486112594604, "rewards/margins": 0.4412562847137451, "rewards/rejected": -0.8485010862350464, "step": 985 }, { "epoch": 0.7376098746960913, "grad_norm": 86.10860989195578, "learning_rate": 8.483131915247967e-08, "logps/chosen": -104.17076110839844, "logps/rejected": -122.96798706054688, "loss": 0.6168, "losses/dpo": 0.5660765767097473, "losses/sft": 0.5969060659408569, "losses/total": 0.5660765767097473, "ref_logps/chosen": -99.71800231933594, "ref_logps/rejected": -115.66714477539062, "rewards/accuracies": 0.71875, "rewards/chosen": -0.44527578353881836, "rewards/margins": 0.28480827808380127, "rewards/rejected": -0.7300840616226196, "step": 986 }, { "epoch": 0.7383579577333084, "grad_norm": 74.0797667386809, "learning_rate": 8.43765340787296e-08, "logps/chosen": -86.65313720703125, "logps/rejected": -93.74101257324219, "loss": 0.6363, "losses/dpo": 0.7028229236602783, "losses/sft": 1.0819555521011353, "losses/total": 0.7028229236602783, "ref_logps/chosen": -81.380859375, "ref_logps/rejected": -85.87786865234375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5272272229194641, "rewards/margins": 0.2590871751308441, "rewards/rejected": -0.7863144278526306, "step": 987 }, { "epoch": 0.7391060407705256, "grad_norm": 66.0490459214406, "learning_rate": 8.392272372943884e-08, "logps/chosen": -99.05435180664062, "logps/rejected": -94.43911743164062, "loss": 0.6271, "losses/dpo": 0.4510901868343353, "losses/sft": 1.2138041257858276, "losses/total": 0.4510901868343353, "ref_logps/chosen": -95.29829406738281, "ref_logps/rejected": -88.58636474609375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.375606894493103, "rewards/margins": 0.2096690535545349, "rewards/rejected": -0.5852759480476379, "step": 988 }, { "epoch": 0.7398541238077426, "grad_norm": 50.23153537063065, "learning_rate": 8.346989077536462e-08, "logps/chosen": -96.77127838134766, "logps/rejected": -103.9964370727539, "loss": 0.5681, "losses/dpo": 0.8807244896888733, "losses/sft": 1.4917346239089966, "losses/total": 0.8807244896888733, "ref_logps/chosen": -93.55957794189453, "ref_logps/rejected": -96.20384216308594, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3211705982685089, "rewards/margins": 0.45808732509613037, "rewards/rejected": -0.7792578935623169, "step": 989 }, { "epoch": 0.7406022068449598, "grad_norm": 74.81449943553993, "learning_rate": 8.301803788151184e-08, "logps/chosen": -92.80767822265625, "logps/rejected": -107.08785247802734, "loss": 0.5562, "losses/dpo": 0.35315456986427307, "losses/sft": 0.6789018511772156, "losses/total": 0.35315456986427307, "ref_logps/chosen": -88.84330749511719, "ref_logps/rejected": -98.98309326171875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3964374363422394, "rewards/margins": 0.4140383303165436, "rewards/rejected": -0.810475766658783, "step": 990 }, { "epoch": 0.7413502898821769, "grad_norm": 64.79970215976225, "learning_rate": 8.256716770711763e-08, "logps/chosen": -66.74786376953125, "logps/rejected": -74.37936401367188, "loss": 0.6061, "losses/dpo": 0.5506273508071899, "losses/sft": 1.1433011293411255, "losses/total": 0.5506273508071899, "ref_logps/chosen": -63.869956970214844, "ref_logps/rejected": -68.5228500366211, "rewards/accuracies": 0.65625, "rewards/chosen": -0.28779011964797974, "rewards/margins": 0.29786163568496704, "rewards/rejected": -0.5856517553329468, "step": 991 }, { "epoch": 0.7420983729193941, "grad_norm": 89.42448150339413, "learning_rate": 8.211728290563561e-08, "logps/chosen": -115.632568359375, "logps/rejected": -121.27906799316406, "loss": 0.6578, "losses/dpo": 0.5371885299682617, "losses/sft": 0.6473545432090759, "losses/total": 0.5371885299682617, "ref_logps/chosen": -110.6832046508789, "ref_logps/rejected": -114.64506530761719, "rewards/accuracies": 0.5, "rewards/chosen": -0.49493634700775146, "rewards/margins": 0.16846494376659393, "rewards/rejected": -0.6634013056755066, "step": 992 }, { "epoch": 0.7428464559566111, "grad_norm": 59.75745061285294, "learning_rate": 8.166838612472019e-08, "logps/chosen": -83.66972351074219, "logps/rejected": -95.94206237792969, "loss": 0.5477, "losses/dpo": 0.35932043194770813, "losses/sft": 0.6676799058914185, "losses/total": 0.35932043194770813, "ref_logps/chosen": -80.73318481445312, "ref_logps/rejected": -88.81767272949219, "rewards/accuracies": 0.8125, "rewards/chosen": -0.29365456104278564, "rewards/margins": 0.41878318786621094, "rewards/rejected": -0.7124376893043518, "step": 993 }, { "epoch": 0.7435945389938283, "grad_norm": 107.98490208599408, "learning_rate": 8.12204800062114e-08, "logps/chosen": -66.35089111328125, "logps/rejected": -85.45648956298828, "loss": 0.5832, "losses/dpo": 0.5278238654136658, "losses/sft": 0.196109801530838, "losses/total": 0.5278238654136658, "ref_logps/chosen": -62.728736877441406, "ref_logps/rejected": -78.68112182617188, "rewards/accuracies": 0.84375, "rewards/chosen": -0.36221590638160706, "rewards/margins": 0.31532156467437744, "rewards/rejected": -0.6775375008583069, "step": 994 }, { "epoch": 0.7443426220310454, "grad_norm": 81.0331162937678, "learning_rate": 8.07735671861188e-08, "logps/chosen": -109.33090209960938, "logps/rejected": -125.7088851928711, "loss": 0.6385, "losses/dpo": 0.7992033958435059, "losses/sft": 1.3740782737731934, "losses/total": 0.7992033958435059, "ref_logps/chosen": -104.79682922363281, "ref_logps/rejected": -119.03904724121094, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4534066319465637, "rewards/margins": 0.21357640624046326, "rewards/rejected": -0.6669830679893494, "step": 995 }, { "epoch": 0.7450907050682626, "grad_norm": 78.0580897555259, "learning_rate": 8.032765029460626e-08, "logps/chosen": -113.63278198242188, "logps/rejected": -128.903564453125, "loss": 0.5801, "losses/dpo": 0.49901536107063293, "losses/sft": 0.8164950609207153, "losses/total": 0.49901536107063293, "ref_logps/chosen": -108.43418884277344, "ref_logps/rejected": -120.09234619140625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5198593735694885, "rewards/margins": 0.36126312613487244, "rewards/rejected": -0.8811224699020386, "step": 996 }, { "epoch": 0.7458387881054797, "grad_norm": 75.64889950755465, "learning_rate": 7.988273195597642e-08, "logps/chosen": -103.36155700683594, "logps/rejected": -116.01659393310547, "loss": 0.6111, "losses/dpo": 0.7176466584205627, "losses/sft": 1.3714088201522827, "losses/total": 0.7176466584205627, "ref_logps/chosen": -98.79507446289062, "ref_logps/rejected": -108.54293823242188, "rewards/accuracies": 0.71875, "rewards/chosen": -0.45664918422698975, "rewards/margins": 0.2907162308692932, "rewards/rejected": -0.7473654747009277, "step": 997 }, { "epoch": 0.7465868711426968, "grad_norm": 63.115653299732436, "learning_rate": 7.943881478865519e-08, "logps/chosen": -83.91370391845703, "logps/rejected": -96.88471984863281, "loss": 0.6826, "losses/dpo": 0.5743423700332642, "losses/sft": 0.9113293886184692, "losses/total": 0.5743423700332642, "ref_logps/chosen": -79.41746520996094, "ref_logps/rejected": -90.66163635253906, "rewards/accuracies": 0.625, "rewards/chosen": -0.44962388277053833, "rewards/margins": 0.1726834774017334, "rewards/rejected": -0.6223073601722717, "step": 998 }, { "epoch": 0.747334954179914, "grad_norm": 101.7346768100101, "learning_rate": 7.899590140517665e-08, "logps/chosen": -103.4684066772461, "logps/rejected": -112.18274688720703, "loss": 0.6632, "losses/dpo": 0.7285593748092651, "losses/sft": 0.5847342014312744, "losses/total": 0.7285593748092651, "ref_logps/chosen": -98.86613464355469, "ref_logps/rejected": -105.77024841308594, "rewards/accuracies": 0.59375, "rewards/chosen": -0.46022647619247437, "rewards/margins": 0.1810237318277359, "rewards/rejected": -0.6412502527236938, "step": 999 }, { "epoch": 0.7480830372171311, "grad_norm": 70.05029119721036, "learning_rate": 7.855399441216714e-08, "logps/chosen": -69.84858703613281, "logps/rejected": -79.0323715209961, "loss": 0.6049, "losses/dpo": 0.5557985901832581, "losses/sft": 0.28978872299194336, "losses/total": 0.5557985901832581, "ref_logps/chosen": -66.24164581298828, "ref_logps/rejected": -72.44877624511719, "rewards/accuracies": 0.71875, "rewards/chosen": -0.36069390177726746, "rewards/margins": 0.2976655960083008, "rewards/rejected": -0.6583595275878906, "step": 1000 }, { "epoch": 0.7488311202543483, "grad_norm": 72.14884411790703, "learning_rate": 7.81130964103304e-08, "logps/chosen": -89.40018463134766, "logps/rejected": -99.1524429321289, "loss": 0.6503, "losses/dpo": 0.6820061206817627, "losses/sft": 1.2664680480957031, "losses/total": 0.6820061206817627, "ref_logps/chosen": -85.28707885742188, "ref_logps/rejected": -93.49934387207031, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4113111197948456, "rewards/margins": 0.1539984494447708, "rewards/rejected": -0.5653095841407776, "step": 1001 }, { "epoch": 0.7495792032915654, "grad_norm": 87.61230461674643, "learning_rate": 7.767320999443183e-08, "logps/chosen": -110.92351531982422, "logps/rejected": -115.99482727050781, "loss": 0.6391, "losses/dpo": 0.5988960266113281, "losses/sft": 0.9202499389648438, "losses/total": 0.5988960266113281, "ref_logps/chosen": -105.55900573730469, "ref_logps/rejected": -108.80528259277344, "rewards/accuracies": 0.625, "rewards/chosen": -0.5364509224891663, "rewards/margins": 0.18250323832035065, "rewards/rejected": -0.7189541459083557, "step": 1002 }, { "epoch": 0.7503272863287825, "grad_norm": 96.43371044533082, "learning_rate": 7.723433775328384e-08, "logps/chosen": -74.35077667236328, "logps/rejected": -87.07794189453125, "loss": 0.5918, "losses/dpo": 0.8020116090774536, "losses/sft": 0.6002021431922913, "losses/total": 0.8020116090774536, "ref_logps/chosen": -70.48533630371094, "ref_logps/rejected": -79.9522705078125, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3865435719490051, "rewards/margins": 0.32602402567863464, "rewards/rejected": -0.7125675678253174, "step": 1003 }, { "epoch": 0.7510753693659996, "grad_norm": 57.91834912863749, "learning_rate": 7.679648226972992e-08, "logps/chosen": -105.20048522949219, "logps/rejected": -129.45004272460938, "loss": 0.5321, "losses/dpo": 0.34714120626449585, "losses/sft": 0.8363871574401855, "losses/total": 0.34714120626449585, "ref_logps/chosen": -101.0451431274414, "ref_logps/rejected": -120.25540924072266, "rewards/accuracies": 0.75, "rewards/chosen": -0.4155339002609253, "rewards/margins": 0.5039291381835938, "rewards/rejected": -0.919463038444519, "step": 1004 }, { "epoch": 0.7518234524032168, "grad_norm": 75.82245666836732, "learning_rate": 7.635964612062992e-08, "logps/chosen": -106.61898803710938, "logps/rejected": -122.29280090332031, "loss": 0.5826, "losses/dpo": 0.6798314452171326, "losses/sft": 0.742943286895752, "losses/total": 0.6798314452171326, "ref_logps/chosen": -103.52427673339844, "ref_logps/rejected": -116.08075714111328, "rewards/accuracies": 0.6875, "rewards/chosen": -0.30947166681289673, "rewards/margins": 0.3117330074310303, "rewards/rejected": -0.6212046146392822, "step": 1005 }, { "epoch": 0.7525715354404339, "grad_norm": 68.24021268540898, "learning_rate": 7.592383187684456e-08, "logps/chosen": -89.1611557006836, "logps/rejected": -78.28771209716797, "loss": 0.7147, "losses/dpo": 1.0418040752410889, "losses/sft": 1.108994483947754, "losses/total": 1.0418040752410889, "ref_logps/chosen": -83.7933578491211, "ref_logps/rejected": -71.8532485961914, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5367789268493652, "rewards/margins": 0.10666751861572266, "rewards/rejected": -0.6434464454650879, "step": 1006 }, { "epoch": 0.753319618477651, "grad_norm": 68.48352818941402, "learning_rate": 7.548904210322058e-08, "logps/chosen": -81.68315887451172, "logps/rejected": -85.50244140625, "loss": 0.6689, "losses/dpo": 0.8874818682670593, "losses/sft": 1.206020712852478, "losses/total": 0.8874818682670593, "ref_logps/chosen": -77.57335662841797, "ref_logps/rejected": -80.29820251464844, "rewards/accuracies": 0.5, "rewards/chosen": -0.4109799861907959, "rewards/margins": 0.1094435304403305, "rewards/rejected": -0.5204235315322876, "step": 1007 }, { "epoch": 0.7540677015148681, "grad_norm": 69.94423372443813, "learning_rate": 7.505527935857556e-08, "logps/chosen": -92.00767517089844, "logps/rejected": -92.43267059326172, "loss": 0.6696, "losses/dpo": 0.6555395722389221, "losses/sft": 0.6610907316207886, "losses/total": 0.6555395722389221, "ref_logps/chosen": -86.49188995361328, "ref_logps/rejected": -84.98275756835938, "rewards/accuracies": 0.625, "rewards/chosen": -0.5515780448913574, "rewards/margins": 0.19341439008712769, "rewards/rejected": -0.7449923753738403, "step": 1008 }, { "epoch": 0.7548157845520853, "grad_norm": 70.12785330830644, "learning_rate": 7.462254619568278e-08, "logps/chosen": -91.1064224243164, "logps/rejected": -99.8120346069336, "loss": 0.6228, "losses/dpo": 0.6712719202041626, "losses/sft": 0.6099578142166138, "losses/total": 0.6712719202041626, "ref_logps/chosen": -86.93667602539062, "ref_logps/rejected": -93.174072265625, "rewards/accuracies": 0.625, "rewards/chosen": -0.4169749319553375, "rewards/margins": 0.24682077765464783, "rewards/rejected": -0.6637957096099854, "step": 1009 }, { "epoch": 0.7555638675893024, "grad_norm": 83.49021903446665, "learning_rate": 7.419084516125617e-08, "logps/chosen": -102.27244567871094, "logps/rejected": -100.81736755371094, "loss": 0.7091, "losses/dpo": 0.9380107522010803, "losses/sft": 0.6370511651039124, "losses/total": 0.9380107522010803, "ref_logps/chosen": -97.64830017089844, "ref_logps/rejected": -95.25178527832031, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4624151885509491, "rewards/margins": 0.0941433385014534, "rewards/rejected": -0.5565584897994995, "step": 1010 }, { "epoch": 0.7563119506265196, "grad_norm": 103.68834455535905, "learning_rate": 7.376017879593552e-08, "logps/chosen": -93.50180053710938, "logps/rejected": -101.43135070800781, "loss": 0.6167, "losses/dpo": 0.3778122663497925, "losses/sft": 0.7615069150924683, "losses/total": 0.3778122663497925, "ref_logps/chosen": -88.82647705078125, "ref_logps/rejected": -93.96170043945312, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4675329327583313, "rewards/margins": 0.27943286299705505, "rewards/rejected": -0.7469658255577087, "step": 1011 }, { "epoch": 0.7570600336637366, "grad_norm": 74.96806723763578, "learning_rate": 7.333054963427122e-08, "logps/chosen": -112.14035034179688, "logps/rejected": -124.09134674072266, "loss": 0.678, "losses/dpo": 0.4515395164489746, "losses/sft": 0.9849974513053894, "losses/total": 0.4515395164489746, "ref_logps/chosen": -107.11947631835938, "ref_logps/rejected": -117.25135803222656, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5020882487297058, "rewards/margins": 0.1819106936454773, "rewards/rejected": -0.6839989423751831, "step": 1012 }, { "epoch": 0.7578081167009538, "grad_norm": 78.35921865734808, "learning_rate": 7.290196020470981e-08, "logps/chosen": -90.26255798339844, "logps/rejected": -87.80238342285156, "loss": 0.758, "losses/dpo": 0.7318345308303833, "losses/sft": 0.9125560522079468, "losses/total": 0.7318345308303833, "ref_logps/chosen": -85.17508697509766, "ref_logps/rejected": -82.45013427734375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5087469220161438, "rewards/margins": 0.02647818997502327, "rewards/rejected": -0.5352251529693604, "step": 1013 }, { "epoch": 0.7585561997381709, "grad_norm": 56.87727606274861, "learning_rate": 7.247441302957857e-08, "logps/chosen": -86.05436706542969, "logps/rejected": -95.66836547851562, "loss": 0.5812, "losses/dpo": 0.5849775671958923, "losses/sft": 0.3152744770050049, "losses/total": 0.5849775671958923, "ref_logps/chosen": -82.7445068359375, "ref_logps/rejected": -88.93501281738281, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3309858739376068, "rewards/margins": 0.34234923124313354, "rewards/rejected": -0.673335075378418, "step": 1014 }, { "epoch": 0.7593042827753881, "grad_norm": 55.34356503318408, "learning_rate": 7.204791062507101e-08, "logps/chosen": -79.29975128173828, "logps/rejected": -89.11936950683594, "loss": 0.5413, "losses/dpo": 0.7767267227172852, "losses/sft": 1.0600477457046509, "losses/total": 0.7767267227172852, "ref_logps/chosen": -75.68464660644531, "ref_logps/rejected": -80.2979736328125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3615102767944336, "rewards/margins": 0.5206283330917358, "rewards/rejected": -0.882138729095459, "step": 1015 }, { "epoch": 0.7600523658126052, "grad_norm": 53.32545006227207, "learning_rate": 7.162245550123192e-08, "logps/chosen": -90.93733978271484, "logps/rejected": -100.69717407226562, "loss": 0.5575, "losses/dpo": 0.5540570020675659, "losses/sft": 1.0219390392303467, "losses/total": 0.5540570020675659, "ref_logps/chosen": -86.56536102294922, "ref_logps/rejected": -91.30601501464844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.43719804286956787, "rewards/margins": 0.5019182562828064, "rewards/rejected": -0.9391162991523743, "step": 1016 }, { "epoch": 0.7608004488498223, "grad_norm": 51.31339587452917, "learning_rate": 7.11980501619428e-08, "logps/chosen": -98.04167938232422, "logps/rejected": -106.34609985351562, "loss": 0.6164, "losses/dpo": 0.6924769282341003, "losses/sft": 1.2281851768493652, "losses/total": 0.6924769282341003, "ref_logps/chosen": -93.92728424072266, "ref_logps/rejected": -99.69718933105469, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4114391505718231, "rewards/margins": 0.25345170497894287, "rewards/rejected": -0.6648908853530884, "step": 1017 }, { "epoch": 0.7615485318870394, "grad_norm": 66.23708974895142, "learning_rate": 7.077469710490683e-08, "logps/chosen": -95.28810119628906, "logps/rejected": -105.32768249511719, "loss": 0.6518, "losses/dpo": 0.6177943348884583, "losses/sft": 1.1514954566955566, "losses/total": 0.6177943348884583, "ref_logps/chosen": -92.13168334960938, "ref_logps/rejected": -100.15909576416016, "rewards/accuracies": 0.5625, "rewards/chosen": -0.31564217805862427, "rewards/margins": 0.20121626555919647, "rewards/rejected": -0.5168584585189819, "step": 1018 }, { "epoch": 0.7622966149242566, "grad_norm": 59.715776833581714, "learning_rate": 7.035239882163435e-08, "logps/chosen": -99.90554809570312, "logps/rejected": -100.84248352050781, "loss": 0.6811, "losses/dpo": 0.45930448174476624, "losses/sft": 1.0746204853057861, "losses/total": 0.45930448174476624, "ref_logps/chosen": -95.88411712646484, "ref_logps/rejected": -95.1957778930664, "rewards/accuracies": 0.59375, "rewards/chosen": -0.402143269777298, "rewards/margins": 0.16252732276916504, "rewards/rejected": -0.5646705627441406, "step": 1019 }, { "epoch": 0.7630446979614737, "grad_norm": 72.97576224963689, "learning_rate": 6.993115779742816e-08, "logps/chosen": -110.4044189453125, "logps/rejected": -124.24600219726562, "loss": 0.5344, "losses/dpo": 0.5320330858230591, "losses/sft": 0.7725009918212891, "losses/total": 0.5320330858230591, "ref_logps/chosen": -106.26726531982422, "ref_logps/rejected": -115.08377075195312, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4137161374092102, "rewards/margins": 0.5025075674057007, "rewards/rejected": -0.9162237644195557, "step": 1020 }, { "epoch": 0.7637927809986909, "grad_norm": 92.24025829950645, "learning_rate": 6.951097651136889e-08, "logps/chosen": -101.33998107910156, "logps/rejected": -97.82330322265625, "loss": 0.7115, "losses/dpo": 1.058343529701233, "losses/sft": 1.093414306640625, "losses/total": 1.058343529701233, "ref_logps/chosen": -96.85401916503906, "ref_logps/rejected": -91.76667785644531, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4485958516597748, "rewards/margins": 0.15706613659858704, "rewards/rejected": -0.605661928653717, "step": 1021 }, { "epoch": 0.764540864035908, "grad_norm": 86.91982683658686, "learning_rate": 6.909185743630052e-08, "logps/chosen": -107.56549835205078, "logps/rejected": -116.68840789794922, "loss": 0.5861, "losses/dpo": 0.5757924914360046, "losses/sft": 0.9868168830871582, "losses/total": 0.5757924914360046, "ref_logps/chosen": -102.89581298828125, "ref_logps/rejected": -108.44178771972656, "rewards/accuracies": 0.625, "rewards/chosen": -0.4669678509235382, "rewards/margins": 0.357694536447525, "rewards/rejected": -0.8246623873710632, "step": 1022 }, { "epoch": 0.7652889470731251, "grad_norm": 81.70992811151987, "learning_rate": 6.867380303881563e-08, "logps/chosen": -119.03904724121094, "logps/rejected": -121.443603515625, "loss": 0.5911, "losses/dpo": 0.6551112532615662, "losses/sft": 1.320961833000183, "losses/total": 0.6551112532615662, "ref_logps/chosen": -115.64828491210938, "ref_logps/rejected": -114.8560562133789, "rewards/accuracies": 0.65625, "rewards/chosen": -0.33907580375671387, "rewards/margins": 0.3196791410446167, "rewards/rejected": -0.6587550044059753, "step": 1023 }, { "epoch": 0.7660370301103423, "grad_norm": 94.32292238035699, "learning_rate": 6.825681577924092e-08, "logps/chosen": -92.47276306152344, "logps/rejected": -108.95951843261719, "loss": 0.5746, "losses/dpo": 0.5790554285049438, "losses/sft": 1.1271398067474365, "losses/total": 0.5790554285049438, "ref_logps/chosen": -88.95327758789062, "ref_logps/rejected": -100.75961303710938, "rewards/accuracies": 0.65625, "rewards/chosen": -0.351949006319046, "rewards/margins": 0.468041330575943, "rewards/rejected": -0.8199903964996338, "step": 1024 }, { "epoch": 0.7667851131475594, "grad_norm": 79.50273948124034, "learning_rate": 6.784089811162288e-08, "logps/chosen": -97.55268859863281, "logps/rejected": -115.31951141357422, "loss": 0.5684, "losses/dpo": 0.8881692886352539, "losses/sft": 1.0661778450012207, "losses/total": 0.8881692886352539, "ref_logps/chosen": -94.66803741455078, "ref_logps/rejected": -108.20758819580078, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2884654700756073, "rewards/margins": 0.42272722721099854, "rewards/rejected": -0.7111926674842834, "step": 1025 }, { "epoch": 0.7675331961847766, "grad_norm": 103.81164572477375, "learning_rate": 6.742605248371319e-08, "logps/chosen": -112.34683227539062, "logps/rejected": -136.99044799804688, "loss": 0.6209, "losses/dpo": 0.5439213514328003, "losses/sft": 1.4474096298217773, "losses/total": 0.5439213514328003, "ref_logps/chosen": -107.91981506347656, "ref_logps/rejected": -130.15902709960938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.44270193576812744, "rewards/margins": 0.24044016003608704, "rewards/rejected": -0.6831420660018921, "step": 1026 }, { "epoch": 0.7682812792219936, "grad_norm": 67.61582720978807, "learning_rate": 6.701228133695456e-08, "logps/chosen": -96.33879089355469, "logps/rejected": -113.12725830078125, "loss": 0.6283, "losses/dpo": 0.6340144872665405, "losses/sft": 0.7582727074623108, "losses/total": 0.6340144872665405, "ref_logps/chosen": -92.90228271484375, "ref_logps/rejected": -107.91539764404297, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3436504304409027, "rewards/margins": 0.1775357574224472, "rewards/rejected": -0.5211861729621887, "step": 1027 }, { "epoch": 0.7690293622592108, "grad_norm": 70.00759109480789, "learning_rate": 6.65995871064659e-08, "logps/chosen": -105.21048736572266, "logps/rejected": -109.6395034790039, "loss": 0.675, "losses/dpo": 0.7064166069030762, "losses/sft": 0.6358790397644043, "losses/total": 0.7064166069030762, "ref_logps/chosen": -100.4697036743164, "ref_logps/rejected": -103.369873046875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.47407859563827515, "rewards/margins": 0.1528836190700531, "rewards/rejected": -0.6269621849060059, "step": 1028 }, { "epoch": 0.7697774452964279, "grad_norm": 124.39266995609569, "learning_rate": 6.618797222102851e-08, "logps/chosen": -92.91346740722656, "logps/rejected": -106.94474792480469, "loss": 0.683, "losses/dpo": 0.5592671632766724, "losses/sft": 0.898262619972229, "losses/total": 0.5592671632766724, "ref_logps/chosen": -88.84615325927734, "ref_logps/rejected": -101.93097686767578, "rewards/accuracies": 0.59375, "rewards/chosen": -0.40673163533210754, "rewards/margins": 0.09464625269174576, "rewards/rejected": -0.5013778805732727, "step": 1029 }, { "epoch": 0.7705255283336451, "grad_norm": 91.99025892435023, "learning_rate": 6.577743910307132e-08, "logps/chosen": -91.5950927734375, "logps/rejected": -96.01811981201172, "loss": 0.5868, "losses/dpo": 0.5092297792434692, "losses/sft": 0.6808981895446777, "losses/total": 0.5092297792434692, "ref_logps/chosen": -88.18611145019531, "ref_logps/rejected": -89.66848754882812, "rewards/accuracies": 0.65625, "rewards/chosen": -0.34089773893356323, "rewards/margins": 0.2940656244754791, "rewards/rejected": -0.63496333360672, "step": 1030 }, { "epoch": 0.7712736113708621, "grad_norm": 99.09400257089234, "learning_rate": 6.536799016865713e-08, "logps/chosen": -100.369140625, "logps/rejected": -109.7389907836914, "loss": 0.6243, "losses/dpo": 0.4972114562988281, "losses/sft": 0.721799910068512, "losses/total": 0.4972114562988281, "ref_logps/chosen": -95.22123718261719, "ref_logps/rejected": -101.57746124267578, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5147906541824341, "rewards/margins": 0.3013620376586914, "rewards/rejected": -0.8161526918411255, "step": 1031 }, { "epoch": 0.7720216944080793, "grad_norm": 75.82931772296627, "learning_rate": 6.495962782746792e-08, "logps/chosen": -111.82367706298828, "logps/rejected": -118.31967163085938, "loss": 0.5912, "losses/dpo": 0.7382789254188538, "losses/sft": 1.6949158906936646, "losses/total": 0.7382789254188538, "ref_logps/chosen": -107.93275451660156, "ref_logps/rejected": -110.91831970214844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3890916109085083, "rewards/margins": 0.3510432541370392, "rewards/rejected": -0.7401348352432251, "step": 1032 }, { "epoch": 0.7727697774452964, "grad_norm": 86.37279352000385, "learning_rate": 6.455235448279095e-08, "logps/chosen": -109.32792663574219, "logps/rejected": -110.48274230957031, "loss": 0.6603, "losses/dpo": 0.6441943645477295, "losses/sft": 0.7046254873275757, "losses/total": 0.6441943645477295, "ref_logps/chosen": -105.1324462890625, "ref_logps/rejected": -104.66371154785156, "rewards/accuracies": 0.59375, "rewards/chosen": -0.41954758763313293, "rewards/margins": 0.16235552728176117, "rewards/rejected": -0.5819031596183777, "step": 1033 }, { "epoch": 0.7735178604825136, "grad_norm": 48.98563945300325, "learning_rate": 6.41461725315045e-08, "logps/chosen": -96.23460388183594, "logps/rejected": -107.5826416015625, "loss": 0.5637, "losses/dpo": 0.5795682668685913, "losses/sft": 0.534438967704773, "losses/total": 0.5795682668685913, "ref_logps/chosen": -92.95858764648438, "ref_logps/rejected": -100.52711486816406, "rewards/accuracies": 0.6875, "rewards/chosen": -0.327600359916687, "rewards/margins": 0.37795254588127136, "rewards/rejected": -0.7055529356002808, "step": 1034 }, { "epoch": 0.7742659435197307, "grad_norm": 48.75920378049774, "learning_rate": 6.374108436406372e-08, "logps/chosen": -106.50497436523438, "logps/rejected": -106.75904846191406, "loss": 0.5821, "losses/dpo": 0.7603746652603149, "losses/sft": 0.8416817784309387, "losses/total": 0.7603746652603149, "ref_logps/chosen": -102.59258270263672, "ref_logps/rejected": -98.93065643310547, "rewards/accuracies": 0.65625, "rewards/chosen": -0.39123931527137756, "rewards/margins": 0.3915998935699463, "rewards/rejected": -0.7828392386436462, "step": 1035 }, { "epoch": 0.7750140265569478, "grad_norm": 54.68978723593078, "learning_rate": 6.333709236448692e-08, "logps/chosen": -81.23406982421875, "logps/rejected": -87.93891143798828, "loss": 0.6018, "losses/dpo": 0.43837523460388184, "losses/sft": 0.20582491159439087, "losses/total": 0.43837523460388184, "ref_logps/chosen": -77.38722229003906, "ref_logps/rejected": -79.38449096679688, "rewards/accuracies": 0.53125, "rewards/chosen": -0.38468486070632935, "rewards/margins": 0.47075650095939636, "rewards/rejected": -0.8554413318634033, "step": 1036 }, { "epoch": 0.7757621095941649, "grad_norm": 83.10663281352782, "learning_rate": 6.293419891034099e-08, "logps/chosen": -112.1369400024414, "logps/rejected": -107.77398681640625, "loss": 0.6986, "losses/dpo": 0.7493065595626831, "losses/sft": 1.016202688217163, "losses/total": 0.7493065595626831, "ref_logps/chosen": -107.74221801757812, "ref_logps/rejected": -102.6214370727539, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4394723176956177, "rewards/margins": 0.07578157633543015, "rewards/rejected": -0.5152539014816284, "step": 1037 }, { "epoch": 0.7765101926313821, "grad_norm": 65.71281272054752, "learning_rate": 6.253240637272775e-08, "logps/chosen": -106.97222900390625, "logps/rejected": -113.97898864746094, "loss": 0.7288, "losses/dpo": 0.5729687213897705, "losses/sft": 0.9969080686569214, "losses/total": 0.5729687213897705, "ref_logps/chosen": -102.195556640625, "ref_logps/rejected": -108.6382827758789, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4776668846607208, "rewards/margins": 0.05640335753560066, "rewards/rejected": -0.5340702533721924, "step": 1038 }, { "epoch": 0.7772582756685992, "grad_norm": 54.11655479954533, "learning_rate": 6.213171711626997e-08, "logps/chosen": -92.20938110351562, "logps/rejected": -110.21195983886719, "loss": 0.5868, "losses/dpo": 0.5953448414802551, "losses/sft": 0.5777280926704407, "losses/total": 0.5953448414802551, "ref_logps/chosen": -88.10272216796875, "ref_logps/rejected": -103.07279968261719, "rewards/accuracies": 0.71875, "rewards/chosen": -0.41066625714302063, "rewards/margins": 0.30324992537498474, "rewards/rejected": -0.7139161825180054, "step": 1039 }, { "epoch": 0.7780063587058164, "grad_norm": 101.62021472733291, "learning_rate": 6.173213349909728e-08, "logps/chosen": -85.96343994140625, "logps/rejected": -97.71114349365234, "loss": 0.639, "losses/dpo": 0.5023886561393738, "losses/sft": 0.6905549764633179, "losses/total": 0.5023886561393738, "ref_logps/chosen": -81.55586242675781, "ref_logps/rejected": -90.9962158203125, "rewards/accuracies": 0.625, "rewards/chosen": -0.44075721502304077, "rewards/margins": 0.2307361215353012, "rewards/rejected": -0.6714933514595032, "step": 1040 }, { "epoch": 0.7787544417430334, "grad_norm": 67.0511876757044, "learning_rate": 6.133365787283268e-08, "logps/chosen": -107.9659423828125, "logps/rejected": -119.11787414550781, "loss": 0.641, "losses/dpo": 0.46713733673095703, "losses/sft": 0.775205135345459, "losses/total": 0.46713733673095703, "ref_logps/chosen": -103.55845642089844, "ref_logps/rejected": -112.53814697265625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4407494068145752, "rewards/margins": 0.21722356975078583, "rewards/rejected": -0.6579729318618774, "step": 1041 }, { "epoch": 0.7795025247802506, "grad_norm": 67.83479433415553, "learning_rate": 6.093629258257821e-08, "logps/chosen": -101.33431243896484, "logps/rejected": -114.38524627685547, "loss": 0.5877, "losses/dpo": 0.4735599756240845, "losses/sft": 0.971044659614563, "losses/total": 0.4735599756240845, "ref_logps/chosen": -96.52960205078125, "ref_logps/rejected": -106.68820190429688, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4804714322090149, "rewards/margins": 0.28923359513282776, "rewards/rejected": -0.7697049975395203, "step": 1042 }, { "epoch": 0.7802506078174677, "grad_norm": 47.79193855924459, "learning_rate": 6.054003996690146e-08, "logps/chosen": -80.55650329589844, "logps/rejected": -99.25193786621094, "loss": 0.5177, "losses/dpo": 0.5611035227775574, "losses/sft": 0.8866967558860779, "losses/total": 0.5611035227775574, "ref_logps/chosen": -78.51459503173828, "ref_logps/rejected": -92.16867065429688, "rewards/accuracies": 0.78125, "rewards/chosen": -0.20419079065322876, "rewards/margins": 0.5041360855102539, "rewards/rejected": -0.7083268165588379, "step": 1043 }, { "epoch": 0.7809986908546849, "grad_norm": 59.729897569012586, "learning_rate": 6.01449023578216e-08, "logps/chosen": -95.1822509765625, "logps/rejected": -101.63084411621094, "loss": 0.6316, "losses/dpo": 0.4327763020992279, "losses/sft": 1.1222854852676392, "losses/total": 0.4327763020992279, "ref_logps/chosen": -90.75390625, "ref_logps/rejected": -94.4352035522461, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4428347945213318, "rewards/margins": 0.2767282724380493, "rewards/rejected": -0.7195631265640259, "step": 1044 }, { "epoch": 0.781746773891902, "grad_norm": 63.44061019113341, "learning_rate": 5.975088208079609e-08, "logps/chosen": -79.3946304321289, "logps/rejected": -90.06181335449219, "loss": 0.637, "losses/dpo": 0.6500882506370544, "losses/sft": 0.8037538528442383, "losses/total": 0.6500882506370544, "ref_logps/chosen": -74.90873718261719, "ref_logps/rejected": -83.46295166015625, "rewards/accuracies": 0.625, "rewards/chosen": -0.4485889673233032, "rewards/margins": 0.21129702031612396, "rewards/rejected": -0.6598860025405884, "step": 1045 }, { "epoch": 0.7824948569291191, "grad_norm": 102.18054959519532, "learning_rate": 5.935798145470638e-08, "logps/chosen": -113.8867416381836, "logps/rejected": -107.81144714355469, "loss": 0.6834, "losses/dpo": 0.7469902038574219, "losses/sft": 0.7309746742248535, "losses/total": 0.7469902038574219, "ref_logps/chosen": -108.11772155761719, "ref_logps/rejected": -100.61721801757812, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5769026279449463, "rewards/margins": 0.14252088963985443, "rewards/rejected": -0.7194235324859619, "step": 1046 }, { "epoch": 0.7832429399663363, "grad_norm": 80.17443548562242, "learning_rate": 5.896620279184472e-08, "logps/chosen": -108.11359405517578, "logps/rejected": -117.58694458007812, "loss": 0.6203, "losses/dpo": 0.5854874849319458, "losses/sft": 1.4133630990982056, "losses/total": 0.5854874849319458, "ref_logps/chosen": -103.68317413330078, "ref_logps/rejected": -110.65059661865234, "rewards/accuracies": 0.75, "rewards/chosen": -0.44304192066192627, "rewards/margins": 0.25059205293655396, "rewards/rejected": -0.6936339735984802, "step": 1047 }, { "epoch": 0.7839910230035534, "grad_norm": 72.76843269408054, "learning_rate": 5.857554839790033e-08, "logps/chosen": -94.1788101196289, "logps/rejected": -99.77445983886719, "loss": 0.6703, "losses/dpo": 0.541928231716156, "losses/sft": 0.5668236613273621, "losses/total": 0.541928231716156, "ref_logps/chosen": -91.0389633178711, "ref_logps/rejected": -95.13823699951172, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3139844536781311, "rewards/margins": 0.1496378481388092, "rewards/rejected": -0.4636223018169403, "step": 1048 }, { "epoch": 0.7847391060407706, "grad_norm": 57.69937180254817, "learning_rate": 5.8186020571945884e-08, "logps/chosen": -63.79075241088867, "logps/rejected": -67.18778991699219, "loss": 0.7107, "losses/dpo": 0.6735530495643616, "losses/sft": 0.740562379360199, "losses/total": 0.6735530495643616, "ref_logps/chosen": -59.11504364013672, "ref_logps/rejected": -61.81911849975586, "rewards/accuracies": 0.53125, "rewards/chosen": -0.46757036447525024, "rewards/margins": 0.0692971795797348, "rewards/rejected": -0.5368675589561462, "step": 1049 }, { "epoch": 0.7854871890779876, "grad_norm": 81.95856080042257, "learning_rate": 5.779762160642418e-08, "logps/chosen": -98.32041931152344, "logps/rejected": -108.07073974609375, "loss": 0.6074, "losses/dpo": 0.44867759943008423, "losses/sft": 0.541968047618866, "losses/total": 0.44867759943008423, "ref_logps/chosen": -94.95465087890625, "ref_logps/rejected": -102.09809112548828, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3365764617919922, "rewards/margins": 0.26068809628486633, "rewards/rejected": -0.5972645878791809, "step": 1050 }, { "epoch": 0.7862352721152048, "grad_norm": 44.87540618372628, "learning_rate": 5.741035378713427e-08, "logps/chosen": -91.80799865722656, "logps/rejected": -100.53845977783203, "loss": 0.5885, "losses/dpo": 0.753753662109375, "losses/sft": 0.5949954986572266, "losses/total": 0.753753662109375, "ref_logps/chosen": -88.34529113769531, "ref_logps/rejected": -93.7988052368164, "rewards/accuracies": 0.65625, "rewards/chosen": -0.34627020359039307, "rewards/margins": 0.3276955485343933, "rewards/rejected": -0.6739657521247864, "step": 1051 }, { "epoch": 0.7869833551524219, "grad_norm": 54.24360729822454, "learning_rate": 5.7024219393218246e-08, "logps/chosen": -64.59310913085938, "logps/rejected": -66.84296417236328, "loss": 0.6182, "losses/dpo": 0.65285325050354, "losses/sft": 0.15168553590774536, "losses/total": 0.65285325050354, "ref_logps/chosen": -62.23286056518555, "ref_logps/rejected": -62.188758850097656, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2360251247882843, "rewards/margins": 0.22939518094062805, "rewards/rejected": -0.46542030572891235, "step": 1052 }, { "epoch": 0.7877314381896391, "grad_norm": 73.74996275820034, "learning_rate": 5.6639220697147826e-08, "logps/chosen": -78.4983901977539, "logps/rejected": -82.96682739257812, "loss": 0.7192, "losses/dpo": 0.7694375514984131, "losses/sft": 0.7765686511993408, "losses/total": 0.7694375514984131, "ref_logps/chosen": -73.95169067382812, "ref_logps/rejected": -78.0721206665039, "rewards/accuracies": 0.46875, "rewards/chosen": -0.4546699523925781, "rewards/margins": 0.03480175882577896, "rewards/rejected": -0.4894717335700989, "step": 1053 }, { "epoch": 0.7884795212268562, "grad_norm": 63.417883515014324, "learning_rate": 5.625535996471081e-08, "logps/chosen": -104.39308166503906, "logps/rejected": -102.14535522460938, "loss": 0.7267, "losses/dpo": 0.5292753577232361, "losses/sft": 1.3058750629425049, "losses/total": 0.5292753577232361, "ref_logps/chosen": -99.23677062988281, "ref_logps/rejected": -96.49530029296875, "rewards/accuracies": 0.46875, "rewards/chosen": -0.5156323909759521, "rewards/margins": 0.04937323182821274, "rewards/rejected": -0.5650056600570679, "step": 1054 }, { "epoch": 0.7892276042640733, "grad_norm": 75.05789710423115, "learning_rate": 5.5872639454998174e-08, "logps/chosen": -107.80365753173828, "logps/rejected": -114.95257568359375, "loss": 0.7187, "losses/dpo": 0.7487943172454834, "losses/sft": 0.7472344636917114, "losses/total": 0.7487943172454834, "ref_logps/chosen": -103.2298812866211, "ref_logps/rejected": -109.55419158935547, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4573770761489868, "rewards/margins": 0.08246117830276489, "rewards/rejected": -0.5398382544517517, "step": 1055 }, { "epoch": 0.7899756873012904, "grad_norm": 74.88794943990307, "learning_rate": 5.5491061420390174e-08, "logps/chosen": -96.5494384765625, "logps/rejected": -95.85224151611328, "loss": 0.6742, "losses/dpo": 0.7517311573028564, "losses/sft": 0.6456807851791382, "losses/total": 0.7517311573028564, "ref_logps/chosen": -92.56754302978516, "ref_logps/rejected": -90.4353256225586, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3981894850730896, "rewards/margins": 0.14350254833698273, "rewards/rejected": -0.5416920185089111, "step": 1056 }, { "epoch": 0.7907237703385076, "grad_norm": 95.98464227061977, "learning_rate": 5.511062810654349e-08, "logps/chosen": -107.55001068115234, "logps/rejected": -105.58332824707031, "loss": 0.7112, "losses/dpo": 0.6548105478286743, "losses/sft": 1.1387739181518555, "losses/total": 0.6548105478286743, "ref_logps/chosen": -102.29004669189453, "ref_logps/rejected": -99.99060821533203, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5259960889816284, "rewards/margins": 0.03327678143978119, "rewards/rejected": -0.5592728853225708, "step": 1057 }, { "epoch": 0.7914718533757247, "grad_norm": 67.03720137490318, "learning_rate": 5.473134175237784e-08, "logps/chosen": -96.81930541992188, "logps/rejected": -115.07467651367188, "loss": 0.5536, "losses/dpo": 0.543090283870697, "losses/sft": 0.45889490842819214, "losses/total": 0.543090283870697, "ref_logps/chosen": -93.04784393310547, "ref_logps/rejected": -107.38002014160156, "rewards/accuracies": 0.71875, "rewards/chosen": -0.377146452665329, "rewards/margins": 0.39232051372528076, "rewards/rejected": -0.7694669961929321, "step": 1058 }, { "epoch": 0.7922199364129419, "grad_norm": 85.73761286112855, "learning_rate": 5.4353204590063125e-08, "logps/chosen": -87.00918579101562, "logps/rejected": -99.44050598144531, "loss": 0.5909, "losses/dpo": 0.4832344651222229, "losses/sft": 0.8927839994430542, "losses/total": 0.4832344651222229, "ref_logps/chosen": -83.68099975585938, "ref_logps/rejected": -92.97984313964844, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3328184485435486, "rewards/margins": 0.31324678659439087, "rewards/rejected": -0.6460652351379395, "step": 1059 }, { "epoch": 0.7929680194501589, "grad_norm": 55.71724337318246, "learning_rate": 5.3976218845005775e-08, "logps/chosen": -94.69587707519531, "logps/rejected": -102.09786987304688, "loss": 0.5495, "losses/dpo": 0.5731711387634277, "losses/sft": 0.5250707864761353, "losses/total": 0.5731711387634277, "ref_logps/chosen": -90.57020568847656, "ref_logps/rejected": -94.06102752685547, "rewards/accuracies": 0.71875, "rewards/chosen": -0.41256779432296753, "rewards/margins": 0.39111626148223877, "rewards/rejected": -0.8036840558052063, "step": 1060 }, { "epoch": 0.7937161024873761, "grad_norm": 101.58223959313378, "learning_rate": 5.3600386735836e-08, "logps/chosen": -91.72227478027344, "logps/rejected": -98.1314926147461, "loss": 0.6438, "losses/dpo": 0.5091481804847717, "losses/sft": 1.121121883392334, "losses/total": 0.5091481804847717, "ref_logps/chosen": -87.85374450683594, "ref_logps/rejected": -92.3218994140625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3868531584739685, "rewards/margins": 0.19410669803619385, "rewards/rejected": -0.5809598565101624, "step": 1061 }, { "epoch": 0.7944641855245932, "grad_norm": 60.36712149637059, "learning_rate": 5.322571047439467e-08, "logps/chosen": -94.49606323242188, "logps/rejected": -112.84645080566406, "loss": 0.5628, "losses/dpo": 0.6066350936889648, "losses/sft": 0.3802853226661682, "losses/total": 0.6066350936889648, "ref_logps/chosen": -90.80522155761719, "ref_logps/rejected": -105.4261474609375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.36908453702926636, "rewards/margins": 0.37294459342956543, "rewards/rejected": -0.742029070854187, "step": 1062 }, { "epoch": 0.7952122685618104, "grad_norm": 84.61864927683011, "learning_rate": 5.2852192265720215e-08, "logps/chosen": -59.30805206298828, "logps/rejected": -73.0915298461914, "loss": 0.5742, "losses/dpo": 0.5109359622001648, "losses/sft": 0.5165445804595947, "losses/total": 0.5109359622001648, "ref_logps/chosen": -56.97167205810547, "ref_logps/rejected": -67.55985260009766, "rewards/accuracies": 0.78125, "rewards/chosen": -0.23363783955574036, "rewards/margins": 0.31952962279319763, "rewards/rejected": -0.553167462348938, "step": 1063 }, { "epoch": 0.7959603515990274, "grad_norm": 79.77130217940349, "learning_rate": 5.2479834308035903e-08, "logps/chosen": -110.14955139160156, "logps/rejected": -116.67107391357422, "loss": 0.6392, "losses/dpo": 0.6202516555786133, "losses/sft": 1.0165596008300781, "losses/total": 0.6202516555786133, "ref_logps/chosen": -105.22299194335938, "ref_logps/rejected": -109.6368408203125, "rewards/accuracies": 0.625, "rewards/chosen": -0.4926570653915405, "rewards/margins": 0.21076688170433044, "rewards/rejected": -0.7034239172935486, "step": 1064 }, { "epoch": 0.7967084346362446, "grad_norm": 77.3393154899821, "learning_rate": 5.210863879273647e-08, "logps/chosen": -75.7182388305664, "logps/rejected": -78.96273040771484, "loss": 0.6062, "losses/dpo": 0.4306662082672119, "losses/sft": 0.32084500789642334, "losses/total": 0.4306662082672119, "ref_logps/chosen": -71.15099334716797, "ref_logps/rejected": -71.38204193115234, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4567248821258545, "rewards/margins": 0.3013445734977722, "rewards/rejected": -0.7580694556236267, "step": 1065 }, { "epoch": 0.7974565176734617, "grad_norm": 72.60974930071005, "learning_rate": 5.173860790437562e-08, "logps/chosen": -104.185791015625, "logps/rejected": -112.74453735351562, "loss": 0.5731, "losses/dpo": 0.6273537874221802, "losses/sft": 1.4866056442260742, "losses/total": 0.6273537874221802, "ref_logps/chosen": -100.82456970214844, "ref_logps/rejected": -106.02487182617188, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3361221253871918, "rewards/margins": 0.3358440101146698, "rewards/rejected": -0.6719661355018616, "step": 1066 }, { "epoch": 0.7982046007106789, "grad_norm": 89.60376678450888, "learning_rate": 5.136974382065293e-08, "logps/chosen": -88.36153411865234, "logps/rejected": -101.67266082763672, "loss": 0.6309, "losses/dpo": 0.2295730859041214, "losses/sft": 0.6593545079231262, "losses/total": 0.2295730859041214, "ref_logps/chosen": -83.36433410644531, "ref_logps/rejected": -93.13810729980469, "rewards/accuracies": 0.53125, "rewards/chosen": -0.49972012639045715, "rewards/margins": 0.35373514890670776, "rewards/rejected": -0.8534553050994873, "step": 1067 }, { "epoch": 0.798952683747896, "grad_norm": 74.36832963539513, "learning_rate": 5.10020487124011e-08, "logps/chosen": -72.70225524902344, "logps/rejected": -86.18916320800781, "loss": 0.5499, "losses/dpo": 0.5392440557479858, "losses/sft": 0.2796069085597992, "losses/total": 0.5392440557479858, "ref_logps/chosen": -68.6154556274414, "ref_logps/rejected": -77.38018798828125, "rewards/accuracies": 0.75, "rewards/chosen": -0.40868058800697327, "rewards/margins": 0.47221729159355164, "rewards/rejected": -0.8808978199958801, "step": 1068 }, { "epoch": 0.7997007667851131, "grad_norm": 101.62516170925211, "learning_rate": 5.063552474357338e-08, "logps/chosen": -109.6627197265625, "logps/rejected": -108.42852020263672, "loss": 0.6319, "losses/dpo": 0.6035622358322144, "losses/sft": 0.9169294238090515, "losses/total": 0.6035622358322144, "ref_logps/chosen": -105.66830444335938, "ref_logps/rejected": -102.04185485839844, "rewards/accuracies": 0.625, "rewards/chosen": -0.3994414806365967, "rewards/margins": 0.2392255663871765, "rewards/rejected": -0.6386670470237732, "step": 1069 }, { "epoch": 0.8004488498223303, "grad_norm": 76.57601331139372, "learning_rate": 5.0270174071230463e-08, "logps/chosen": -116.54289245605469, "logps/rejected": -126.27084350585938, "loss": 0.6906, "losses/dpo": 0.609625518321991, "losses/sft": 1.3942070007324219, "losses/total": 0.609625518321991, "ref_logps/chosen": -111.43043518066406, "ref_logps/rejected": -120.15016174316406, "rewards/accuracies": 0.5, "rewards/chosen": -0.5112460851669312, "rewards/margins": 0.10082217305898666, "rewards/rejected": -0.6120682954788208, "step": 1070 }, { "epoch": 0.8011969328595474, "grad_norm": 77.64420816863154, "learning_rate": 4.990599884552801e-08, "logps/chosen": -106.07501983642578, "logps/rejected": -104.33073425292969, "loss": 0.6549, "losses/dpo": 0.47374439239501953, "losses/sft": 0.7467329502105713, "losses/total": 0.47374439239501953, "ref_logps/chosen": -101.1135025024414, "ref_logps/rejected": -97.45083618164062, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4961521029472351, "rewards/margins": 0.19183726608753204, "rewards/rejected": -0.687989354133606, "step": 1071 }, { "epoch": 0.8019450158967646, "grad_norm": 56.592432791401244, "learning_rate": 4.9543001209703976e-08, "logps/chosen": -101.3875503540039, "logps/rejected": -101.52508544921875, "loss": 0.628, "losses/dpo": 0.6735670566558838, "losses/sft": 1.1268534660339355, "losses/total": 0.6735670566558838, "ref_logps/chosen": -97.52438354492188, "ref_logps/rejected": -95.23223876953125, "rewards/accuracies": 0.625, "rewards/chosen": -0.38631653785705566, "rewards/margins": 0.24296733736991882, "rewards/rejected": -0.6292839050292969, "step": 1072 }, { "epoch": 0.8019450158967646, "eval_logps/chosen": -39.4996452331543, "eval_logps/rejected": -45.777122497558594, "eval_loss": 0.615176260471344, "eval_losses/dpo": 0.6067831516265869, "eval_losses/sft": 0.32809561491012573, "eval_losses/total": 0.6067831516265869, "eval_ref_logps/chosen": -35.788204193115234, "eval_ref_logps/rejected": -39.235660552978516, "eval_rewards/accuracies": 0.6400862336158752, "eval_rewards/chosen": -0.37114447355270386, "eval_rewards/margins": 0.2830016314983368, "eval_rewards/rejected": -0.654146134853363, "eval_runtime": 38.1263, "eval_samples_per_second": 12.144, "eval_steps_per_second": 1.521, "step": 1072 }, { "epoch": 0.8026930989339817, "grad_norm": 81.35086929991623, "learning_rate": 4.9181183300066045e-08, "logps/chosen": -94.97135925292969, "logps/rejected": -84.58641815185547, "loss": 0.7231, "losses/dpo": 0.5987409353256226, "losses/sft": 0.6383897066116333, "losses/total": 0.5987409353256226, "ref_logps/chosen": -89.83317565917969, "ref_logps/rejected": -78.73716735839844, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5138182044029236, "rewards/margins": 0.07110674679279327, "rewards/rejected": -0.5849249362945557, "step": 1073 }, { "epoch": 0.8034411819711988, "grad_norm": 58.441922808875056, "learning_rate": 4.882054724597892e-08, "logps/chosen": -84.03362274169922, "logps/rejected": -97.62710571289062, "loss": 0.5775, "losses/dpo": 0.7452080249786377, "losses/sft": 1.2960538864135742, "losses/total": 0.7452080249786377, "ref_logps/chosen": -79.93785095214844, "ref_logps/rejected": -89.45124816894531, "rewards/accuracies": 0.6875, "rewards/chosen": -0.40957731008529663, "rewards/margins": 0.4080072045326233, "rewards/rejected": -0.8175844550132751, "step": 1074 }, { "epoch": 0.8041892650084159, "grad_norm": 110.74784328866554, "learning_rate": 4.846109516985192e-08, "logps/chosen": -82.70014953613281, "logps/rejected": -81.66864013671875, "loss": 0.6175, "losses/dpo": 0.6339119672775269, "losses/sft": 0.44855451583862305, "losses/total": 0.6339119672775269, "ref_logps/chosen": -78.1333999633789, "ref_logps/rejected": -74.7086181640625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.45667558908462524, "rewards/margins": 0.23932604491710663, "rewards/rejected": -0.6960015892982483, "step": 1075 }, { "epoch": 0.8049373480456331, "grad_norm": 76.29084959344372, "learning_rate": 4.810282918712635e-08, "logps/chosen": -82.66168975830078, "logps/rejected": -99.56742095947266, "loss": 0.6068, "losses/dpo": 0.5571731925010681, "losses/sft": 0.9069884419441223, "losses/total": 0.5571731925010681, "ref_logps/chosen": -79.33191680908203, "ref_logps/rejected": -93.6165542602539, "rewards/accuracies": 0.75, "rewards/chosen": -0.3329765796661377, "rewards/margins": 0.2621096074581146, "rewards/rejected": -0.5950862169265747, "step": 1076 }, { "epoch": 0.8056854310828502, "grad_norm": 84.27219313635277, "learning_rate": 4.774575140626316e-08, "logps/chosen": -101.57394409179688, "logps/rejected": -106.0834732055664, "loss": 0.6033, "losses/dpo": 0.5265019536018372, "losses/sft": 0.6159636378288269, "losses/total": 0.5265019536018372, "ref_logps/chosen": -96.49073791503906, "ref_logps/rejected": -97.74458312988281, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5083203911781311, "rewards/margins": 0.32556837797164917, "rewards/rejected": -0.833888828754425, "step": 1077 }, { "epoch": 0.8064335141200674, "grad_norm": 58.7325394595543, "learning_rate": 4.738986392873068e-08, "logps/chosen": -90.41338348388672, "logps/rejected": -107.95455932617188, "loss": 0.635, "losses/dpo": 0.6543173789978027, "losses/sft": 1.3476097583770752, "losses/total": 0.6543173789978027, "ref_logps/chosen": -86.33336639404297, "ref_logps/rejected": -101.93058776855469, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4080018401145935, "rewards/margins": 0.19439643621444702, "rewards/rejected": -0.6023982763290405, "step": 1078 }, { "epoch": 0.8071815971572844, "grad_norm": 94.29566646582121, "learning_rate": 4.703516884899189e-08, "logps/chosen": -76.92872619628906, "logps/rejected": -64.78780364990234, "loss": 0.7677, "losses/dpo": 0.7244080305099487, "losses/sft": 1.2473118305206299, "losses/total": 0.7244080305099487, "ref_logps/chosen": -71.1082534790039, "ref_logps/rejected": -59.883018493652344, "rewards/accuracies": 0.46875, "rewards/chosen": -0.5820476412773132, "rewards/margins": -0.09156913310289383, "rewards/rejected": -0.4904784858226776, "step": 1079 }, { "epoch": 0.8079296801945016, "grad_norm": 95.4381409869, "learning_rate": 4.668166825449238e-08, "logps/chosen": -106.8064956665039, "logps/rejected": -109.89035034179688, "loss": 0.6763, "losses/dpo": 0.6602586507797241, "losses/sft": 1.095442295074463, "losses/total": 0.6602586507797241, "ref_logps/chosen": -102.43681335449219, "ref_logps/rejected": -104.4804458618164, "rewards/accuracies": 0.53125, "rewards/chosen": -0.43696701526641846, "rewards/margins": 0.10402293503284454, "rewards/rejected": -0.5409899353981018, "step": 1080 }, { "epoch": 0.8086777632317187, "grad_norm": 54.42056823966537, "learning_rate": 4.6329364225647844e-08, "logps/chosen": -105.68653869628906, "logps/rejected": -109.35784912109375, "loss": 0.6145, "losses/dpo": 0.6098994612693787, "losses/sft": 0.4401474595069885, "losses/total": 0.6098994612693787, "ref_logps/chosen": -100.73770904541016, "ref_logps/rejected": -101.22428894042969, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4948827922344208, "rewards/margins": 0.31847336888313293, "rewards/rejected": -0.8133561611175537, "step": 1081 }, { "epoch": 0.8094258462689359, "grad_norm": 151.41277781174205, "learning_rate": 4.5978258835832226e-08, "logps/chosen": -86.69976806640625, "logps/rejected": -99.63167572021484, "loss": 0.6177, "losses/dpo": 0.4993326961994171, "losses/sft": 0.6881393790245056, "losses/total": 0.4993326961994171, "ref_logps/chosen": -82.31400299072266, "ref_logps/rejected": -92.32279968261719, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4385766386985779, "rewards/margins": 0.2923104166984558, "rewards/rejected": -0.7308870553970337, "step": 1082 }, { "epoch": 0.810173929306153, "grad_norm": 73.74045079365838, "learning_rate": 4.562835415136507e-08, "logps/chosen": -99.2420883178711, "logps/rejected": -106.12223052978516, "loss": 0.5821, "losses/dpo": 0.42542240023612976, "losses/sft": 0.9958387017250061, "losses/total": 0.42542240023612976, "ref_logps/chosen": -95.26861572265625, "ref_logps/rejected": -98.96634674072266, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3973466455936432, "rewards/margins": 0.31824174523353577, "rewards/rejected": -0.7155883312225342, "step": 1083 }, { "epoch": 0.8109220123433701, "grad_norm": 65.26715430946783, "learning_rate": 4.527965223149957e-08, "logps/chosen": -102.68143463134766, "logps/rejected": -109.97267150878906, "loss": 0.6719, "losses/dpo": 0.6484699249267578, "losses/sft": 0.4245877265930176, "losses/total": 0.6484699249267578, "ref_logps/chosen": -97.36044311523438, "ref_logps/rejected": -102.95113372802734, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5320993661880493, "rewards/margins": 0.17005473375320435, "rewards/rejected": -0.7021540999412537, "step": 1084 }, { "epoch": 0.8116700953805872, "grad_norm": 76.79414505942707, "learning_rate": 4.493215512841045e-08, "logps/chosen": -79.42460632324219, "logps/rejected": -90.59722900390625, "loss": 0.6181, "losses/dpo": 0.4789654612541199, "losses/sft": 1.205140471458435, "losses/total": 0.4789654612541199, "ref_logps/chosen": -76.56993103027344, "ref_logps/rejected": -85.0374755859375, "rewards/accuracies": 0.75, "rewards/chosen": -0.28546684980392456, "rewards/margins": 0.27050840854644775, "rewards/rejected": -0.5559753179550171, "step": 1085 }, { "epoch": 0.8124181784178044, "grad_norm": 62.26682816973918, "learning_rate": 4.4585864887181776e-08, "logps/chosen": -102.46062469482422, "logps/rejected": -118.60747528076172, "loss": 0.5675, "losses/dpo": 0.7034720182418823, "losses/sft": 0.7146216630935669, "losses/total": 0.7034720182418823, "ref_logps/chosen": -98.48602294921875, "ref_logps/rejected": -110.82355499267578, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3974606990814209, "rewards/margins": 0.38093167543411255, "rewards/rejected": -0.7783923149108887, "step": 1086 }, { "epoch": 0.8131662614550215, "grad_norm": 77.25605656359141, "learning_rate": 4.4240783545795274e-08, "logps/chosen": -106.13206481933594, "logps/rejected": -123.43043518066406, "loss": 0.5653, "losses/dpo": 0.4355410635471344, "losses/sft": 0.6239435076713562, "losses/total": 0.4355410635471344, "ref_logps/chosen": -102.17520141601562, "ref_logps/rejected": -115.06063079833984, "rewards/accuracies": 0.625, "rewards/chosen": -0.3956863284111023, "rewards/margins": 0.4412946105003357, "rewards/rejected": -0.836980938911438, "step": 1087 }, { "epoch": 0.8139143444922387, "grad_norm": 70.6135148963918, "learning_rate": 4.389691313511781e-08, "logps/chosen": -84.7960205078125, "logps/rejected": -83.1664810180664, "loss": 0.5946, "losses/dpo": 0.606174647808075, "losses/sft": 1.2345696687698364, "losses/total": 0.606174647808075, "ref_logps/chosen": -81.07846069335938, "ref_logps/rejected": -76.52119445800781, "rewards/accuracies": 0.65625, "rewards/chosen": -0.37175607681274414, "rewards/margins": 0.2927728295326233, "rewards/rejected": -0.6645288467407227, "step": 1088 }, { "epoch": 0.8146624275294557, "grad_norm": 63.02967557893516, "learning_rate": 4.355425567888979e-08, "logps/chosen": -92.26737976074219, "logps/rejected": -104.60704803466797, "loss": 0.5586, "losses/dpo": 0.470695436000824, "losses/sft": 0.9050682187080383, "losses/total": 0.470695436000824, "ref_logps/chosen": -88.5831298828125, "ref_logps/rejected": -97.22747802734375, "rewards/accuracies": 0.75, "rewards/chosen": -0.36842459440231323, "rewards/margins": 0.369532972574234, "rewards/rejected": -0.7379575371742249, "step": 1089 }, { "epoch": 0.8154105105666729, "grad_norm": 74.86450008332682, "learning_rate": 4.321281319371309e-08, "logps/chosen": -75.14773559570312, "logps/rejected": -83.40277862548828, "loss": 0.6679, "losses/dpo": 0.7771933078765869, "losses/sft": 0.6676187515258789, "losses/total": 0.7771933078765869, "ref_logps/chosen": -70.9697494506836, "ref_logps/rejected": -77.79756164550781, "rewards/accuracies": 0.59375, "rewards/chosen": -0.41779905557632446, "rewards/margins": 0.14272169768810272, "rewards/rejected": -0.5605207681655884, "step": 1090 }, { "epoch": 0.81615859360389, "grad_norm": 70.78599021146263, "learning_rate": 4.287258768903948e-08, "logps/chosen": -109.89376831054688, "logps/rejected": -112.6286392211914, "loss": 0.5528, "losses/dpo": 0.4702899754047394, "losses/sft": 0.6116170287132263, "losses/total": 0.4702899754047394, "ref_logps/chosen": -106.17707061767578, "ref_logps/rejected": -104.06587982177734, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3716699182987213, "rewards/margins": 0.4846055209636688, "rewards/rejected": -0.8562755584716797, "step": 1091 }, { "epoch": 0.8169066766411072, "grad_norm": 77.66583305806532, "learning_rate": 4.2533581167158357e-08, "logps/chosen": -126.1910400390625, "logps/rejected": -113.80059814453125, "loss": 0.7235, "losses/dpo": 0.5216203927993774, "losses/sft": 1.2957059144973755, "losses/total": 0.5216203927993774, "ref_logps/chosen": -120.44058227539062, "ref_logps/rejected": -107.31217193603516, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5750458240509033, "rewards/margins": 0.0737977921962738, "rewards/rejected": -0.6488436460494995, "step": 1092 }, { "epoch": 0.8176547596783242, "grad_norm": 66.34151930434672, "learning_rate": 4.2195795623185234e-08, "logps/chosen": -80.58029174804688, "logps/rejected": -85.40762329101562, "loss": 0.7105, "losses/dpo": 0.8583267331123352, "losses/sft": 0.7293281555175781, "losses/total": 0.8583267331123352, "ref_logps/chosen": -76.5372543334961, "ref_logps/rejected": -80.88524627685547, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4043034315109253, "rewards/margins": 0.04793338105082512, "rewards/rejected": -0.4522368013858795, "step": 1093 }, { "epoch": 0.8184028427155414, "grad_norm": 73.35596914621948, "learning_rate": 4.185923304504996e-08, "logps/chosen": -97.21061706542969, "logps/rejected": -100.73197174072266, "loss": 0.647, "losses/dpo": 0.478425532579422, "losses/sft": 0.5571839809417725, "losses/total": 0.478425532579422, "ref_logps/chosen": -93.4908447265625, "ref_logps/rejected": -95.21526336669922, "rewards/accuracies": 0.59375, "rewards/chosen": -0.37197792530059814, "rewards/margins": 0.17969387769699097, "rewards/rejected": -0.5516718029975891, "step": 1094 }, { "epoch": 0.8191509257527586, "grad_norm": 58.40328035515103, "learning_rate": 4.152389541348494e-08, "logps/chosen": -94.23403930664062, "logps/rejected": -103.5373306274414, "loss": 0.6308, "losses/dpo": 0.48351743817329407, "losses/sft": 0.7311797142028809, "losses/total": 0.48351743817329407, "ref_logps/chosen": -90.38697814941406, "ref_logps/rejected": -97.73787689208984, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3847062587738037, "rewards/margins": 0.19523946940898895, "rewards/rejected": -0.5799456834793091, "step": 1095 }, { "epoch": 0.8198990087899757, "grad_norm": 68.2129327632744, "learning_rate": 4.1189784702013784e-08, "logps/chosen": -96.92897033691406, "logps/rejected": -103.84425354003906, "loss": 0.6001, "losses/dpo": 0.42799001932144165, "losses/sft": 0.9604641795158386, "losses/total": 0.42799001932144165, "ref_logps/chosen": -93.23411560058594, "ref_logps/rejected": -97.05563354492188, "rewards/accuracies": 0.65625, "rewards/chosen": -0.369486540555954, "rewards/margins": 0.3093754053115845, "rewards/rejected": -0.6788619160652161, "step": 1096 }, { "epoch": 0.8206470918271929, "grad_norm": 72.3473060079986, "learning_rate": 4.085690287693916e-08, "logps/chosen": -93.28019714355469, "logps/rejected": -106.07257843017578, "loss": 0.6173, "losses/dpo": 0.6693598031997681, "losses/sft": 1.0620605945587158, "losses/total": 0.6693598031997681, "ref_logps/chosen": -89.23619079589844, "ref_logps/rejected": -99.14531707763672, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4044008255004883, "rewards/margins": 0.2883254885673523, "rewards/rejected": -0.6927263736724854, "step": 1097 }, { "epoch": 0.8213951748644099, "grad_norm": 93.24290725692302, "learning_rate": 4.052525189733166e-08, "logps/chosen": -88.6192626953125, "logps/rejected": -92.65288543701172, "loss": 0.7055, "losses/dpo": 0.7005810141563416, "losses/sft": 0.3891492187976837, "losses/total": 0.7005810141563416, "ref_logps/chosen": -83.7149429321289, "ref_logps/rejected": -86.96125793457031, "rewards/accuracies": 0.40625, "rewards/chosen": -0.49043142795562744, "rewards/margins": 0.07873179018497467, "rewards/rejected": -0.5691632032394409, "step": 1098 }, { "epoch": 0.8221432579016271, "grad_norm": 55.27362509975342, "learning_rate": 4.0194833715018025e-08, "logps/chosen": -83.96549224853516, "logps/rejected": -106.64033508300781, "loss": 0.5734, "losses/dpo": 0.5840281248092651, "losses/sft": 1.2166422605514526, "losses/total": 0.5840281248092651, "ref_logps/chosen": -80.93157958984375, "ref_logps/rejected": -99.95775604248047, "rewards/accuracies": 0.75, "rewards/chosen": -0.3033914864063263, "rewards/margins": 0.36486655473709106, "rewards/rejected": -0.668258011341095, "step": 1099 }, { "epoch": 0.8228913409388442, "grad_norm": 67.79176436848822, "learning_rate": 3.9865650274569954e-08, "logps/chosen": -116.33053588867188, "logps/rejected": -129.295654296875, "loss": 0.5836, "losses/dpo": 0.558629035949707, "losses/sft": 1.2512686252593994, "losses/total": 0.558629035949707, "ref_logps/chosen": -113.05713653564453, "ref_logps/rejected": -122.72959899902344, "rewards/accuracies": 0.65625, "rewards/chosen": -0.32734084129333496, "rewards/margins": 0.3292643129825592, "rewards/rejected": -0.6566051244735718, "step": 1100 }, { "epoch": 0.8236394239760614, "grad_norm": 67.7642114943522, "learning_rate": 3.953770351329227e-08, "logps/chosen": -66.98492431640625, "logps/rejected": -87.38168334960938, "loss": 0.6191, "losses/dpo": 0.5885561108589172, "losses/sft": 0.810598611831665, "losses/total": 0.5885561108589172, "ref_logps/chosen": -63.16120147705078, "ref_logps/rejected": -80.55136108398438, "rewards/accuracies": 0.625, "rewards/chosen": -0.38237205147743225, "rewards/margins": 0.3006594181060791, "rewards/rejected": -0.683031439781189, "step": 1101 }, { "epoch": 0.8243875070132785, "grad_norm": 79.57227017930327, "learning_rate": 3.9210995361211835e-08, "logps/chosen": -100.87260437011719, "logps/rejected": -115.896240234375, "loss": 0.6306, "losses/dpo": 0.47589507699012756, "losses/sft": 1.1687517166137695, "losses/total": 0.47589507699012756, "ref_logps/chosen": -96.73904418945312, "ref_logps/rejected": -109.11121368408203, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4133557677268982, "rewards/margins": 0.2651479244232178, "rewards/rejected": -0.678503692150116, "step": 1102 }, { "epoch": 0.8251355900504956, "grad_norm": 69.08120576399293, "learning_rate": 3.888552774106596e-08, "logps/chosen": -84.4455795288086, "logps/rejected": -92.23174285888672, "loss": 0.6722, "losses/dpo": 0.7034687995910645, "losses/sft": 1.085005283355713, "losses/total": 0.7034687995910645, "ref_logps/chosen": -79.77639770507812, "ref_logps/rejected": -85.11955261230469, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4669187068939209, "rewards/margins": 0.24430036544799805, "rewards/rejected": -0.7112191319465637, "step": 1103 }, { "epoch": 0.8258836730877127, "grad_norm": 52.70021093858346, "learning_rate": 3.8561302568291207e-08, "logps/chosen": -83.05165100097656, "logps/rejected": -93.15257263183594, "loss": 0.6035, "losses/dpo": 0.693987250328064, "losses/sft": 0.31116706132888794, "losses/total": 0.693987250328064, "ref_logps/chosen": -78.94000244140625, "ref_logps/rejected": -85.71685791015625, "rewards/accuracies": 0.59375, "rewards/chosen": -0.411164790391922, "rewards/margins": 0.33240649104118347, "rewards/rejected": -0.7435712814331055, "step": 1104 }, { "epoch": 0.8266317561249299, "grad_norm": 49.93429694568004, "learning_rate": 3.82383217510123e-08, "logps/chosen": -91.09173583984375, "logps/rejected": -109.5943832397461, "loss": 0.5923, "losses/dpo": 0.5987730026245117, "losses/sft": 0.5703917741775513, "losses/total": 0.5987730026245117, "ref_logps/chosen": -87.352783203125, "ref_logps/rejected": -102.68049621582031, "rewards/accuracies": 0.625, "rewards/chosen": -0.373896062374115, "rewards/margins": 0.31749317049980164, "rewards/rejected": -0.6913892030715942, "step": 1105 }, { "epoch": 0.827379839162147, "grad_norm": 69.0708594324821, "learning_rate": 3.7916587190030464e-08, "logps/chosen": -113.4881591796875, "logps/rejected": -116.32492065429688, "loss": 0.6198, "losses/dpo": 0.722990870475769, "losses/sft": 0.5421802997589111, "losses/total": 0.722990870475769, "ref_logps/chosen": -109.3351821899414, "ref_logps/rejected": -109.956787109375, "rewards/accuracies": 0.625, "rewards/chosen": -0.41529715061187744, "rewards/margins": 0.2215169221162796, "rewards/rejected": -0.6368141174316406, "step": 1106 }, { "epoch": 0.8281279221993642, "grad_norm": 65.81028432277164, "learning_rate": 3.759610077881259e-08, "logps/chosen": -76.41390228271484, "logps/rejected": -89.26223754882812, "loss": 0.6331, "losses/dpo": 0.6481855511665344, "losses/sft": 0.559194028377533, "losses/total": 0.6481855511665344, "ref_logps/chosen": -73.1112060546875, "ref_logps/rejected": -83.7293701171875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3302696943283081, "rewards/margins": 0.22301596403121948, "rewards/rejected": -0.5532856583595276, "step": 1107 }, { "epoch": 0.8288760052365812, "grad_norm": 52.075370983573244, "learning_rate": 3.727686440347991e-08, "logps/chosen": -85.6903305053711, "logps/rejected": -97.22527313232422, "loss": 0.5686, "losses/dpo": 0.4838050603866577, "losses/sft": 0.9236255884170532, "losses/total": 0.4838050603866577, "ref_logps/chosen": -82.43492126464844, "ref_logps/rejected": -90.28751373291016, "rewards/accuracies": 0.6875, "rewards/chosen": -0.32554006576538086, "rewards/margins": 0.36823567748069763, "rewards/rejected": -0.6937757730484009, "step": 1108 }, { "epoch": 0.8296240882737984, "grad_norm": 133.60981889779694, "learning_rate": 3.6958879942797114e-08, "logps/chosen": -94.16575622558594, "logps/rejected": -91.43103790283203, "loss": 0.6806, "losses/dpo": 0.7754580974578857, "losses/sft": 0.6095079779624939, "losses/total": 0.7754580974578857, "ref_logps/chosen": -89.62761688232422, "ref_logps/rejected": -85.6177749633789, "rewards/accuracies": 0.5625, "rewards/chosen": -0.45381444692611694, "rewards/margins": 0.1275121569633484, "rewards/rejected": -0.5813266038894653, "step": 1109 }, { "epoch": 0.8303721713110155, "grad_norm": 81.11922244995333, "learning_rate": 3.6642149268161005e-08, "logps/chosen": -101.45236206054688, "logps/rejected": -111.53697204589844, "loss": 0.5839, "losses/dpo": 0.6592105031013489, "losses/sft": 0.6112967133522034, "losses/total": 0.6592105031013489, "ref_logps/chosen": -97.49604797363281, "ref_logps/rejected": -104.12814331054688, "rewards/accuracies": 0.71875, "rewards/chosen": -0.39563101530075073, "rewards/margins": 0.34525182843208313, "rewards/rejected": -0.7408828735351562, "step": 1110 }, { "epoch": 0.8311202543482327, "grad_norm": 69.90919463189081, "learning_rate": 3.632667424358965e-08, "logps/chosen": -105.75830841064453, "logps/rejected": -117.47185516357422, "loss": 0.5987, "losses/dpo": 0.6054035425186157, "losses/sft": 1.2080955505371094, "losses/total": 0.6054035425186157, "ref_logps/chosen": -101.15011596679688, "ref_logps/rejected": -109.72505950927734, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4608197808265686, "rewards/margins": 0.3138594627380371, "rewards/rejected": -0.7746792435646057, "step": 1111 }, { "epoch": 0.8318683373854497, "grad_norm": 72.22518769395312, "learning_rate": 3.601245672571143e-08, "logps/chosen": -88.4651107788086, "logps/rejected": -98.90802764892578, "loss": 0.6048, "losses/dpo": 0.6995025873184204, "losses/sft": 0.5877116918563843, "losses/total": 0.6995025873184204, "ref_logps/chosen": -83.65252685546875, "ref_logps/rejected": -90.94853973388672, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4812582731246948, "rewards/margins": 0.3146902620792389, "rewards/rejected": -0.7959485650062561, "step": 1112 }, { "epoch": 0.8326164204226669, "grad_norm": 84.01858201444644, "learning_rate": 3.569949856375393e-08, "logps/chosen": -104.42627716064453, "logps/rejected": -107.17235565185547, "loss": 0.6599, "losses/dpo": 0.6388543844223022, "losses/sft": 0.47329288721084595, "losses/total": 0.6388543844223022, "ref_logps/chosen": -100.91382598876953, "ref_logps/rejected": -102.11565399169922, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3512451648712158, "rewards/margins": 0.15442442893981934, "rewards/rejected": -0.5056695938110352, "step": 1113 }, { "epoch": 0.833364503459884, "grad_norm": 82.3537622607996, "learning_rate": 3.538780159953347e-08, "logps/chosen": -74.94674682617188, "logps/rejected": -83.75489044189453, "loss": 0.6089, "losses/dpo": 0.5794873237609863, "losses/sft": 0.38139480352401733, "losses/total": 0.5794873237609863, "ref_logps/chosen": -70.81851196289062, "ref_logps/rejected": -76.54469299316406, "rewards/accuracies": 0.75, "rewards/chosen": -0.412822961807251, "rewards/margins": 0.3081958293914795, "rewards/rejected": -0.7210188508033752, "step": 1114 }, { "epoch": 0.8341125864971012, "grad_norm": 96.887046192983, "learning_rate": 3.507736766744376e-08, "logps/chosen": -100.86360931396484, "logps/rejected": -106.59542846679688, "loss": 0.6068, "losses/dpo": 0.7685271501541138, "losses/sft": 1.492434024810791, "losses/total": 0.7685271501541138, "ref_logps/chosen": -97.3539810180664, "ref_logps/rejected": -99.59307861328125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.35096287727355957, "rewards/margins": 0.3492714762687683, "rewards/rejected": -0.7002343535423279, "step": 1115 }, { "epoch": 0.8348606695343183, "grad_norm": 61.72420452496826, "learning_rate": 3.476819859444538e-08, "logps/chosen": -107.64030456542969, "logps/rejected": -114.44557189941406, "loss": 0.6712, "losses/dpo": 0.5542910099029541, "losses/sft": 0.8711963891983032, "losses/total": 0.5542910099029541, "ref_logps/chosen": -101.88948822021484, "ref_logps/rejected": -106.64985656738281, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5750818252563477, "rewards/margins": 0.20449009537696838, "rewards/rejected": -0.7795719504356384, "step": 1116 }, { "epoch": 0.8356087525715354, "grad_norm": 109.24938748185792, "learning_rate": 3.4460296200054975e-08, "logps/chosen": -109.87751770019531, "logps/rejected": -109.65902709960938, "loss": 0.7333, "losses/dpo": 1.1011136770248413, "losses/sft": 1.1937294006347656, "losses/total": 1.1011136770248413, "ref_logps/chosen": -103.67094421386719, "ref_logps/rejected": -102.95014953613281, "rewards/accuracies": 0.46875, "rewards/chosen": -0.6206570863723755, "rewards/margins": 0.05023185908794403, "rewards/rejected": -0.6708889007568359, "step": 1117 }, { "epoch": 0.8363568356087526, "grad_norm": 97.84578065035129, "learning_rate": 3.4153662296334676e-08, "logps/chosen": -89.03369140625, "logps/rejected": -109.23141479492188, "loss": 0.5816, "losses/dpo": 0.65996915102005, "losses/sft": 1.017493486404419, "losses/total": 0.65996915102005, "ref_logps/chosen": -84.95965576171875, "ref_logps/rejected": -101.18563079833984, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4074046313762665, "rewards/margins": 0.3971736431121826, "rewards/rejected": -0.8045783042907715, "step": 1118 }, { "epoch": 0.8371049186459697, "grad_norm": 81.49078183611613, "learning_rate": 3.3848298687881136e-08, "logps/chosen": -87.49349975585938, "logps/rejected": -95.74250030517578, "loss": 0.6403, "losses/dpo": 0.7775467038154602, "losses/sft": 1.4247559309005737, "losses/total": 0.7775467038154602, "ref_logps/chosen": -82.2662353515625, "ref_logps/rejected": -88.41285705566406, "rewards/accuracies": 0.625, "rewards/chosen": -0.5227270126342773, "rewards/margins": 0.21023711562156677, "rewards/rejected": -0.7329641580581665, "step": 1119 }, { "epoch": 0.8378530016831869, "grad_norm": 94.11324663477863, "learning_rate": 3.354420717181522e-08, "logps/chosen": -109.89578247070312, "logps/rejected": -117.68363952636719, "loss": 0.6293, "losses/dpo": 0.6375395059585571, "losses/sft": 1.1761213541030884, "losses/total": 0.6375395059585571, "ref_logps/chosen": -105.76811218261719, "ref_logps/rejected": -111.17877960205078, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4127661883831024, "rewards/margins": 0.23771992325782776, "rewards/rejected": -0.6504861116409302, "step": 1120 }, { "epoch": 0.838601084720404, "grad_norm": 64.8085671154016, "learning_rate": 3.324138953777117e-08, "logps/chosen": -90.02445983886719, "logps/rejected": -92.56121063232422, "loss": 0.6613, "losses/dpo": 0.640059232711792, "losses/sft": 1.1902602910995483, "losses/total": 0.640059232711792, "ref_logps/chosen": -85.85026550292969, "ref_logps/rejected": -86.93789672851562, "rewards/accuracies": 0.59375, "rewards/chosen": -0.41741910576820374, "rewards/margins": 0.14491330087184906, "rewards/rejected": -0.5623323917388916, "step": 1121 }, { "epoch": 0.8393491677576211, "grad_norm": 71.14277013958302, "learning_rate": 3.29398475678864e-08, "logps/chosen": -111.824951171875, "logps/rejected": -117.24454498291016, "loss": 0.6621, "losses/dpo": 0.5515626668930054, "losses/sft": 0.8536291718482971, "losses/total": 0.5515626668930054, "ref_logps/chosen": -106.62355041503906, "ref_logps/rejected": -110.38106536865234, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5201402902603149, "rewards/margins": 0.16620796918869019, "rewards/rejected": -0.6863481998443604, "step": 1122 }, { "epoch": 0.8400972507948382, "grad_norm": 54.88700599804292, "learning_rate": 3.263958303679057e-08, "logps/chosen": -91.23989868164062, "logps/rejected": -108.99897003173828, "loss": 0.5938, "losses/dpo": 0.7433619499206543, "losses/sft": 0.9011365175247192, "losses/total": 0.7433619499206543, "ref_logps/chosen": -87.34468078613281, "ref_logps/rejected": -101.37701416015625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3895218074321747, "rewards/margins": 0.37267404794692993, "rewards/rejected": -0.762195885181427, "step": 1123 }, { "epoch": 0.8408453338320554, "grad_norm": 63.49493879137905, "learning_rate": 3.234059771159556e-08, "logps/chosen": -100.90934753417969, "logps/rejected": -111.4334716796875, "loss": 0.6477, "losses/dpo": 0.5764752626419067, "losses/sft": 0.6418586373329163, "losses/total": 0.5764752626419067, "ref_logps/chosen": -96.98014068603516, "ref_logps/rejected": -105.87957763671875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3929208815097809, "rewards/margins": 0.16246992349624634, "rewards/rejected": -0.5553908348083496, "step": 1124 }, { "epoch": 0.8415934168692725, "grad_norm": 71.09678827857017, "learning_rate": 3.2042893351884783e-08, "logps/chosen": -115.41576385498047, "logps/rejected": -113.395263671875, "loss": 0.7059, "losses/dpo": 0.8953909873962402, "losses/sft": 0.9754602313041687, "losses/total": 0.8953909873962402, "ref_logps/chosen": -110.19416809082031, "ref_logps/rejected": -107.05018615722656, "rewards/accuracies": 0.46875, "rewards/chosen": -0.5221593379974365, "rewards/margins": 0.11234819144010544, "rewards/rejected": -0.6345075368881226, "step": 1125 }, { "epoch": 0.8423414999064897, "grad_norm": 75.89019736314204, "learning_rate": 3.174647170970296e-08, "logps/chosen": -105.60362243652344, "logps/rejected": -107.98330688476562, "loss": 0.5608, "losses/dpo": 0.5649166107177734, "losses/sft": 0.4450972378253937, "losses/total": 0.5649166107177734, "ref_logps/chosen": -102.36888122558594, "ref_logps/rejected": -100.28034973144531, "rewards/accuracies": 0.75, "rewards/chosen": -0.3234744966030121, "rewards/margins": 0.44682055711746216, "rewards/rejected": -0.7702950239181519, "step": 1126 }, { "epoch": 0.8430895829437067, "grad_norm": 55.13186329438975, "learning_rate": 3.145133452954585e-08, "logps/chosen": -70.80465698242188, "logps/rejected": -81.47305297851562, "loss": 0.5899, "losses/dpo": 0.596772313117981, "losses/sft": 0.8159319758415222, "losses/total": 0.596772313117981, "ref_logps/chosen": -67.36268615722656, "ref_logps/rejected": -74.81404113769531, "rewards/accuracies": 0.625, "rewards/chosen": -0.3441975712776184, "rewards/margins": 0.32170361280441284, "rewards/rejected": -0.665901243686676, "step": 1127 }, { "epoch": 0.8438376659809239, "grad_norm": 73.0924600044572, "learning_rate": 3.1157483548349824e-08, "logps/chosen": -93.08203887939453, "logps/rejected": -100.78540802001953, "loss": 0.6182, "losses/dpo": 0.6675162315368652, "losses/sft": 1.1752870082855225, "losses/total": 0.6675162315368652, "ref_logps/chosen": -88.34362030029297, "ref_logps/rejected": -92.9242172241211, "rewards/accuracies": 0.65625, "rewards/chosen": -0.47384220361709595, "rewards/margins": 0.3122768998146057, "rewards/rejected": -0.7861190438270569, "step": 1128 }, { "epoch": 0.844585749018141, "grad_norm": 74.18400286536126, "learning_rate": 3.086492049548178e-08, "logps/chosen": -98.4913101196289, "logps/rejected": -110.99407196044922, "loss": 0.5991, "losses/dpo": 0.5986804962158203, "losses/sft": 1.3908436298370361, "losses/total": 0.5986804962158203, "ref_logps/chosen": -94.24397277832031, "ref_logps/rejected": -103.57630920410156, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4247336685657501, "rewards/margins": 0.31704288721084595, "rewards/rejected": -0.7417765855789185, "step": 1129 }, { "epoch": 0.8453338320553582, "grad_norm": 112.51921100620544, "learning_rate": 3.057364709272886e-08, "logps/chosen": -108.24406433105469, "logps/rejected": -106.16944122314453, "loss": 0.6709, "losses/dpo": 0.7350795269012451, "losses/sft": 1.4501315355300903, "losses/total": 0.7350795269012451, "ref_logps/chosen": -103.32905578613281, "ref_logps/rejected": -99.8653564453125, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4915008246898651, "rewards/margins": 0.13890817761421204, "rewards/rejected": -0.6304090023040771, "step": 1130 }, { "epoch": 0.8460819150925752, "grad_norm": 68.69780170083037, "learning_rate": 3.028366505428856e-08, "logps/chosen": -80.54450988769531, "logps/rejected": -103.29612731933594, "loss": 0.6025, "losses/dpo": 0.8235004544258118, "losses/sft": 1.133596658706665, "losses/total": 0.8235004544258118, "ref_logps/chosen": -75.99964904785156, "ref_logps/rejected": -95.76078033447266, "rewards/accuracies": 0.625, "rewards/chosen": -0.4544859230518341, "rewards/margins": 0.29904842376708984, "rewards/rejected": -0.7535344362258911, "step": 1131 }, { "epoch": 0.8468299981297924, "grad_norm": 157.68592870581827, "learning_rate": 2.999497608675827e-08, "logps/chosen": -87.0016098022461, "logps/rejected": -97.91226196289062, "loss": 0.5906, "losses/dpo": 0.5648943185806274, "losses/sft": 1.3172342777252197, "losses/total": 0.5648943185806274, "ref_logps/chosen": -83.59552001953125, "ref_logps/rejected": -91.65560913085938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3406086564064026, "rewards/margins": 0.28505751490592957, "rewards/rejected": -0.6256661415100098, "step": 1132 }, { "epoch": 0.8475780811670095, "grad_norm": 67.9258764688486, "learning_rate": 2.9707581889125504e-08, "logps/chosen": -86.97413635253906, "logps/rejected": -95.74651336669922, "loss": 0.588, "losses/dpo": 0.5409899950027466, "losses/sft": 1.3401576280593872, "losses/total": 0.5409899950027466, "ref_logps/chosen": -83.33476257324219, "ref_logps/rejected": -88.2034912109375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3639376163482666, "rewards/margins": 0.39036497473716736, "rewards/rejected": -0.7543026208877563, "step": 1133 }, { "epoch": 0.8483261642042267, "grad_norm": 65.5382980318465, "learning_rate": 2.9421484152757748e-08, "logps/chosen": -112.93919372558594, "logps/rejected": -135.03106689453125, "loss": 0.6407, "losses/dpo": 0.6012345552444458, "losses/sft": 1.5231225490570068, "losses/total": 0.6012345552444458, "ref_logps/chosen": -109.02582550048828, "ref_logps/rejected": -129.02035522460938, "rewards/accuracies": 0.65625, "rewards/chosen": -0.39133644104003906, "rewards/margins": 0.20973454415798187, "rewards/rejected": -0.6010710000991821, "step": 1134 }, { "epoch": 0.8490742472414438, "grad_norm": 100.22248032624287, "learning_rate": 2.913668456139265e-08, "logps/chosen": -115.21800231933594, "logps/rejected": -111.00267028808594, "loss": 0.6889, "losses/dpo": 0.5557447671890259, "losses/sft": 0.9240932464599609, "losses/total": 0.5557447671890259, "ref_logps/chosen": -110.38092803955078, "ref_logps/rejected": -105.23908996582031, "rewards/accuracies": 0.625, "rewards/chosen": -0.4837072193622589, "rewards/margins": 0.09265114367008209, "rewards/rejected": -0.5763583779335022, "step": 1135 }, { "epoch": 0.8498223302786609, "grad_norm": 63.98951840996392, "learning_rate": 2.8853184791128084e-08, "logps/chosen": -89.15528869628906, "logps/rejected": -95.6718978881836, "loss": 0.6646, "losses/dpo": 0.46943992376327515, "losses/sft": 0.5857738852500916, "losses/total": 0.46943992376327515, "ref_logps/chosen": -84.20246887207031, "ref_logps/rejected": -88.58855438232422, "rewards/accuracies": 0.53125, "rewards/chosen": -0.49528250098228455, "rewards/margins": 0.21305182576179504, "rewards/rejected": -0.7083343267440796, "step": 1136 }, { "epoch": 0.850570413315878, "grad_norm": 47.84455983518663, "learning_rate": 2.8570986510412092e-08, "logps/chosen": -90.47746276855469, "logps/rejected": -102.31077575683594, "loss": 0.6075, "losses/dpo": 0.5457146167755127, "losses/sft": 1.1434704065322876, "losses/total": 0.5457146167755127, "ref_logps/chosen": -87.06696319580078, "ref_logps/rejected": -96.3960952758789, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3410496711730957, "rewards/margins": 0.2504180669784546, "rewards/rejected": -0.5914677381515503, "step": 1137 }, { "epoch": 0.8513184963530952, "grad_norm": 57.50989951456439, "learning_rate": 2.8290091380033326e-08, "logps/chosen": -84.9250259399414, "logps/rejected": -93.33457946777344, "loss": 0.6256, "losses/dpo": 0.6087413430213928, "losses/sft": 0.8893285989761353, "losses/total": 0.6087413430213928, "ref_logps/chosen": -81.02131652832031, "ref_logps/rejected": -87.01056671142578, "rewards/accuracies": 0.5625, "rewards/chosen": -0.39037173986434937, "rewards/margins": 0.242029070854187, "rewards/rejected": -0.6324007511138916, "step": 1138 }, { "epoch": 0.8520665793903123, "grad_norm": 80.88877232540848, "learning_rate": 2.8010501053111062e-08, "logps/chosen": -96.27772521972656, "logps/rejected": -110.56034088134766, "loss": 0.5628, "losses/dpo": 0.3949304223060608, "losses/sft": 1.3884507417678833, "losses/total": 0.3949304223060608, "ref_logps/chosen": -92.75360870361328, "ref_logps/rejected": -102.92814636230469, "rewards/accuracies": 0.625, "rewards/chosen": -0.3524121642112732, "rewards/margins": 0.41080760955810547, "rewards/rejected": -0.7632197737693787, "step": 1139 }, { "epoch": 0.8528146624275295, "grad_norm": 80.955573814535, "learning_rate": 2.7732217175085726e-08, "logps/chosen": -75.83497619628906, "logps/rejected": -92.5840835571289, "loss": 0.6474, "losses/dpo": 0.9026886224746704, "losses/sft": 0.43227118253707886, "losses/total": 0.9026886224746704, "ref_logps/chosen": -72.50325012207031, "ref_logps/rejected": -86.64991760253906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.33317217230796814, "rewards/margins": 0.2602441608905792, "rewards/rejected": -0.5934163331985474, "step": 1140 }, { "epoch": 0.8535627454647466, "grad_norm": 75.79437434172966, "learning_rate": 2.7455241383708917e-08, "logps/chosen": -86.57710266113281, "logps/rejected": -97.93742370605469, "loss": 0.6254, "losses/dpo": 0.5387163758277893, "losses/sft": 0.5509043335914612, "losses/total": 0.5387163758277893, "ref_logps/chosen": -82.0560302734375, "ref_logps/rejected": -90.4111099243164, "rewards/accuracies": 0.59375, "rewards/chosen": -0.45210695266723633, "rewards/margins": 0.30052414536476135, "rewards/rejected": -0.7526310682296753, "step": 1141 }, { "epoch": 0.8543108285019637, "grad_norm": 61.423409306226, "learning_rate": 2.717957530903392e-08, "logps/chosen": -85.51630401611328, "logps/rejected": -90.50651550292969, "loss": 0.6194, "losses/dpo": 0.8363171815872192, "losses/sft": 1.2145143747329712, "losses/total": 0.8363171815872192, "ref_logps/chosen": -81.33365631103516, "ref_logps/rejected": -83.84645080566406, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4182642996311188, "rewards/margins": 0.24774070084095, "rewards/rejected": -0.66600501537323, "step": 1142 }, { "epoch": 0.8550589115391809, "grad_norm": 70.99613442442094, "learning_rate": 2.6905220573406136e-08, "logps/chosen": -115.96942901611328, "logps/rejected": -116.58712768554688, "loss": 0.6728, "losses/dpo": 0.7554218173027039, "losses/sft": 1.1608208417892456, "losses/total": 0.7554218173027039, "ref_logps/chosen": -112.08939361572266, "ref_logps/rejected": -111.1513442993164, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3880032002925873, "rewards/margins": 0.1555750072002411, "rewards/rejected": -0.5435781478881836, "step": 1143 }, { "epoch": 0.855806994576398, "grad_norm": 59.56542439601839, "learning_rate": 2.6632178791453435e-08, "logps/chosen": -82.50006866455078, "logps/rejected": -95.4716796875, "loss": 0.58, "losses/dpo": 0.5229045152664185, "losses/sft": 0.6639829874038696, "losses/total": 0.5229045152664185, "ref_logps/chosen": -78.66897583007812, "ref_logps/rejected": -88.20501708984375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.38310953974723816, "rewards/margins": 0.3435564339160919, "rewards/rejected": -0.7266659736633301, "step": 1144 }, { "epoch": 0.8565550776136152, "grad_norm": 67.66079141294355, "learning_rate": 2.636045157007685e-08, "logps/chosen": -88.34735107421875, "logps/rejected": -89.22833251953125, "loss": 0.6047, "losses/dpo": 0.6035076379776001, "losses/sft": 0.7181600332260132, "losses/total": 0.6035076379776001, "ref_logps/chosen": -84.36204528808594, "ref_logps/rejected": -82.03738403320312, "rewards/accuracies": 0.625, "rewards/chosen": -0.39852991700172424, "rewards/margins": 0.32056617736816406, "rewards/rejected": -0.7190960049629211, "step": 1145 }, { "epoch": 0.8573031606508322, "grad_norm": 75.73294950654368, "learning_rate": 2.6090040508440854e-08, "logps/chosen": -102.03672790527344, "logps/rejected": -105.2076416015625, "loss": 0.6901, "losses/dpo": 0.6781447529792786, "losses/sft": 0.673710823059082, "losses/total": 0.6781447529792786, "ref_logps/chosen": -97.44439697265625, "ref_logps/rejected": -99.40052032470703, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4592330753803253, "rewards/margins": 0.1214805394411087, "rewards/rejected": -0.5807135701179504, "step": 1146 }, { "epoch": 0.8580512436880494, "grad_norm": 51.44817114601859, "learning_rate": 2.58209471979641e-08, "logps/chosen": -88.88390350341797, "logps/rejected": -98.87480926513672, "loss": 0.5576, "losses/dpo": 0.49291855096817017, "losses/sft": 0.6518338918685913, "losses/total": 0.49291855096817017, "ref_logps/chosen": -85.61624908447266, "ref_logps/rejected": -91.17314147949219, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3267652988433838, "rewards/margins": 0.4434020519256592, "rewards/rejected": -0.770167350769043, "step": 1147 }, { "epoch": 0.8587993267252665, "grad_norm": 77.04823031265236, "learning_rate": 2.5553173222309987e-08, "logps/chosen": -79.91291809082031, "logps/rejected": -85.07067108154297, "loss": 0.6822, "losses/dpo": 0.668333888053894, "losses/sft": 0.7661940455436707, "losses/total": 0.668333888053894, "ref_logps/chosen": -75.323486328125, "ref_logps/rejected": -79.50672912597656, "rewards/accuracies": 0.5625, "rewards/chosen": -0.458943247795105, "rewards/margins": 0.09745055437088013, "rewards/rejected": -0.5563938021659851, "step": 1148 }, { "epoch": 0.8595474097624837, "grad_norm": 76.43881356708421, "learning_rate": 2.5286720157377593e-08, "logps/chosen": -101.80975341796875, "logps/rejected": -102.92137145996094, "loss": 0.6321, "losses/dpo": 0.5816869139671326, "losses/sft": 0.5986847877502441, "losses/total": 0.5816869139671326, "ref_logps/chosen": -97.02253723144531, "ref_logps/rejected": -95.71148681640625, "rewards/accuracies": 0.5, "rewards/chosen": -0.4787222743034363, "rewards/margins": 0.24226585030555725, "rewards/rejected": -0.7209880948066711, "step": 1149 }, { "epoch": 0.8602954927997007, "grad_norm": 96.71576453609953, "learning_rate": 2.5021589571291968e-08, "logps/chosen": -84.8042221069336, "logps/rejected": -94.77652740478516, "loss": 0.6466, "losses/dpo": 0.5385458469390869, "losses/sft": 0.8765192627906799, "losses/total": 0.5385458469390869, "ref_logps/chosen": -80.1457290649414, "ref_logps/rejected": -87.81051635742188, "rewards/accuracies": 0.59375, "rewards/chosen": -0.46584978699684143, "rewards/margins": 0.23075199127197266, "rewards/rejected": -0.6966018080711365, "step": 1150 }, { "epoch": 0.8610435758369179, "grad_norm": 70.19938016981921, "learning_rate": 2.475778302439524e-08, "logps/chosen": -82.00315856933594, "logps/rejected": -93.39854431152344, "loss": 0.5805, "losses/dpo": 0.7268797755241394, "losses/sft": 0.9754918813705444, "losses/total": 0.7268797755241394, "ref_logps/chosen": -78.14553833007812, "ref_logps/rejected": -85.49080657958984, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3857613801956177, "rewards/margins": 0.40501219034194946, "rewards/rejected": -0.7907735109329224, "step": 1151 }, { "epoch": 0.861791658874135, "grad_norm": 54.37572645381734, "learning_rate": 2.449530206923728e-08, "logps/chosen": -79.25394439697266, "logps/rejected": -90.66407775878906, "loss": 0.621, "losses/dpo": 0.7191840410232544, "losses/sft": 0.2479294389486313, "losses/total": 0.7191840410232544, "ref_logps/chosen": -75.09860229492188, "ref_logps/rejected": -84.3380126953125, "rewards/accuracies": 0.625, "rewards/chosen": -0.4155338704586029, "rewards/margins": 0.2170724719762802, "rewards/rejected": -0.6326063871383667, "step": 1152 }, { "epoch": 0.8625397419113522, "grad_norm": 63.558090297057085, "learning_rate": 2.4234148250566578e-08, "logps/chosen": -89.92494201660156, "logps/rejected": -90.96697235107422, "loss": 0.659, "losses/dpo": 0.5409822463989258, "losses/sft": 0.5708655714988708, "losses/total": 0.5409822463989258, "ref_logps/chosen": -85.23956298828125, "ref_logps/rejected": -84.52637481689453, "rewards/accuracies": 0.53125, "rewards/chosen": -0.46853795647621155, "rewards/margins": 0.1755211353302002, "rewards/rejected": -0.6440591216087341, "step": 1153 }, { "epoch": 0.8632878249485693, "grad_norm": 67.28316984436317, "learning_rate": 2.3974323105321325e-08, "logps/chosen": -98.94322204589844, "logps/rejected": -104.6902847290039, "loss": 0.6502, "losses/dpo": 0.5233403444290161, "losses/sft": 1.481066346168518, "losses/total": 0.5233403444290161, "ref_logps/chosen": -94.5351791381836, "ref_logps/rejected": -97.14918518066406, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4408040940761566, "rewards/margins": 0.3133062720298767, "rewards/rejected": -0.7541103363037109, "step": 1154 }, { "epoch": 0.8640359079857864, "grad_norm": 51.19772068217717, "learning_rate": 2.3715828162620105e-08, "logps/chosen": -81.43318176269531, "logps/rejected": -99.11446380615234, "loss": 0.5751, "losses/dpo": 0.38150307536125183, "losses/sft": 0.47608667612075806, "losses/total": 0.38150307536125183, "ref_logps/chosen": -77.24576568603516, "ref_logps/rejected": -91.38117218017578, "rewards/accuracies": 0.75, "rewards/chosen": -0.4187413454055786, "rewards/margins": 0.35458868741989136, "rewards/rejected": -0.7733300924301147, "step": 1155 }, { "epoch": 0.8647839910230035, "grad_norm": 86.3114035700409, "learning_rate": 2.3458664943753066e-08, "logps/chosen": -107.96903991699219, "logps/rejected": -121.83621215820312, "loss": 0.5779, "losses/dpo": 0.5069944858551025, "losses/sft": 1.0802676677703857, "losses/total": 0.5069944858551025, "ref_logps/chosen": -104.58070373535156, "ref_logps/rejected": -114.52365112304688, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3388333320617676, "rewards/margins": 0.3924230933189392, "rewards/rejected": -0.7312564253807068, "step": 1156 }, { "epoch": 0.8655320740602207, "grad_norm": 63.59081006201974, "learning_rate": 2.3202834962172762e-08, "logps/chosen": -85.67227172851562, "logps/rejected": -96.78801727294922, "loss": 0.6109, "losses/dpo": 0.5983768105506897, "losses/sft": 0.24932709336280823, "losses/total": 0.5983768105506897, "ref_logps/chosen": -82.05587005615234, "ref_logps/rejected": -90.9583969116211, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3616408705711365, "rewards/margins": 0.22132112085819244, "rewards/rejected": -0.5829619765281677, "step": 1157 }, { "epoch": 0.8662801570974378, "grad_norm": 73.20669034338395, "learning_rate": 2.2948339723485728e-08, "logps/chosen": -107.63867950439453, "logps/rejected": -113.50035095214844, "loss": 0.5919, "losses/dpo": 0.5718940496444702, "losses/sft": 0.8070001006126404, "losses/total": 0.5718940496444702, "ref_logps/chosen": -103.04488372802734, "ref_logps/rejected": -105.82594299316406, "rewards/accuracies": 0.625, "rewards/chosen": -0.459379106760025, "rewards/margins": 0.30806204676628113, "rewards/rejected": -0.7674411535263062, "step": 1158 }, { "epoch": 0.867028240134655, "grad_norm": 108.54884715476223, "learning_rate": 2.2695180725442992e-08, "logps/chosen": -89.85911560058594, "logps/rejected": -92.41352844238281, "loss": 0.6663, "losses/dpo": 0.6420770287513733, "losses/sft": 0.8353440165519714, "losses/total": 0.6420770287513733, "ref_logps/chosen": -85.36913299560547, "ref_logps/rejected": -86.74481201171875, "rewards/accuracies": 0.625, "rewards/chosen": -0.44899803400039673, "rewards/margins": 0.11787408590316772, "rewards/rejected": -0.5668721199035645, "step": 1159 }, { "epoch": 0.867776323171872, "grad_norm": 54.85213422456936, "learning_rate": 2.244335945793166e-08, "logps/chosen": -92.22483825683594, "logps/rejected": -95.1583023071289, "loss": 0.6103, "losses/dpo": 0.38839542865753174, "losses/sft": 0.708568811416626, "losses/total": 0.38839542865753174, "ref_logps/chosen": -87.72701263427734, "ref_logps/rejected": -87.56669616699219, "rewards/accuracies": 0.65625, "rewards/chosen": -0.44978222250938416, "rewards/margins": 0.3093784749507904, "rewards/rejected": -0.7591606378555298, "step": 1160 }, { "epoch": 0.8685244062090892, "grad_norm": 67.35634928159114, "learning_rate": 2.2192877402966048e-08, "logps/chosen": -58.97206115722656, "logps/rejected": -68.1800537109375, "loss": 0.6361, "losses/dpo": 0.7070176005363464, "losses/sft": 0.6218264698982239, "losses/total": 0.7070176005363464, "ref_logps/chosen": -54.67668151855469, "ref_logps/rejected": -62.300682067871094, "rewards/accuracies": 0.71875, "rewards/chosen": -0.429537832736969, "rewards/margins": 0.15839950740337372, "rewards/rejected": -0.5879373550415039, "step": 1161 }, { "epoch": 0.8692724892463063, "grad_norm": 157.51261594592637, "learning_rate": 2.1943736034679027e-08, "logps/chosen": -109.71380615234375, "logps/rejected": -122.34896850585938, "loss": 0.6643, "losses/dpo": 0.5939483046531677, "losses/sft": 1.342995524406433, "losses/total": 0.5939483046531677, "ref_logps/chosen": -104.99046325683594, "ref_logps/rejected": -115.81129455566406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.47233372926712036, "rewards/margins": 0.18143290281295776, "rewards/rejected": -0.6537666320800781, "step": 1162 }, { "epoch": 0.8700205722835235, "grad_norm": 62.412641156932956, "learning_rate": 2.16959368193132e-08, "logps/chosen": -78.15038299560547, "logps/rejected": -99.9508056640625, "loss": 0.5718, "losses/dpo": 0.4342176914215088, "losses/sft": 0.7904868721961975, "losses/total": 0.4342176914215088, "ref_logps/chosen": -74.38076782226562, "ref_logps/rejected": -92.62963104248047, "rewards/accuracies": 0.75, "rewards/chosen": -0.376961350440979, "rewards/margins": 0.3551555573940277, "rewards/rejected": -0.7321168780326843, "step": 1163 }, { "epoch": 0.8707686553207405, "grad_norm": 95.37912374358318, "learning_rate": 2.1449481215212394e-08, "logps/chosen": -89.08294677734375, "logps/rejected": -95.38442993164062, "loss": 0.6663, "losses/dpo": 0.6194444298744202, "losses/sft": 0.4968166947364807, "losses/total": 0.6194444298744202, "ref_logps/chosen": -85.13109588623047, "ref_logps/rejected": -89.10859680175781, "rewards/accuracies": 0.59375, "rewards/chosen": -0.395184189081192, "rewards/margins": 0.2323993295431137, "rewards/rejected": -0.6275835633277893, "step": 1164 }, { "epoch": 0.8715167383579577, "grad_norm": 126.54592456924661, "learning_rate": 2.1204370672812972e-08, "logps/chosen": -84.32413482666016, "logps/rejected": -96.82430267333984, "loss": 0.6212, "losses/dpo": 0.5865350365638733, "losses/sft": 0.7650852799415588, "losses/total": 0.5865350365638733, "ref_logps/chosen": -79.147705078125, "ref_logps/rejected": -89.02294921875, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5176421999931335, "rewards/margins": 0.26249346137046814, "rewards/rejected": -0.7801356315612793, "step": 1165 }, { "epoch": 0.8722648213951749, "grad_norm": 117.64666104294344, "learning_rate": 2.0960606634635364e-08, "logps/chosen": -95.71849060058594, "logps/rejected": -91.96115112304688, "loss": 0.6697, "losses/dpo": 0.6341527700424194, "losses/sft": 0.962943434715271, "losses/total": 0.6341527700424194, "ref_logps/chosen": -91.07966613769531, "ref_logps/rejected": -85.91368103027344, "rewards/accuracies": 0.5, "rewards/chosen": -0.4638822078704834, "rewards/margins": 0.14086508750915527, "rewards/rejected": -0.6047472953796387, "step": 1166 }, { "epoch": 0.873012904432392, "grad_norm": 69.17375993626993, "learning_rate": 2.0718190535275766e-08, "logps/chosen": -81.90461730957031, "logps/rejected": -93.05802154541016, "loss": 0.5851, "losses/dpo": 0.3106456398963928, "losses/sft": 0.8920408487319946, "losses/total": 0.3106456398963928, "ref_logps/chosen": -78.32534790039062, "ref_logps/rejected": -85.54450988769531, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3579273819923401, "rewards/margins": 0.39342373609542847, "rewards/rejected": -0.7513511180877686, "step": 1167 }, { "epoch": 0.8737609874696092, "grad_norm": 122.3357351900588, "learning_rate": 2.0477123801397265e-08, "logps/chosen": -75.91789245605469, "logps/rejected": -79.87205505371094, "loss": 0.6913, "losses/dpo": 0.6401243805885315, "losses/sft": 1.3149278163909912, "losses/total": 0.6401243805885315, "ref_logps/chosen": -71.6534423828125, "ref_logps/rejected": -74.30660247802734, "rewards/accuracies": 0.53125, "rewards/chosen": -0.42644479870796204, "rewards/margins": 0.13010051846504211, "rewards/rejected": -0.5565453767776489, "step": 1168 }, { "epoch": 0.8745090705068262, "grad_norm": 188.68908039756644, "learning_rate": 2.0237407851721816e-08, "logps/chosen": -111.12744140625, "logps/rejected": -118.86097717285156, "loss": 0.6163, "losses/dpo": 0.6467564702033997, "losses/sft": 1.0833383798599243, "losses/total": 0.6467564702033997, "ref_logps/chosen": -106.91584777832031, "ref_logps/rejected": -111.54750061035156, "rewards/accuracies": 0.625, "rewards/chosen": -0.42115944623947144, "rewards/margins": 0.3101879060268402, "rewards/rejected": -0.7313473224639893, "step": 1169 }, { "epoch": 0.8752571535440434, "grad_norm": 66.94218495391235, "learning_rate": 1.9999044097021718e-08, "logps/chosen": -100.16592407226562, "logps/rejected": -98.76060485839844, "loss": 0.6591, "losses/dpo": 0.7223728895187378, "losses/sft": 0.9038819074630737, "losses/total": 0.7223728895187378, "ref_logps/chosen": -97.2405776977539, "ref_logps/rejected": -94.08317565917969, "rewards/accuracies": 0.5625, "rewards/chosen": -0.292534202337265, "rewards/margins": 0.1752086877822876, "rewards/rejected": -0.467742919921875, "step": 1170 }, { "epoch": 0.8760052365812605, "grad_norm": 101.76561176774969, "learning_rate": 1.9762033940111332e-08, "logps/chosen": -101.41230010986328, "logps/rejected": -106.99668884277344, "loss": 0.5912, "losses/dpo": 0.47926804423332214, "losses/sft": 1.6109135150909424, "losses/total": 0.47926804423332214, "ref_logps/chosen": -96.96591186523438, "ref_logps/rejected": -99.27009582519531, "rewards/accuracies": 0.6875, "rewards/chosen": -0.44463902711868286, "rewards/margins": 0.3280203640460968, "rewards/rejected": -0.772659420967102, "step": 1171 }, { "epoch": 0.8767533196184777, "grad_norm": 64.00630887049986, "learning_rate": 1.9526378775838958e-08, "logps/chosen": -83.88371276855469, "logps/rejected": -92.83914184570312, "loss": 0.6217, "losses/dpo": 0.5785311460494995, "losses/sft": 0.8561851978302002, "losses/total": 0.5785311460494995, "ref_logps/chosen": -79.16523742675781, "ref_logps/rejected": -85.556396484375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.47184640169143677, "rewards/margins": 0.25642693042755127, "rewards/rejected": -0.7282732725143433, "step": 1172 }, { "epoch": 0.8775014026556948, "grad_norm": 64.54122064517199, "learning_rate": 1.9292079991078375e-08, "logps/chosen": -89.17818450927734, "logps/rejected": -96.53414154052734, "loss": 0.609, "losses/dpo": 0.34232932329177856, "losses/sft": 0.9403371214866638, "losses/total": 0.34232932329177856, "ref_logps/chosen": -86.13164520263672, "ref_logps/rejected": -90.6599349975586, "rewards/accuracies": 0.65625, "rewards/chosen": -0.30465397238731384, "rewards/margins": 0.28276723623275757, "rewards/rejected": -0.587421178817749, "step": 1173 }, { "epoch": 0.878249485692912, "grad_norm": 53.441364902580446, "learning_rate": 1.905913896472089e-08, "logps/chosen": -67.80721282958984, "logps/rejected": -81.80953979492188, "loss": 0.5357, "losses/dpo": 0.5458329916000366, "losses/sft": 0.562875509262085, "losses/total": 0.5458329916000366, "ref_logps/chosen": -65.13392639160156, "ref_logps/rejected": -74.41642761230469, "rewards/accuracies": 0.78125, "rewards/chosen": -0.26732903718948364, "rewards/margins": 0.4719824194908142, "rewards/rejected": -0.7393114566802979, "step": 1174 }, { "epoch": 0.878997568730129, "grad_norm": 70.83416943075356, "learning_rate": 1.8827557067667142e-08, "logps/chosen": -97.619384765625, "logps/rejected": -99.8822250366211, "loss": 0.7397, "losses/dpo": 0.5207787156105042, "losses/sft": 0.880002498626709, "losses/total": 0.5207787156105042, "ref_logps/chosen": -91.93922424316406, "ref_logps/rejected": -94.02887725830078, "rewards/accuracies": 0.40625, "rewards/chosen": -0.5680150389671326, "rewards/margins": 0.01731903851032257, "rewards/rejected": -0.585334062576294, "step": 1175 }, { "epoch": 0.8797456517673462, "grad_norm": 68.07561950065492, "learning_rate": 1.8597335662819096e-08, "logps/chosen": -90.13739013671875, "logps/rejected": -101.52740478515625, "loss": 0.5697, "losses/dpo": 0.5070142149925232, "losses/sft": 0.9785212278366089, "losses/total": 0.5070142149925232, "ref_logps/chosen": -87.24261474609375, "ref_logps/rejected": -95.07058715820312, "rewards/accuracies": 0.6875, "rewards/chosen": -0.28947770595550537, "rewards/margins": 0.35620272159576416, "rewards/rejected": -0.6456804275512695, "step": 1176 }, { "epoch": 0.8804937348045633, "grad_norm": 60.93029334076102, "learning_rate": 1.836847610507189e-08, "logps/chosen": -93.79440307617188, "logps/rejected": -103.10572814941406, "loss": 0.6213, "losses/dpo": 0.5130705237388611, "losses/sft": 0.8510052561759949, "losses/total": 0.5130705237388611, "ref_logps/chosen": -89.87837219238281, "ref_logps/rejected": -96.90989685058594, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3916037678718567, "rewards/margins": 0.2279798835515976, "rewards/rejected": -0.6195836663246155, "step": 1177 }, { "epoch": 0.8812418178417805, "grad_norm": 64.55228537511013, "learning_rate": 1.814097974130599e-08, "logps/chosen": -139.90975952148438, "logps/rejected": -143.37432861328125, "loss": 0.667, "losses/dpo": 0.461854487657547, "losses/sft": 1.5330777168273926, "losses/total": 0.461854487657547, "ref_logps/chosen": -135.6502227783203, "ref_logps/rejected": -137.7778778076172, "rewards/accuracies": 0.5625, "rewards/chosen": -0.42595475912094116, "rewards/margins": 0.13368919491767883, "rewards/rejected": -0.5596439242362976, "step": 1178 }, { "epoch": 0.8819899008789975, "grad_norm": 54.50149266359982, "learning_rate": 1.7914847910379173e-08, "logps/chosen": -91.94363403320312, "logps/rejected": -97.67964172363281, "loss": 0.6979, "losses/dpo": 0.6378160715103149, "losses/sft": 1.0768276453018188, "losses/total": 0.6378160715103149, "ref_logps/chosen": -87.10527038574219, "ref_logps/rejected": -92.01933288574219, "rewards/accuracies": 0.5625, "rewards/chosen": -0.48383620381355286, "rewards/margins": 0.08219408988952637, "rewards/rejected": -0.5660302639007568, "step": 1179 }, { "epoch": 0.8827379839162147, "grad_norm": 92.27993499614674, "learning_rate": 1.7690081943118718e-08, "logps/chosen": -97.090576171875, "logps/rejected": -105.4790267944336, "loss": 0.6251, "losses/dpo": 0.5040746331214905, "losses/sft": 0.6190780997276306, "losses/total": 0.5040746331214905, "ref_logps/chosen": -93.60408782958984, "ref_logps/rejected": -99.72844696044922, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3486500382423401, "rewards/margins": 0.22640812397003174, "rewards/rejected": -0.5750582218170166, "step": 1180 }, { "epoch": 0.8834860669534318, "grad_norm": 64.95379998042434, "learning_rate": 1.746668316231359e-08, "logps/chosen": -98.84261322021484, "logps/rejected": -99.54290771484375, "loss": 0.6932, "losses/dpo": 0.872649610042572, "losses/sft": 0.6211157441139221, "losses/total": 0.872649610042572, "ref_logps/chosen": -95.01368713378906, "ref_logps/rejected": -95.16305541992188, "rewards/accuracies": 0.5, "rewards/chosen": -0.3828924596309662, "rewards/margins": 0.05509256571531296, "rewards/rejected": -0.43798500299453735, "step": 1181 }, { "epoch": 0.884234149990649, "grad_norm": 61.65391066647736, "learning_rate": 1.7244652882706545e-08, "logps/chosen": -78.12931823730469, "logps/rejected": -90.226318359375, "loss": 0.59, "losses/dpo": 0.5050398707389832, "losses/sft": 0.8863462805747986, "losses/total": 0.5050398707389832, "ref_logps/chosen": -74.83971405029297, "ref_logps/rejected": -84.11788940429688, "rewards/accuracies": 0.6875, "rewards/chosen": -0.32896023988723755, "rewards/margins": 0.28188252449035645, "rewards/rejected": -0.610842764377594, "step": 1182 }, { "epoch": 0.884982233027866, "grad_norm": 91.13376980248917, "learning_rate": 1.7023992410986482e-08, "logps/chosen": -95.09606170654297, "logps/rejected": -99.22698974609375, "loss": 0.6082, "losses/dpo": 0.5402988791465759, "losses/sft": 0.774093747138977, "losses/total": 0.5402988791465759, "ref_logps/chosen": -91.5191879272461, "ref_logps/rejected": -93.0804443359375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.35768747329711914, "rewards/margins": 0.2569666802883148, "rewards/rejected": -0.6146541833877563, "step": 1183 }, { "epoch": 0.8857303160650832, "grad_norm": 100.54411816496678, "learning_rate": 1.6804703045780743e-08, "logps/chosen": -104.70806884765625, "logps/rejected": -115.229248046875, "loss": 0.6963, "losses/dpo": 0.737223744392395, "losses/sft": 0.9855235815048218, "losses/total": 0.737223744392395, "ref_logps/chosen": -99.69223022460938, "ref_logps/rejected": -109.23773193359375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5015841126441956, "rewards/margins": 0.09756730496883392, "rewards/rejected": -0.5991514325141907, "step": 1184 }, { "epoch": 0.8864783991023003, "grad_norm": 75.70084686158349, "learning_rate": 1.658678607764735e-08, "logps/chosen": -81.70988464355469, "logps/rejected": -90.62261962890625, "loss": 0.5519, "losses/dpo": 0.6473878622055054, "losses/sft": 0.9900181889533997, "losses/total": 0.6473878622055054, "ref_logps/chosen": -77.739990234375, "ref_logps/rejected": -82.26460266113281, "rewards/accuracies": 0.75, "rewards/chosen": -0.3969905376434326, "rewards/margins": 0.4388114809989929, "rewards/rejected": -0.8358020782470703, "step": 1185 }, { "epoch": 0.8872264821395175, "grad_norm": 67.84998047279521, "learning_rate": 1.637024278906776e-08, "logps/chosen": -87.250732421875, "logps/rejected": -93.65349578857422, "loss": 0.6352, "losses/dpo": 0.735305666923523, "losses/sft": 1.0375291109085083, "losses/total": 0.735305666923523, "ref_logps/chosen": -83.97109985351562, "ref_logps/rejected": -88.54185485839844, "rewards/accuracies": 0.65625, "rewards/chosen": -0.32796406745910645, "rewards/margins": 0.1832004189491272, "rewards/rejected": -0.5111644864082336, "step": 1186 }, { "epoch": 0.8879745651767346, "grad_norm": 119.31234557360112, "learning_rate": 1.615507445443884e-08, "logps/chosen": -85.59403991699219, "logps/rejected": -78.44100952148438, "loss": 0.686, "losses/dpo": 0.5815693140029907, "losses/sft": 0.34256285429000854, "losses/total": 0.5815693140029907, "ref_logps/chosen": -80.78814697265625, "ref_logps/rejected": -72.86872863769531, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4805888533592224, "rewards/margins": 0.07663826644420624, "rewards/rejected": -0.5572271347045898, "step": 1187 }, { "epoch": 0.8887226482139517, "grad_norm": 83.20717442647357, "learning_rate": 1.5941282340065697e-08, "logps/chosen": -112.90686798095703, "logps/rejected": -119.269287109375, "loss": 0.6385, "losses/dpo": 0.5314191579818726, "losses/sft": 1.0642062425613403, "losses/total": 0.5314191579818726, "ref_logps/chosen": -109.08158111572266, "ref_logps/rejected": -112.91943359375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3825281858444214, "rewards/margins": 0.25245726108551025, "rewards/rejected": -0.6349854469299316, "step": 1188 }, { "epoch": 0.8894707312511689, "grad_norm": 71.53416682904599, "learning_rate": 1.5728867704154075e-08, "logps/chosen": -90.3236083984375, "logps/rejected": -98.970947265625, "loss": 0.6668, "losses/dpo": 0.7133692502975464, "losses/sft": 1.0422884225845337, "losses/total": 0.7133692502975464, "ref_logps/chosen": -84.74412536621094, "ref_logps/rejected": -91.62421417236328, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5579488277435303, "rewards/margins": 0.1767248809337616, "rewards/rejected": -0.7346737384796143, "step": 1189 }, { "epoch": 0.890218814288386, "grad_norm": 88.5583632316717, "learning_rate": 1.5517831796803132e-08, "logps/chosen": -89.34171295166016, "logps/rejected": -98.92312622070312, "loss": 0.6189, "losses/dpo": 0.6558191180229187, "losses/sft": 1.002737283706665, "losses/total": 0.6558191180229187, "ref_logps/chosen": -85.32910919189453, "ref_logps/rejected": -92.25552368164062, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4012604355812073, "rewards/margins": 0.26549965143203735, "rewards/rejected": -0.6667600870132446, "step": 1190 }, { "epoch": 0.8909668973256032, "grad_norm": 55.64626079106174, "learning_rate": 1.530817585999783e-08, "logps/chosen": -82.23662567138672, "logps/rejected": -89.59709930419922, "loss": 0.5438, "losses/dpo": 0.6480134725570679, "losses/sft": 0.6912856698036194, "losses/total": 0.6480134725570679, "ref_logps/chosen": -78.61979675292969, "ref_logps/rejected": -82.0531005859375, "rewards/accuracies": 0.84375, "rewards/chosen": -0.36168360710144043, "rewards/margins": 0.39271628856658936, "rewards/rejected": -0.7543998956680298, "step": 1191 }, { "epoch": 0.8917149803628203, "grad_norm": 72.56623676939621, "learning_rate": 1.509990112760176e-08, "logps/chosen": -88.96058654785156, "logps/rejected": -107.69384002685547, "loss": 0.5958, "losses/dpo": 0.7551692724227905, "losses/sft": 0.6273984909057617, "losses/total": 0.7551692724227905, "ref_logps/chosen": -84.81060791015625, "ref_logps/rejected": -100.19931030273438, "rewards/accuracies": 0.71875, "rewards/chosen": -0.41499900817871094, "rewards/margins": 0.33445417881011963, "rewards/rejected": -0.7494531869888306, "step": 1192 }, { "epoch": 0.8924630634000374, "grad_norm": 51.17759761581664, "learning_rate": 1.4893008825349968e-08, "logps/chosen": -70.46275329589844, "logps/rejected": -82.35395812988281, "loss": 0.5118, "losses/dpo": 0.513863205909729, "losses/sft": 0.9126659631729126, "losses/total": 0.513863205909729, "ref_logps/chosen": -68.80720520019531, "ref_logps/rejected": -75.35858154296875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.16555523872375488, "rewards/margins": 0.5339823961257935, "rewards/rejected": -0.6995375752449036, "step": 1193 }, { "epoch": 0.8932111464372545, "grad_norm": 57.98474030519675, "learning_rate": 1.4687500170841472e-08, "logps/chosen": -99.10517883300781, "logps/rejected": -114.45341491699219, "loss": 0.6292, "losses/dpo": 0.621487557888031, "losses/sft": 0.8560210466384888, "losses/total": 0.621487557888031, "ref_logps/chosen": -94.82589721679688, "ref_logps/rejected": -107.2366943359375, "rewards/accuracies": 0.625, "rewards/chosen": -0.4279276430606842, "rewards/margins": 0.2937450408935547, "rewards/rejected": -0.7216726541519165, "step": 1194 }, { "epoch": 0.8939592294744717, "grad_norm": 56.476866737777904, "learning_rate": 1.4483376373532502e-08, "logps/chosen": -82.25628662109375, "logps/rejected": -105.68310546875, "loss": 0.5826, "losses/dpo": 0.5692982077598572, "losses/sft": 0.5048052072525024, "losses/total": 0.5692982077598572, "ref_logps/chosen": -78.11888122558594, "ref_logps/rejected": -96.83406066894531, "rewards/accuracies": 0.75, "rewards/chosen": -0.41374096274375916, "rewards/margins": 0.4711635410785675, "rewards/rejected": -0.8849045038223267, "step": 1195 }, { "epoch": 0.8947073125116888, "grad_norm": 64.08815316510449, "learning_rate": 1.4280638634728948e-08, "logps/chosen": -104.41556549072266, "logps/rejected": -121.40997314453125, "loss": 0.5888, "losses/dpo": 0.8215691447257996, "losses/sft": 1.3254694938659668, "losses/total": 0.8215691447257996, "ref_logps/chosen": -100.76889038085938, "ref_logps/rejected": -114.43211364746094, "rewards/accuracies": 0.6875, "rewards/chosen": -0.36466720700263977, "rewards/margins": 0.33311891555786133, "rewards/rejected": -0.6977860927581787, "step": 1196 }, { "epoch": 0.895455395548906, "grad_norm": 100.27038258564116, "learning_rate": 1.407928814757961e-08, "logps/chosen": -97.32841491699219, "logps/rejected": -101.60464477539062, "loss": 0.7262, "losses/dpo": 0.6972538828849792, "losses/sft": 0.6678569316864014, "losses/total": 0.6972538828849792, "ref_logps/chosen": -92.82171630859375, "ref_logps/rejected": -95.99501037597656, "rewards/accuracies": 0.46875, "rewards/chosen": -0.4506699740886688, "rewards/margins": 0.1102931797504425, "rewards/rejected": -0.5609631538391113, "step": 1197 }, { "epoch": 0.896203478586123, "grad_norm": 54.30335384219037, "learning_rate": 1.3879326097069016e-08, "logps/chosen": -99.19525909423828, "logps/rejected": -105.5774154663086, "loss": 0.6494, "losses/dpo": 0.7258752584457397, "losses/sft": 0.36333441734313965, "losses/total": 0.7258752584457397, "ref_logps/chosen": -95.7838134765625, "ref_logps/rejected": -100.2542724609375, "rewards/accuracies": 0.625, "rewards/chosen": -0.3411451578140259, "rewards/margins": 0.1911696493625641, "rewards/rejected": -0.5323147773742676, "step": 1198 }, { "epoch": 0.8969515616233402, "grad_norm": 59.524095592973936, "learning_rate": 1.3680753660010424e-08, "logps/chosen": -80.54620361328125, "logps/rejected": -87.02122497558594, "loss": 0.6133, "losses/dpo": 0.6121180057525635, "losses/sft": 1.1485800743103027, "losses/total": 0.6121180057525635, "ref_logps/chosen": -76.28221893310547, "ref_logps/rejected": -80.50298309326172, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4263981878757477, "rewards/margins": 0.2254265397787094, "rewards/rejected": -0.6518247127532959, "step": 1199 }, { "epoch": 0.8976996446605573, "grad_norm": 78.47913734537428, "learning_rate": 1.3483572005039106e-08, "logps/chosen": -78.0251693725586, "logps/rejected": -83.66975402832031, "loss": 0.5771, "losses/dpo": 0.5119936466217041, "losses/sft": 0.6642171144485474, "losses/total": 0.5119936466217041, "ref_logps/chosen": -74.209228515625, "ref_logps/rejected": -75.31120300292969, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3815932273864746, "rewards/margins": 0.4542616605758667, "rewards/rejected": -0.8358548879623413, "step": 1200 }, { "epoch": 0.8984477276977745, "grad_norm": 93.42791764233489, "learning_rate": 1.3287782292605243e-08, "logps/chosen": -115.3221664428711, "logps/rejected": -128.95071411132812, "loss": 0.6006, "losses/dpo": 0.46432873606681824, "losses/sft": 1.4807358980178833, "losses/total": 0.46432873606681824, "ref_logps/chosen": -112.0633544921875, "ref_logps/rejected": -122.71842193603516, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3258817493915558, "rewards/margins": 0.29734793305397034, "rewards/rejected": -0.6232296824455261, "step": 1201 }, { "epoch": 0.8991958107349916, "grad_norm": 86.61442074159253, "learning_rate": 1.309338567496715e-08, "logps/chosen": -103.75633239746094, "logps/rejected": -95.3408432006836, "loss": 0.7837, "losses/dpo": 0.4627956748008728, "losses/sft": 1.2205826044082642, "losses/total": 0.4627956748008728, "ref_logps/chosen": -98.5155029296875, "ref_logps/rejected": -90.25297546386719, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5240835547447205, "rewards/margins": -0.015296995639801025, "rewards/rejected": -0.5087865591049194, "step": 1202 }, { "epoch": 0.8999438937722087, "grad_norm": 79.02098345639604, "learning_rate": 1.2900383296184536e-08, "logps/chosen": -98.83900451660156, "logps/rejected": -121.49928283691406, "loss": 0.6227, "losses/dpo": 0.6483698487281799, "losses/sft": 1.016440510749817, "losses/total": 0.6483698487281799, "ref_logps/chosen": -94.05921936035156, "ref_logps/rejected": -113.8110122680664, "rewards/accuracies": 0.625, "rewards/chosen": -0.4779786467552185, "rewards/margins": 0.2908487319946289, "rewards/rejected": -0.7688274383544922, "step": 1203 }, { "epoch": 0.9006919768094258, "grad_norm": 83.16202245013541, "learning_rate": 1.2708776292111866e-08, "logps/chosen": -106.70623779296875, "logps/rejected": -118.276611328125, "loss": 0.5532, "losses/dpo": 0.6167342066764832, "losses/sft": 0.9434674382209778, "losses/total": 0.6167342066764832, "ref_logps/chosen": -103.95538330078125, "ref_logps/rejected": -111.0067138671875, "rewards/accuracies": 0.75, "rewards/chosen": -0.2750844955444336, "rewards/margins": 0.45190638303756714, "rewards/rejected": -0.7269909381866455, "step": 1204 }, { "epoch": 0.901440059846643, "grad_norm": 70.48634749984147, "learning_rate": 1.2518565790391427e-08, "logps/chosen": -80.87684631347656, "logps/rejected": -94.36087036132812, "loss": 0.4978, "losses/dpo": 0.530651330947876, "losses/sft": 0.5802378058433533, "losses/total": 0.530651330947876, "ref_logps/chosen": -76.9498291015625, "ref_logps/rejected": -84.4564437866211, "rewards/accuracies": 0.75, "rewards/chosen": -0.3927006721496582, "rewards/margins": 0.5977415442466736, "rewards/rejected": -0.9904422760009766, "step": 1205 }, { "epoch": 0.9021881428838601, "grad_norm": 97.13364702809861, "learning_rate": 1.2329752910446912e-08, "logps/chosen": -98.20262145996094, "logps/rejected": -110.62193298339844, "loss": 0.5591, "losses/dpo": 0.5436981916427612, "losses/sft": 1.743634581565857, "losses/total": 0.5436981916427612, "ref_logps/chosen": -94.9195556640625, "ref_logps/rejected": -103.08102416992188, "rewards/accuracies": 0.75, "rewards/chosen": -0.3283054232597351, "rewards/margins": 0.42578527331352234, "rewards/rejected": -0.7540907263755798, "step": 1206 }, { "epoch": 0.9021881428838601, "eval_logps/chosen": -39.461917877197266, "eval_logps/rejected": -45.744667053222656, "eval_loss": 0.6151586174964905, "eval_losses/dpo": 0.6054712533950806, "eval_losses/sft": 0.32769978046417236, "eval_losses/total": 0.6054712533950806, "eval_ref_logps/chosen": -35.788204193115234, "eval_ref_logps/rejected": -39.235660552978516, "eval_rewards/accuracies": 0.6465517282485962, "eval_rewards/chosen": -0.3673713803291321, "eval_rewards/margins": 0.2835288941860199, "eval_rewards/rejected": -0.6509003043174744, "eval_runtime": 38.1192, "eval_samples_per_second": 12.146, "eval_steps_per_second": 1.522, "step": 1206 }, { "epoch": 0.9029362259210773, "grad_norm": 61.334316016009616, "learning_rate": 1.2142338763476728e-08, "logps/chosen": -87.6827621459961, "logps/rejected": -97.32237243652344, "loss": 0.5809, "losses/dpo": 0.570033073425293, "losses/sft": 0.9806172251701355, "losses/total": 0.570033073425293, "ref_logps/chosen": -84.20079803466797, "ref_logps/rejected": -90.13585662841797, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3481951951980591, "rewards/margins": 0.3704562187194824, "rewards/rejected": -0.7186514735221863, "step": 1207 }, { "epoch": 0.9036843089582943, "grad_norm": 85.84366356866497, "learning_rate": 1.1956324452447458e-08, "logps/chosen": -84.45806121826172, "logps/rejected": -94.032470703125, "loss": 0.5676, "losses/dpo": 0.6686390042304993, "losses/sft": 0.2874106764793396, "losses/total": 0.6686390042304993, "ref_logps/chosen": -80.8031997680664, "ref_logps/rejected": -86.15808868408203, "rewards/accuracies": 0.625, "rewards/chosen": -0.36548686027526855, "rewards/margins": 0.42195212841033936, "rewards/rejected": -0.7874389886856079, "step": 1208 }, { "epoch": 0.9044323919955115, "grad_norm": 73.78563561143191, "learning_rate": 1.1771711072087465e-08, "logps/chosen": -105.98214721679688, "logps/rejected": -125.29637145996094, "loss": 0.6464, "losses/dpo": 0.45219090580940247, "losses/sft": 1.0445858240127563, "losses/total": 0.45219090580940247, "ref_logps/chosen": -101.32015228271484, "ref_logps/rejected": -118.15959930419922, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4661999046802521, "rewards/margins": 0.2474784106016159, "rewards/rejected": -0.7136783003807068, "step": 1209 }, { "epoch": 0.9051804750327286, "grad_norm": 58.665343956052396, "learning_rate": 1.1588499708880316e-08, "logps/chosen": -77.42050170898438, "logps/rejected": -97.37828063964844, "loss": 0.5226, "losses/dpo": 0.4467446804046631, "losses/sft": 0.9742170572280884, "losses/total": 0.4467446804046631, "ref_logps/chosen": -74.06903076171875, "ref_logps/rejected": -88.84786224365234, "rewards/accuracies": 0.75, "rewards/chosen": -0.33514708280563354, "rewards/margins": 0.5178946852684021, "rewards/rejected": -0.8530417680740356, "step": 1210 }, { "epoch": 0.9059285580699458, "grad_norm": 67.60852851452374, "learning_rate": 1.1406691441058491e-08, "logps/chosen": -100.19975280761719, "logps/rejected": -105.18648529052734, "loss": 0.6372, "losses/dpo": 0.5663295388221741, "losses/sft": 0.7018232941627502, "losses/total": 0.5663295388221741, "ref_logps/chosen": -96.63040924072266, "ref_logps/rejected": -99.06703186035156, "rewards/accuracies": 0.625, "rewards/chosen": -0.35693439841270447, "rewards/margins": 0.2550112307071686, "rewards/rejected": -0.611945629119873, "step": 1211 }, { "epoch": 0.906676641107163, "grad_norm": 54.59227180772513, "learning_rate": 1.1226287338596929e-08, "logps/chosen": -89.69628143310547, "logps/rejected": -103.05441284179688, "loss": 0.594, "losses/dpo": 0.6247140169143677, "losses/sft": 1.4976203441619873, "losses/total": 0.6247140169143677, "ref_logps/chosen": -85.44764709472656, "ref_logps/rejected": -95.10794830322266, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4248633086681366, "rewards/margins": 0.3697834312915802, "rewards/rejected": -0.7946467399597168, "step": 1212 }, { "epoch": 0.90742472414438, "grad_norm": 64.3834797727753, "learning_rate": 1.1047288463206878e-08, "logps/chosen": -91.82964324951172, "logps/rejected": -94.64665222167969, "loss": 0.6616, "losses/dpo": 0.8805596828460693, "losses/sft": 0.3327178955078125, "losses/total": 0.8805596828460693, "ref_logps/chosen": -86.78225708007812, "ref_logps/rejected": -88.12875366210938, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5047390460968018, "rewards/margins": 0.14705145359039307, "rewards/rejected": -0.6517904996871948, "step": 1213 }, { "epoch": 0.9081728071815972, "grad_norm": 151.02493529928532, "learning_rate": 1.0869695868329532e-08, "logps/chosen": -105.73439025878906, "logps/rejected": -108.05950927734375, "loss": 0.657, "losses/dpo": 0.6713093519210815, "losses/sft": 1.7762961387634277, "losses/total": 0.6713093519210815, "ref_logps/chosen": -102.12596893310547, "ref_logps/rejected": -102.81082153320312, "rewards/accuracies": 0.625, "rewards/chosen": -0.3608425259590149, "rewards/margins": 0.16402646899223328, "rewards/rejected": -0.5248689651489258, "step": 1214 }, { "epoch": 0.9089208902188143, "grad_norm": 73.1414811822351, "learning_rate": 1.0693510599129873e-08, "logps/chosen": -95.92245483398438, "logps/rejected": -101.75703430175781, "loss": 0.6717, "losses/dpo": 0.5314512252807617, "losses/sft": 0.6209264397621155, "losses/total": 0.5314512252807617, "ref_logps/chosen": -91.57395935058594, "ref_logps/rejected": -95.90263366699219, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4348500370979309, "rewards/margins": 0.15058976411819458, "rewards/rejected": -0.5854398012161255, "step": 1215 }, { "epoch": 0.9096689732560315, "grad_norm": 96.75968876707022, "learning_rate": 1.0518733692490511e-08, "logps/chosen": -104.16519165039062, "logps/rejected": -105.49054718017578, "loss": 0.7153, "losses/dpo": 0.6068311929702759, "losses/sft": 1.0752065181732178, "losses/total": 0.6068311929702759, "ref_logps/chosen": -98.12090301513672, "ref_logps/rejected": -98.33621978759766, "rewards/accuracies": 0.53125, "rewards/chosen": -0.6044292449951172, "rewards/margins": 0.11100432276725769, "rewards/rejected": -0.7154335379600525, "step": 1216 }, { "epoch": 0.9104170562932485, "grad_norm": 78.9598380986325, "learning_rate": 1.0345366177005544e-08, "logps/chosen": -76.14225769042969, "logps/rejected": -90.80464935302734, "loss": 0.626, "losses/dpo": 0.5814436078071594, "losses/sft": 0.7934982180595398, "losses/total": 0.5814436078071594, "ref_logps/chosen": -72.16297149658203, "ref_logps/rejected": -84.71333312988281, "rewards/accuracies": 0.625, "rewards/chosen": -0.39792829751968384, "rewards/margins": 0.21120423078536987, "rewards/rejected": -0.6091325283050537, "step": 1217 }, { "epoch": 0.9111651393304657, "grad_norm": 68.46796890711649, "learning_rate": 1.0173409072974649e-08, "logps/chosen": -81.63391876220703, "logps/rejected": -91.849853515625, "loss": 0.6685, "losses/dpo": 0.6831374168395996, "losses/sft": 0.8743407726287842, "losses/total": 0.6831374168395996, "ref_logps/chosen": -77.73782348632812, "ref_logps/rejected": -86.55191802978516, "rewards/accuracies": 0.5625, "rewards/chosen": -0.38960933685302734, "rewards/margins": 0.14018376171588898, "rewards/rejected": -0.5297930836677551, "step": 1218 }, { "epoch": 0.9119132223676828, "grad_norm": 82.18826748449726, "learning_rate": 1.0002863392396865e-08, "logps/chosen": -86.57591247558594, "logps/rejected": -97.12445831298828, "loss": 0.6436, "losses/dpo": 0.9228140711784363, "losses/sft": 0.9930399060249329, "losses/total": 0.9228140711784363, "ref_logps/chosen": -82.40779876708984, "ref_logps/rejected": -90.97876739501953, "rewards/accuracies": 0.625, "rewards/chosen": -0.41681158542633057, "rewards/margins": 0.1977570652961731, "rewards/rejected": -0.6145687103271484, "step": 1219 }, { "epoch": 0.9126613054049, "grad_norm": 64.9027808800027, "learning_rate": 9.833730138964768e-09, "logps/chosen": -93.90243530273438, "logps/rejected": -115.99784088134766, "loss": 0.5691, "losses/dpo": 0.5937448143959045, "losses/sft": 0.2722456157207489, "losses/total": 0.5937448143959045, "ref_logps/chosen": -90.82390594482422, "ref_logps/rejected": -109.12409210205078, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3078533709049225, "rewards/margins": 0.3795211911201477, "rewards/rejected": -0.6873745918273926, "step": 1220 }, { "epoch": 0.913409388442117, "grad_norm": 59.76746770673356, "learning_rate": 9.666010308058609e-09, "logps/chosen": -100.3577880859375, "logps/rejected": -108.58538818359375, "loss": 0.5856, "losses/dpo": 0.5739750862121582, "losses/sft": 0.878434419631958, "losses/total": 0.5739750862121582, "ref_logps/chosen": -96.65365600585938, "ref_logps/rejected": -101.04320526123047, "rewards/accuracies": 0.625, "rewards/chosen": -0.3704131543636322, "rewards/margins": 0.38380563259124756, "rewards/rejected": -0.7542187571525574, "step": 1221 }, { "epoch": 0.9141574714793342, "grad_norm": 64.9305276808336, "learning_rate": 9.499704886740289e-09, "logps/chosen": -107.69004821777344, "logps/rejected": -118.91885375976562, "loss": 0.5931, "losses/dpo": 0.7027384042739868, "losses/sft": 1.2442491054534912, "losses/total": 0.7027384042739868, "ref_logps/chosen": -103.57691955566406, "ref_logps/rejected": -111.1987533569336, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4113123416900635, "rewards/margins": 0.3606984615325928, "rewards/rejected": -0.772010862827301, "step": 1222 }, { "epoch": 0.9149055545165513, "grad_norm": 70.095591425252, "learning_rate": 9.334814853747736e-09, "logps/chosen": -95.0744400024414, "logps/rejected": -99.38558959960938, "loss": 0.6077, "losses/dpo": 0.6192693114280701, "losses/sft": 0.7291459441184998, "losses/total": 0.6192693114280701, "ref_logps/chosen": -90.88148498535156, "ref_logps/rejected": -92.09321594238281, "rewards/accuracies": 0.625, "rewards/chosen": -0.4192955791950226, "rewards/margins": 0.3099405765533447, "rewards/rejected": -0.7292361855506897, "step": 1223 }, { "epoch": 0.9156536375537685, "grad_norm": 73.42089536556837, "learning_rate": 9.171341179489034e-09, "logps/chosen": -89.0706558227539, "logps/rejected": -99.43917846679688, "loss": 0.6503, "losses/dpo": 1.1034104824066162, "losses/sft": 0.6623272895812988, "losses/total": 1.1034104824066162, "ref_logps/chosen": -85.90594482421875, "ref_logps/rejected": -93.59788513183594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3164706528186798, "rewards/margins": 0.26765769720077515, "rewards/rejected": -0.5841283798217773, "step": 1224 }, { "epoch": 0.9164017205909856, "grad_norm": 63.57962035124288, "learning_rate": 9.009284826036689e-09, "logps/chosen": -58.07750701904297, "logps/rejected": -75.81666564941406, "loss": 0.56, "losses/dpo": 0.5785367488861084, "losses/sft": 0.3009987473487854, "losses/total": 0.5785367488861084, "ref_logps/chosen": -54.28105163574219, "ref_logps/rejected": -67.39247131347656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3796457052230835, "rewards/margins": 0.4627727270126343, "rewards/rejected": -0.8424185514450073, "step": 1225 }, { "epoch": 0.9171498036282028, "grad_norm": 66.80988257395683, "learning_rate": 8.848646747122074e-09, "logps/chosen": -101.5813980102539, "logps/rejected": -102.90713500976562, "loss": 0.6255, "losses/dpo": 0.5370602607727051, "losses/sft": 0.7325765490531921, "losses/total": 0.5370602607727051, "ref_logps/chosen": -98.76565551757812, "ref_logps/rejected": -97.8707275390625, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2815733253955841, "rewards/margins": 0.22206859290599823, "rewards/rejected": -0.5036419034004211, "step": 1226 }, { "epoch": 0.9178978866654198, "grad_norm": 54.67747034144367, "learning_rate": 8.68942788812968e-09, "logps/chosen": -96.49148559570312, "logps/rejected": -102.87699890136719, "loss": 0.6456, "losses/dpo": 0.678427517414093, "losses/sft": 1.353109359741211, "losses/total": 0.678427517414093, "ref_logps/chosen": -92.78562927246094, "ref_logps/rejected": -97.35267639160156, "rewards/accuracies": 0.59375, "rewards/chosen": -0.37058597803115845, "rewards/margins": 0.18184658885002136, "rewards/rejected": -0.5524325370788574, "step": 1227 }, { "epoch": 0.918645969702637, "grad_norm": 56.47519869522461, "learning_rate": 8.531629186091738e-09, "logps/chosen": -115.764892578125, "logps/rejected": -128.95680236816406, "loss": 0.6204, "losses/dpo": 0.4113309979438782, "losses/sft": 0.7693597078323364, "losses/total": 0.4113309979438782, "ref_logps/chosen": -111.37454223632812, "ref_logps/rejected": -121.63496398925781, "rewards/accuracies": 0.5, "rewards/chosen": -0.4390355944633484, "rewards/margins": 0.2931479811668396, "rewards/rejected": -0.7321836948394775, "step": 1228 }, { "epoch": 0.9193940527398541, "grad_norm": 74.00570636089226, "learning_rate": 8.375251569682551e-09, "logps/chosen": -89.13972473144531, "logps/rejected": -98.11454010009766, "loss": 0.5802, "losses/dpo": 0.680308997631073, "losses/sft": 0.7951639890670776, "losses/total": 0.680308997631073, "ref_logps/chosen": -85.46868896484375, "ref_logps/rejected": -91.359375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3671041429042816, "rewards/margins": 0.3084116578102112, "rewards/rejected": -0.6755157709121704, "step": 1229 }, { "epoch": 0.9201421357770713, "grad_norm": 83.42137084968645, "learning_rate": 8.220295959213003e-09, "logps/chosen": -80.21194458007812, "logps/rejected": -79.70440673828125, "loss": 0.7552, "losses/dpo": 0.6728357076644897, "losses/sft": 0.31911689043045044, "losses/total": 0.6728357076644897, "ref_logps/chosen": -75.84626770019531, "ref_logps/rejected": -75.79236602783203, "rewards/accuracies": 0.40625, "rewards/chosen": -0.4365677237510681, "rewards/margins": -0.04536406695842743, "rewards/rejected": -0.3912036716938019, "step": 1230 }, { "epoch": 0.9208902188142883, "grad_norm": 65.61878682106527, "learning_rate": 8.066763266625282e-09, "logps/chosen": -102.26033020019531, "logps/rejected": -109.15116882324219, "loss": 0.6174, "losses/dpo": 0.5230758786201477, "losses/sft": 1.13373863697052, "losses/total": 0.5230758786201477, "ref_logps/chosen": -98.15084075927734, "ref_logps/rejected": -102.23223876953125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4109484553337097, "rewards/margins": 0.28094419836997986, "rewards/rejected": -0.6918926239013672, "step": 1231 }, { "epoch": 0.9216383018515055, "grad_norm": 76.57637582029298, "learning_rate": 7.914654395487469e-09, "logps/chosen": -77.3440170288086, "logps/rejected": -87.10324096679688, "loss": 0.5336, "losses/dpo": 0.47026050090789795, "losses/sft": 0.6543353796005249, "losses/total": 0.47026050090789795, "ref_logps/chosen": -73.34780883789062, "ref_logps/rejected": -77.99209594726562, "rewards/accuracies": 0.75, "rewards/chosen": -0.39962077140808105, "rewards/margins": 0.5114942789077759, "rewards/rejected": -0.9111150503158569, "step": 1232 }, { "epoch": 0.9223863848887226, "grad_norm": 123.28148219639097, "learning_rate": 7.763970240988127e-09, "logps/chosen": -121.96391296386719, "logps/rejected": -154.203369140625, "loss": 0.6091, "losses/dpo": 0.3362734019756317, "losses/sft": 0.5747027397155762, "losses/total": 0.3362734019756317, "ref_logps/chosen": -116.5965576171875, "ref_logps/rejected": -145.54180908203125, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5367357134819031, "rewards/margins": 0.32942134141921997, "rewards/rejected": -0.866157054901123, "step": 1233 }, { "epoch": 0.9231344679259398, "grad_norm": 54.064387417519725, "learning_rate": 7.614711689931107e-09, "logps/chosen": -88.62516784667969, "logps/rejected": -93.30401611328125, "loss": 0.6653, "losses/dpo": 0.7139730453491211, "losses/sft": 0.8886280059814453, "losses/total": 0.7139730453491211, "ref_logps/chosen": -84.6685791015625, "ref_logps/rejected": -87.1683349609375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3956584930419922, "rewards/margins": 0.21791018545627594, "rewards/rejected": -0.6135687232017517, "step": 1234 }, { "epoch": 0.9238825509631569, "grad_norm": 87.26263108953322, "learning_rate": 7.466879620730393e-09, "logps/chosen": -95.45063781738281, "logps/rejected": -97.47332763671875, "loss": 0.6578, "losses/dpo": 0.6997188329696655, "losses/sft": 0.6749987006187439, "losses/total": 0.6997188329696655, "ref_logps/chosen": -91.72111511230469, "ref_logps/rejected": -91.97481536865234, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3729521334171295, "rewards/margins": 0.1768990159034729, "rewards/rejected": -0.5498511791229248, "step": 1235 }, { "epoch": 0.924630634000374, "grad_norm": 78.3178719567927, "learning_rate": 7.320474903404761e-09, "logps/chosen": -79.00079345703125, "logps/rejected": -79.6113052368164, "loss": 0.6698, "losses/dpo": 0.721586287021637, "losses/sft": 1.0320589542388916, "losses/total": 0.721586287021637, "ref_logps/chosen": -74.470458984375, "ref_logps/rejected": -73.15997314453125, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4530331790447235, "rewards/margins": 0.1920996904373169, "rewards/rejected": -0.6451328992843628, "step": 1236 }, { "epoch": 0.9253787170375912, "grad_norm": 57.15135109306424, "learning_rate": 7.175498399572877e-09, "logps/chosen": -87.14069366455078, "logps/rejected": -100.14297485351562, "loss": 0.604, "losses/dpo": 0.4727008640766144, "losses/sft": 0.9298191070556641, "losses/total": 0.4727008640766144, "ref_logps/chosen": -83.98503112792969, "ref_logps/rejected": -93.74565887451172, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3155660331249237, "rewards/margins": 0.3241654336452484, "rewards/rejected": -0.6397314667701721, "step": 1237 }, { "epoch": 0.9261268000748083, "grad_norm": 81.11441081431924, "learning_rate": 7.031950962447991e-09, "logps/chosen": -93.28671264648438, "logps/rejected": -90.82169342041016, "loss": 0.7279, "losses/dpo": 1.1815458536148071, "losses/sft": 1.0409330129623413, "losses/total": 1.1815458536148071, "ref_logps/chosen": -87.54960632324219, "ref_logps/rejected": -84.65003967285156, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5737096667289734, "rewards/margins": 0.04345623776316643, "rewards/rejected": -0.6171658635139465, "step": 1238 }, { "epoch": 0.9268748831120255, "grad_norm": 74.0459764035259, "learning_rate": 6.889833436833081e-09, "logps/chosen": -95.54714965820312, "logps/rejected": -108.80522918701172, "loss": 0.6791, "losses/dpo": 0.653688907623291, "losses/sft": 0.5761976838111877, "losses/total": 0.653688907623291, "ref_logps/chosen": -91.09417724609375, "ref_logps/rejected": -103.49273681640625, "rewards/accuracies": 0.59375, "rewards/chosen": -0.44529712200164795, "rewards/margins": 0.08595234155654907, "rewards/rejected": -0.5312495231628418, "step": 1239 }, { "epoch": 0.9276229661492426, "grad_norm": 69.20934446339935, "learning_rate": 6.749146659115884e-09, "logps/chosen": -109.87352752685547, "logps/rejected": -112.47776794433594, "loss": 0.6669, "losses/dpo": 0.6426227688789368, "losses/sft": 0.7678746581077576, "losses/total": 0.6426227688789368, "ref_logps/chosen": -105.5895767211914, "ref_logps/rejected": -105.39892578125, "rewards/accuracies": 0.625, "rewards/chosen": -0.42839470505714417, "rewards/margins": 0.27948832511901855, "rewards/rejected": -0.7078830003738403, "step": 1240 }, { "epoch": 0.9283710491864597, "grad_norm": 99.15452307283203, "learning_rate": 6.6098914572638165e-09, "logps/chosen": -110.61351776123047, "logps/rejected": -110.61475372314453, "loss": 0.653, "losses/dpo": 0.6470342874526978, "losses/sft": 0.8137529492378235, "losses/total": 0.6470342874526978, "ref_logps/chosen": -106.12715148925781, "ref_logps/rejected": -103.7268295288086, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4486357271671295, "rewards/margins": 0.24015597999095917, "rewards/rejected": -0.6887916922569275, "step": 1241 }, { "epoch": 0.9291191322236768, "grad_norm": 68.83922780578564, "learning_rate": 6.47206865081934e-09, "logps/chosen": -90.7609634399414, "logps/rejected": -100.46366119384766, "loss": 0.6594, "losses/dpo": 0.700676441192627, "losses/sft": 1.3327327966690063, "losses/total": 0.700676441192627, "ref_logps/chosen": -86.2029037475586, "ref_logps/rejected": -93.86976623535156, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4558061361312866, "rewards/margins": 0.20358309149742126, "rewards/rejected": -0.6593892574310303, "step": 1242 }, { "epoch": 0.929867215260894, "grad_norm": 92.67243301051231, "learning_rate": 6.335679050894882e-09, "logps/chosen": -95.54631042480469, "logps/rejected": -107.45690155029297, "loss": 0.6048, "losses/dpo": 0.47858893871307373, "losses/sft": 1.0310101509094238, "losses/total": 0.47858893871307373, "ref_logps/chosen": -92.20785522460938, "ref_logps/rejected": -100.50596618652344, "rewards/accuracies": 0.625, "rewards/chosen": -0.33384543657302856, "rewards/margins": 0.3612479567527771, "rewards/rejected": -0.6950933933258057, "step": 1243 }, { "epoch": 0.9306152982981111, "grad_norm": 71.3026856765959, "learning_rate": 6.200723460168283e-09, "logps/chosen": -105.4922103881836, "logps/rejected": -112.46110534667969, "loss": 0.5843, "losses/dpo": 0.6741883158683777, "losses/sft": 1.028874397277832, "losses/total": 0.6741883158683777, "ref_logps/chosen": -101.82792663574219, "ref_logps/rejected": -105.48645782470703, "rewards/accuracies": 0.75, "rewards/chosen": -0.36642926931381226, "rewards/margins": 0.3310351073741913, "rewards/rejected": -0.6974642872810364, "step": 1244 }, { "epoch": 0.9313633813353283, "grad_norm": 61.41682562770829, "learning_rate": 6.067202672877886e-09, "logps/chosen": -79.63488006591797, "logps/rejected": -84.09317016601562, "loss": 0.6701, "losses/dpo": 0.43688374757766724, "losses/sft": 0.9685957431793213, "losses/total": 0.43688374757766724, "ref_logps/chosen": -74.95785522460938, "ref_logps/rejected": -77.4916763305664, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4677029252052307, "rewards/margins": 0.19244712591171265, "rewards/rejected": -0.6601500511169434, "step": 1245 }, { "epoch": 0.9321114643725453, "grad_norm": 66.10149304641376, "learning_rate": 5.935117474818036e-09, "logps/chosen": -91.5979232788086, "logps/rejected": -92.09662628173828, "loss": 0.6292, "losses/dpo": 0.7972444891929626, "losses/sft": 0.7688626646995544, "losses/total": 0.7972444891929626, "ref_logps/chosen": -88.14134216308594, "ref_logps/rejected": -86.80597686767578, "rewards/accuracies": 0.65625, "rewards/chosen": -0.34565767645835876, "rewards/margins": 0.18340745568275452, "rewards/rejected": -0.5290651321411133, "step": 1246 }, { "epoch": 0.9328595474097625, "grad_norm": 61.54462256163135, "learning_rate": 5.804468643334337e-09, "logps/chosen": -98.03744506835938, "logps/rejected": -111.68763732910156, "loss": 0.5992, "losses/dpo": 0.7667149305343628, "losses/sft": 0.9969642758369446, "losses/total": 0.7667149305343628, "ref_logps/chosen": -94.59860229492188, "ref_logps/rejected": -105.16889190673828, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34388336539268494, "rewards/margins": 0.30799058079719543, "rewards/rejected": -0.6518739461898804, "step": 1247 }, { "epoch": 0.9336076304469796, "grad_norm": 64.32522277734151, "learning_rate": 5.675256947319101e-09, "logps/chosen": -91.57855987548828, "logps/rejected": -106.18276977539062, "loss": 0.6406, "losses/dpo": 0.7943019866943359, "losses/sft": 1.297351360321045, "losses/total": 0.7943019866943359, "ref_logps/chosen": -87.93257141113281, "ref_logps/rejected": -100.49467468261719, "rewards/accuracies": 0.625, "rewards/chosen": -0.36459892988204956, "rewards/margins": 0.20421108603477478, "rewards/rejected": -0.568809986114502, "step": 1248 }, { "epoch": 0.9343557134841968, "grad_norm": 71.67894808936154, "learning_rate": 5.5474831472068494e-09, "logps/chosen": -102.41458892822266, "logps/rejected": -113.39787292480469, "loss": 0.645, "losses/dpo": 0.7413765788078308, "losses/sft": 0.7929729223251343, "losses/total": 0.7413765788078308, "ref_logps/chosen": -97.41638946533203, "ref_logps/rejected": -106.37448120117188, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4998195767402649, "rewards/margins": 0.20251986384391785, "rewards/rejected": -0.7023395299911499, "step": 1249 }, { "epoch": 0.9351037965214138, "grad_norm": 72.96977427251781, "learning_rate": 5.421147994969816e-09, "logps/chosen": -89.41911315917969, "logps/rejected": -94.68203735351562, "loss": 0.6558, "losses/dpo": 0.6785516738891602, "losses/sft": 1.1680729389190674, "losses/total": 0.6785516738891602, "ref_logps/chosen": -84.53211212158203, "ref_logps/rejected": -88.12242889404297, "rewards/accuracies": 0.59375, "rewards/chosen": -0.48870038986206055, "rewards/margins": 0.16726049780845642, "rewards/rejected": -0.6559609174728394, "step": 1250 }, { "epoch": 0.935851879558631, "grad_norm": 57.67469088269401, "learning_rate": 5.296252234113563e-09, "logps/chosen": -86.19396209716797, "logps/rejected": -101.0201187133789, "loss": 0.554, "losses/dpo": 0.8114815950393677, "losses/sft": 1.3663127422332764, "losses/total": 0.8114815950393677, "ref_logps/chosen": -82.28874206542969, "ref_logps/rejected": -93.31858825683594, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3905225992202759, "rewards/margins": 0.379629909992218, "rewards/rejected": -0.7701525092124939, "step": 1251 }, { "epoch": 0.9365999625958481, "grad_norm": 57.569191145125565, "learning_rate": 5.172796599672485e-09, "logps/chosen": -78.7724380493164, "logps/rejected": -92.44013214111328, "loss": 0.6243, "losses/dpo": 0.619408130645752, "losses/sft": 0.522476315498352, "losses/total": 0.619408130645752, "ref_logps/chosen": -74.90829467773438, "ref_logps/rejected": -86.20232391357422, "rewards/accuracies": 0.5, "rewards/chosen": -0.3864150047302246, "rewards/margins": 0.23736697435379028, "rewards/rejected": -0.6237819194793701, "step": 1252 }, { "epoch": 0.9373480456330653, "grad_norm": 146.86623389523317, "learning_rate": 5.050781818205674e-09, "logps/chosen": -105.94985961914062, "logps/rejected": -117.50100708007812, "loss": 0.626, "losses/dpo": 0.6326907277107239, "losses/sft": 1.4153473377227783, "losses/total": 0.6326907277107239, "ref_logps/chosen": -101.58988189697266, "ref_logps/rejected": -110.65091705322266, "rewards/accuracies": 0.5625, "rewards/chosen": -0.435998797416687, "rewards/margins": 0.24900966882705688, "rewards/rejected": -0.6850084662437439, "step": 1253 }, { "epoch": 0.9380961286702824, "grad_norm": 80.26729437792785, "learning_rate": 4.930208607792446e-09, "logps/chosen": -87.78329467773438, "logps/rejected": -91.76683807373047, "loss": 0.6255, "losses/dpo": 1.0189101696014404, "losses/sft": 0.5597437620162964, "losses/total": 1.0189101696014404, "ref_logps/chosen": -83.38225555419922, "ref_logps/rejected": -84.48130798339844, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4401032328605652, "rewards/margins": 0.2884497046470642, "rewards/rejected": -0.7285529375076294, "step": 1254 }, { "epoch": 0.9388442117074995, "grad_norm": 54.8894738853845, "learning_rate": 4.811077678028269e-09, "logps/chosen": -83.20854949951172, "logps/rejected": -92.90223693847656, "loss": 0.5746, "losses/dpo": 0.6476658582687378, "losses/sft": 1.1551953554153442, "losses/total": 0.6476658582687378, "ref_logps/chosen": -79.10472106933594, "ref_logps/rejected": -84.98426818847656, "rewards/accuracies": 0.71875, "rewards/chosen": -0.41038310527801514, "rewards/margins": 0.3814135193824768, "rewards/rejected": -0.7917966246604919, "step": 1255 }, { "epoch": 0.9395922947447166, "grad_norm": 75.39583346544406, "learning_rate": 4.693389730020508e-09, "logps/chosen": -83.12604522705078, "logps/rejected": -96.59599304199219, "loss": 0.6907, "losses/dpo": 0.4767940044403076, "losses/sft": 0.6338489651679993, "losses/total": 0.4767940044403076, "ref_logps/chosen": -77.7559814453125, "ref_logps/rejected": -89.19522857666016, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5370060801506042, "rewards/margins": 0.2030705064535141, "rewards/rejected": -0.7400766015052795, "step": 1256 }, { "epoch": 0.9403403777819338, "grad_norm": 65.16521247678683, "learning_rate": 4.577145456384296e-09, "logps/chosen": -86.16363525390625, "logps/rejected": -92.77691650390625, "loss": 0.5917, "losses/dpo": 0.4787185788154602, "losses/sft": 1.1264342069625854, "losses/total": 0.4787185788154602, "ref_logps/chosen": -82.29118347167969, "ref_logps/rejected": -85.85858917236328, "rewards/accuracies": 0.71875, "rewards/chosen": -0.38724541664123535, "rewards/margins": 0.3045872449874878, "rewards/rejected": -0.6918326616287231, "step": 1257 }, { "epoch": 0.9410884608191509, "grad_norm": 86.40632386023347, "learning_rate": 4.4623455412385044e-09, "logps/chosen": -95.58470153808594, "logps/rejected": -99.73617553710938, "loss": 0.6945, "losses/dpo": 0.8956193327903748, "losses/sft": 1.4506009817123413, "losses/total": 0.8956193327903748, "ref_logps/chosen": -91.6731185913086, "ref_logps/rejected": -95.19740295410156, "rewards/accuracies": 0.625, "rewards/chosen": -0.39115816354751587, "rewards/margins": 0.06271902471780777, "rewards/rejected": -0.45387721061706543, "step": 1258 }, { "epoch": 0.9418365438563681, "grad_norm": 73.68328854446528, "learning_rate": 4.348990660201668e-09, "logps/chosen": -79.42451477050781, "logps/rejected": -82.93222045898438, "loss": 0.6391, "losses/dpo": 0.604732096195221, "losses/sft": 0.28594261407852173, "losses/total": 0.604732096195221, "ref_logps/chosen": -75.20147705078125, "ref_logps/rejected": -76.94418334960938, "rewards/accuracies": 0.5, "rewards/chosen": -0.42230409383773804, "rewards/margins": 0.17650005221366882, "rewards/rejected": -0.5988041758537292, "step": 1259 }, { "epoch": 0.9425846268935852, "grad_norm": 72.20067789916602, "learning_rate": 4.237081480388066e-09, "logps/chosen": -101.03079223632812, "logps/rejected": -104.33650207519531, "loss": 0.6666, "losses/dpo": 0.5850532650947571, "losses/sft": 0.8747610449790955, "losses/total": 0.5850532650947571, "ref_logps/chosen": -96.96773529052734, "ref_logps/rejected": -98.49101257324219, "rewards/accuracies": 0.625, "rewards/chosen": -0.40630537271499634, "rewards/margins": 0.17824366688728333, "rewards/rejected": -0.5845490097999573, "step": 1260 }, { "epoch": 0.9433327099308023, "grad_norm": 100.38386243790212, "learning_rate": 4.12661866040373e-09, "logps/chosen": -89.02254486083984, "logps/rejected": -96.82244873046875, "loss": 0.5978, "losses/dpo": 0.5262622833251953, "losses/sft": 1.1326239109039307, "losses/total": 0.5262622833251953, "ref_logps/chosen": -84.91191101074219, "ref_logps/rejected": -89.5614013671875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.41106319427490234, "rewards/margins": 0.3150419592857361, "rewards/rejected": -0.7261051535606384, "step": 1261 }, { "epoch": 0.9440807929680195, "grad_norm": 74.92751568353475, "learning_rate": 4.0176028503425826e-09, "logps/chosen": -90.55907440185547, "logps/rejected": -89.90573120117188, "loss": 0.694, "losses/dpo": 0.6154741644859314, "losses/sft": 0.2331765592098236, "losses/total": 0.6154741644859314, "ref_logps/chosen": -86.22978210449219, "ref_logps/rejected": -84.57247924804688, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4329291582107544, "rewards/margins": 0.10039594024419785, "rewards/rejected": -0.5333250761032104, "step": 1262 }, { "epoch": 0.9448288760052366, "grad_norm": 76.4670645805167, "learning_rate": 3.910034691782693e-09, "logps/chosen": -90.55133056640625, "logps/rejected": -103.38450622558594, "loss": 0.6687, "losses/dpo": 0.6687107086181641, "losses/sft": 1.005706787109375, "losses/total": 0.6687107086181641, "ref_logps/chosen": -85.851806640625, "ref_logps/rejected": -97.51974487304688, "rewards/accuracies": 0.71875, "rewards/chosen": -0.469951868057251, "rewards/margins": 0.11652311682701111, "rewards/rejected": -0.5864749550819397, "step": 1263 }, { "epoch": 0.9455769590424538, "grad_norm": 64.59379080820479, "learning_rate": 3.803914817782333e-09, "logps/chosen": -57.3265380859375, "logps/rejected": -60.78833770751953, "loss": 0.6477, "losses/dpo": 0.6887168884277344, "losses/sft": 0.7462136745452881, "losses/total": 0.6887168884277344, "ref_logps/chosen": -54.22631072998047, "ref_logps/rejected": -55.7993278503418, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3100225329399109, "rewards/margins": 0.1888783723115921, "rewards/rejected": -0.4989008903503418, "step": 1264 }, { "epoch": 0.9463250420796708, "grad_norm": 64.81958169238959, "learning_rate": 3.6992438528764545e-09, "logps/chosen": -85.41490173339844, "logps/rejected": -85.54429626464844, "loss": 0.6921, "losses/dpo": 0.5190759897232056, "losses/sft": 0.6561710238456726, "losses/total": 0.5190759897232056, "ref_logps/chosen": -81.23255920410156, "ref_logps/rejected": -80.04067993164062, "rewards/accuracies": 0.59375, "rewards/chosen": -0.41823485493659973, "rewards/margins": 0.1321275234222412, "rewards/rejected": -0.5503623485565186, "step": 1265 }, { "epoch": 0.947073125116888, "grad_norm": 102.79159658053612, "learning_rate": 3.5960224130728858e-09, "logps/chosen": -96.9426040649414, "logps/rejected": -100.3900375366211, "loss": 0.685, "losses/dpo": 0.8092766404151917, "losses/sft": 0.729993462562561, "losses/total": 0.8092766404151917, "ref_logps/chosen": -92.78230285644531, "ref_logps/rejected": -95.28356170654297, "rewards/accuracies": 0.5, "rewards/chosen": -0.41603001952171326, "rewards/margins": 0.09461796283721924, "rewards/rejected": -0.5106479525566101, "step": 1266 }, { "epoch": 0.9478212081541051, "grad_norm": 74.88077445204371, "learning_rate": 3.4942511058486946e-09, "logps/chosen": -96.21996307373047, "logps/rejected": -102.39236450195312, "loss": 0.5341, "losses/dpo": 0.5375908613204956, "losses/sft": 0.9822946786880493, "losses/total": 0.5375908613204956, "ref_logps/chosen": -92.39061737060547, "ref_logps/rejected": -94.28872680664062, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3829345405101776, "rewards/margins": 0.42742928862571716, "rewards/rejected": -0.8103638291358948, "step": 1267 }, { "epoch": 0.9485692911913223, "grad_norm": 73.38887506209397, "learning_rate": 3.3939305301467213e-09, "logps/chosen": -94.9256591796875, "logps/rejected": -95.28760528564453, "loss": 0.6558, "losses/dpo": 0.6746376752853394, "losses/sft": 1.2706103324890137, "losses/total": 0.6746376752853394, "ref_logps/chosen": -91.43588256835938, "ref_logps/rejected": -90.21084594726562, "rewards/accuracies": 0.625, "rewards/chosen": -0.34897685050964355, "rewards/margins": 0.15869800746440887, "rewards/rejected": -0.5076748728752136, "step": 1268 }, { "epoch": 0.9493173742285393, "grad_norm": 71.41141378817586, "learning_rate": 3.2950612763718575e-09, "logps/chosen": -120.37641906738281, "logps/rejected": -120.72488403320312, "loss": 0.6753, "losses/dpo": 0.8567748665809631, "losses/sft": 1.141908884048462, "losses/total": 0.8567748665809631, "ref_logps/chosen": -115.78253173828125, "ref_logps/rejected": -115.0727767944336, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4593888223171234, "rewards/margins": 0.10582160949707031, "rewards/rejected": -0.5652104616165161, "step": 1269 }, { "epoch": 0.9500654572657565, "grad_norm": 106.75397518001773, "learning_rate": 3.1976439263879108e-09, "logps/chosen": -108.487548828125, "logps/rejected": -117.48857116699219, "loss": 0.6266, "losses/dpo": 0.589027464389801, "losses/sft": 0.7077415585517883, "losses/total": 0.589027464389801, "ref_logps/chosen": -103.2886734008789, "ref_logps/rejected": -109.93490600585938, "rewards/accuracies": 0.625, "rewards/chosen": -0.5198880434036255, "rewards/margins": 0.2354787290096283, "rewards/rejected": -0.7553667426109314, "step": 1270 }, { "epoch": 0.9508135403029736, "grad_norm": 112.98434601441662, "learning_rate": 3.101679053513745e-09, "logps/chosen": -81.78652954101562, "logps/rejected": -81.23521423339844, "loss": 0.6674, "losses/dpo": 0.6749790906906128, "losses/sft": 0.4219992160797119, "losses/total": 0.6749790906906128, "ref_logps/chosen": -77.88331604003906, "ref_logps/rejected": -75.68495178222656, "rewards/accuracies": 0.46875, "rewards/chosen": -0.3903220593929291, "rewards/margins": 0.1647038757801056, "rewards/rejected": -0.5550259351730347, "step": 1271 }, { "epoch": 0.9515616233401908, "grad_norm": 60.276765821706164, "learning_rate": 3.007167222520285e-09, "logps/chosen": -83.98312377929688, "logps/rejected": -95.92047882080078, "loss": 0.6272, "losses/dpo": 0.5152997374534607, "losses/sft": 1.0699357986450195, "losses/total": 0.5152997374534607, "ref_logps/chosen": -80.39696502685547, "ref_logps/rejected": -89.82666015625, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3586156666278839, "rewards/margins": 0.25076696276664734, "rewards/rejected": -0.609382688999176, "step": 1272 }, { "epoch": 0.9523097063774079, "grad_norm": 71.46365625187725, "learning_rate": 2.9141089896269343e-09, "logps/chosen": -75.966064453125, "logps/rejected": -89.46366119384766, "loss": 0.5836, "losses/dpo": 0.47240912914276123, "losses/sft": 0.8180992007255554, "losses/total": 0.47240912914276123, "ref_logps/chosen": -73.65571594238281, "ref_logps/rejected": -84.02318572998047, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23103468120098114, "rewards/margins": 0.3130127787590027, "rewards/rejected": -0.544047474861145, "step": 1273 }, { "epoch": 0.953057789414625, "grad_norm": 77.83106889500719, "learning_rate": 2.8225049024984958e-09, "logps/chosen": -99.20821380615234, "logps/rejected": -100.75440979003906, "loss": 0.7004, "losses/dpo": 0.8866727352142334, "losses/sft": 1.1024925708770752, "losses/total": 0.8866727352142334, "ref_logps/chosen": -95.60607147216797, "ref_logps/rejected": -96.66063690185547, "rewards/accuracies": 0.46875, "rewards/chosen": -0.3602145314216614, "rewards/margins": 0.04916220158338547, "rewards/rejected": -0.40937677025794983, "step": 1274 }, { "epoch": 0.9538058724518421, "grad_norm": 79.96821849865503, "learning_rate": 2.732355500241784e-09, "logps/chosen": -83.5235595703125, "logps/rejected": -96.28414154052734, "loss": 0.6632, "losses/dpo": 0.632606029510498, "losses/sft": 0.7773022055625916, "losses/total": 0.632606029510498, "ref_logps/chosen": -78.8705062866211, "ref_logps/rejected": -89.7353515625, "rewards/accuracies": 0.625, "rewards/chosen": -0.4653058648109436, "rewards/margins": 0.18957401812076569, "rewards/rejected": -0.6548798680305481, "step": 1275 }, { "epoch": 0.9545539554890593, "grad_norm": 63.0955049706202, "learning_rate": 2.643661313402601e-09, "logps/chosen": -83.42410278320312, "logps/rejected": -91.70938873291016, "loss": 0.6239, "losses/dpo": 0.541158139705658, "losses/sft": 0.8344438076019287, "losses/total": 0.541158139705658, "ref_logps/chosen": -79.14330291748047, "ref_logps/rejected": -84.63862609863281, "rewards/accuracies": 0.65625, "rewards/chosen": -0.42808040976524353, "rewards/margins": 0.27899545431137085, "rewards/rejected": -0.7070759534835815, "step": 1276 }, { "epoch": 0.9553020385262764, "grad_norm": 81.44591798991807, "learning_rate": 2.5564228639624595e-09, "logps/chosen": -107.83403778076172, "logps/rejected": -101.70565795898438, "loss": 0.6936, "losses/dpo": 0.529888927936554, "losses/sft": 1.4031325578689575, "losses/total": 0.529888927936554, "ref_logps/chosen": -103.25970458984375, "ref_logps/rejected": -96.03228759765625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4574335813522339, "rewards/margins": 0.10990400612354279, "rewards/rejected": -0.5673376321792603, "step": 1277 }, { "epoch": 0.9560501215634936, "grad_norm": 79.28039337405126, "learning_rate": 2.470640665335616e-09, "logps/chosen": -92.14910888671875, "logps/rejected": -87.18870544433594, "loss": 0.7061, "losses/dpo": 0.6503806114196777, "losses/sft": 0.439592570066452, "losses/total": 0.6503806114196777, "ref_logps/chosen": -88.0286865234375, "ref_logps/rejected": -82.41332244873047, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4120418131351471, "rewards/margins": 0.06549699604511261, "rewards/rejected": -0.4775388240814209, "step": 1278 }, { "epoch": 0.9567982046007106, "grad_norm": 86.54434542549949, "learning_rate": 2.38631522236607e-09, "logps/chosen": -102.78597259521484, "logps/rejected": -106.81980895996094, "loss": 0.6551, "losses/dpo": 0.4825819730758667, "losses/sft": 1.4411890506744385, "losses/total": 0.4825819730758667, "ref_logps/chosen": -98.07181549072266, "ref_logps/rejected": -100.0429458618164, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4714156687259674, "rewards/margins": 0.20627160370349884, "rewards/rejected": -0.6776872873306274, "step": 1279 }, { "epoch": 0.9575462876379278, "grad_norm": 67.51035968260344, "learning_rate": 2.3034470313244846e-09, "logps/chosen": -91.01301574707031, "logps/rejected": -96.21817779541016, "loss": 0.637, "losses/dpo": 0.717863917350769, "losses/sft": 0.9386616349220276, "losses/total": 0.717863917350769, "ref_logps/chosen": -86.67925262451172, "ref_logps/rejected": -89.9822998046875, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4333759546279907, "rewards/margins": 0.1902109980583191, "rewards/rejected": -0.623586893081665, "step": 1280 }, { "epoch": 0.9582943706751449, "grad_norm": 117.22914910925059, "learning_rate": 2.222036579905384e-09, "logps/chosen": -73.81900024414062, "logps/rejected": -80.35090637207031, "loss": 0.6378, "losses/dpo": 0.6475811004638672, "losses/sft": 0.6526294946670532, "losses/total": 0.6475811004638672, "ref_logps/chosen": -70.14810943603516, "ref_logps/rejected": -74.48573303222656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.36708909273147583, "rewards/margins": 0.21942967176437378, "rewards/rejected": -0.5865187644958496, "step": 1281 }, { "epoch": 0.9590424537123621, "grad_norm": 75.74459294576353, "learning_rate": 2.1420843472241823e-09, "logps/chosen": -101.13557434082031, "logps/rejected": -109.37317657470703, "loss": 0.6558, "losses/dpo": 0.46930447220802307, "losses/sft": 0.5531554222106934, "losses/total": 0.46930447220802307, "ref_logps/chosen": -96.80693054199219, "ref_logps/rejected": -103.29495239257812, "rewards/accuracies": 0.65625, "rewards/chosen": -0.43286368250846863, "rewards/margins": 0.1749587506055832, "rewards/rejected": -0.6078224182128906, "step": 1282 }, { "epoch": 0.9597905367495791, "grad_norm": 75.33937048000686, "learning_rate": 2.063590803814408e-09, "logps/chosen": -131.90916442871094, "logps/rejected": -134.93414306640625, "loss": 0.6663, "losses/dpo": 0.7749282717704773, "losses/sft": 1.7364041805267334, "losses/total": 0.7749282717704773, "ref_logps/chosen": -126.7481689453125, "ref_logps/rejected": -127.85588073730469, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5160984992980957, "rewards/margins": 0.1917291134595871, "rewards/rejected": -0.707827627658844, "step": 1283 }, { "epoch": 0.9605386197867963, "grad_norm": 73.89731768126526, "learning_rate": 1.986556411625012e-09, "logps/chosen": -69.86791229248047, "logps/rejected": -70.85253143310547, "loss": 0.6365, "losses/dpo": 0.6384409070014954, "losses/sft": 1.014013409614563, "losses/total": 0.6384409070014954, "ref_logps/chosen": -66.80302429199219, "ref_logps/rejected": -66.27749633789062, "rewards/accuracies": 0.75, "rewards/chosen": -0.3064892888069153, "rewards/margins": 0.15101465582847595, "rewards/rejected": -0.45750391483306885, "step": 1284 }, { "epoch": 0.9612867028240135, "grad_norm": 61.652487654047626, "learning_rate": 1.9109816240174547e-09, "logps/chosen": -104.24888610839844, "logps/rejected": -111.75657653808594, "loss": 0.6803, "losses/dpo": 0.8186298608779907, "losses/sft": 1.0933948755264282, "losses/total": 0.8186298608779907, "ref_logps/chosen": -98.85260009765625, "ref_logps/rejected": -105.16178894042969, "rewards/accuracies": 0.625, "rewards/chosen": -0.5396279096603394, "rewards/margins": 0.11985226720571518, "rewards/rejected": -0.6594801545143127, "step": 1285 }, { "epoch": 0.9620347858612306, "grad_norm": 56.14765216474034, "learning_rate": 1.8368668857632331e-09, "logps/chosen": -81.17070770263672, "logps/rejected": -96.7817153930664, "loss": 0.5741, "losses/dpo": 0.582453727722168, "losses/sft": 0.9944047331809998, "losses/total": 0.582453727722168, "ref_logps/chosen": -77.49075317382812, "ref_logps/rejected": -89.53260040283203, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36799633502960205, "rewards/margins": 0.3569154143333435, "rewards/rejected": -0.7249118089675903, "step": 1286 }, { "epoch": 0.9627828688984478, "grad_norm": 54.63398722865739, "learning_rate": 1.7642126330411622e-09, "logps/chosen": -87.83833312988281, "logps/rejected": -98.63851928710938, "loss": 0.6223, "losses/dpo": 0.5525274276733398, "losses/sft": 0.6002968549728394, "losses/total": 0.5525274276733398, "ref_logps/chosen": -84.23351287841797, "ref_logps/rejected": -92.71544647216797, "rewards/accuracies": 0.71875, "rewards/chosen": -0.36048251390457153, "rewards/margins": 0.2318248748779297, "rewards/rejected": -0.5923073887825012, "step": 1287 }, { "epoch": 0.9635309519356648, "grad_norm": 61.855616398241544, "learning_rate": 1.6930192934348497e-09, "logps/chosen": -86.21490478515625, "logps/rejected": -87.61050415039062, "loss": 0.6101, "losses/dpo": 0.6766643524169922, "losses/sft": 1.2049895524978638, "losses/total": 0.6766643524169922, "ref_logps/chosen": -82.41419219970703, "ref_logps/rejected": -81.24259185791016, "rewards/accuracies": 0.65625, "rewards/chosen": -0.38007161021232605, "rewards/margins": 0.25671902298927307, "rewards/rejected": -0.6367906332015991, "step": 1288 }, { "epoch": 0.964279034972882, "grad_norm": 49.5852251135411, "learning_rate": 1.6232872859301694e-09, "logps/chosen": -88.26910400390625, "logps/rejected": -92.61994934082031, "loss": 0.5994, "losses/dpo": 0.6616922616958618, "losses/sft": 0.47489282488822937, "losses/total": 0.6616922616958618, "ref_logps/chosen": -84.48704528808594, "ref_logps/rejected": -86.00242614746094, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3782059848308563, "rewards/margins": 0.28354552388191223, "rewards/rejected": -0.6617515087127686, "step": 1289 }, { "epoch": 0.9650271180100991, "grad_norm": 69.74730610174358, "learning_rate": 1.5550170209127355e-09, "logps/chosen": -71.93753051757812, "logps/rejected": -83.51567840576172, "loss": 0.6419, "losses/dpo": 0.5912843942642212, "losses/sft": 0.2716396749019623, "losses/total": 0.5912843942642212, "ref_logps/chosen": -67.86094665527344, "ref_logps/rejected": -77.68801879882812, "rewards/accuracies": 0.71875, "rewards/chosen": -0.40765923261642456, "rewards/margins": 0.17510655522346497, "rewards/rejected": -0.5827657580375671, "step": 1290 }, { "epoch": 0.9657752010473163, "grad_norm": 64.78936345789371, "learning_rate": 1.4882089001655718e-09, "logps/chosen": -89.64180755615234, "logps/rejected": -97.2757568359375, "loss": 0.6424, "losses/dpo": 0.6967378854751587, "losses/sft": 0.7279509902000427, "losses/total": 0.6967378854751587, "ref_logps/chosen": -85.8584976196289, "ref_logps/rejected": -91.52503204345703, "rewards/accuracies": 0.5625, "rewards/chosen": -0.37833118438720703, "rewards/margins": 0.19674065709114075, "rewards/rejected": -0.5750718116760254, "step": 1291 }, { "epoch": 0.9665232840845334, "grad_norm": 70.302702428824, "learning_rate": 1.4228633168667514e-09, "logps/chosen": -84.2426528930664, "logps/rejected": -114.12158203125, "loss": 0.5274, "losses/dpo": 0.42201685905456543, "losses/sft": 0.46827995777130127, "losses/total": 0.42201685905456543, "ref_logps/chosen": -80.81484985351562, "ref_logps/rejected": -106.03266143798828, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3427804708480835, "rewards/margins": 0.466111421585083, "rewards/rejected": -0.8088918924331665, "step": 1292 }, { "epoch": 0.9672713671217505, "grad_norm": 82.97982763559824, "learning_rate": 1.3589806555869832e-09, "logps/chosen": -105.94963836669922, "logps/rejected": -115.22958374023438, "loss": 0.6479, "losses/dpo": 0.692333996295929, "losses/sft": 0.9786175489425659, "losses/total": 0.692333996295929, "ref_logps/chosen": -102.22892761230469, "ref_logps/rejected": -109.9055404663086, "rewards/accuracies": 0.625, "rewards/chosen": -0.3720705211162567, "rewards/margins": 0.1603328287601471, "rewards/rejected": -0.5324033498764038, "step": 1293 }, { "epoch": 0.9680194501589676, "grad_norm": 61.14918509226529, "learning_rate": 1.2965612922874458e-09, "logps/chosen": -99.99383544921875, "logps/rejected": -111.31785583496094, "loss": 0.6045, "losses/dpo": 0.9459117650985718, "losses/sft": 1.1964493989944458, "losses/total": 0.9459117650985718, "ref_logps/chosen": -95.78019714355469, "ref_logps/rejected": -103.39155578613281, "rewards/accuracies": 0.625, "rewards/chosen": -0.4213640093803406, "rewards/margins": 0.37126603722572327, "rewards/rejected": -0.7926300764083862, "step": 1294 }, { "epoch": 0.9687675331961848, "grad_norm": 114.17747429325247, "learning_rate": 1.2356055943175404e-09, "logps/chosen": -78.83253479003906, "logps/rejected": -69.7047348022461, "loss": 0.7482, "losses/dpo": 0.6715818047523499, "losses/sft": 0.6023010611534119, "losses/total": 0.6715818047523499, "ref_logps/chosen": -74.82839965820312, "ref_logps/rejected": -66.21331024169922, "rewards/accuracies": 0.40625, "rewards/chosen": -0.4004148244857788, "rewards/margins": -0.05127238482236862, "rewards/rejected": -0.3491424322128296, "step": 1295 }, { "epoch": 0.9695156162334019, "grad_norm": 72.21463405050235, "learning_rate": 1.1761139204126968e-09, "logps/chosen": -82.67105102539062, "logps/rejected": -97.55622100830078, "loss": 0.6283, "losses/dpo": 0.508527934551239, "losses/sft": 0.751926600933075, "losses/total": 0.508527934551239, "ref_logps/chosen": -79.39065551757812, "ref_logps/rejected": -92.26617431640625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3280394375324249, "rewards/margins": 0.200965017080307, "rewards/rejected": -0.5290044546127319, "step": 1296 }, { "epoch": 0.9702636992706191, "grad_norm": 63.38572325265337, "learning_rate": 1.1180866206923767e-09, "logps/chosen": -87.83694458007812, "logps/rejected": -97.29962158203125, "loss": 0.6672, "losses/dpo": 0.5804587602615356, "losses/sft": 0.45879143476486206, "losses/total": 0.5804587602615356, "ref_logps/chosen": -83.86869049072266, "ref_logps/rejected": -91.91288757324219, "rewards/accuracies": 0.59375, "rewards/chosen": -0.39682549238204956, "rewards/margins": 0.14184804260730743, "rewards/rejected": -0.5386735200881958, "step": 1297 }, { "epoch": 0.9710117823078361, "grad_norm": 64.10364779300686, "learning_rate": 1.0615240366578238e-09, "logps/chosen": -104.44111633300781, "logps/rejected": -109.71001434326172, "loss": 0.6823, "losses/dpo": 0.6608184576034546, "losses/sft": 0.8007652759552002, "losses/total": 0.6608184576034546, "ref_logps/chosen": -99.48912048339844, "ref_logps/rejected": -103.3502197265625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4951990842819214, "rewards/margins": 0.1407795399427414, "rewards/rejected": -0.635978639125824, "step": 1298 }, { "epoch": 0.9717598653450533, "grad_norm": 68.79766584639682, "learning_rate": 1.0064265011902328e-09, "logps/chosen": -87.8827896118164, "logps/rejected": -87.71543884277344, "loss": 0.6801, "losses/dpo": 0.7329565286636353, "losses/sft": 0.8390571475028992, "losses/total": 0.7329565286636353, "ref_logps/chosen": -83.08531188964844, "ref_logps/rejected": -81.80152893066406, "rewards/accuracies": 0.5, "rewards/chosen": -0.4797479212284088, "rewards/margins": 0.11164234578609467, "rewards/rejected": -0.5913902521133423, "step": 1299 }, { "epoch": 0.9725079483822704, "grad_norm": 67.98294216672238, "learning_rate": 9.527943385487237e-10, "logps/chosen": -100.54510498046875, "logps/rejected": -113.80298614501953, "loss": 0.6279, "losses/dpo": 0.6512966156005859, "losses/sft": 0.5200973749160767, "losses/total": 0.6512966156005859, "ref_logps/chosen": -96.70021057128906, "ref_logps/rejected": -107.7091293334961, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3844892382621765, "rewards/margins": 0.22489586472511292, "rewards/rejected": -0.6093851327896118, "step": 1300 }, { "epoch": 0.9732560314194876, "grad_norm": 96.61259248531424, "learning_rate": 9.006278643683696e-10, "logps/chosen": -82.07158660888672, "logps/rejected": -100.61835479736328, "loss": 0.6332, "losses/dpo": 0.34201544523239136, "losses/sft": 0.931149423122406, "losses/total": 0.34201544523239136, "ref_logps/chosen": -78.21710968017578, "ref_logps/rejected": -93.56779479980469, "rewards/accuracies": 0.59375, "rewards/chosen": -0.38544726371765137, "rewards/margins": 0.31960880756378174, "rewards/rejected": -0.7050560712814331, "step": 1301 }, { "epoch": 0.9740041144567047, "grad_norm": 79.18269131217586, "learning_rate": 8.499273856584499e-10, "logps/chosen": -105.54087829589844, "logps/rejected": -103.38949584960938, "loss": 0.6305, "losses/dpo": 0.8206950426101685, "losses/sft": 0.5264465808868408, "losses/total": 0.8206950426101685, "ref_logps/chosen": -102.09628295898438, "ref_logps/rejected": -97.83584594726562, "rewards/accuracies": 0.625, "rewards/chosen": -0.344458669424057, "rewards/margins": 0.2109055072069168, "rewards/rejected": -0.555364191532135, "step": 1302 }, { "epoch": 0.9747521974939218, "grad_norm": 70.18013434685594, "learning_rate": 8.006932008005896e-10, "logps/chosen": -103.04061126708984, "logps/rejected": -109.25037384033203, "loss": 0.555, "losses/dpo": 0.5650319457054138, "losses/sft": 0.2586176097393036, "losses/total": 0.5650319457054138, "ref_logps/chosen": -99.7252197265625, "ref_logps/rejected": -101.79660034179688, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3315396308898926, "rewards/margins": 0.41383635997772217, "rewards/rejected": -0.7453759908676147, "step": 1303 }, { "epoch": 0.9755002805311389, "grad_norm": 68.52295177784681, "learning_rate": 7.52925599546983e-10, "logps/chosen": -96.091552734375, "logps/rejected": -104.15351867675781, "loss": 0.5939, "losses/dpo": 0.6620636582374573, "losses/sft": 1.141943097114563, "losses/total": 0.6620636582374573, "ref_logps/chosen": -91.49202728271484, "ref_logps/rejected": -96.4551010131836, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4599514603614807, "rewards/margins": 0.30989035964012146, "rewards/rejected": -0.7698418498039246, "step": 1304 }, { "epoch": 0.9762483635683561, "grad_norm": 79.2925865720013, "learning_rate": 7.066248630187011e-10, "logps/chosen": -110.80089569091797, "logps/rejected": -102.37039184570312, "loss": 0.6791, "losses/dpo": 0.8017555475234985, "losses/sft": 0.49836814403533936, "losses/total": 0.8017555475234985, "ref_logps/chosen": -105.42405700683594, "ref_logps/rejected": -95.51387786865234, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5376837849617004, "rewards/margins": 0.14796766638755798, "rewards/rejected": -0.685651421546936, "step": 1305 }, { "epoch": 0.9769964466055732, "grad_norm": 118.2549219699941, "learning_rate": 6.617912637040813e-10, "logps/chosen": -111.16111755371094, "logps/rejected": -120.75802612304688, "loss": 0.7095, "losses/dpo": 0.4439311623573303, "losses/sft": 0.7588082551956177, "losses/total": 0.4439311623573303, "ref_logps/chosen": -106.41072082519531, "ref_logps/rejected": -114.44053649902344, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4750409722328186, "rewards/margins": 0.15670835971832275, "rewards/rejected": -0.6317492723464966, "step": 1306 }, { "epoch": 0.9777445296427903, "grad_norm": 72.84712500099538, "learning_rate": 6.184250654570622e-10, "logps/chosen": -87.40279388427734, "logps/rejected": -85.7686767578125, "loss": 0.7255, "losses/dpo": 0.7928342819213867, "losses/sft": 0.3035460114479065, "losses/total": 0.7928342819213867, "ref_logps/chosen": -82.33531188964844, "ref_logps/rejected": -79.92073059082031, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5067480802536011, "rewards/margins": 0.07804706692695618, "rewards/rejected": -0.5847952365875244, "step": 1307 }, { "epoch": 0.9784926126800075, "grad_norm": 78.58732216488616, "learning_rate": 5.765265234957128e-10, "logps/chosen": -102.76844787597656, "logps/rejected": -100.03876495361328, "loss": 0.7062, "losses/dpo": 0.7405925989151001, "losses/sft": 1.0902342796325684, "losses/total": 0.7405925989151001, "ref_logps/chosen": -98.72447204589844, "ref_logps/rejected": -95.41537475585938, "rewards/accuracies": 0.5, "rewards/chosen": -0.404397577047348, "rewards/margins": 0.05794185772538185, "rewards/rejected": -0.46233946084976196, "step": 1308 }, { "epoch": 0.9792406957172246, "grad_norm": 81.89293999430382, "learning_rate": 5.360958844005947e-10, "logps/chosen": -94.035400390625, "logps/rejected": -97.40431213378906, "loss": 0.7302, "losses/dpo": 0.7635016441345215, "losses/sft": 1.041053056716919, "losses/total": 0.7635016441345215, "ref_logps/chosen": -89.22223663330078, "ref_logps/rejected": -92.01594543457031, "rewards/accuracies": 0.46875, "rewards/chosen": -0.4813160300254822, "rewards/margins": 0.05752033367753029, "rewards/rejected": -0.5388363599777222, "step": 1309 }, { "epoch": 0.9799887787544418, "grad_norm": 132.68217123348722, "learning_rate": 4.971333861134297e-10, "logps/chosen": -77.8878402709961, "logps/rejected": -81.98336029052734, "loss": 0.651, "losses/dpo": 0.6782549023628235, "losses/sft": 0.6017647385597229, "losses/total": 0.6782549023628235, "ref_logps/chosen": -73.04715728759766, "ref_logps/rejected": -75.16873168945312, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4840693473815918, "rewards/margins": 0.19739362597465515, "rewards/rejected": -0.6814630031585693, "step": 1310 }, { "epoch": 0.9807368617916589, "grad_norm": 80.66783861091871, "learning_rate": 4.596392579356845e-10, "logps/chosen": -81.34965515136719, "logps/rejected": -92.84780883789062, "loss": 0.6379, "losses/dpo": 0.5027711987495422, "losses/sft": 0.20820239186286926, "losses/total": 0.5027711987495422, "ref_logps/chosen": -76.31106567382812, "ref_logps/rejected": -85.27054595947266, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5038579702377319, "rewards/margins": 0.2538681626319885, "rewards/rejected": -0.7577261328697205, "step": 1311 }, { "epoch": 0.981484944828876, "grad_norm": 64.86082912313884, "learning_rate": 4.2361372052715504e-10, "logps/chosen": -90.80941772460938, "logps/rejected": -106.20515441894531, "loss": 0.6766, "losses/dpo": 0.8756207823753357, "losses/sft": 1.301910638809204, "losses/total": 0.8756207823753357, "ref_logps/chosen": -87.08192443847656, "ref_logps/rejected": -100.92549133300781, "rewards/accuracies": 0.625, "rewards/chosen": -0.3727492392063141, "rewards/margins": 0.1552160531282425, "rewards/rejected": -0.5279653072357178, "step": 1312 }, { "epoch": 0.9822330278660931, "grad_norm": 53.40856735710734, "learning_rate": 3.8905698590474545e-10, "logps/chosen": -81.74081420898438, "logps/rejected": -85.55778503417969, "loss": 0.6487, "losses/dpo": 0.7015489339828491, "losses/sft": 0.6425892114639282, "losses/total": 0.7015489339828491, "ref_logps/chosen": -77.84809112548828, "ref_logps/rejected": -79.47443389892578, "rewards/accuracies": 0.59375, "rewards/chosen": -0.38927268981933594, "rewards/margins": 0.21906207501888275, "rewards/rejected": -0.6083347797393799, "step": 1313 }, { "epoch": 0.9829811109033103, "grad_norm": 93.05723838714245, "learning_rate": 3.559692574411632e-10, "logps/chosen": -96.05827331542969, "logps/rejected": -112.90453338623047, "loss": 0.5307, "losses/dpo": 0.5287984013557434, "losses/sft": 0.21483367681503296, "losses/total": 0.5287984013557434, "ref_logps/chosen": -93.19007873535156, "ref_logps/rejected": -105.27076721191406, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2868192791938782, "rewards/margins": 0.47655636072158813, "rewards/rejected": -0.7633756399154663, "step": 1314 }, { "epoch": 0.9837291939405274, "grad_norm": 69.04917667348315, "learning_rate": 3.243507298637815e-10, "logps/chosen": -89.03345489501953, "logps/rejected": -97.64852905273438, "loss": 0.6033, "losses/dpo": 0.5143626928329468, "losses/sft": 0.6602886915206909, "losses/total": 0.5143626928329468, "ref_logps/chosen": -85.19662475585938, "ref_logps/rejected": -90.61492919921875, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3836819529533386, "rewards/margins": 0.31967759132385254, "rewards/rejected": -0.7033595442771912, "step": 1315 }, { "epoch": 0.9844772769777446, "grad_norm": 57.45169265617375, "learning_rate": 2.942015892534178e-10, "logps/chosen": -83.8006591796875, "logps/rejected": -103.95746612548828, "loss": 0.5131, "losses/dpo": 0.4938807189464569, "losses/sft": 0.8930040597915649, "losses/total": 0.4938807189464569, "ref_logps/chosen": -81.08343505859375, "ref_logps/rejected": -96.26600646972656, "rewards/accuracies": 0.8125, "rewards/chosen": -0.27172327041625977, "rewards/margins": 0.4974238872528076, "rewards/rejected": -0.7691471576690674, "step": 1316 }, { "epoch": 0.9852253600149616, "grad_norm": 60.54642347991924, "learning_rate": 2.6552201304327894e-10, "logps/chosen": -96.80669403076172, "logps/rejected": -112.01838684082031, "loss": 0.6376, "losses/dpo": 0.566399872303009, "losses/sft": 0.7315203547477722, "losses/total": 0.566399872303009, "ref_logps/chosen": -91.74836730957031, "ref_logps/rejected": -104.8538818359375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5058324337005615, "rewards/margins": 0.21061666309833527, "rewards/rejected": -0.7164491415023804, "step": 1317 }, { "epoch": 0.9859734430521788, "grad_norm": 64.5226468833284, "learning_rate": 2.383121700179347e-10, "logps/chosen": -101.3904800415039, "logps/rejected": -111.77705383300781, "loss": 0.6253, "losses/dpo": 0.560895562171936, "losses/sft": 0.9854695796966553, "losses/total": 0.560895562171936, "ref_logps/chosen": -97.60802459716797, "ref_logps/rejected": -105.31426239013672, "rewards/accuracies": 0.71875, "rewards/chosen": -0.378245085477829, "rewards/margins": 0.268033891916275, "rewards/rejected": -0.646278977394104, "step": 1318 }, { "epoch": 0.9867215260893959, "grad_norm": 63.513857388270196, "learning_rate": 2.1257222031231813e-10, "logps/chosen": -88.48165130615234, "logps/rejected": -104.06352233886719, "loss": 0.5476, "losses/dpo": 0.5441508293151855, "losses/sft": 1.1057021617889404, "losses/total": 0.5441508293151855, "ref_logps/chosen": -85.66667938232422, "ref_logps/rejected": -96.83711242675781, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2814970314502716, "rewards/margins": 0.4411444067955017, "rewards/rejected": -0.7226414680480957, "step": 1319 }, { "epoch": 0.9874696091266131, "grad_norm": 81.94856472655385, "learning_rate": 1.8830231541072662e-10, "logps/chosen": -96.00165557861328, "logps/rejected": -96.39358520507812, "loss": 0.7362, "losses/dpo": 0.9954726099967957, "losses/sft": 0.9909012317657471, "losses/total": 0.9954726099967957, "ref_logps/chosen": -90.91478729248047, "ref_logps/rejected": -90.70841979980469, "rewards/accuracies": 0.5625, "rewards/chosen": -0.508686363697052, "rewards/margins": 0.059830471873283386, "rewards/rejected": -0.5685168504714966, "step": 1320 }, { "epoch": 0.9882176921638302, "grad_norm": 65.3281869548147, "learning_rate": 1.6550259814601675e-10, "logps/chosen": -82.9664306640625, "logps/rejected": -93.52983093261719, "loss": 0.5977, "losses/dpo": 0.41060230135917664, "losses/sft": 0.8095760941505432, "losses/total": 0.41060230135917664, "ref_logps/chosen": -79.28773498535156, "ref_logps/rejected": -86.7964859008789, "rewards/accuracies": 0.59375, "rewards/chosen": -0.36786991357803345, "rewards/margins": 0.30546486377716064, "rewards/rejected": -0.6733347773551941, "step": 1321 }, { "epoch": 0.9889657752010473, "grad_norm": 60.12987414487873, "learning_rate": 1.441732026986886e-10, "logps/chosen": -81.60006713867188, "logps/rejected": -86.21080017089844, "loss": 0.6497, "losses/dpo": 0.7147899866104126, "losses/sft": 0.5420001149177551, "losses/total": 0.7147899866104126, "ref_logps/chosen": -77.06005096435547, "ref_logps/rejected": -80.02911376953125, "rewards/accuracies": 0.5, "rewards/chosen": -0.4540022015571594, "rewards/margins": 0.1641664206981659, "rewards/rejected": -0.6181685924530029, "step": 1322 }, { "epoch": 0.9897138582382644, "grad_norm": 83.6500654531566, "learning_rate": 1.2431425459616395e-10, "logps/chosen": -86.39167785644531, "logps/rejected": -93.16567993164062, "loss": 0.6357, "losses/dpo": 0.5824193358421326, "losses/sft": 0.3098785877227783, "losses/total": 0.5824193358421326, "ref_logps/chosen": -82.73713684082031, "ref_logps/rejected": -87.11656188964844, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3654547929763794, "rewards/margins": 0.23945724964141846, "rewards/rejected": -0.6049120426177979, "step": 1323 }, { "epoch": 0.9904619412754816, "grad_norm": 64.47456485265684, "learning_rate": 1.0592587071195369e-10, "logps/chosen": -97.70494842529297, "logps/rejected": -118.50333404541016, "loss": 0.5966, "losses/dpo": 0.49155423045158386, "losses/sft": 1.0202101469039917, "losses/total": 0.49155423045158386, "ref_logps/chosen": -93.34709930419922, "ref_logps/rejected": -111.3034439086914, "rewards/accuracies": 0.6875, "rewards/chosen": -0.43578553199768066, "rewards/margins": 0.28420308232307434, "rewards/rejected": -0.7199886441230774, "step": 1324 }, { "epoch": 0.9912100243126987, "grad_norm": 64.6173964552279, "learning_rate": 8.900815926513039e-11, "logps/chosen": -84.8451156616211, "logps/rejected": -91.66860961914062, "loss": 0.6565, "losses/dpo": 0.5068252682685852, "losses/sft": 1.3847575187683105, "losses/total": 0.5068252682685852, "ref_logps/chosen": -80.75450134277344, "ref_logps/rejected": -85.83480834960938, "rewards/accuracies": 0.59375, "rewards/chosen": -0.40906113386154175, "rewards/margins": 0.1743195801973343, "rewards/rejected": -0.5833806991577148, "step": 1325 }, { "epoch": 0.9919581073499159, "grad_norm": 60.349395606389116, "learning_rate": 7.356121981946794e-11, "logps/chosen": -87.0782241821289, "logps/rejected": -98.17304992675781, "loss": 0.6488, "losses/dpo": 0.6069216728210449, "losses/sft": 1.3851189613342285, "losses/total": 0.6069216728210449, "ref_logps/chosen": -82.87784576416016, "ref_logps/rejected": -92.3551025390625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.42003822326660156, "rewards/margins": 0.16175636649131775, "rewards/rejected": -0.5817945599555969, "step": 1326 }, { "epoch": 0.9927061903871329, "grad_norm": 61.259771649703794, "learning_rate": 5.958514328308073e-11, "logps/chosen": -98.26195526123047, "logps/rejected": -130.71095275878906, "loss": 0.5388, "losses/dpo": 0.4185025095939636, "losses/sft": 0.8679402470588684, "losses/total": 0.4185025095939636, "ref_logps/chosen": -95.21104431152344, "ref_logps/rejected": -123.1772689819336, "rewards/accuracies": 0.78125, "rewards/chosen": -0.30509164929389954, "rewards/margins": 0.4482765793800354, "rewards/rejected": -0.7533682584762573, "step": 1327 }, { "epoch": 0.9934542734243501, "grad_norm": 61.76036790921024, "learning_rate": 4.70800119077297e-11, "logps/chosen": -95.34400177001953, "logps/rejected": -117.6018295288086, "loss": 0.5417, "losses/dpo": 0.39317822456359863, "losses/sft": 1.2295933961868286, "losses/total": 0.39317822456359863, "ref_logps/chosen": -92.21922302246094, "ref_logps/rejected": -109.87973022460938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3124782145023346, "rewards/margins": 0.4597315788269043, "rewards/rejected": -0.7722097635269165, "step": 1328 }, { "epoch": 0.9942023564615672, "grad_norm": 69.50306524748665, "learning_rate": 3.6045899288378315e-11, "logps/chosen": -100.78227233886719, "logps/rejected": -102.17129516601562, "loss": 0.6147, "losses/dpo": 0.789271354675293, "losses/sft": 0.6767371296882629, "losses/total": 0.789271354675293, "ref_logps/chosen": -97.66545104980469, "ref_logps/rejected": -96.53266143798828, "rewards/accuracies": 0.78125, "rewards/chosen": -0.31168216466903687, "rewards/margins": 0.2521807849407196, "rewards/rejected": -0.5638629794120789, "step": 1329 }, { "epoch": 0.9949504394987844, "grad_norm": 69.87262824167044, "learning_rate": 2.64828703628317e-11, "logps/chosen": -94.74053192138672, "logps/rejected": -109.99113464355469, "loss": 0.6068, "losses/dpo": 0.7316064834594727, "losses/sft": 0.8411780595779419, "losses/total": 0.7316064834594727, "ref_logps/chosen": -91.41384887695312, "ref_logps/rejected": -104.03785705566406, "rewards/accuracies": 0.65625, "rewards/chosen": -0.33266735076904297, "rewards/margins": 0.26266032457351685, "rewards/rejected": -0.5953276753425598, "step": 1330 }, { "epoch": 0.9956985225360016, "grad_norm": 68.42335725038146, "learning_rate": 1.8390981411264828e-11, "logps/chosen": -92.83131408691406, "logps/rejected": -99.8111572265625, "loss": 0.6347, "losses/dpo": 0.6106313467025757, "losses/sft": 1.2885255813598633, "losses/total": 0.6106313467025757, "ref_logps/chosen": -88.61311340332031, "ref_logps/rejected": -93.46829223632812, "rewards/accuracies": 0.71875, "rewards/chosen": -0.42181915044784546, "rewards/margins": 0.21246755123138428, "rewards/rejected": -0.6342867016792297, "step": 1331 }, { "epoch": 0.9964466055732186, "grad_norm": 78.47366711376876, "learning_rate": 1.1770280055917182e-11, "logps/chosen": -103.89241790771484, "logps/rejected": -114.40785217285156, "loss": 0.6217, "losses/dpo": 0.43720531463623047, "losses/sft": 0.603500485420227, "losses/total": 0.43720531463623047, "ref_logps/chosen": -99.35454559326172, "ref_logps/rejected": -107.19087219238281, "rewards/accuracies": 0.53125, "rewards/chosen": -0.45378783345222473, "rewards/margins": 0.26790985465049744, "rewards/rejected": -0.7216977477073669, "step": 1332 }, { "epoch": 0.9971946886104358, "grad_norm": 72.48824201803683, "learning_rate": 6.6208052608429654e-12, "logps/chosen": -87.69355773925781, "logps/rejected": -87.1852798461914, "loss": 0.6518, "losses/dpo": 0.9078832864761353, "losses/sft": 1.1307129859924316, "losses/total": 0.9078832864761353, "ref_logps/chosen": -83.3745346069336, "ref_logps/rejected": -80.61248779296875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4319019317626953, "rewards/margins": 0.22537687420845032, "rewards/rejected": -0.6572787761688232, "step": 1333 }, { "epoch": 0.9979427716476529, "grad_norm": 67.58068988144235, "learning_rate": 2.942587331605795e-12, "logps/chosen": -86.97212982177734, "logps/rejected": -95.52812194824219, "loss": 0.6279, "losses/dpo": 0.7165204286575317, "losses/sft": 0.5361387729644775, "losses/total": 0.7165204286575317, "ref_logps/chosen": -82.60633087158203, "ref_logps/rejected": -89.1469497680664, "rewards/accuracies": 0.65625, "rewards/chosen": -0.43657994270324707, "rewards/margins": 0.20153796672821045, "rewards/rejected": -0.6381179094314575, "step": 1334 }, { "epoch": 0.9986908546848701, "grad_norm": 55.124565589905835, "learning_rate": 7.356479152509365e-13, "logps/chosen": -91.50282287597656, "logps/rejected": -104.527587890625, "loss": 0.625, "losses/dpo": 0.61297607421875, "losses/sft": 0.9364376068115234, "losses/total": 0.61297607421875, "ref_logps/chosen": -87.8662109375, "ref_logps/rejected": -98.70712280273438, "rewards/accuracies": 0.625, "rewards/chosen": -0.36366093158721924, "rewards/margins": 0.21838600933551788, "rewards/rejected": -0.5820469260215759, "step": 1335 }, { "epoch": 0.9994389377220871, "grad_norm": 75.97489913727874, "learning_rate": 0.0, "logps/chosen": -106.40237426757812, "logps/rejected": -117.79166412353516, "loss": 0.6194, "losses/dpo": 0.4642177224159241, "losses/sft": 1.143139123916626, "losses/total": 0.4642177224159241, "ref_logps/chosen": -102.15181732177734, "ref_logps/rejected": -110.46540832519531, "rewards/accuracies": 0.75, "rewards/chosen": -0.42505496740341187, "rewards/margins": 0.3075697422027588, "rewards/rejected": -0.7326247096061707, "step": 1336 }, { "epoch": 0.9994389377220871, "step": 1336, "total_flos": 0.0, "train_loss": 0.6421700569014707, "train_runtime": 8077.4671, "train_samples_per_second": 5.295, "train_steps_per_second": 0.165 } ], "logging_steps": 1.0, "max_steps": 1336, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }