diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,11728 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.9957215091404122, - "eval_steps": 64, - "global_step": 480, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.002074419810709192, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5625, - "grad_norm": 30.445291710986226, - "learning_rate": 0.0, - "logits/chosen": 1.3143655061721802, - "logits/rejected": 1.334812045097351, - "logps/accuracies": 0.4375, - "logps/chosen": -329.3199157714844, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -329.3199157714844, - "logps/ref_rejected": -308.284912109375, - "logps/rejected": -308.284912109375, - "loss": 1.0, - "rewards/accuracies": 0.0, - "rewards/chosen": 0.0, - "rewards/grad_term": 0.05000000447034836, - "rewards/margins": 0.0, - "rewards/rejected": 0.0, - "step": 1 - }, - { - "epoch": 0.004148839621418384, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.4375, - "grad_norm": 31.539643174909184, - "learning_rate": 1.5151715240963886e-07, - "logits/chosen": 1.136220932006836, - "logits/rejected": 1.1561778783798218, - "logps/accuracies": 0.5625, - "logps/chosen": -280.4060363769531, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -280.13531494140625, - "logps/ref_rejected": -287.2406005859375, - "logps/rejected": -287.34637451171875, - "loss": 0.9925, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.027070851996541023, - "rewards/grad_term": 0.05042332783341408, - "rewards/margins": -0.01649157702922821, - "rewards/rejected": -0.010579276829957962, - "step": 2 - }, - { - "epoch": 0.006223259432127577, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.6875, - "grad_norm": 35.669754591330424, - "learning_rate": 2.401490047853298e-07, - "logits/chosen": 1.6139692068099976, - "logits/rejected": 1.5537246465682983, - "logps/accuracies": 0.3125, - "logps/chosen": -279.83502197265625, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -279.268310546875, - "logps/ref_rejected": -258.850341796875, - "logps/rejected": -259.1755065917969, - "loss": 0.9854, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.056671928614377975, - "rewards/grad_term": 0.050606753677129745, - "rewards/margins": -0.024156270548701286, - "rewards/rejected": -0.03251565620303154, - "step": 3 - }, - { - "epoch": 0.008297679242836769, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5625, - "grad_norm": 35.57252658535935, - "learning_rate": 3.030343048192777e-07, - "logits/chosen": 1.7025470733642578, - "logits/rejected": 1.6247684955596924, - "logps/accuracies": 0.4375, - "logps/chosen": -321.3578796386719, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -321.1121826171875, - "logps/ref_rejected": -316.1632995605469, - "logps/rejected": -316.3089599609375, - "loss": 0.9885, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.024570418521761894, - "rewards/grad_term": 0.050244007259607315, - "rewards/margins": -0.010007334873080254, - "rewards/rejected": -0.014563081786036491, - "step": 4 - }, - { - "epoch": 0.010372099053545962, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 40.58061807061268, - "learning_rate": 3.5181193303727093e-07, - "logits/chosen": 1.4226707220077515, - "logits/rejected": 1.505796194076538, - "logps/accuracies": 0.625, - "logps/chosen": -245.84014892578125, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -245.4801483154297, - "logps/ref_rejected": -251.10232543945312, - "logps/rejected": -251.73736572265625, - "loss": 0.9812, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.03600040823221207, - "rewards/grad_term": 0.049336254596710205, - "rewards/margins": 0.02750583179295063, - "rewards/rejected": -0.06350623816251755, - "step": 5 - }, - { - "epoch": 0.012446518864255154, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 27.99149170411343, - "learning_rate": 3.9166615719496866e-07, - "logits/chosen": 1.3876930475234985, - "logits/rejected": 1.4264953136444092, - "logps/accuracies": 0.6875, - "logps/chosen": -291.3964538574219, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -291.29791259765625, - "logps/ref_rejected": -307.63433837890625, - "logps/rejected": -309.51678466796875, - "loss": 0.9803, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.009854471310973167, - "rewards/grad_term": 0.04629334807395935, - "rewards/margins": 0.17839080095291138, - "rewards/rejected": -0.1882452815771103, - "step": 6 - }, - { - "epoch": 0.014520938674964345, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.625, - "grad_norm": 35.07512813691069, - "learning_rate": 4.253624235933518e-07, - "logits/chosen": 1.3303760290145874, - "logits/rejected": 1.41872239112854, - "logps/accuracies": 0.375, - "logps/chosen": -256.7828369140625, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -256.71258544921875, - "logps/ref_rejected": -252.4455108642578, - "logps/rejected": -255.78366088867188, - "loss": 0.9451, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.007027318701148033, - "rewards/grad_term": 0.04277125000953674, - "rewards/margins": 0.3267865777015686, - "rewards/rejected": -0.3338139057159424, - "step": 7 - }, - { - "epoch": 0.016595358485673537, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 53.4690693475631, - "learning_rate": 4.545514572289166e-07, - "logits/chosen": 1.4601320028305054, - "logits/rejected": 1.5025103092193604, - "logps/accuracies": 0.25, - "logps/chosen": -328.91387939453125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -328.650634765625, - "logps/ref_rejected": -320.340576171875, - "logps/rejected": -322.7126159667969, - "loss": 0.9334, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.026328086853027344, - "rewards/grad_term": 0.04583045467734337, - "rewards/margins": 0.2108786404132843, - "rewards/rejected": -0.23720674216747284, - "step": 8 - }, - { - "epoch": 0.01866977829638273, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 50.206274648941296, - "learning_rate": 4.802980095706596e-07, - "logits/chosen": 1.516817331314087, - "logits/rejected": 1.5115327835083008, - "logps/accuracies": 0.625, - "logps/chosen": -271.8262634277344, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -272.4455261230469, - "logps/ref_rejected": -264.29541015625, - "logps/rejected": -272.98419189453125, - "loss": 0.9138, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.0619237944483757, - "rewards/grad_term": 0.037300530821084976, - "rewards/margins": 0.930805504322052, - "rewards/rejected": -0.8688817620277405, - "step": 9 - }, - { - "epoch": 0.020744198107091924, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.4375, - "grad_norm": 68.87775211865134, - "learning_rate": 5.033290854469099e-07, - "logits/chosen": 1.1356251239776611, - "logits/rejected": 1.1563141345977783, - "logps/accuracies": 0.5625, - "logps/chosen": -303.23114013671875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -303.0604553222656, - "logps/ref_rejected": -303.90673828125, - "logps/rejected": -306.86865234375, - "loss": 0.9189, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.017069505527615547, - "rewards/grad_term": 0.04398500546813011, - "rewards/margins": 0.27912360429763794, - "rewards/rejected": -0.29619312286376953, - "step": 10 - }, - { - "epoch": 0.022818617917801116, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 57.71463843312805, - "learning_rate": 5.241632278117911e-07, - "logits/chosen": 1.4097551107406616, - "logits/rejected": 1.5242267847061157, - "logps/accuracies": 0.6875, - "logps/chosen": -328.52459716796875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -328.664306640625, - "logps/ref_rejected": -354.8746032714844, - "logps/rejected": -370.66998291015625, - "loss": 0.874, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.013973474502563477, - "rewards/grad_term": 0.029512763023376465, - "rewards/margins": 1.5935115814208984, - "rewards/rejected": -1.579538106918335, - "step": 11 - }, - { - "epoch": 0.024893037728510307, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 19.34405027942017, - "learning_rate": 5.431833096046075e-07, - "logits/chosen": 1.2825186252593994, - "logits/rejected": 1.4041016101837158, - "logps/accuracies": 0.6875, - "logps/chosen": -339.60736083984375, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -332.350341796875, - "logps/ref_rejected": -342.4404602050781, - "logps/rejected": -393.659423828125, - "loss": 0.7647, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7256991863250732, - "rewards/grad_term": 0.022119037806987762, - "rewards/margins": 4.3961944580078125, - "rewards/rejected": -5.121893405914307, - "step": 12 - }, - { - "epoch": 0.0269674575392195, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 18.068273844723095, - "learning_rate": 5.606800887562651e-07, - "logits/chosen": 1.3912986516952515, - "logits/rejected": 1.4100464582443237, - "logps/accuracies": 0.625, - "logps/chosen": -360.9014587402344, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -349.53863525390625, - "logps/ref_rejected": -341.5389709472656, - "logps/rejected": -384.8885498046875, - "loss": 0.7998, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.136283040046692, - "rewards/grad_term": 0.016487201675772667, - "rewards/margins": 3.198676347732544, - "rewards/rejected": -4.334959506988525, - "step": 13 - }, - { - "epoch": 0.02904187734992869, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.4375, - "grad_norm": 27.645843878628447, - "learning_rate": 5.768795760029907e-07, - "logits/chosen": 1.4637022018432617, - "logits/rejected": 1.4576082229614258, - "logps/accuracies": 0.5625, - "logps/chosen": -319.2772216796875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -297.5706787109375, - "logps/ref_rejected": -267.2304382324219, - "logps/rejected": -324.9382629394531, - "loss": 0.7958, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.1706531047821045, - "rewards/grad_term": 0.016141919419169426, - "rewards/margins": 3.600131034851074, - "rewards/rejected": -5.770784378051758, - "step": 14 - }, - { - "epoch": 0.031116297160637883, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 24.62720014572735, - "learning_rate": 5.919609378226007e-07, - "logits/chosen": 1.4202252626419067, - "logits/rejected": 1.5137040615081787, - "logps/accuracies": 0.8125, - "logps/chosen": -354.2460632324219, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -320.8343200683594, - "logps/ref_rejected": -324.75347900390625, - "logps/rejected": -385.1374816894531, - "loss": 0.8253, - "rewards/accuracies": 0.875, - "rewards/chosen": -3.341172933578491, - "rewards/grad_term": 0.024179620668292046, - "rewards/margins": 2.697230815887451, - "rewards/rejected": -6.0384039878845215, - "step": 15 - }, - { - "epoch": 0.033190716971347074, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 24.048376194470723, - "learning_rate": 6.060686096385554e-07, - "logits/chosen": 1.3625872135162354, - "logits/rejected": 1.5674644708633423, - "logps/accuracies": 0.75, - "logps/chosen": -321.19952392578125, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -300.4372253417969, - "logps/ref_rejected": -299.97760009765625, - "logps/rejected": -369.1448974609375, - "loss": 0.7917, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.0762338638305664, - "rewards/grad_term": 0.01863047480583191, - "rewards/margins": 4.8404951095581055, - "rewards/rejected": -6.916728973388672, - "step": 16 - }, - { - "epoch": 0.03526513678205627, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 45.28780856236718, - "learning_rate": 6.193207302864632e-07, - "logits/chosen": 1.3500301837921143, - "logits/rejected": 1.3641669750213623, - "logps/accuracies": 0.75, - "logps/chosen": -257.4446105957031, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -241.6280517578125, - "logps/ref_rejected": -237.22329711914062, - "logps/rejected": -294.7928466796875, - "loss": 0.7765, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5816560983657837, - "rewards/grad_term": 0.019907817244529724, - "rewards/margins": 4.175297737121582, - "rewards/rejected": -5.756953239440918, - "step": 17 - }, - { - "epoch": 0.03733955659276546, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 33.38191088083146, - "learning_rate": 6.318151619802984e-07, - "logits/chosen": 1.1816256046295166, - "logits/rejected": 1.2595932483673096, - "logps/accuracies": 0.75, - "logps/chosen": -317.13775634765625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -302.21380615234375, - "logps/ref_rejected": -337.5311279296875, - "logps/rejected": -381.22796630859375, - "loss": 0.87, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.4923958778381348, - "rewards/grad_term": 0.024127114564180374, - "rewards/margins": 2.877284288406372, - "rewards/rejected": -4.369679927825928, - "step": 18 - }, - { - "epoch": 0.03941397640347465, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 40.949935957102895, - "learning_rate": 6.436338804795301e-07, - "logits/chosen": 1.4050649404525757, - "logits/rejected": 1.4994277954101562, - "logps/accuracies": 0.875, - "logps/chosen": -292.9769287109375, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -263.2729187011719, - "logps/ref_rejected": -291.3985290527344, - "logps/rejected": -359.13031005859375, - "loss": 0.8089, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.9704017639160156, - "rewards/grad_term": 0.02432025596499443, - "rewards/margins": 3.802779197692871, - "rewards/rejected": -6.773180961608887, - "step": 19 - }, - { - "epoch": 0.04148839621418385, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 30.4272452916532, - "learning_rate": 6.548462378565487e-07, - "logits/chosen": 1.5738377571105957, - "logits/rejected": 1.5692741870880127, - "logps/accuracies": 0.8125, - "logps/chosen": -281.9122619628906, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -266.5782775878906, - "logps/ref_rejected": -265.3857421875, - "logps/rejected": -338.1681213378906, - "loss": 0.8287, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5333983898162842, - "rewards/grad_term": 0.01711239479482174, - "rewards/margins": 5.744836807250977, - "rewards/rejected": -7.278235912322998, - "step": 20 - }, - { - "epoch": 0.043562816024893036, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 21.481613213914592, - "learning_rate": 6.655114283786817e-07, - "logits/chosen": 1.495798945426941, - "logits/rejected": 1.575231909751892, - "logps/accuracies": 0.5625, - "logps/chosen": -329.2530517578125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -307.7621154785156, - "logps/ref_rejected": -305.4177551269531, - "logps/rejected": -353.7918395996094, - "loss": 0.7972, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.1490931510925293, - "rewards/grad_term": 0.026765087619423866, - "rewards/margins": 2.6883203983306885, - "rewards/rejected": -4.8374128341674805, - "step": 21 - }, - { - "epoch": 0.04563723583560223, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 43.410701214740534, - "learning_rate": 6.7568038022143e-07, - "logits/chosen": 1.3690290451049805, - "logits/rejected": 1.4588748216629028, - "logps/accuracies": 0.75, - "logps/chosen": -303.6971740722656, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -295.21514892578125, - "logps/ref_rejected": -289.0858459472656, - "logps/rejected": -341.2356262207031, - "loss": 0.756, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.8482051491737366, - "rewards/grad_term": 0.01651611179113388, - "rewards/margins": 4.366776466369629, - "rewards/rejected": -5.214981555938721, - "step": 22 - }, - { - "epoch": 0.04771165564631142, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 40.534862406872996, - "learning_rate": 6.853972263303346e-07, - "logits/chosen": 1.460001826286316, - "logits/rejected": 1.4567062854766846, - "logps/accuracies": 0.625, - "logps/chosen": -357.6313171386719, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -350.6695251464844, - "logps/ref_rejected": -331.9695129394531, - "logps/rejected": -388.808349609375, - "loss": 0.7408, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.6961804628372192, - "rewards/grad_term": 0.018778638914227486, - "rewards/margins": 4.987700939178467, - "rewards/rejected": -5.683881759643555, - "step": 23 - }, - { - "epoch": 0.049786075457020615, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 25.219410545582974, - "learning_rate": 6.947004620142464e-07, - "logits/chosen": 1.494457483291626, - "logits/rejected": 1.583849310874939, - "logps/accuracies": 0.8125, - "logps/chosen": -303.1878662109375, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -287.615234375, - "logps/ref_rejected": -305.63421630859375, - "logps/rejected": -363.3654479980469, - "loss": 0.741, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5572607517242432, - "rewards/grad_term": 0.01411459967494011, - "rewards/margins": 4.215861797332764, - "rewards/rejected": -5.773122787475586, - "step": 24 - }, - { - "epoch": 0.05186049526772981, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 21.34830172085934, - "learning_rate": 7.036238660745419e-07, - "logits/chosen": 1.2966912984848022, - "logits/rejected": 1.3436973094940186, - "logps/accuracies": 0.8125, - "logps/chosen": -318.70855712890625, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -309.7275390625, - "logps/ref_rejected": -321.34222412109375, - "logps/rejected": -374.23724365234375, - "loss": 0.7473, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.8981032371520996, - "rewards/grad_term": 0.030619274824857712, - "rewards/margins": 4.391395092010498, - "rewards/rejected": -5.289497375488281, - "step": 25 - }, - { - "epoch": 0.053934915078439, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 24.70213159620223, - "learning_rate": 7.121972411659039e-07, - "logits/chosen": 1.527209997177124, - "logits/rejected": 1.5188902616500854, - "logps/accuracies": 0.75, - "logps/chosen": -311.9977722167969, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -312.7909240722656, - "logps/ref_rejected": -319.8678283691406, - "logps/rejected": -361.7353515625, - "loss": 0.7263, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.07931968569755554, - "rewards/grad_term": 0.0128245297819376, - "rewards/margins": 4.26607084274292, - "rewards/rejected": -4.186751365661621, - "step": 26 - }, - { - "epoch": 0.056009334889148193, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 21.657435932051584, - "learning_rate": 7.204470143559894e-07, - "logits/chosen": 1.0803958177566528, - "logits/rejected": 1.182570219039917, - "logps/accuracies": 0.625, - "logps/chosen": -305.4505310058594, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -292.47894287109375, - "logps/ref_rejected": -291.8714904785156, - "logps/rejected": -341.31640625, - "loss": 0.708, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.29715895652771, - "rewards/grad_term": 0.02386583387851715, - "rewards/margins": 3.6473331451416016, - "rewards/rejected": -4.944491863250732, - "step": 27 - }, - { - "epoch": 0.05808375469985738, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 43.751484729145034, - "learning_rate": 7.283967284126295e-07, - "logits/chosen": 1.5686805248260498, - "logits/rejected": 1.6045427322387695, - "logps/accuracies": 0.5625, - "logps/chosen": -274.0627136230469, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -275.16888427734375, - "logps/ref_rejected": -261.067138671875, - "logps/rejected": -293.029052734375, - "loss": 0.7257, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.11061999201774597, - "rewards/grad_term": 0.01599438488483429, - "rewards/margins": 3.306811571121216, - "rewards/rejected": -3.1961915493011475, - "step": 28 - }, - { - "epoch": 0.06015817451056658, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0625, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 31.4830012847814, - "learning_rate": 7.360674468418735e-07, - "logits/chosen": 1.3757277727127075, - "logits/rejected": 1.405045747756958, - "logps/accuracies": 0.6875, - "logps/chosen": -325.8724365234375, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -329.0152893066406, - "logps/ref_rejected": -316.9151611328125, - "logps/rejected": -348.4771423339844, - "loss": 0.7116, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.3142804205417633, - "rewards/grad_term": 0.01970498077571392, - "rewards/margins": 3.47047758102417, - "rewards/rejected": -3.1561975479125977, - "step": 29 - }, - { - "epoch": 0.062232594321275765, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 43.903525037448524, - "learning_rate": 7.434780902322396e-07, - "logits/chosen": 1.2318979501724243, - "logits/rejected": 1.2543270587921143, - "logps/accuracies": 0.9375, - "logps/chosen": -299.3183288574219, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -293.3425598144531, - "logps/ref_rejected": -326.46270751953125, - "logps/rejected": -357.6190185546875, - "loss": 0.714, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.5975769758224487, - "rewards/grad_term": 0.024707140401005745, - "rewards/margins": 2.5180513858795166, - "rewards/rejected": -3.115628242492676, - "step": 30 - }, - { - "epoch": 0.06430701413198496, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 34.75148622057771, - "learning_rate": 7.506457174281587e-07, - "logits/chosen": 1.2120341062545776, - "logits/rejected": 1.220780849456787, - "logps/accuracies": 0.8125, - "logps/chosen": -327.4204406738281, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -317.1983337402344, - "logps/ref_rejected": -326.3907165527344, - "logps/rejected": -362.292724609375, - "loss": 0.7512, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.0222113132476807, - "rewards/grad_term": 0.028988810256123543, - "rewards/margins": 2.5679914951324463, - "rewards/rejected": -3.590202808380127, - "step": 31 - }, - { - "epoch": 0.06638143394269415, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 24.42268750796852, - "learning_rate": 7.575857620481944e-07, - "logits/chosen": 1.2759184837341309, - "logits/rejected": 1.330209732055664, - "logps/accuracies": 0.75, - "logps/chosen": -352.6253967285156, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -354.62640380859375, - "logps/ref_rejected": -366.90618896484375, - "logps/rejected": -401.5226135253906, - "loss": 0.6974, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.20009994506835938, - "rewards/grad_term": 0.021008620038628578, - "rewards/margins": 3.6617395877838135, - "rewards/rejected": -3.461639404296875, - "step": 32 - }, - { - "epoch": 0.06845585375340335, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 24.880719455281493, - "learning_rate": 7.643122325971209e-07, - "logits/chosen": 1.103371262550354, - "logits/rejected": 1.1177877187728882, - "logps/accuracies": 0.875, - "logps/chosen": -307.4759826660156, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -311.5048828125, - "logps/ref_rejected": -314.81597900390625, - "logps/rejected": -364.4283447265625, - "loss": 0.7078, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.4028913974761963, - "rewards/grad_term": 0.007870044559240341, - "rewards/margins": 5.364123821258545, - "rewards/rejected": -4.961232662200928, - "step": 33 - }, - { - "epoch": 0.07053027356411254, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 19.643933011479398, - "learning_rate": 7.708378826961021e-07, - "logits/chosen": 1.102717399597168, - "logits/rejected": 1.2488610744476318, - "logps/accuracies": 0.75, - "logps/chosen": -334.2718200683594, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -332.568603515625, - "logps/ref_rejected": -440.8373718261719, - "logps/rejected": -474.8569030761719, - "loss": 0.6622, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.170322984457016, - "rewards/grad_term": 0.02305561862885952, - "rewards/margins": 3.231626510620117, - "rewards/rejected": -3.401949882507324, - "step": 34 - }, - { - "epoch": 0.07260469337482173, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 22.64669553529777, - "learning_rate": 7.771743566306228e-07, - "logits/chosen": 1.0699303150177002, - "logits/rejected": 1.0104891061782837, - "logps/accuracies": 0.5625, - "logps/chosen": -356.24755859375, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -349.8689880371094, - "logps/ref_rejected": -338.50445556640625, - "logps/rejected": -369.7318115234375, - "loss": 0.7305, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.6378520131111145, - "rewards/grad_term": 0.030929066240787506, - "rewards/margins": 2.4848828315734863, - "rewards/rejected": -3.1227352619171143, - "step": 35 - }, - { - "epoch": 0.07467911318553092, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 27.940317128258794, - "learning_rate": 7.833323143899373e-07, - "logits/chosen": 0.8235185146331787, - "logits/rejected": 0.8534129858016968, - "logps/accuracies": 0.75, - "logps/chosen": -311.9441833496094, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -311.42791748046875, - "logps/ref_rejected": -301.2667541503906, - "logps/rejected": -360.82537841796875, - "loss": 0.6602, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.05162731558084488, - "rewards/grad_term": 0.01031492929905653, - "rewards/margins": 5.904232025146484, - "rewards/rejected": -5.955859661102295, - "step": 36 - }, - { - "epoch": 0.07675353299624012, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 59.21163950114599, - "learning_rate": 7.893215395709077e-07, - "logits/chosen": 0.6561946868896484, - "logits/rejected": 0.7136399745941162, - "logps/accuracies": 0.875, - "logps/chosen": -280.95098876953125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -277.3229064941406, - "logps/ref_rejected": -273.8558349609375, - "logps/rejected": -336.818115234375, - "loss": 0.6433, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.36280834674835205, - "rewards/grad_term": 0.008878666907548904, - "rewards/margins": 5.933422088623047, - "rewards/rejected": -6.296230316162109, - "step": 37 - }, - { - "epoch": 0.0788279528069493, - "flips/correct->correct": 0.125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 49.21026179278685, - "learning_rate": 7.951510328891689e-07, - "logits/chosen": 1.0223195552825928, - "logits/rejected": 0.9707791209220886, - "logps/accuracies": 0.5, - "logps/chosen": -251.95631408691406, - "logps/ref_accuracies": 0.125, - "logps/ref_chosen": -241.30072021484375, - "logps/ref_rejected": -225.43099975585938, - "logps/rejected": -282.074951171875, - "loss": 0.6902, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.0655571222305298, - "rewards/grad_term": 0.019987476989626884, - "rewards/margins": 4.59883975982666, - "rewards/rejected": -5.664397239685059, - "step": 38 - }, - { - "epoch": 0.0809023726176585, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 27.79357547798861, - "learning_rate": 8.008290935415948e-07, - "logits/chosen": 0.7353692650794983, - "logits/rejected": 0.8016875386238098, - "logps/accuracies": 0.8125, - "logps/chosen": -287.4935607910156, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -283.6936950683594, - "logps/ref_rejected": -283.30487060546875, - "logps/rejected": -337.9640197753906, - "loss": 0.6843, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.3799862861633301, - "rewards/grad_term": 0.01074125524610281, - "rewards/margins": 5.085926055908203, - "rewards/rejected": -5.465912818908691, - "step": 39 - }, - { - "epoch": 0.0829767924283677, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 52.06939047981281, - "learning_rate": 8.063633902661875e-07, - "logits/chosen": 0.9137625694274902, - "logits/rejected": 0.9001289010047913, - "logps/accuracies": 0.8125, - "logps/chosen": -308.0149841308594, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -298.7401428222656, - "logps/ref_rejected": -289.8734436035156, - "logps/rejected": -362.60211181640625, - "loss": 0.6861, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.9274865984916687, - "rewards/grad_term": 0.015340002253651619, - "rewards/margins": 6.3453826904296875, - "rewards/rejected": -7.272869110107422, - "step": 40 - }, - { - "epoch": 0.08505121223907688, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 30.550457494843638, - "learning_rate": 8.117610236262845e-07, - "logits/chosen": 0.7508188486099243, - "logits/rejected": 0.8092616200447083, - "logps/accuracies": 0.8125, - "logps/chosen": -344.6716613769531, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -332.699462890625, - "logps/ref_rejected": -344.08209228515625, - "logps/rejected": -386.7120056152344, - "loss": 0.7017, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.1972177028656006, - "rewards/grad_term": 0.02788725309073925, - "rewards/margins": 3.065772771835327, - "rewards/rejected": -4.262990474700928, - "step": 41 - }, - { - "epoch": 0.08712563204978607, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 25.64415652049263, - "learning_rate": 8.170285807883206e-07, - "logits/chosen": 0.6477910876274109, - "logits/rejected": 0.8107466697692871, - "logps/accuracies": 0.8125, - "logps/chosen": -261.5460205078125, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -256.37158203125, - "logps/ref_rejected": -280.9100646972656, - "logps/rejected": -316.6261901855469, - "loss": 0.6765, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.5174452066421509, - "rewards/grad_term": 0.02346464805305004, - "rewards/margins": 3.054164409637451, - "rewards/rejected": -3.5716099739074707, - "step": 42 - }, - { - "epoch": 0.08920005186049526, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 35.35935314371997, - "learning_rate": 8.221721838532495e-07, - "logits/chosen": 0.6840221285820007, - "logits/rejected": 0.6609375476837158, - "logps/accuracies": 0.6875, - "logps/chosen": -291.4024353027344, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -296.0429992675781, - "logps/ref_rejected": -282.0372009277344, - "logps/rejected": -319.9362487792969, - "loss": 0.5996, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4640587866306305, - "rewards/grad_term": 0.011877249926328659, - "rewards/margins": 4.253963470458984, - "rewards/rejected": -3.7899045944213867, - "step": 43 - }, - { - "epoch": 0.09127447167120446, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.4375, - "grad_norm": 28.87938199876403, - "learning_rate": 8.271975326310688e-07, - "logits/chosen": 0.8031829595565796, - "logits/rejected": 0.7779420614242554, - "logps/accuracies": 0.5625, - "logps/chosen": -308.4298095703125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -299.1800537109375, - "logps/ref_rejected": -301.2965087890625, - "logps/rejected": -333.80767822265625, - "loss": 0.65, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.9249745607376099, - "rewards/grad_term": 0.028043199330568314, - "rewards/margins": 2.326139450073242, - "rewards/rejected": -3.2511138916015625, - "step": 44 - }, - { - "epoch": 0.09334889148191365, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 27.45541909883828, - "learning_rate": 8.321099426079305e-07, - "logits/chosen": 0.6578277349472046, - "logits/rejected": 0.7826619148254395, - "logps/accuracies": 0.875, - "logps/chosen": -284.1718444824219, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -278.0632019042969, - "logps/ref_rejected": -310.51953125, - "logps/rejected": -362.188720703125, - "loss": 0.6724, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.6108630895614624, - "rewards/grad_term": 0.015516946092247963, - "rewards/margins": 4.556060791015625, - "rewards/rejected": -5.1669230461120605, - "step": 45 - }, - { - "epoch": 0.09542331129262284, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 59.28118284951873, - "learning_rate": 8.369143787399735e-07, - "logits/chosen": 0.9397487044334412, - "logits/rejected": 0.9460306167602539, - "logps/accuracies": 0.75, - "logps/chosen": -250.52760314941406, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -249.87916564941406, - "logps/ref_rejected": -253.21328735351562, - "logps/rejected": -280.88421630859375, - "loss": 0.7086, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.0648447573184967, - "rewards/grad_term": 0.024975696578621864, - "rewards/margins": 2.702247142791748, - "rewards/rejected": -2.7670915126800537, - "step": 46 - }, - { - "epoch": 0.09749773110333204, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 27.055341319651774, - "learning_rate": 8.416154856125216e-07, - "logits/chosen": 0.8418172597885132, - "logits/rejected": 0.8614631295204163, - "logps/accuracies": 0.6875, - "logps/chosen": -289.743408203125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -288.4799499511719, - "logps/ref_rejected": -295.8186340332031, - "logps/rejected": -333.06170654296875, - "loss": 0.648, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.12634362280368805, - "rewards/grad_term": 0.015468433499336243, - "rewards/margins": 3.597963571548462, - "rewards/rejected": -3.724307060241699, - "step": 47 - }, - { - "epoch": 0.09957215091404123, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 29.320128400390026, - "learning_rate": 8.462176144238853e-07, - "logits/chosen": 1.0445611476898193, - "logits/rejected": 1.080256700515747, - "logps/accuracies": 0.875, - "logps/chosen": -277.6840515136719, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -284.16644287109375, - "logps/ref_rejected": -313.25799560546875, - "logps/rejected": -363.92913818359375, - "loss": 0.6148, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.648241400718689, - "rewards/grad_term": 0.01066309679299593, - "rewards/margins": 5.71535062789917, - "rewards/rejected": -5.067109107971191, - "step": 48 - }, - { - "epoch": 0.10164657072475042, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 30.36633688848658, - "learning_rate": 8.507248471867036e-07, - "logits/chosen": 1.0277738571166992, - "logits/rejected": 0.9966739416122437, - "logps/accuracies": 0.625, - "logps/chosen": -354.4734191894531, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -353.09027099609375, - "logps/ref_rejected": -352.4345397949219, - "logps/rejected": -385.0770263671875, - "loss": 0.6356, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.13831399381160736, - "rewards/grad_term": 0.021235931664705276, - "rewards/margins": 3.1259407997131348, - "rewards/rejected": -3.264254570007324, - "step": 49 - }, - { - "epoch": 0.10372099053545962, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 34.234438693372084, - "learning_rate": 8.551410184841808e-07, - "logits/chosen": 0.8633083701133728, - "logits/rejected": 0.860028088092804, - "logps/accuracies": 0.6875, - "logps/chosen": -252.79647827148438, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -252.6006317138672, - "logps/ref_rejected": -258.76251220703125, - "logps/rejected": -295.47882080078125, - "loss": 0.65, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.01958458498120308, - "rewards/grad_term": 0.01410377025604248, - "rewards/margins": 3.6520471572875977, - "rewards/rejected": -3.6716315746307373, - "step": 50 - }, - { - "epoch": 0.10579541034616881, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 29.660260139951582, - "learning_rate": 8.59469735071793e-07, - "logits/chosen": 0.38166430592536926, - "logits/rejected": 0.4328911304473877, - "logps/accuracies": 0.625, - "logps/chosen": -296.5737609863281, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -290.32611083984375, - "logps/ref_rejected": -293.0495910644531, - "logps/rejected": -349.3868408203125, - "loss": 0.6285, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.6247656941413879, - "rewards/grad_term": 0.017583010718226433, - "rewards/margins": 5.008961200714111, - "rewards/rejected": -5.633727073669434, - "step": 51 - }, - { - "epoch": 0.107869830156878, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 25.949283947333324, - "learning_rate": 8.637143935755428e-07, - "logits/chosen": 0.727641224861145, - "logits/rejected": 0.7505197525024414, - "logps/accuracies": 0.625, - "logps/chosen": -288.1712646484375, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -278.9561767578125, - "logps/ref_rejected": -265.6705322265625, - "logps/rejected": -308.8449401855469, - "loss": 0.6146, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9215071201324463, - "rewards/grad_term": 0.022838197648525238, - "rewards/margins": 3.395932912826538, - "rewards/rejected": -4.317440032958984, - "step": 52 - }, - { - "epoch": 0.10994424996758718, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 24.134309233597588, - "learning_rate": 8.678781965043402e-07, - "logits/chosen": 0.7036612033843994, - "logits/rejected": 0.6557080745697021, - "logps/accuracies": 0.875, - "logps/chosen": -358.0240478515625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -348.71356201171875, - "logps/ref_rejected": -355.6617431640625, - "logps/rejected": -404.3010559082031, - "loss": 0.6916, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9310531616210938, - "rewards/grad_term": 0.02438879944384098, - "rewards/margins": 3.93287992477417, - "rewards/rejected": -4.863933086395264, - "step": 53 - }, - { - "epoch": 0.11201866977829639, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 53.64626463678004, - "learning_rate": 8.719641667656282e-07, - "logits/chosen": 0.6714786887168884, - "logits/rejected": 0.5823845863342285, - "logps/accuracies": 0.625, - "logps/chosen": -376.20220947265625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -369.01507568359375, - "logps/ref_rejected": -328.4320373535156, - "logps/rejected": -383.90655517578125, - "loss": 0.6963, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.7187104821205139, - "rewards/grad_term": 0.011453664861619473, - "rewards/margins": 4.828742980957031, - "rewards/rejected": -5.5474534034729, - "step": 54 - }, - { - "epoch": 0.11409308958900558, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 70.929778345069, - "learning_rate": 8.759751608490621e-07, - "logits/chosen": 0.44098129868507385, - "logits/rejected": 0.5110803842544556, - "logps/accuracies": 0.8125, - "logps/chosen": -307.63323974609375, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -303.2783203125, - "logps/ref_rejected": -305.9503173828125, - "logps/rejected": -365.0198669433594, - "loss": 0.6622, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.4354937672615051, - "rewards/grad_term": 0.016174456104636192, - "rewards/margins": 5.471461296081543, - "rewards/rejected": -5.906955242156982, - "step": 55 - }, - { - "epoch": 0.11616750939971476, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 26.619296281410804, - "learning_rate": 8.799138808222686e-07, - "logits/chosen": 0.7330751419067383, - "logits/rejected": 0.9024415016174316, - "logps/accuracies": 0.75, - "logps/chosen": -235.89239501953125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -231.52980041503906, - "logps/ref_rejected": -262.5042724609375, - "logps/rejected": -304.2793273925781, - "loss": 0.6587, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.43625733256340027, - "rewards/grad_term": 0.020225245505571365, - "rewards/margins": 3.7412445545196533, - "rewards/rejected": -4.177502155303955, - "step": 56 - }, - { - "epoch": 0.11824192921042397, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 27.602722271896948, - "learning_rate": 8.837828852648599e-07, - "logits/chosen": 0.5326017737388611, - "logits/rejected": 0.6437039971351624, - "logps/accuracies": 0.8125, - "logps/chosen": -301.2654113769531, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -306.17083740234375, - "logps/ref_rejected": -299.8456115722656, - "logps/rejected": -362.2225036621094, - "loss": 0.6253, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.4905431568622589, - "rewards/grad_term": 0.014911260455846786, - "rewards/margins": 6.728227615356445, - "rewards/rejected": -6.2376837730407715, - "step": 57 - }, - { - "epoch": 0.12031634902113315, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 28.670654744865576, - "learning_rate": 8.875845992515123e-07, - "logits/chosen": 0.38607218861579895, - "logits/rejected": 0.414478600025177, - "logps/accuracies": 0.625, - "logps/chosen": -328.02618408203125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -322.28265380859375, - "logps/ref_rejected": -297.2394104003906, - "logps/rejected": -336.27459716796875, - "loss": 0.6757, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.5743532180786133, - "rewards/grad_term": 0.023728108033537865, - "rewards/margins": 3.329164505004883, - "rewards/rejected": -3.903517961502075, - "step": 58 - }, - { - "epoch": 0.12239076883184234, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 31.46030833209825, - "learning_rate": 8.91321323481661e-07, - "logits/chosen": 0.6807994246482849, - "logits/rejected": 0.7104217410087585, - "logps/accuracies": 0.8125, - "logps/chosen": -331.4752502441406, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -335.2398986816406, - "logps/ref_rejected": -332.1490173339844, - "logps/rejected": -374.0831298828125, - "loss": 0.6771, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.37646305561065674, - "rewards/grad_term": 0.010923169553279877, - "rewards/margins": 4.56987190246582, - "rewards/rejected": -4.193408966064453, - "step": 59 - }, - { - "epoch": 0.12446518864255153, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 23.984091347684366, - "learning_rate": 8.949952426418784e-07, - "logits/chosen": 0.568733811378479, - "logits/rejected": 0.635265052318573, - "logps/accuracies": 0.6875, - "logps/chosen": -397.9205322265625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -402.82952880859375, - "logps/ref_rejected": -363.6296691894531, - "logps/rejected": -400.825439453125, - "loss": 0.6358, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.4909006357192993, - "rewards/grad_term": 0.015072712674736977, - "rewards/margins": 4.210475921630859, - "rewards/rejected": -3.7195756435394287, - "step": 60 - }, - { - "epoch": 0.12653960845326073, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.125, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 73.64774037078915, - "learning_rate": 8.986084330770518e-07, - "logits/chosen": 0.7834938764572144, - "logits/rejected": 0.8703972101211548, - "logps/accuracies": 0.6875, - "logps/chosen": -256.0115661621094, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -239.9134521484375, - "logps/ref_rejected": -261.8016662597656, - "logps/rejected": -313.2479248046875, - "loss": 0.6474, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.6098082065582275, - "rewards/grad_term": 0.01827179826796055, - "rewards/margins": 3.534818172454834, - "rewards/rejected": -5.144626140594482, - "step": 61 - }, - { - "epoch": 0.12861402826396992, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 25.70529599001306, - "learning_rate": 9.021628698377976e-07, - "logits/chosen": 0.5873112082481384, - "logits/rejected": 0.6506080627441406, - "logps/accuracies": 0.75, - "logps/chosen": -274.6400451660156, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -276.3179016113281, - "logps/ref_rejected": -279.714599609375, - "logps/rejected": -340.3433532714844, - "loss": 0.6359, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.16778606176376343, - "rewards/grad_term": 0.014593811705708504, - "rewards/margins": 6.230656623840332, - "rewards/rejected": -6.062870979309082, - "step": 62 - }, - { - "epoch": 0.1306884480746791, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 29.57488992234054, - "learning_rate": 9.056604331640114e-07, - "logits/chosen": 0.511448323726654, - "logits/rejected": 0.4164316654205322, - "logps/accuracies": 0.75, - "logps/chosen": -254.2750244140625, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -253.3539276123047, - "logps/ref_rejected": -264.7762145996094, - "logps/rejected": -299.82305908203125, - "loss": 0.6508, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.09211038053035736, - "rewards/grad_term": 0.026033716276288033, - "rewards/margins": 3.4125752449035645, - "rewards/rejected": -3.504685401916504, - "step": 63 - }, - { - "epoch": 0.1327628678853883, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 55.6766766699201, - "learning_rate": 9.091029144578332e-07, - "logits/chosen": 0.5473611354827881, - "logits/rejected": 0.6334167122840881, - "logps/accuracies": 0.8125, - "logps/chosen": -307.349365234375, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -308.5955505371094, - "logps/ref_rejected": -327.8055725097656, - "logps/rejected": -374.07574462890625, - "loss": 0.6354, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.12462212890386581, - "rewards/grad_term": 0.011786059476435184, - "rewards/margins": 4.751638412475586, - "rewards/rejected": -4.627016067504883, - "step": 64 - }, - { - "epoch": 0.1327628678853883, - "eval_flips/correct->correct": 0.43842363357543945, - "eval_flips/correct->incorrect": 0.004926108289510012, - "eval_flips/incorrect->correct": 0.2660098373889923, - "eval_flips/incorrect->incorrect": 0.29064038395881653, - "eval_logits/chosen": 0.5654913783073425, - "eval_logits/rejected": 0.6160324215888977, - "eval_logps/accuracies": 0.7044335007667542, - "eval_logps/chosen": -288.4407958984375, - "eval_logps/ref_accuracies": 0.4433497488498688, - "eval_logps/ref_chosen": -287.3511047363281, - "eval_logps/ref_rejected": -289.0460205078125, - "eval_logps/rejected": -328.46038818359375, - "eval_loss": 0.6570103168487549, - "eval_rewards/accuracies": 0.8325123190879822, - "eval_rewards/chosen": -0.10896830260753632, - "eval_rewards/grad_term": 0.021043213084340096, - "eval_rewards/margins": 3.8324687480926514, - "eval_rewards/rejected": -3.9414374828338623, - "eval_runtime": 786.9931, - "eval_samples_per_second": 2.056, - "eval_steps_per_second": 0.258, - "step": 64 - }, - { - "epoch": 0.13483728769609749, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 24.735517627215668, - "learning_rate": 9.124920217935358e-07, - "logits/chosen": 0.40278834104537964, - "logits/rejected": 0.4163047969341278, - "logps/accuracies": 0.875, - "logps/chosen": -353.63824462890625, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -357.9703369140625, - "logps/ref_rejected": -365.9423522949219, - "logps/rejected": -425.2349853515625, - "loss": 0.6043, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4332119822502136, - "rewards/grad_term": 0.007745261769741774, - "rewards/margins": 6.362478256225586, - "rewards/rejected": -5.929266452789307, - "step": 65 - }, - { - "epoch": 0.1369117075068067, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 26.834165183727563, - "learning_rate": 9.158293850067597e-07, - "logits/chosen": 0.387469083070755, - "logits/rejected": 0.4058898091316223, - "logps/accuracies": 0.8125, - "logps/chosen": -252.04205322265625, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -252.20950317382812, - "logps/ref_rejected": -263.31280517578125, - "logps/rejected": -316.500244140625, - "loss": 0.6308, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.01674594357609749, - "rewards/grad_term": 0.014994516968727112, - "rewards/margins": 5.335488319396973, - "rewards/rejected": -5.318742275238037, - "step": 66 - }, - { - "epoch": 0.1389861273175159, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 31.0308823025867, - "learning_rate": 9.191165604010531e-07, - "logits/chosen": 0.3395693302154541, - "logits/rejected": 0.34473684430122375, - "logps/accuracies": 0.75, - "logps/chosen": -325.09197998046875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -328.00286865234375, - "logps/ref_rejected": -305.96258544921875, - "logps/rejected": -359.0819396972656, - "loss": 0.6403, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.2910885214805603, - "rewards/grad_term": 0.009055268950760365, - "rewards/margins": 5.603026390075684, - "rewards/rejected": -5.3119378089904785, - "step": 67 - }, - { - "epoch": 0.14106054712822508, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 26.519671253661027, - "learning_rate": 9.22355035105741e-07, - "logits/chosen": 0.4188442528247833, - "logits/rejected": 0.4437766969203949, - "logps/accuracies": 0.6875, - "logps/chosen": -293.8087463378906, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -280.9673156738281, - "logps/ref_rejected": -302.37518310546875, - "logps/rejected": -354.1598815917969, - "loss": 0.619, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.284143090248108, - "rewards/grad_term": 0.02901587449014187, - "rewards/margins": 3.8943264484405518, - "rewards/rejected": -5.178469657897949, - "step": 68 - }, - { - "epoch": 0.14313496693893427, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.4375, - "grad_norm": 30.502044803594153, - "learning_rate": 9.255462311156644e-07, - "logits/chosen": 0.5335452556610107, - "logits/rejected": 0.5705280303955078, - "logps/accuracies": 0.5625, - "logps/chosen": -320.91192626953125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -304.9333190917969, - "logps/ref_rejected": -281.81768798828125, - "logps/rejected": -346.9627685546875, - "loss": 0.6755, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5978612899780273, - "rewards/grad_term": 0.018111273646354675, - "rewards/margins": 4.916650295257568, - "rewards/rejected": -6.514511585235596, - "step": 69 - }, - { - "epoch": 0.14520938674964345, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 39.33816705000978, - "learning_rate": 9.286915090402617e-07, - "logits/chosen": 0.4920622706413269, - "logits/rejected": 0.5008682012557983, - "logps/accuracies": 0.8125, - "logps/chosen": -302.3096618652344, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -291.40972900390625, - "logps/ref_rejected": -286.4915771484375, - "logps/rejected": -359.8939514160156, - "loss": 0.6369, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.0899897813796997, - "rewards/grad_term": 0.0171764325350523, - "rewards/margins": 6.250240325927734, - "rewards/rejected": -7.340230464935303, - "step": 70 - }, - { - "epoch": 0.14728380656035264, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 32.47379922437437, - "learning_rate": 9.317921715867286e-07, - "logits/chosen": 0.5690668225288391, - "logits/rejected": 0.6497770547866821, - "logps/accuracies": 0.75, - "logps/chosen": -300.4138488769531, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -293.0126647949219, - "logps/ref_rejected": -293.5539855957031, - "logps/rejected": -361.0144348144531, - "loss": 0.6126, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.7401193976402283, - "rewards/grad_term": 0.012420150451362133, - "rewards/margins": 6.005928039550781, - "rewards/rejected": -6.746047019958496, - "step": 71 - }, - { - "epoch": 0.14935822637106183, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 35.11526252181687, - "learning_rate": 9.348494667995762e-07, - "logits/chosen": 0.5223222970962524, - "logits/rejected": 0.6166201829910278, - "logps/accuracies": 0.875, - "logps/chosen": -262.4486083984375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -247.41668701171875, - "logps/ref_rejected": -251.63619995117188, - "logps/rejected": -323.4156494140625, - "loss": 0.6372, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.503194808959961, - "rewards/grad_term": 0.016301069408655167, - "rewards/margins": 5.674752712249756, - "rewards/rejected": -7.177947521209717, - "step": 72 - }, - { - "epoch": 0.15143264618177105, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 33.25143057906705, - "learning_rate": 9.378645910767493e-07, - "logits/chosen": 0.5215972065925598, - "logits/rejected": 0.4775215685367584, - "logps/accuracies": 0.6875, - "logps/chosen": -257.8221435546875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -257.3147888183594, - "logps/ref_rejected": -245.8674774169922, - "logps/rejected": -302.9472961425781, - "loss": 0.6342, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.050737857818603516, - "rewards/grad_term": 0.01016687136143446, - "rewards/margins": 5.657248497009277, - "rewards/rejected": -5.707986831665039, - "step": 73 - }, - { - "epoch": 0.15350706599248023, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.6875, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 52.55742381973749, - "learning_rate": 9.408386919805467e-07, - "logits/chosen": 0.7360602021217346, - "logits/rejected": 0.70041424036026, - "logps/accuracies": 0.9375, - "logps/chosen": -317.7826843261719, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -302.98712158203125, - "logps/ref_rejected": -267.1181945800781, - "logps/rejected": -356.98870849609375, - "loss": 0.6432, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.4795535802841187, - "rewards/grad_term": 0.008950343355536461, - "rewards/margins": 7.507498741149902, - "rewards/rejected": -8.987051963806152, - "step": 74 - }, - { - "epoch": 0.15558148580318942, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 26.35632884945645, - "learning_rate": 9.437728708598716e-07, - "logits/chosen": 0.3639271855354309, - "logits/rejected": 0.38472047448158264, - "logps/accuracies": 0.875, - "logps/chosen": -278.147216796875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -264.1582336425781, - "logps/ref_rejected": -274.158203125, - "logps/rejected": -352.51422119140625, - "loss": 0.6529, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.3989008665084839, - "rewards/grad_term": 0.011926427483558655, - "rewards/margins": 6.436697959899902, - "rewards/rejected": -7.835598945617676, - "step": 75 - }, - { - "epoch": 0.1576559056138986, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 43.67163684751143, - "learning_rate": 9.466681852988078e-07, - "logits/chosen": 0.6780661344528198, - "logits/rejected": 0.7738847732543945, - "logps/accuracies": 0.75, - "logps/chosen": -286.3451843261719, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -271.2528991699219, - "logps/ref_rejected": -271.053955078125, - "logps/rejected": -328.85772705078125, - "loss": 0.6067, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5092324018478394, - "rewards/grad_term": 0.023612529039382935, - "rewards/margins": 4.271145820617676, - "rewards/rejected": -5.780378341674805, - "step": 76 - }, - { - "epoch": 0.1597303254246078, - "flips/correct->correct": 0.875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 22.36636463977758, - "learning_rate": 9.495256514051431e-07, - "logits/chosen": 0.4788045287132263, - "logits/rejected": 0.549846887588501, - "logps/accuracies": 1.0, - "logps/chosen": -222.5209197998047, - "logps/ref_accuracies": 0.875, - "logps/ref_chosen": -207.42868041992188, - "logps/ref_rejected": -236.1974334716797, - "logps/rejected": -293.5431823730469, - "loss": 0.6448, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5092250108718872, - "rewards/grad_term": 0.0173178743571043, - "rewards/margins": 4.225347995758057, - "rewards/rejected": -5.734574317932129, - "step": 77 - }, - { - "epoch": 0.161804745235317, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 25.267712307372168, - "learning_rate": 9.523462459512337e-07, - "logits/chosen": 0.5372971892356873, - "logits/rejected": 0.6544579863548279, - "logps/accuracies": 0.9375, - "logps/chosen": -278.4732360839844, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -275.07958984375, - "logps/ref_rejected": -292.14898681640625, - "logps/rejected": -352.7454833984375, - "loss": 0.6166, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.3393683433532715, - "rewards/grad_term": 0.022649819031357765, - "rewards/margins": 5.720274925231934, - "rewards/rejected": -6.059643745422363, - "step": 78 - }, - { - "epoch": 0.16387916504602618, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 57.52603839415039, - "learning_rate": 9.551309083784976e-07, - "logits/chosen": 0.6397267580032349, - "logits/rejected": 0.7187516093254089, - "logps/accuracies": 0.9375, - "logps/chosen": -273.272705078125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -278.8054504394531, - "logps/ref_rejected": -292.9872741699219, - "logps/rejected": -340.0445861816406, - "loss": 0.6701, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.5532730221748352, - "rewards/grad_term": 0.014312355779111385, - "rewards/margins": 5.259001731872559, - "rewards/rejected": -4.705729007720947, - "step": 79 - }, - { - "epoch": 0.1659535848567354, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 50.777119418812084, - "learning_rate": 9.578805426758263e-07, - "logits/chosen": 0.4606146216392517, - "logits/rejected": 0.46222275495529175, - "logps/accuracies": 0.8125, - "logps/chosen": -292.800537109375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -291.5415954589844, - "logps/ref_rejected": -313.3748474121094, - "logps/rejected": -364.9443054199219, - "loss": 0.6283, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.1258973628282547, - "rewards/grad_term": 0.018451694399118423, - "rewards/margins": 5.031045436859131, - "rewards/rejected": -5.156942367553711, - "step": 80 - }, - { - "epoch": 0.16802800466744458, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 25.904014404983347, - "learning_rate": 9.605960191413192e-07, - "logits/chosen": 0.5609871745109558, - "logits/rejected": 0.646887481212616, - "logps/accuracies": 0.6875, - "logps/chosen": -388.10205078125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -396.9491271972656, - "logps/ref_rejected": -395.2713928222656, - "logps/rejected": -423.269287109375, - "loss": 0.5963, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.8847097158432007, - "rewards/grad_term": 0.024479346349835396, - "rewards/margins": 3.6844961643218994, - "rewards/rejected": -2.7997865676879883, - "step": 81 - }, - { - "epoch": 0.17010242447815377, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 18.071356978636363, - "learning_rate": 9.632781760359235e-07, - "logits/chosen": 0.2946923077106476, - "logits/rejected": 0.26006707549095154, - "logps/accuracies": 0.6875, - "logps/chosen": -222.20687866210938, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -218.695068359375, - "logps/ref_rejected": -223.76553344726562, - "logps/rejected": -264.6587829589844, - "loss": 0.6335, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.3511809706687927, - "rewards/grad_term": 0.025286730378866196, - "rewards/margins": 3.7381458282470703, - "rewards/rejected": -4.08932638168335, - "step": 82 - }, - { - "epoch": 0.17217684428886296, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 29.580870064695095, - "learning_rate": 9.659278211368498e-07, - "logits/chosen": 0.653415322303772, - "logits/rejected": 0.7497892379760742, - "logps/accuracies": 0.875, - "logps/chosen": -334.653564453125, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -324.0084533691406, - "logps/ref_rejected": -340.58624267578125, - "logps/rejected": -422.7427978515625, - "loss": 0.6484, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.0645086765289307, - "rewards/grad_term": 0.018634023144841194, - "rewards/margins": 7.15114688873291, - "rewards/rejected": -8.215656280517578, - "step": 83 - }, - { - "epoch": 0.17425126409957215, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 37.56856528542604, - "learning_rate": 9.685457331979593e-07, - "logits/chosen": 0.7688320875167847, - "logits/rejected": 0.913873553276062, - "logps/accuracies": 0.8125, - "logps/chosen": -252.0431671142578, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -241.2303466796875, - "logps/ref_rejected": -278.7004699707031, - "logps/rejected": -341.7118835449219, - "loss": 0.6808, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.0812804698944092, - "rewards/grad_term": 0.022220587357878685, - "rewards/margins": 5.219861030578613, - "rewards/rejected": -6.301141738891602, - "step": 84 - }, - { - "epoch": 0.17632568391028133, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 25.08015845081592, - "learning_rate": 9.711326633237342e-07, - "logits/chosen": 0.6746060252189636, - "logits/rejected": 0.6128141283988953, - "logps/accuracies": 0.8125, - "logps/chosen": -324.61865234375, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -314.5938415527344, - "logps/ref_rejected": -327.64666748046875, - "logps/rejected": -388.7850036621094, - "loss": 0.58, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.002484679222107, - "rewards/grad_term": 0.017657626420259476, - "rewards/margins": 5.111349582672119, - "rewards/rejected": -6.113834857940674, - "step": 85 - }, - { - "epoch": 0.17840010372099052, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 28.420792350466737, - "learning_rate": 9.736893362628883e-07, - "logits/chosen": 0.49216994643211365, - "logits/rejected": 0.5920721888542175, - "logps/accuracies": 0.9375, - "logps/chosen": -299.5179443359375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -294.0077209472656, - "logps/ref_rejected": -302.6850280761719, - "logps/rejected": -385.48388671875, - "loss": 0.6414, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5510200262069702, - "rewards/grad_term": 0.004014983773231506, - "rewards/margins": 7.728863716125488, - "rewards/rejected": -8.279884338378906, - "step": 86 - }, - { - "epoch": 0.18047452353169974, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 32.17317573293341, - "learning_rate": 9.762164516272033e-07, - "logits/chosen": 0.7234176397323608, - "logits/rejected": 0.7146831154823303, - "logps/accuracies": 0.9375, - "logps/chosen": -299.3135681152344, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -294.986083984375, - "logps/ref_rejected": -306.23895263671875, - "logps/rejected": -362.81964111328125, - "loss": 0.6571, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.43274781107902527, - "rewards/grad_term": 0.012466475367546082, - "rewards/margins": 5.2253193855285645, - "rewards/rejected": -5.658066749572754, - "step": 87 - }, - { - "epoch": 0.18254894334240893, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 26.897654394769035, - "learning_rate": 9.787146850407078e-07, - "logits/chosen": 0.47364750504493713, - "logits/rejected": 0.5636922717094421, - "logps/accuracies": 0.75, - "logps/chosen": -264.1487121582031, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -270.39495849609375, - "logps/ref_rejected": -258.71246337890625, - "logps/rejected": -319.8270263671875, - "loss": 0.6117, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.6246242523193359, - "rewards/grad_term": 0.01352761872112751, - "rewards/margins": 6.736079216003418, - "rewards/rejected": -6.111454963684082, - "step": 88 - }, - { - "epoch": 0.18462336315311811, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 36.70610652642651, - "learning_rate": 9.811846892239293e-07, - "logits/chosen": 0.1739477515220642, - "logits/rejected": 0.20079316198825836, - "logps/accuracies": 0.875, - "logps/chosen": -334.56201171875, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -319.6775207519531, - "logps/ref_rejected": -328.6192626953125, - "logps/rejected": -389.33929443359375, - "loss": 0.5805, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.4884480237960815, - "rewards/grad_term": 0.028538305312395096, - "rewards/margins": 4.583554267883301, - "rewards/rejected": -6.072002410888672, - "step": 89 - }, - { - "epoch": 0.1866977829638273, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 38.62477740343346, - "learning_rate": 9.836270950175693e-07, - "logits/chosen": 0.5048727989196777, - "logits/rejected": 0.5224493741989136, - "logps/accuracies": 0.875, - "logps/chosen": -265.7325439453125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -250.5098419189453, - "logps/ref_rejected": -255.43650817871094, - "logps/rejected": -315.4735107421875, - "loss": 0.6476, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5222673416137695, - "rewards/grad_term": 0.023190699517726898, - "rewards/margins": 4.481435298919678, - "rewards/rejected": -6.003702640533447, - "step": 90 - }, - { - "epoch": 0.1887722027745365, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 36.7027529146051, - "learning_rate": 9.860425123496167e-07, - "logits/chosen": 0.5219244360923767, - "logits/rejected": 0.5849474668502808, - "logps/accuracies": 0.9375, - "logps/chosen": -240.11685180664062, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -234.85340881347656, - "logps/ref_rejected": -262.1112060546875, - "logps/rejected": -327.15716552734375, - "loss": 0.6069, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5263462662696838, - "rewards/grad_term": 0.004026439506560564, - "rewards/margins": 5.978251934051514, - "rewards/rejected": -6.504598140716553, - "step": 91 - }, - { - "epoch": 0.19084662258524568, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 34.422228673025685, - "learning_rate": 9.884315311496123e-07, - "logits/chosen": 0.5342029929161072, - "logits/rejected": 0.5386108160018921, - "logps/accuracies": 0.75, - "logps/chosen": -340.9605407714844, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -338.02606201171875, - "logps/ref_rejected": -346.3376770019531, - "logps/rejected": -377.7868347167969, - "loss": 0.5815, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.29344886541366577, - "rewards/grad_term": 0.020996563136577606, - "rewards/margins": 2.8514671325683594, - "rewards/rejected": -3.1449155807495117, - "step": 92 - }, - { - "epoch": 0.1929210423959549, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 25.326369311886236, - "learning_rate": 9.907947222134885e-07, - "logits/chosen": 0.4443345069885254, - "logits/rejected": 0.4642353653907776, - "logps/accuracies": 0.875, - "logps/chosen": -346.2325744628906, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -345.8390197753906, - "logps/ref_rejected": -357.72564697265625, - "logps/rejected": -413.6025085449219, - "loss": 0.5793, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.03935527801513672, - "rewards/grad_term": 0.00771428644657135, - "rewards/margins": 5.548335552215576, - "rewards/rejected": -5.587691783905029, - "step": 93 - }, - { - "epoch": 0.19499546220666408, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 21.376114840937195, - "learning_rate": 9.931326380221604e-07, - "logits/chosen": 0.6561794281005859, - "logits/rejected": 0.7463537454605103, - "logps/accuracies": 0.8125, - "logps/chosen": -254.1697540283203, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -258.75799560546875, - "logps/ref_rejected": -282.442138671875, - "logps/rejected": -320.4227600097656, - "loss": 0.5967, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.4588264226913452, - "rewards/grad_term": 0.02241508476436138, - "rewards/margins": 4.256890296936035, - "rewards/rejected": -3.7980637550354004, - "step": 94 - }, - { - "epoch": 0.19706988201737327, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 28.87335565570023, - "learning_rate": 9.95445813516801e-07, - "logits/chosen": 0.31641554832458496, - "logits/rejected": 0.4115113914012909, - "logps/accuracies": 0.8125, - "logps/chosen": -305.4784240722656, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -301.1734619140625, - "logps/ref_rejected": -309.7505187988281, - "logps/rejected": -370.07855224609375, - "loss": 0.595, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.43049633502960205, - "rewards/grad_term": 0.007062141317874193, - "rewards/margins": 5.602307319641113, - "rewards/rejected": -6.032803535461426, - "step": 95 - }, - { - "epoch": 0.19914430182808246, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 30.52259082669545, - "learning_rate": 9.977347668335242e-07, - "logits/chosen": 0.5447170734405518, - "logits/rejected": 0.6960605978965759, - "logps/accuracies": 0.8125, - "logps/chosen": -320.6680603027344, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -323.8368225097656, - "logps/ref_rejected": -339.37957763671875, - "logps/rejected": -400.385009765625, - "loss": 0.6261, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.3168814182281494, - "rewards/grad_term": 0.006425461731851101, - "rewards/margins": 6.417423248291016, - "rewards/rejected": -6.100542068481445, - "step": 96 - }, - { - "epoch": 0.20121872163879165, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 51.93086544682398, - "learning_rate": 1e-06, - "logits/chosen": 0.6568098068237305, - "logits/rejected": 0.6733189225196838, - "logps/accuracies": 0.875, - "logps/chosen": -286.9490966796875, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -284.9871826171875, - "logps/ref_rejected": -301.6272888183594, - "logps/rejected": -357.9549560546875, - "loss": 0.5764, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.19619154930114746, - "rewards/grad_term": 0.018322059884667397, - "rewards/margins": 5.436576843261719, - "rewards/rejected": -5.632768630981445, - "step": 97 - }, - { - "epoch": 0.20329314144950084, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 28.258232668290642, - "learning_rate": 1e-06, - "logits/chosen": 0.41130974888801575, - "logits/rejected": 0.47705498337745667, - "logps/accuracies": 0.75, - "logps/chosen": -322.64227294921875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -310.24725341796875, - "logps/ref_rejected": -321.1720275878906, - "logps/rejected": -375.3896484375, - "loss": 0.6226, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.239502191543579, - "rewards/grad_term": 0.022803550586104393, - "rewards/margins": 4.182260990142822, - "rewards/rejected": -5.4217634201049805, - "step": 98 - }, - { - "epoch": 0.20536756126021002, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 27.738502482797642, - "learning_rate": 9.988465974625143e-07, - "logits/chosen": 0.4429183602333069, - "logits/rejected": 0.5393229126930237, - "logps/accuracies": 0.75, - "logps/chosen": -272.3836975097656, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -278.6350402832031, - "logps/ref_rejected": -277.7386169433594, - "logps/rejected": -315.6932373046875, - "loss": 0.6369, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.6251335740089417, - "rewards/grad_term": 0.01758977398276329, - "rewards/margins": 4.420593738555908, - "rewards/rejected": -3.7954602241516113, - "step": 99 - }, - { - "epoch": 0.20744198107091924, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 49.56019663548194, - "learning_rate": 9.976931949250289e-07, - "logits/chosen": 0.5111449956893921, - "logits/rejected": 0.4637998640537262, - "logps/accuracies": 0.8125, - "logps/chosen": -305.05950927734375, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -303.1130065917969, - "logps/ref_rejected": -295.25433349609375, - "logps/rejected": -367.4417724609375, - "loss": 0.635, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.19465157389640808, - "rewards/grad_term": 0.006312578916549683, - "rewards/margins": 7.024093151092529, - "rewards/rejected": -7.218744277954102, - "step": 100 - }, - { - "epoch": 0.20951640088162843, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 17.980236189059163, - "learning_rate": 9.965397923875432e-07, - "logits/chosen": 0.5828474760055542, - "logits/rejected": 0.6235547661781311, - "logps/accuracies": 0.8125, - "logps/chosen": -270.784912109375, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -273.26043701171875, - "logps/ref_rejected": -269.11077880859375, - "logps/rejected": -325.9720458984375, - "loss": 0.6338, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.24755248427391052, - "rewards/grad_term": 0.01740310713648796, - "rewards/margins": 5.933681488037109, - "rewards/rejected": -5.686128616333008, - "step": 101 - }, - { - "epoch": 0.21159082069233762, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 51.922995548635456, - "learning_rate": 9.953863898500576e-07, - "logits/chosen": 0.18250882625579834, - "logits/rejected": 0.20775896310806274, - "logps/accuracies": 0.8125, - "logps/chosen": -266.9601745605469, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -265.5532531738281, - "logps/ref_rejected": -263.236328125, - "logps/rejected": -325.6028747558594, - "loss": 0.5857, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.14069411158561707, - "rewards/grad_term": 0.010405524633824825, - "rewards/margins": 6.095961570739746, - "rewards/rejected": -6.236655235290527, - "step": 102 - }, - { - "epoch": 0.2136652405030468, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 58.49656496183437, - "learning_rate": 9.94232987312572e-07, - "logits/chosen": 0.24150438606739044, - "logits/rejected": 0.23409827053546906, - "logps/accuracies": 0.6875, - "logps/chosen": -275.4272155761719, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -267.8497009277344, - "logps/ref_rejected": -259.2445068359375, - "logps/rejected": -302.8243103027344, - "loss": 0.654, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.7577495574951172, - "rewards/grad_term": 0.027012387290596962, - "rewards/margins": 3.6002304553985596, - "rewards/rejected": -4.357979774475098, - "step": 103 - }, - { - "epoch": 0.215739660313756, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 33.69936760710596, - "learning_rate": 9.930795847750865e-07, - "logits/chosen": 0.37147602438926697, - "logits/rejected": 0.5065699219703674, - "logps/accuracies": 0.8125, - "logps/chosen": -246.92913818359375, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -244.8133544921875, - "logps/ref_rejected": -273.55145263671875, - "logps/rejected": -323.93560791015625, - "loss": 0.5903, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.211576446890831, - "rewards/grad_term": 0.01827353984117508, - "rewards/margins": 4.82683801651001, - "rewards/rejected": -5.038414001464844, - "step": 104 - }, - { - "epoch": 0.21781408012446518, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 30.373799785451364, - "learning_rate": 9.919261822376009e-07, - "logits/chosen": 0.650319516658783, - "logits/rejected": 0.6357383728027344, - "logps/accuracies": 0.6875, - "logps/chosen": -262.9386901855469, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -259.9366760253906, - "logps/ref_rejected": -256.1930236816406, - "logps/rejected": -282.44952392578125, - "loss": 0.5851, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3002018928527832, - "rewards/grad_term": 0.028288275003433228, - "rewards/margins": 2.3254497051239014, - "rewards/rejected": -2.6256518363952637, - "step": 105 - }, - { - "epoch": 0.21988849993517437, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 24.848196690851, - "learning_rate": 9.907727797001152e-07, - "logits/chosen": 0.35767537355422974, - "logits/rejected": 0.42828047275543213, - "logps/accuracies": 0.9375, - "logps/chosen": -260.8604736328125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -265.82379150390625, - "logps/ref_rejected": -287.0606689453125, - "logps/rejected": -353.9102478027344, - "loss": 0.5905, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.4963342547416687, - "rewards/grad_term": 0.011764682829380035, - "rewards/margins": 7.18129301071167, - "rewards/rejected": -6.684958457946777, - "step": 106 - }, - { - "epoch": 0.22196291974588359, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 50.26567742873495, - "learning_rate": 9.896193771626296e-07, - "logits/chosen": 0.24374046921730042, - "logits/rejected": 0.2071159929037094, - "logps/accuracies": 0.8125, - "logps/chosen": -322.9122009277344, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -321.7509765625, - "logps/ref_rejected": -327.6671142578125, - "logps/rejected": -379.50433349609375, - "loss": 0.5947, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.11612237989902496, - "rewards/grad_term": 0.013888241723179817, - "rewards/margins": 5.06759786605835, - "rewards/rejected": -5.183720588684082, - "step": 107 - }, - { - "epoch": 0.22403733955659277, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 24.60066728160171, - "learning_rate": 9.884659746251442e-07, - "logits/chosen": 0.28119832277297974, - "logits/rejected": 0.4410630464553833, - "logps/accuracies": 0.8125, - "logps/chosen": -263.49688720703125, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -259.3692626953125, - "logps/ref_rejected": -296.3498229980469, - "logps/rejected": -338.8854064941406, - "loss": 0.6482, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.41276171803474426, - "rewards/grad_term": 0.030529310926795006, - "rewards/margins": 3.840797185897827, - "rewards/rejected": -4.253559112548828, - "step": 108 - }, - { - "epoch": 0.22611175936730196, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 29.88832833016055, - "learning_rate": 9.873125720876585e-07, - "logits/chosen": 0.4925777018070221, - "logits/rejected": 0.39786702394485474, - "logps/accuracies": 0.625, - "logps/chosen": -288.4710693359375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -291.87298583984375, - "logps/ref_rejected": -257.73553466796875, - "logps/rejected": -322.43603515625, - "loss": 0.6003, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.3401949405670166, - "rewards/grad_term": 0.005394600797444582, - "rewards/margins": 6.810248851776123, - "rewards/rejected": -6.4700541496276855, - "step": 109 - }, - { - "epoch": 0.22818617917801115, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 23.915753236950135, - "learning_rate": 9.861591695501729e-07, - "logits/chosen": 0.2205159217119217, - "logits/rejected": 0.19697824120521545, - "logps/accuracies": 0.8125, - "logps/chosen": -352.7537841796875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -355.77337646484375, - "logps/ref_rejected": -356.9278564453125, - "logps/rejected": -400.95245361328125, - "loss": 0.5934, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.30196261405944824, - "rewards/grad_term": 0.017759006470441818, - "rewards/margins": 4.704426288604736, - "rewards/rejected": -4.402463436126709, - "step": 110 - }, - { - "epoch": 0.23026059898872034, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 27.27299967581395, - "learning_rate": 9.850057670126874e-07, - "logits/chosen": 0.37821733951568604, - "logits/rejected": 0.4970583915710449, - "logps/accuracies": 0.75, - "logps/chosen": -237.38504028320312, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -230.9459228515625, - "logps/ref_rejected": -249.4907684326172, - "logps/rejected": -298.8011169433594, - "loss": 0.633, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.6439133286476135, - "rewards/grad_term": 0.018191155046224594, - "rewards/margins": 4.287120819091797, - "rewards/rejected": -4.931033134460449, - "step": 111 - }, - { - "epoch": 0.23233501879942953, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 39.145128913057, - "learning_rate": 9.838523644752018e-07, - "logits/chosen": 0.1512741595506668, - "logits/rejected": 0.32822132110595703, - "logps/accuracies": 0.75, - "logps/chosen": -267.9004821777344, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -264.21697998046875, - "logps/ref_rejected": -307.68572998046875, - "logps/rejected": -361.28033447265625, - "loss": 0.5966, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.3683478534221649, - "rewards/grad_term": 0.017407521605491638, - "rewards/margins": 4.9911088943481445, - "rewards/rejected": -5.359456539154053, - "step": 112 - }, - { - "epoch": 0.23440943861013872, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 27.5954679159607, - "learning_rate": 9.826989619377162e-07, - "logits/chosen": 0.5426469445228577, - "logits/rejected": 0.5697547197341919, - "logps/accuracies": 0.75, - "logps/chosen": -312.826904296875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -306.6598815917969, - "logps/ref_rejected": -277.28387451171875, - "logps/rejected": -353.48114013671875, - "loss": 0.6009, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.616702675819397, - "rewards/grad_term": 0.01106889545917511, - "rewards/margins": 7.003021240234375, - "rewards/rejected": -7.619723320007324, - "step": 113 - }, - { - "epoch": 0.23648385842084793, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 40.98809530673688, - "learning_rate": 9.815455594002307e-07, - "logits/chosen": 0.39876848459243774, - "logits/rejected": 0.3462454378604889, - "logps/accuracies": 0.8125, - "logps/chosen": -294.205078125, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -285.1748046875, - "logps/ref_rejected": -286.44140625, - "logps/rejected": -345.4891662597656, - "loss": 0.6112, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.9030314087867737, - "rewards/grad_term": 0.011187486350536346, - "rewards/margins": 5.001744747161865, - "rewards/rejected": -5.904776573181152, - "step": 114 - }, - { - "epoch": 0.23855827823155712, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 40.11356790900559, - "learning_rate": 9.80392156862745e-07, - "logits/chosen": 0.5221942067146301, - "logits/rejected": 0.4882541298866272, - "logps/accuracies": 0.8125, - "logps/chosen": -260.0171203613281, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -251.60545349121094, - "logps/ref_rejected": -259.59515380859375, - "logps/rejected": -315.2181701660156, - "loss": 0.6161, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.8411648869514465, - "rewards/grad_term": 0.021061977371573448, - "rewards/margins": 4.721133232116699, - "rewards/rejected": -5.56229829788208, - "step": 115 - }, - { - "epoch": 0.2406326980422663, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 31.171420280298065, - "learning_rate": 9.792387543252594e-07, - "logits/chosen": 0.23254762589931488, - "logits/rejected": 0.2675570845603943, - "logps/accuracies": 0.9375, - "logps/chosen": -289.489501953125, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -289.00616455078125, - "logps/ref_rejected": -302.8209533691406, - "logps/rejected": -371.5626220703125, - "loss": 0.5818, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.048332199454307556, - "rewards/grad_term": 0.007328622043132782, - "rewards/margins": 6.825834274291992, - "rewards/rejected": -6.874166488647461, - "step": 116 - }, - { - "epoch": 0.2427071178529755, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 25.865643815270218, - "learning_rate": 9.780853517877738e-07, - "logits/chosen": 0.5108106136322021, - "logits/rejected": 0.5345089435577393, - "logps/accuracies": 0.875, - "logps/chosen": -284.67791748046875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -285.08941650390625, - "logps/ref_rejected": -308.15838623046875, - "logps/rejected": -370.12554931640625, - "loss": 0.5606, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.04114929586648941, - "rewards/grad_term": 0.009833071380853653, - "rewards/margins": 6.237868785858154, - "rewards/rejected": -6.196719646453857, - "step": 117 - }, - { - "epoch": 0.24478153766368468, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 23.22657914288563, - "learning_rate": 9.769319492502884e-07, - "logits/chosen": 0.23898278176784515, - "logits/rejected": 0.2838956415653229, - "logps/accuracies": 0.9375, - "logps/chosen": -317.9300231933594, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -322.7581787109375, - "logps/ref_rejected": -333.2860107421875, - "logps/rejected": -404.1823425292969, - "loss": 0.5433, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4828159809112549, - "rewards/grad_term": 0.002703046426177025, - "rewards/margins": 7.572445869445801, - "rewards/rejected": -7.089630126953125, - "step": 118 - }, - { - "epoch": 0.24685595747439387, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 20.20109483182801, - "learning_rate": 9.757785467128027e-07, - "logits/chosen": 0.6895065307617188, - "logits/rejected": 0.7345404624938965, - "logps/accuracies": 0.8125, - "logps/chosen": -298.1529846191406, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -294.2225341796875, - "logps/ref_rejected": -282.6121520996094, - "logps/rejected": -337.44476318359375, - "loss": 0.5809, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.3930422067642212, - "rewards/grad_term": 0.017846597358584404, - "rewards/margins": 5.090217113494873, - "rewards/rejected": -5.483259677886963, - "step": 119 - }, - { - "epoch": 0.24893037728510306, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 33.31398261005609, - "learning_rate": 9.74625144175317e-07, - "logits/chosen": 0.37160423398017883, - "logits/rejected": 0.3335186839103699, - "logps/accuracies": 0.75, - "logps/chosen": -276.2862548828125, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -279.1397399902344, - "logps/ref_rejected": -279.9727478027344, - "logps/rejected": -327.0055847167969, - "loss": 0.62, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.28534770011901855, - "rewards/grad_term": 0.01826310157775879, - "rewards/margins": 4.9886322021484375, - "rewards/rejected": -4.70328426361084, - "step": 120 - }, - { - "epoch": 0.25100479709581225, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0625, - "flips/incorrect->incorrect": 0.4375, - "grad_norm": 24.34104219676861, - "learning_rate": 9.734717416378314e-07, - "logits/chosen": 0.47060269117355347, - "logits/rejected": 0.533828854560852, - "logps/accuracies": 0.5625, - "logps/chosen": -250.86029052734375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -249.06607055664062, - "logps/ref_rejected": -266.4825439453125, - "logps/rejected": -304.35400390625, - "loss": 0.6245, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.17942330241203308, - "rewards/grad_term": 0.023510945960879326, - "rewards/margins": 3.6077194213867188, - "rewards/rejected": -3.787142515182495, - "step": 121 - }, - { - "epoch": 0.25307921690652146, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 32.38649249036862, - "learning_rate": 9.72318339100346e-07, - "logits/chosen": 0.058825843036174774, - "logits/rejected": 0.1310182362794876, - "logps/accuracies": 0.625, - "logps/chosen": -307.8884582519531, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -298.2418212890625, - "logps/ref_rejected": -289.80157470703125, - "logps/rejected": -332.2846984863281, - "loss": 0.6421, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.9646634459495544, - "rewards/grad_term": 0.028044363483786583, - "rewards/margins": 3.283651828765869, - "rewards/rejected": -4.248315334320068, - "step": 122 - }, - { - "epoch": 0.2551536367172306, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 25.328252823752003, - "learning_rate": 9.711649365628604e-07, - "logits/chosen": 0.4595690667629242, - "logits/rejected": 0.4828678071498871, - "logps/accuracies": 0.8125, - "logps/chosen": -319.6042785644531, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -314.05908203125, - "logps/ref_rejected": -309.8699645996094, - "logps/rejected": -373.5873718261719, - "loss": 0.638, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5545214414596558, - "rewards/grad_term": 0.0088451923802495, - "rewards/margins": 5.817216396331787, - "rewards/rejected": -6.371737480163574, - "step": 123 - }, - { - "epoch": 0.25722805652793984, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 58.14409414314697, - "learning_rate": 9.70011534025375e-07, - "logits/chosen": 0.16532814502716064, - "logits/rejected": 0.1864890158176422, - "logps/accuracies": 0.6875, - "logps/chosen": -322.5909729003906, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -328.1080017089844, - "logps/ref_rejected": -314.7974548339844, - "logps/rejected": -369.16583251953125, - "loss": 0.6333, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.551704466342926, - "rewards/grad_term": 0.014467663131654263, - "rewards/margins": 5.9885406494140625, - "rewards/rejected": -5.4368367195129395, - "step": 124 - }, - { - "epoch": 0.25930247633864906, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 29.577490597996505, - "learning_rate": 9.688581314878893e-07, - "logits/chosen": 0.27106067538261414, - "logits/rejected": 0.28159230947494507, - "logps/accuracies": 0.875, - "logps/chosen": -324.951904296875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -315.4916687011719, - "logps/ref_rejected": -310.50274658203125, - "logps/rejected": -384.7645568847656, - "loss": 0.6228, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.9460303783416748, - "rewards/grad_term": 0.01042198482900858, - "rewards/margins": 6.480146884918213, - "rewards/rejected": -7.426177024841309, - "step": 125 - }, - { - "epoch": 0.2613768961493582, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 28.275175168662855, - "learning_rate": 9.677047289504036e-07, - "logits/chosen": 0.16482499241828918, - "logits/rejected": 0.13334128260612488, - "logps/accuracies": 0.8125, - "logps/chosen": -395.91998291015625, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -403.248046875, - "logps/ref_rejected": -382.96343994140625, - "logps/rejected": -467.2958679199219, - "loss": 0.586, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7328065633773804, - "rewards/grad_term": 0.002520698821172118, - "rewards/margins": 9.166044235229492, - "rewards/rejected": -8.43323802947998, - "step": 126 - }, - { - "epoch": 0.26345131596006743, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 31.309734628094976, - "learning_rate": 9.66551326412918e-07, - "logits/chosen": 0.07952776551246643, - "logits/rejected": 0.1613186150789261, - "logps/accuracies": 0.875, - "logps/chosen": -320.70037841796875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -327.677490234375, - "logps/ref_rejected": -339.91748046875, - "logps/rejected": -401.8999328613281, - "loss": 0.6011, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.6977108120918274, - "rewards/grad_term": 0.010999541729688644, - "rewards/margins": 6.89595890045166, - "rewards/rejected": -6.198247909545898, - "step": 127 - }, - { - "epoch": 0.2655257357707766, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 31.741083253136384, - "learning_rate": 9.653979238754326e-07, - "logits/chosen": 0.36669662594795227, - "logits/rejected": 0.40633296966552734, - "logps/accuracies": 0.8125, - "logps/chosen": -352.07159423828125, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -343.3804016113281, - "logps/ref_rejected": -353.6275939941406, - "logps/rejected": -414.04425048828125, - "loss": 0.6335, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.8691204190254211, - "rewards/grad_term": 0.015956774353981018, - "rewards/margins": 5.172546863555908, - "rewards/rejected": -6.0416669845581055, - "step": 128 - }, - { - "epoch": 0.2655257357707766, - "eval_flips/correct->correct": 0.4236453175544739, - "eval_flips/correct->incorrect": 0.019704433158040047, - "eval_flips/incorrect->correct": 0.3300492465496063, - "eval_flips/incorrect->incorrect": 0.2266009896993637, - "eval_logits/chosen": 0.3016127645969391, - "eval_logits/rejected": 0.34773820638656616, - "eval_logps/accuracies": 0.7536945939064026, - "eval_logps/chosen": -294.51837158203125, - "eval_logps/ref_accuracies": 0.4433497488498688, - "eval_logps/ref_chosen": -287.3511047363281, - "eval_logps/ref_rejected": -289.0460205078125, - "eval_logps/rejected": -349.0025329589844, - "eval_loss": 0.6313375234603882, - "eval_rewards/accuracies": 0.8866994976997375, - "eval_rewards/chosen": -0.7167255878448486, - "eval_rewards/grad_term": 0.016497639939188957, - "eval_rewards/margins": 5.278923511505127, - "eval_rewards/rejected": -5.995649337768555, - "eval_runtime": 785.8607, - "eval_samples_per_second": 2.059, - "eval_steps_per_second": 0.258, - "step": 128 - }, - { - "epoch": 0.2676001555814858, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 35.58302765361098, - "learning_rate": 9.64244521337947e-07, - "logits/chosen": 0.3390696048736572, - "logits/rejected": 0.3560726046562195, - "logps/accuracies": 0.8125, - "logps/chosen": -322.2768249511719, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -310.5375061035156, - "logps/ref_rejected": -317.2485046386719, - "logps/rejected": -383.12200927734375, - "loss": 0.6209, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.1739336252212524, - "rewards/grad_term": 0.013801316730678082, - "rewards/margins": 5.413419246673584, - "rewards/rejected": -6.5873517990112305, - "step": 129 - }, - { - "epoch": 0.26967457539219497, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 29.188493789977155, - "learning_rate": 9.630911188004613e-07, - "logits/chosen": 0.4089130163192749, - "logits/rejected": 0.3992210626602173, - "logps/accuracies": 0.625, - "logps/chosen": -246.3241729736328, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -250.2421417236328, - "logps/ref_rejected": -233.44342041015625, - "logps/rejected": -285.1191711425781, - "loss": 0.6077, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.391795814037323, - "rewards/grad_term": 0.014157270081341267, - "rewards/margins": 5.559370040893555, - "rewards/rejected": -5.167574405670166, - "step": 130 - }, - { - "epoch": 0.2717489952029042, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 36.02900591013536, - "learning_rate": 9.619377162629756e-07, - "logits/chosen": 0.32642504572868347, - "logits/rejected": 0.34259384870529175, - "logps/accuracies": 0.875, - "logps/chosen": -331.6784973144531, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -329.0844421386719, - "logps/ref_rejected": -341.3873596191406, - "logps/rejected": -407.9965515136719, - "loss": 0.632, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.2594057619571686, - "rewards/grad_term": 0.006255139596760273, - "rewards/margins": 6.4015092849731445, - "rewards/rejected": -6.660915374755859, - "step": 131 - }, - { - "epoch": 0.2738234150136134, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 24.659869882606102, - "learning_rate": 9.607843137254902e-07, - "logits/chosen": 0.25219637155532837, - "logits/rejected": 0.21175454556941986, - "logps/accuracies": 0.6875, - "logps/chosen": -320.4070129394531, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -320.45111083984375, - "logps/ref_rejected": -291.7745056152344, - "logps/rejected": -371.3065490722656, - "loss": 0.5938, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.0044078826904296875, - "rewards/grad_term": 0.007969305850565434, - "rewards/margins": 7.957607269287109, - "rewards/rejected": -7.953199863433838, - "step": 132 - }, - { - "epoch": 0.27589783482432256, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 66.69910712869152, - "learning_rate": 9.596309111880046e-07, - "logits/chosen": 0.4336986243724823, - "logits/rejected": 0.4323787987232208, - "logps/accuracies": 0.6875, - "logps/chosen": -302.4508361816406, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -285.8231201171875, - "logps/ref_rejected": -284.0436706542969, - "logps/rejected": -342.44122314453125, - "loss": 0.6008, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.6627693176269531, - "rewards/grad_term": 0.020465871319174767, - "rewards/margins": 4.176986217498779, - "rewards/rejected": -5.839755535125732, - "step": 133 - }, - { - "epoch": 0.2779722546350318, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 35.23241127542349, - "learning_rate": 9.58477508650519e-07, - "logits/chosen": 0.502811074256897, - "logits/rejected": 0.5239925980567932, - "logps/accuracies": 0.75, - "logps/chosen": -317.7173156738281, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -317.0647888183594, - "logps/ref_rejected": -297.7698669433594, - "logps/rejected": -358.817626953125, - "loss": 0.6362, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.06524688005447388, - "rewards/grad_term": 0.008810807019472122, - "rewards/margins": 6.039529800415039, - "rewards/rejected": -6.1047773361206055, - "step": 134 - }, - { - "epoch": 0.28004667444574094, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 58.595789612615945, - "learning_rate": 9.573241061130333e-07, - "logits/chosen": 0.36474430561065674, - "logits/rejected": 0.35197287797927856, - "logps/accuracies": 0.75, - "logps/chosen": -326.8201904296875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -325.84783935546875, - "logps/ref_rejected": -326.2251892089844, - "logps/rejected": -372.5774841308594, - "loss": 0.6008, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.09723645448684692, - "rewards/grad_term": 0.020627174526453018, - "rewards/margins": 4.537996768951416, - "rewards/rejected": -4.6352338790893555, - "step": 135 - }, - { - "epoch": 0.28212109425645016, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 25.14456310024547, - "learning_rate": 9.561707035755479e-07, - "logits/chosen": 0.26421457529067993, - "logits/rejected": 0.33900099992752075, - "logps/accuracies": 0.875, - "logps/chosen": -260.8269348144531, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -258.1986999511719, - "logps/ref_rejected": -283.28680419921875, - "logps/rejected": -322.0076904296875, - "loss": 0.6207, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.26282617449760437, - "rewards/grad_term": 0.02707597427070141, - "rewards/margins": 3.609261989593506, - "rewards/rejected": -3.8720884323120117, - "step": 136 - }, - { - "epoch": 0.2841955140671593, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.4375, - "grad_norm": 42.8867860973982, - "learning_rate": 9.550173010380622e-07, - "logits/chosen": 0.09971302002668381, - "logits/rejected": 0.12542912364006042, - "logps/accuracies": 0.5625, - "logps/chosen": -322.331787109375, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -321.5882873535156, - "logps/ref_rejected": -315.2976379394531, - "logps/rejected": -352.12066650390625, - "loss": 0.6771, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.07434892654418945, - "rewards/grad_term": 0.01898660883307457, - "rewards/margins": 3.607954978942871, - "rewards/rejected": -3.6823039054870605, - "step": 137 - }, - { - "epoch": 0.28626993387786853, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 67.6595700226082, - "learning_rate": 9.538638985005768e-07, - "logits/chosen": 0.23438116908073425, - "logits/rejected": 0.3342619240283966, - "logps/accuracies": 0.8125, - "logps/chosen": -300.3291015625, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -296.4096984863281, - "logps/ref_rejected": -310.85064697265625, - "logps/rejected": -363.2022399902344, - "loss": 0.6381, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.39193806052207947, - "rewards/grad_term": 0.017197635024785995, - "rewards/margins": 4.84321928024292, - "rewards/rejected": -5.235157489776611, - "step": 138 - }, - { - "epoch": 0.28834435368857775, - "flips/correct->correct": 0.1875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 46.44298957827346, - "learning_rate": 9.52710495963091e-07, - "logits/chosen": 0.1467391550540924, - "logits/rejected": 0.11830101907253265, - "logps/accuracies": 0.625, - "logps/chosen": -321.1262512207031, - "logps/ref_accuracies": 0.1875, - "logps/ref_chosen": -325.08428955078125, - "logps/ref_rejected": -281.5567321777344, - "logps/rejected": -340.51849365234375, - "loss": 0.6354, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.3958081305027008, - "rewards/grad_term": 0.013231747783720493, - "rewards/margins": 6.291983604431152, - "rewards/rejected": -5.896175384521484, - "step": 139 - }, - { - "epoch": 0.2904187734992869, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 36.89267412980749, - "learning_rate": 9.515570934256055e-07, - "logits/chosen": 0.3458084762096405, - "logits/rejected": 0.37101224064826965, - "logps/accuracies": 0.875, - "logps/chosen": -278.9690246582031, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -275.2301940917969, - "logps/ref_rejected": -293.14935302734375, - "logps/rejected": -343.7322998046875, - "loss": 0.6516, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.37388384342193604, - "rewards/grad_term": 0.020328430458903313, - "rewards/margins": 4.684409141540527, - "rewards/rejected": -5.058292865753174, - "step": 140 - }, - { - "epoch": 0.2924931933099961, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0625, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 29.520183041657447, - "learning_rate": 9.504036908881198e-07, - "logits/chosen": 0.2588088810443878, - "logits/rejected": 0.35400643944740295, - "logps/accuracies": 0.6875, - "logps/chosen": -329.9474182128906, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -309.6986999511719, - "logps/ref_rejected": -333.968994140625, - "logps/rejected": -376.42169189453125, - "loss": 0.6512, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.024869918823242, - "rewards/grad_term": 0.03422696888446808, - "rewards/margins": 2.220407247543335, - "rewards/rejected": -4.24527645111084, - "step": 141 - }, - { - "epoch": 0.2945676131207053, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 20.090341392606742, - "learning_rate": 9.492502883506344e-07, - "logits/chosen": 0.17465892434120178, - "logits/rejected": 0.19804833829402924, - "logps/accuracies": 0.6875, - "logps/chosen": -313.5783386230469, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -307.25164794921875, - "logps/ref_rejected": -290.28948974609375, - "logps/rejected": -353.5581359863281, - "loss": 0.5813, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.6326678991317749, - "rewards/grad_term": 0.017216186970472336, - "rewards/margins": 5.694197177886963, - "rewards/rejected": -6.3268656730651855, - "step": 142 - }, - { - "epoch": 0.2966420329314145, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 38.2122061812, - "learning_rate": 9.480968858131488e-07, - "logits/chosen": 0.28784969449043274, - "logits/rejected": 0.38434553146362305, - "logps/accuracies": 0.8125, - "logps/chosen": -337.89697265625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -307.4181823730469, - "logps/ref_rejected": -347.0847473144531, - "logps/rejected": -420.6956787109375, - "loss": 0.5784, - "rewards/accuracies": 0.875, - "rewards/chosen": -3.047877788543701, - "rewards/grad_term": 0.021118801087141037, - "rewards/margins": 4.3132147789001465, - "rewards/rejected": -7.361092567443848, - "step": 143 - }, - { - "epoch": 0.29871645274212366, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 37.993499093491, - "learning_rate": 9.469434832756632e-07, - "logits/chosen": 0.3818073570728302, - "logits/rejected": 0.4472813010215759, - "logps/accuracies": 1.0, - "logps/chosen": -309.6822509765625, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -295.15576171875, - "logps/ref_rejected": -316.4786071777344, - "logps/rejected": -395.452392578125, - "loss": 0.6367, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4526524543762207, - "rewards/grad_term": 0.005610838998109102, - "rewards/margins": 6.4447221755981445, - "rewards/rejected": -7.897374629974365, - "step": 144 - }, - { - "epoch": 0.3007908725528329, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 54.21342872299693, - "learning_rate": 9.457900807381776e-07, - "logits/chosen": 0.10580252856016159, - "logits/rejected": 0.1295485496520996, - "logps/accuracies": 0.875, - "logps/chosen": -312.27178955078125, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -306.3640441894531, - "logps/ref_rejected": -344.1884765625, - "logps/rejected": -413.30572509765625, - "loss": 0.5971, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.5907725095748901, - "rewards/grad_term": 0.014945675618946552, - "rewards/margins": 6.320951461791992, - "rewards/rejected": -6.911723613739014, - "step": 145 - }, - { - "epoch": 0.3028652923635421, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 49.62895603709547, - "learning_rate": 9.446366782006921e-07, - "logits/chosen": 0.496852844953537, - "logits/rejected": 0.49739354848861694, - "logps/accuracies": 0.75, - "logps/chosen": -254.4556121826172, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -253.5081787109375, - "logps/ref_rejected": -243.6640625, - "logps/rejected": -306.6009826660156, - "loss": 0.6307, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.09474316239356995, - "rewards/grad_term": 0.014840014278888702, - "rewards/margins": 6.198947906494141, - "rewards/rejected": -6.2936906814575195, - "step": 146 - }, - { - "epoch": 0.30493971217425125, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 33.16438726233068, - "learning_rate": 9.434832756632064e-07, - "logits/chosen": 0.3227022588253021, - "logits/rejected": 0.29622456431388855, - "logps/accuracies": 0.9375, - "logps/chosen": -315.11383056640625, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -323.7686462402344, - "logps/ref_rejected": -318.28631591796875, - "logps/rejected": -397.4606628417969, - "loss": 0.598, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.865482747554779, - "rewards/grad_term": 0.0047454568557441235, - "rewards/margins": 8.782920837402344, - "rewards/rejected": -7.917438507080078, - "step": 147 - }, - { - "epoch": 0.30701413198496047, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 32.98244501476046, - "learning_rate": 9.423298731257209e-07, - "logits/chosen": 0.26352736353874207, - "logits/rejected": 0.2875834107398987, - "logps/accuracies": 0.6875, - "logps/chosen": -268.8025817871094, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -260.4774169921875, - "logps/ref_rejected": -263.6733703613281, - "logps/rejected": -314.69268798828125, - "loss": 0.6464, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.8325148820877075, - "rewards/grad_term": 0.02254444733262062, - "rewards/margins": 4.269417762756348, - "rewards/rejected": -5.101933002471924, - "step": 148 - }, - { - "epoch": 0.30908855179566963, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 33.32776229173226, - "learning_rate": 9.411764705882352e-07, - "logits/chosen": 0.11156149208545685, - "logits/rejected": 0.24737051129341125, - "logps/accuracies": 0.75, - "logps/chosen": -303.48388671875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -298.459228515625, - "logps/ref_rejected": -314.2269592285156, - "logps/rejected": -370.69622802734375, - "loss": 0.5583, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.5024658441543579, - "rewards/grad_term": 0.022268792614340782, - "rewards/margins": 5.144461631774902, - "rewards/rejected": -5.646927356719971, - "step": 149 - }, - { - "epoch": 0.31116297160637885, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0625, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 21.82859690680754, - "learning_rate": 9.400230680507497e-07, - "logits/chosen": 0.29909923672676086, - "logits/rejected": 0.33298757672309875, - "logps/accuracies": 0.625, - "logps/chosen": -245.38111877441406, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -240.87034606933594, - "logps/ref_rejected": -240.8376922607422, - "logps/rejected": -289.25067138671875, - "loss": 0.5837, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.45107802748680115, - "rewards/grad_term": 0.022424593567848206, - "rewards/margins": 4.390218257904053, - "rewards/rejected": -4.841296195983887, - "step": 150 - }, - { - "epoch": 0.313237391417088, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 32.35074274110825, - "learning_rate": 9.38869665513264e-07, - "logits/chosen": 0.1612250953912735, - "logits/rejected": 0.15677325427532196, - "logps/accuracies": 0.8125, - "logps/chosen": -291.77081298828125, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -292.1325988769531, - "logps/ref_rejected": -287.9341735839844, - "logps/rejected": -358.65057373046875, - "loss": 0.6135, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.03617708384990692, - "rewards/grad_term": 0.011941466480493546, - "rewards/margins": 7.107817649841309, - "rewards/rejected": -7.071640968322754, - "step": 151 - }, - { - "epoch": 0.3153118112277972, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 42.437628926037625, - "learning_rate": 9.377162629757785e-07, - "logits/chosen": 0.21432383358478546, - "logits/rejected": 0.2382117211818695, - "logps/accuracies": 0.9375, - "logps/chosen": -270.55712890625, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -275.3125305175781, - "logps/ref_rejected": -276.75384521484375, - "logps/rejected": -355.29779052734375, - "loss": 0.5223, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.4755399823188782, - "rewards/grad_term": 0.004575583152472973, - "rewards/margins": 8.329938888549805, - "rewards/rejected": -7.854398250579834, - "step": 152 - }, - { - "epoch": 0.31738623103850644, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 41.66344734527698, - "learning_rate": 9.365628604382929e-07, - "logits/chosen": -0.07189223915338516, - "logits/rejected": -0.08959042280912399, - "logps/accuracies": 0.875, - "logps/chosen": -328.5618896484375, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -327.5036926269531, - "logps/ref_rejected": -321.0348205566406, - "logps/rejected": -393.354248046875, - "loss": 0.5755, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.10582125186920166, - "rewards/grad_term": 0.014492910355329514, - "rewards/margins": 7.126119613647461, - "rewards/rejected": -7.231941223144531, - "step": 153 - }, - { - "epoch": 0.3194606508492156, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 29.74794689137638, - "learning_rate": 9.354094579008073e-07, - "logits/chosen": 0.35974666476249695, - "logits/rejected": 0.3688337206840515, - "logps/accuracies": 0.8125, - "logps/chosen": -313.2078857421875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -305.1917419433594, - "logps/ref_rejected": -320.38128662109375, - "logps/rejected": -390.50933837890625, - "loss": 0.5928, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8016154766082764, - "rewards/grad_term": 0.008705828338861465, - "rewards/margins": 6.2111945152282715, - "rewards/rejected": -7.012809753417969, - "step": 154 - }, - { - "epoch": 0.3215350706599248, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 28.687378270489237, - "learning_rate": 9.342560553633218e-07, - "logits/chosen": 0.18269102275371552, - "logits/rejected": 0.1776474416255951, - "logps/accuracies": 0.875, - "logps/chosen": -275.4834289550781, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -272.5428771972656, - "logps/ref_rejected": -261.2928161621094, - "logps/rejected": -336.526611328125, - "loss": 0.5897, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.2940564453601837, - "rewards/grad_term": 0.006705356761813164, - "rewards/margins": 7.2293267250061035, - "rewards/rejected": -7.523382186889648, - "step": 155 - }, - { - "epoch": 0.323609490470634, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 36.335486366302646, - "learning_rate": 9.331026528258363e-07, - "logits/chosen": 0.07444247603416443, - "logits/rejected": 0.20133280754089355, - "logps/accuracies": 0.75, - "logps/chosen": -329.2906188964844, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -316.47515869140625, - "logps/ref_rejected": -332.9421081542969, - "logps/rejected": -394.68560791015625, - "loss": 0.5576, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.2815489768981934, - "rewards/grad_term": 0.0174331646412611, - "rewards/margins": 4.892797946929932, - "rewards/rejected": -6.174346923828125, - "step": 156 - }, - { - "epoch": 0.3256839102813432, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 27.29416495658281, - "learning_rate": 9.319492502883506e-07, - "logits/chosen": 0.4256312847137451, - "logits/rejected": 0.4740726053714752, - "logps/accuracies": 0.9375, - "logps/chosen": -321.6173095703125, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -303.39617919921875, - "logps/ref_rejected": -309.72113037109375, - "logps/rejected": -375.21429443359375, - "loss": 0.5708, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.822115182876587, - "rewards/grad_term": 0.014289310202002525, - "rewards/margins": 4.727199077606201, - "rewards/rejected": -6.549314022064209, - "step": 157 - }, - { - "epoch": 0.32775833009205235, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 32.223889847251904, - "learning_rate": 9.307958477508651e-07, - "logits/chosen": 0.3213425576686859, - "logits/rejected": 0.35512280464172363, - "logps/accuracies": 0.8125, - "logps/chosen": -299.30364990234375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -305.61181640625, - "logps/ref_rejected": -306.4347839355469, - "logps/rejected": -362.4326171875, - "loss": 0.573, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6308162212371826, - "rewards/grad_term": 0.008797680027782917, - "rewards/margins": 6.230600357055664, - "rewards/rejected": -5.5997843742370605, - "step": 158 - }, - { - "epoch": 0.32983274990276157, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 42.213863271098234, - "learning_rate": 9.296424452133794e-07, - "logits/chosen": 0.377382755279541, - "logits/rejected": 0.456988126039505, - "logps/accuracies": 1.0, - "logps/chosen": -288.3867492675781, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -293.0536804199219, - "logps/ref_rejected": -317.74163818359375, - "logps/rejected": -391.38153076171875, - "loss": 0.546, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.4666934609413147, - "rewards/grad_term": 0.0034745843149721622, - "rewards/margins": 7.8306803703308105, - "rewards/rejected": -7.363986968994141, - "step": 159 - }, - { - "epoch": 0.3319071697134708, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 71.4121450161779, - "learning_rate": 9.284890426758939e-07, - "logits/chosen": 0.31267380714416504, - "logits/rejected": 0.33250027894973755, - "logps/accuracies": 0.75, - "logps/chosen": -327.6630554199219, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -322.4392395019531, - "logps/ref_rejected": -329.0008544921875, - "logps/rejected": -386.5987548828125, - "loss": 0.6503, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.5223828554153442, - "rewards/grad_term": 0.015317104756832123, - "rewards/margins": 5.23740291595459, - "rewards/rejected": -5.759785175323486, - "step": 160 - }, - { - "epoch": 0.33398158952417994, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 90.86602072589874, - "learning_rate": 9.273356401384083e-07, - "logits/chosen": 0.16347447037696838, - "logits/rejected": 0.2435542643070221, - "logps/accuracies": 0.8125, - "logps/chosen": -309.9219665527344, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -309.3226013183594, - "logps/ref_rejected": -317.44964599609375, - "logps/rejected": -393.472900390625, - "loss": 0.5695, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.05993741750717163, - "rewards/grad_term": 0.009122053161263466, - "rewards/margins": 7.542388916015625, - "rewards/rejected": -7.602326393127441, - "step": 161 - }, - { - "epoch": 0.33605600933488916, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 63.38287747184822, - "learning_rate": 9.261822376009227e-07, - "logits/chosen": 0.03160097077488899, - "logits/rejected": 0.15727761387825012, - "logps/accuracies": 0.75, - "logps/chosen": -304.5788269042969, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -295.0198669433594, - "logps/ref_rejected": -315.7266540527344, - "logps/rejected": -362.4599304199219, - "loss": 0.669, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.9558972716331482, - "rewards/grad_term": 0.02149307169020176, - "rewards/margins": 3.7174317836761475, - "rewards/rejected": -4.673328876495361, - "step": 162 - }, - { - "epoch": 0.3381304291455983, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 37.24982276954982, - "learning_rate": 9.250288350634371e-07, - "logits/chosen": 0.24977634847164154, - "logits/rejected": 0.2465619146823883, - "logps/accuracies": 0.875, - "logps/chosen": -300.94000244140625, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -296.2314453125, - "logps/ref_rejected": -290.3169250488281, - "logps/rejected": -364.783447265625, - "loss": 0.5658, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.47085797786712646, - "rewards/grad_term": 0.00792708620429039, - "rewards/margins": 6.97579288482666, - "rewards/rejected": -7.446650981903076, - "step": 163 - }, - { - "epoch": 0.34020484895630754, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 41.17406855028247, - "learning_rate": 9.238754325259515e-07, - "logits/chosen": 0.27946552634239197, - "logits/rejected": 0.2818312346935272, - "logps/accuracies": 0.75, - "logps/chosen": -334.7640075683594, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -326.3661804199219, - "logps/ref_rejected": -322.5880126953125, - "logps/rejected": -379.8989562988281, - "loss": 0.5503, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.8397865295410156, - "rewards/grad_term": 0.01920832134783268, - "rewards/margins": 4.89130973815918, - "rewards/rejected": -5.731095790863037, - "step": 164 - }, - { - "epoch": 0.3422792687670167, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 30.140573016986096, - "learning_rate": 9.227220299884659e-07, - "logits/chosen": 0.106672503054142, - "logits/rejected": 0.20751769840717316, - "logps/accuracies": 0.75, - "logps/chosen": -289.3790283203125, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -279.317138671875, - "logps/ref_rejected": -285.1666564941406, - "logps/rejected": -346.164794921875, - "loss": 0.5963, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.0061873197555542, - "rewards/grad_term": 0.0124615877866745, - "rewards/margins": 5.093625068664551, - "rewards/rejected": -6.099812030792236, - "step": 165 - }, - { - "epoch": 0.3443536885777259, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 33.13491495817512, - "learning_rate": 9.215686274509803e-07, - "logits/chosen": 0.49837812781333923, - "logits/rejected": 0.5220686793327332, - "logps/accuracies": 0.8125, - "logps/chosen": -289.36871337890625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -290.96624755859375, - "logps/ref_rejected": -274.1662292480469, - "logps/rejected": -336.1552734375, - "loss": 0.5496, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.15975357592105865, - "rewards/grad_term": 0.012641198933124542, - "rewards/margins": 6.358658790588379, - "rewards/rejected": -6.1989054679870605, - "step": 166 - }, - { - "epoch": 0.34642810838843513, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 54.11732274527683, - "learning_rate": 9.204152249134947e-07, - "logits/chosen": 0.015494227409362793, - "logits/rejected": 0.016678210347890854, - "logps/accuracies": 0.8125, - "logps/chosen": -325.68914794921875, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -331.82000732421875, - "logps/ref_rejected": -319.7809143066406, - "logps/rejected": -382.5654296875, - "loss": 0.5618, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.6130860447883606, - "rewards/grad_term": 0.009663441218435764, - "rewards/margins": 6.891541004180908, - "rewards/rejected": -6.278454780578613, - "step": 167 - }, - { - "epoch": 0.3485025281991443, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 35.17938428087587, - "learning_rate": 9.192618223760092e-07, - "logits/chosen": 0.1498590111732483, - "logits/rejected": 0.03451567143201828, - "logps/accuracies": 0.75, - "logps/chosen": -318.3073425292969, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -324.92156982421875, - "logps/ref_rejected": -279.1431884765625, - "logps/rejected": -343.2935791015625, - "loss": 0.5567, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.6614212989807129, - "rewards/grad_term": 0.004435483831912279, - "rewards/margins": 7.076463222503662, - "rewards/rejected": -6.415041923522949, - "step": 168 - }, - { - "epoch": 0.3505769480098535, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.4375, - "grad_norm": 29.79744425397875, - "learning_rate": 9.181084198385236e-07, - "logits/chosen": 0.4352983832359314, - "logits/rejected": 0.42166027426719666, - "logps/accuracies": 0.5625, - "logps/chosen": -218.41848754882812, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -218.35841369628906, - "logps/ref_rejected": -203.54812622070312, - "logps/rejected": -244.30548095703125, - "loss": 0.6235, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.0060057491064071655, - "rewards/grad_term": 0.020719772204756737, - "rewards/margins": 4.069727897644043, - "rewards/rejected": -4.0757341384887695, - "step": 169 - }, - { - "epoch": 0.35265136782056267, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.4375, - "grad_norm": 37.443318352494714, - "learning_rate": 9.16955017301038e-07, - "logits/chosen": 0.34023189544677734, - "logits/rejected": 0.36414065957069397, - "logps/accuracies": 0.5625, - "logps/chosen": -341.10162353515625, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -348.6609191894531, - "logps/ref_rejected": -313.69305419921875, - "logps/rejected": -365.22552490234375, - "loss": 0.5449, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.7559297680854797, - "rewards/grad_term": 0.012427425011992455, - "rewards/margins": 5.909174919128418, - "rewards/rejected": -5.153245449066162, - "step": 170 - }, - { - "epoch": 0.3547257876312719, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 38.974124690109655, - "learning_rate": 9.158016147635525e-07, - "logits/chosen": 0.2296074777841568, - "logits/rejected": 0.22281108796596527, - "logps/accuracies": 0.75, - "logps/chosen": -266.399658203125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -266.3810729980469, - "logps/ref_rejected": -264.22479248046875, - "logps/rejected": -312.05328369140625, - "loss": 0.5639, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.00185690401121974, - "rewards/grad_term": 0.021189574152231216, - "rewards/margins": 4.780992031097412, - "rewards/rejected": -4.782848358154297, - "step": 171 - }, - { - "epoch": 0.35680020744198104, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 40.615225671584604, - "learning_rate": 9.146482122260668e-07, - "logits/chosen": -0.058992840349674225, - "logits/rejected": 0.13406533002853394, - "logps/accuracies": 0.8125, - "logps/chosen": -248.48712158203125, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -245.46426391601562, - "logps/ref_rejected": -322.3562316894531, - "logps/rejected": -359.47808837890625, - "loss": 0.6297, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.3022858798503876, - "rewards/grad_term": 0.02431631274521351, - "rewards/margins": 3.4098992347717285, - "rewards/rejected": -3.7121849060058594, - "step": 172 - }, - { - "epoch": 0.35887462725269026, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 48.38748944944743, - "learning_rate": 9.134948096885813e-07, - "logits/chosen": 0.17907698452472687, - "logits/rejected": 0.2532532811164856, - "logps/accuracies": 0.75, - "logps/chosen": -268.450927734375, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -272.5251770019531, - "logps/ref_rejected": -259.0370178222656, - "logps/rejected": -306.13800048828125, - "loss": 0.6268, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.40742844343185425, - "rewards/grad_term": 0.01490036677569151, - "rewards/margins": 5.117522716522217, - "rewards/rejected": -4.710094451904297, - "step": 173 - }, - { - "epoch": 0.3609490470633995, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 31.921038664510473, - "learning_rate": 9.123414071510956e-07, - "logits/chosen": 0.24931451678276062, - "logits/rejected": 0.32089927792549133, - "logps/accuracies": 0.875, - "logps/chosen": -327.949951171875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -325.4226379394531, - "logps/ref_rejected": -319.7037353515625, - "logps/rejected": -401.667724609375, - "loss": 0.5455, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.25273361802101135, - "rewards/grad_term": 0.007721267640590668, - "rewards/margins": 7.943665504455566, - "rewards/rejected": -8.196398735046387, - "step": 174 - }, - { - "epoch": 0.36302346687410864, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 32.597119731399815, - "learning_rate": 9.111880046136101e-07, - "logits/chosen": 0.14227242767810822, - "logits/rejected": 0.14696185290813446, - "logps/accuracies": 0.9375, - "logps/chosen": -302.6449890136719, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -304.1968078613281, - "logps/ref_rejected": -312.4093322753906, - "logps/rejected": -365.310791015625, - "loss": 0.5941, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.15517938137054443, - "rewards/grad_term": 0.017512062564492226, - "rewards/margins": 5.4453277587890625, - "rewards/rejected": -5.2901482582092285, - "step": 175 - }, - { - "epoch": 0.36509788668481785, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 40.38991325463123, - "learning_rate": 9.100346020761245e-07, - "logits/chosen": 0.4459385275840759, - "logits/rejected": 0.48317578434944153, - "logps/accuracies": 0.9375, - "logps/chosen": -375.50152587890625, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -361.5687255859375, - "logps/ref_rejected": -390.73028564453125, - "logps/rejected": -472.79315185546875, - "loss": 0.6096, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.3932788372039795, - "rewards/grad_term": 0.012904556468129158, - "rewards/margins": 6.81300163269043, - "rewards/rejected": -8.206280708312988, - "step": 176 - }, - { - "epoch": 0.367172306495527, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 40.714548146552936, - "learning_rate": 9.088811995386389e-07, - "logits/chosen": 0.1182754784822464, - "logits/rejected": 0.10860362648963928, - "logps/accuracies": 0.8125, - "logps/chosen": -277.8997802734375, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -265.0371398925781, - "logps/ref_rejected": -266.34906005859375, - "logps/rejected": -348.8620300292969, - "loss": 0.5457, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.286261796951294, - "rewards/grad_term": 0.013155965134501457, - "rewards/margins": 6.965037822723389, - "rewards/rejected": -8.251298904418945, - "step": 177 - }, - { - "epoch": 0.36924672630623623, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 23.925847924727726, - "learning_rate": 9.077277970011533e-07, - "logits/chosen": 0.19493117928504944, - "logits/rejected": 0.1747354418039322, - "logps/accuracies": 0.6875, - "logps/chosen": -253.36224365234375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -253.14541625976562, - "logps/ref_rejected": -253.6729736328125, - "logps/rejected": -327.2242736816406, - "loss": 0.5883, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.02168332040309906, - "rewards/grad_term": 0.010116681456565857, - "rewards/margins": 7.33344841003418, - "rewards/rejected": -7.35513162612915, - "step": 178 - }, - { - "epoch": 0.37132114611694544, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 41.54057148552365, - "learning_rate": 9.065743944636677e-07, - "logits/chosen": 0.09981651604175568, - "logits/rejected": 0.060021985322237015, - "logps/accuracies": 0.8125, - "logps/chosen": -319.4670104980469, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -305.79840087890625, - "logps/ref_rejected": -288.304443359375, - "logps/rejected": -342.7301940917969, - "loss": 0.6653, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.3668627738952637, - "rewards/grad_term": 0.01719023287296295, - "rewards/margins": 4.075715065002441, - "rewards/rejected": -5.442577838897705, - "step": 179 - }, - { - "epoch": 0.3733955659276546, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 53.53086030452953, - "learning_rate": 9.054209919261822e-07, - "logits/chosen": 0.32655516266822815, - "logits/rejected": 0.4233202338218689, - "logps/accuracies": 0.75, - "logps/chosen": -231.95205688476562, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -222.77748107910156, - "logps/ref_rejected": -249.9750213623047, - "logps/rejected": -298.7811584472656, - "loss": 0.616, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.9174575805664062, - "rewards/grad_term": 0.03101710043847561, - "rewards/margins": 3.9631576538085938, - "rewards/rejected": -4.880615234375, - "step": 180 - }, - { - "epoch": 0.3754699857383638, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 32.474177233879054, - "learning_rate": 9.042675893886967e-07, - "logits/chosen": 0.15671122074127197, - "logits/rejected": 0.15824642777442932, - "logps/accuracies": 0.875, - "logps/chosen": -331.437255859375, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -329.1837158203125, - "logps/ref_rejected": -332.14324951171875, - "logps/rejected": -419.30963134765625, - "loss": 0.585, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.2253548800945282, - "rewards/grad_term": 0.006346164736896753, - "rewards/margins": 8.491281509399414, - "rewards/rejected": -8.716635704040527, - "step": 181 - }, - { - "epoch": 0.377544405549073, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 32.49282958416918, - "learning_rate": 9.03114186851211e-07, - "logits/chosen": 0.18260034918785095, - "logits/rejected": 0.14178498089313507, - "logps/accuracies": 0.8125, - "logps/chosen": -295.148193359375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -288.6500244140625, - "logps/ref_rejected": -278.32867431640625, - "logps/rejected": -359.5013427734375, - "loss": 0.5916, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.6498188972473145, - "rewards/grad_term": 0.008202875964343548, - "rewards/margins": 7.467443466186523, - "rewards/rejected": -8.11726188659668, - "step": 182 - }, - { - "epoch": 0.3796188253597822, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 29.4740035475264, - "learning_rate": 9.019607843137255e-07, - "logits/chosen": 0.26660820841789246, - "logits/rejected": 0.36798760294914246, - "logps/accuracies": 0.75, - "logps/chosen": -304.508056640625, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -300.42181396484375, - "logps/ref_rejected": -271.1709899902344, - "logps/rejected": -343.62274169921875, - "loss": 0.5723, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.40862417221069336, - "rewards/grad_term": 0.009963629767298698, - "rewards/margins": 6.836550712585449, - "rewards/rejected": -7.245175361633301, - "step": 183 - }, - { - "epoch": 0.38169324517049136, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 39.92370508801508, - "learning_rate": 9.008073817762398e-07, - "logits/chosen": 0.11014918982982635, - "logits/rejected": 0.12970136106014252, - "logps/accuracies": 0.875, - "logps/chosen": -311.4981689453125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -316.7832336425781, - "logps/ref_rejected": -304.4905700683594, - "logps/rejected": -379.767333984375, - "loss": 0.5724, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5285077095031738, - "rewards/grad_term": 0.0013775170082226396, - "rewards/margins": 8.056180953979492, - "rewards/rejected": -7.527673721313477, - "step": 184 - }, - { - "epoch": 0.3837676649812006, - "flips/correct->correct": 0.1875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 77.26727734044364, - "learning_rate": 8.996539792387543e-07, - "logits/chosen": 0.32915353775024414, - "logits/rejected": 0.36334753036499023, - "logps/accuracies": 0.625, - "logps/chosen": -317.26904296875, - "logps/ref_accuracies": 0.1875, - "logps/ref_chosen": -323.752685546875, - "logps/ref_rejected": -283.16058349609375, - "logps/rejected": -342.1800537109375, - "loss": 0.5702, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.6483669281005859, - "rewards/grad_term": 0.011216258630156517, - "rewards/margins": 6.550315856933594, - "rewards/rejected": -5.901949405670166, - "step": 185 - }, - { - "epoch": 0.3858420847919098, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 44.46292399532884, - "learning_rate": 8.985005767012687e-07, - "logits/chosen": 0.250750333070755, - "logits/rejected": 0.2685026228427887, - "logps/accuracies": 0.625, - "logps/chosen": -308.99078369140625, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -318.26324462890625, - "logps/ref_rejected": -362.1750793457031, - "logps/rejected": -410.8619384765625, - "loss": 0.6081, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9272449612617493, - "rewards/grad_term": 0.01049741543829441, - "rewards/margins": 5.795932769775391, - "rewards/rejected": -4.868687152862549, - "step": 186 - }, - { - "epoch": 0.38791650460261895, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 60.58056206089815, - "learning_rate": 8.973471741637831e-07, - "logits/chosen": 0.16933155059814453, - "logits/rejected": 0.23612166941165924, - "logps/accuracies": 0.8125, - "logps/chosen": -347.870849609375, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -355.09906005859375, - "logps/ref_rejected": -377.2695007324219, - "logps/rejected": -426.260009765625, - "loss": 0.6342, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.7228207588195801, - "rewards/grad_term": 0.016279596835374832, - "rewards/margins": 5.6218695640563965, - "rewards/rejected": -4.899049282073975, - "step": 187 - }, - { - "epoch": 0.38999092441332817, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 38.184653723145104, - "learning_rate": 8.961937716262975e-07, - "logits/chosen": 0.19383230805397034, - "logits/rejected": 0.27799373865127563, - "logps/accuracies": 0.875, - "logps/chosen": -237.73406982421875, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -243.88479614257812, - "logps/ref_rejected": -270.0097961425781, - "logps/rejected": -308.8341064453125, - "loss": 0.6516, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.6150741577148438, - "rewards/grad_term": 0.016990307718515396, - "rewards/margins": 4.497503280639648, - "rewards/rejected": -3.8824288845062256, - "step": 188 - }, - { - "epoch": 0.3920653442240373, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 94.03754868556166, - "learning_rate": 8.95040369088812e-07, - "logits/chosen": 0.18277229368686676, - "logits/rejected": 0.32301703095436096, - "logps/accuracies": 0.75, - "logps/chosen": -297.812744140625, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -297.7760314941406, - "logps/ref_rejected": -351.8489990234375, - "logps/rejected": -395.95452880859375, - "loss": 0.5906, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.003670990467071533, - "rewards/grad_term": 0.01534755527973175, - "rewards/margins": 4.406883716583252, - "rewards/rejected": -4.4105544090271, - "step": 189 - }, - { - "epoch": 0.39413976403474654, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 36.637550653396346, - "learning_rate": 8.938869665513263e-07, - "logits/chosen": 0.3476504683494568, - "logits/rejected": 0.33958810567855835, - "logps/accuracies": 0.8125, - "logps/chosen": -227.98374938964844, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -220.88394165039062, - "logps/ref_rejected": -224.73675537109375, - "logps/rejected": -278.4228515625, - "loss": 0.6248, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.7099814414978027, - "rewards/grad_term": 0.017782405018806458, - "rewards/margins": 4.658628463745117, - "rewards/rejected": -5.36860990524292, - "step": 190 - }, - { - "epoch": 0.3962141838454557, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 34.6180474435158, - "learning_rate": 8.927335640138408e-07, - "logits/chosen": 0.29524171352386475, - "logits/rejected": 0.249376118183136, - "logps/accuracies": 0.75, - "logps/chosen": -340.69873046875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -323.4943542480469, - "logps/ref_rejected": -289.3078308105469, - "logps/rejected": -365.9657287597656, - "loss": 0.5815, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.7204382419586182, - "rewards/grad_term": 0.016098525375127792, - "rewards/margins": 5.945353031158447, - "rewards/rejected": -7.6657915115356445, - "step": 191 - }, - { - "epoch": 0.3982886036561649, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 38.65115109290824, - "learning_rate": 8.915801614763551e-07, - "logits/chosen": 0.10618914663791656, - "logits/rejected": 0.20010565221309662, - "logps/accuracies": 0.8125, - "logps/chosen": -255.34237670898438, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -254.41799926757812, - "logps/ref_rejected": -255.06790161132812, - "logps/rejected": -335.4033203125, - "loss": 0.5252, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.09243983030319214, - "rewards/grad_term": 0.008064459078013897, - "rewards/margins": 7.941101551055908, - "rewards/rejected": -8.033540725708008, - "step": 192 - }, - { - "epoch": 0.3982886036561649, - "eval_flips/correct->correct": 0.4334975481033325, - "eval_flips/correct->incorrect": 0.009852216579020023, - "eval_flips/incorrect->correct": 0.35960590839385986, - "eval_flips/incorrect->incorrect": 0.19704432785511017, - "eval_logits/chosen": 0.20541736483573914, - "eval_logits/rejected": 0.25030994415283203, - "eval_logps/accuracies": 0.7931034564971924, - "eval_logps/chosen": -291.723388671875, - "eval_logps/ref_accuracies": 0.4433497488498688, - "eval_logps/ref_chosen": -287.3511047363281, - "eval_logps/ref_rejected": -289.0460205078125, - "eval_logps/rejected": -350.8753356933594, - "eval_loss": 0.6111010313034058, - "eval_rewards/accuracies": 0.8620689511299133, - "eval_rewards/chosen": -0.4372285008430481, - "eval_rewards/grad_term": 0.016007939353585243, - "eval_rewards/margins": 5.745702743530273, - "eval_rewards/rejected": -6.182931900024414, - "eval_runtime": 791.2188, - "eval_samples_per_second": 2.045, - "eval_steps_per_second": 0.257, - "step": 192 - }, - { - "epoch": 0.40036302346687414, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 29.785884187336258, - "learning_rate": 8.904267589388697e-07, - "logits/chosen": 0.4344290494918823, - "logits/rejected": 0.4853968620300293, - "logps/accuracies": 0.75, - "logps/chosen": -235.33804321289062, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -224.7119903564453, - "logps/ref_rejected": -247.52615356445312, - "logps/rejected": -317.22064208984375, - "loss": 0.5751, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.0626037120819092, - "rewards/grad_term": 0.017159339040517807, - "rewards/margins": 5.906847953796387, - "rewards/rejected": -6.969451904296875, - "step": 193 - }, - { - "epoch": 0.4024374432775833, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 33.823622203078585, - "learning_rate": 8.89273356401384e-07, - "logits/chosen": -0.00733010470867157, - "logits/rejected": 0.039806053042411804, - "logps/accuracies": 0.8125, - "logps/chosen": -298.87469482421875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -278.5909423828125, - "logps/ref_rejected": -274.5445251464844, - "logps/rejected": -354.7998352050781, - "loss": 0.6322, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.028369426727295, - "rewards/grad_term": 0.014807065948843956, - "rewards/margins": 5.997160911560059, - "rewards/rejected": -8.025529861450195, - "step": 194 - }, - { - "epoch": 0.4045118630882925, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 71.82502886035329, - "learning_rate": 8.881199538638985e-07, - "logits/chosen": 0.06032078340649605, - "logits/rejected": 0.08065234869718552, - "logps/accuracies": 0.8125, - "logps/chosen": -305.660888671875, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -292.9607238769531, - "logps/ref_rejected": -284.1155090332031, - "logps/rejected": -348.9861755371094, - "loss": 0.6122, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2700166702270508, - "rewards/grad_term": 0.022815629839897156, - "rewards/margins": 5.2170515060424805, - "rewards/rejected": -6.487068176269531, - "step": 195 - }, - { - "epoch": 0.40658628289900167, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 47.96509016237826, - "learning_rate": 8.869665513264129e-07, - "logits/chosen": 0.5451265573501587, - "logits/rejected": 0.6818545460700989, - "logps/accuracies": 0.8125, - "logps/chosen": -262.90087890625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -249.34771728515625, - "logps/ref_rejected": -265.0631408691406, - "logps/rejected": -335.938232421875, - "loss": 0.5831, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.3553152084350586, - "rewards/grad_term": 0.012808618135750294, - "rewards/margins": 5.732193946838379, - "rewards/rejected": -7.087508201599121, - "step": 196 - }, - { - "epoch": 0.4086607027097109, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 21.697323150316873, - "learning_rate": 8.858131487889273e-07, - "logits/chosen": 0.11935015022754669, - "logits/rejected": 0.2006658911705017, - "logps/accuracies": 0.875, - "logps/chosen": -336.0437927246094, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -329.3843078613281, - "logps/ref_rejected": -327.40045166015625, - "logps/rejected": -413.8304748535156, - "loss": 0.6046, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6659499406814575, - "rewards/grad_term": 0.00346172577701509, - "rewards/margins": 7.977048873901367, - "rewards/rejected": -8.642998695373535, - "step": 197 - }, - { - "epoch": 0.41073512252042005, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 40.358779912668254, - "learning_rate": 8.846597462514417e-07, - "logits/chosen": 0.23720747232437134, - "logits/rejected": 0.2754895091056824, - "logps/accuracies": 0.875, - "logps/chosen": -296.7972106933594, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -265.8367614746094, - "logps/ref_rejected": -281.0025939941406, - "logps/rejected": -351.4774475097656, - "loss": 0.625, - "rewards/accuracies": 0.8125, - "rewards/chosen": -3.0960447788238525, - "rewards/grad_term": 0.024012045934796333, - "rewards/margins": 3.9514381885528564, - "rewards/rejected": -7.047482967376709, - "step": 198 - }, - { - "epoch": 0.41280954233112926, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 92.31220771810881, - "learning_rate": 8.835063437139562e-07, - "logits/chosen": 0.4791252911090851, - "logits/rejected": 0.5575248599052429, - "logps/accuracies": 0.8125, - "logps/chosen": -268.31036376953125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -276.9249572753906, - "logps/ref_rejected": -266.7868957519531, - "logps/rejected": -348.34381103515625, - "loss": 0.5736, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.861458420753479, - "rewards/grad_term": 0.004701174795627594, - "rewards/margins": 9.0171537399292, - "rewards/rejected": -8.155694961547852, - "step": 199 - }, - { - "epoch": 0.4148839621418385, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 45.77620831393305, - "learning_rate": 8.823529411764705e-07, - "logits/chosen": 0.12551181018352509, - "logits/rejected": 0.14135059714317322, - "logps/accuracies": 0.9375, - "logps/chosen": -315.8223571777344, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -310.73651123046875, - "logps/ref_rejected": -292.3979797363281, - "logps/rejected": -374.2222900390625, - "loss": 0.6131, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.5085856318473816, - "rewards/grad_term": 0.004745251964777708, - "rewards/margins": 7.673846244812012, - "rewards/rejected": -8.182432174682617, - "step": 200 - }, - { - "epoch": 0.41695838195254764, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 40.635916417828575, - "learning_rate": 8.81199538638985e-07, - "logits/chosen": 0.06241011992096901, - "logits/rejected": 0.11283601820468903, - "logps/accuracies": 0.875, - "logps/chosen": -293.8592834472656, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -296.7002258300781, - "logps/ref_rejected": -303.21331787109375, - "logps/rejected": -367.14044189453125, - "loss": 0.5767, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.28409475088119507, - "rewards/grad_term": 0.00995566789060831, - "rewards/margins": 6.67680549621582, - "rewards/rejected": -6.392710208892822, - "step": 201 - }, - { - "epoch": 0.41903280176325686, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 31.417005116540565, - "learning_rate": 8.800461361014993e-07, - "logits/chosen": 0.08436602354049683, - "logits/rejected": 0.06602154672145844, - "logps/accuracies": 0.625, - "logps/chosen": -324.9912109375, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -327.7398681640625, - "logps/ref_rejected": -292.74981689453125, - "logps/rejected": -361.5688171386719, - "loss": 0.5621, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.2748683989048004, - "rewards/grad_term": 0.007309483364224434, - "rewards/margins": 7.1567702293396, - "rewards/rejected": -6.88190221786499, - "step": 202 - }, - { - "epoch": 0.421107221573966, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 35.05900906163333, - "learning_rate": 8.788927335640138e-07, - "logits/chosen": 0.20211604237556458, - "logits/rejected": 0.21109752357006073, - "logps/accuracies": 0.8125, - "logps/chosen": -333.8841552734375, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -335.36810302734375, - "logps/ref_rejected": -322.5255126953125, - "logps/rejected": -367.34234619140625, - "loss": 0.5631, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.14839425683021545, - "rewards/grad_term": 0.019571855664253235, - "rewards/margins": 4.630078315734863, - "rewards/rejected": -4.481683731079102, - "step": 203 - }, - { - "epoch": 0.42318164138467523, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 32.259107415010234, - "learning_rate": 8.777393310265282e-07, - "logits/chosen": 0.27740195393562317, - "logits/rejected": 0.37459149956703186, - "logps/accuracies": 0.8125, - "logps/chosen": -259.053466796875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -246.73666381835938, - "logps/ref_rejected": -271.6988525390625, - "logps/rejected": -328.5376281738281, - "loss": 0.6065, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.2316789627075195, - "rewards/grad_term": 0.0200329702347517, - "rewards/margins": 4.452197551727295, - "rewards/rejected": -5.6838765144348145, - "step": 204 - }, - { - "epoch": 0.4252560611953844, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 32.608539229670114, - "learning_rate": 8.765859284890427e-07, - "logits/chosen": 0.16907714307308197, - "logits/rejected": 0.20513200759887695, - "logps/accuracies": 0.875, - "logps/chosen": -244.8258056640625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -226.30160522460938, - "logps/ref_rejected": -258.55389404296875, - "logps/rejected": -322.78662109375, - "loss": 0.5794, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.8524205684661865, - "rewards/grad_term": 0.019479090347886086, - "rewards/margins": 4.570858001708984, - "rewards/rejected": -6.42327880859375, - "step": 205 - }, - { - "epoch": 0.4273304810060936, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 42.93969277035671, - "learning_rate": 8.754325259515571e-07, - "logits/chosen": 0.19578887522220612, - "logits/rejected": 0.24128146469593048, - "logps/accuracies": 0.8125, - "logps/chosen": -270.4194030761719, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -261.4314270019531, - "logps/ref_rejected": -278.7731628417969, - "logps/rejected": -349.106201171875, - "loss": 0.5817, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.8987985849380493, - "rewards/grad_term": 0.01616433635354042, - "rewards/margins": 6.134509086608887, - "rewards/rejected": -7.0333075523376465, - "step": 206 - }, - { - "epoch": 0.4294049008168028, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 40.26953156215366, - "learning_rate": 8.742791234140715e-07, - "logits/chosen": 0.2927509844303131, - "logits/rejected": 0.4132809340953827, - "logps/accuracies": 0.75, - "logps/chosen": -298.0523986816406, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -286.35516357421875, - "logps/ref_rejected": -299.4516296386719, - "logps/rejected": -371.3787841796875, - "loss": 0.5984, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.169724702835083, - "rewards/grad_term": 0.015548234805464745, - "rewards/margins": 6.022989749908447, - "rewards/rejected": -7.192714214324951, - "step": 207 - }, - { - "epoch": 0.431479320627512, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 44.42958011244542, - "learning_rate": 8.731257208765859e-07, - "logits/chosen": 0.15052379667758942, - "logits/rejected": 0.13466718792915344, - "logps/accuracies": 0.75, - "logps/chosen": -344.1072692871094, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -348.62744140625, - "logps/ref_rejected": -322.577880859375, - "logps/rejected": -371.2545471191406, - "loss": 0.5599, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.45202189683914185, - "rewards/grad_term": 0.014142685569822788, - "rewards/margins": 5.319693565368652, - "rewards/rejected": -4.867671966552734, - "step": 208 - }, - { - "epoch": 0.4335537404382212, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 55.08631188728783, - "learning_rate": 8.719723183391004e-07, - "logits/chosen": 0.2708834409713745, - "logits/rejected": 0.3263266980648041, - "logps/accuracies": 0.875, - "logps/chosen": -266.4635009765625, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -266.25146484375, - "logps/ref_rejected": -271.44537353515625, - "logps/rejected": -334.1143798828125, - "loss": 0.5873, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.021202266216278076, - "rewards/grad_term": 0.010078574530780315, - "rewards/margins": 6.245699882507324, - "rewards/rejected": -6.266901969909668, - "step": 209 - }, - { - "epoch": 0.43562816024893036, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 42.60799619320707, - "learning_rate": 8.708189158016147e-07, - "logits/chosen": 0.3283449113368988, - "logits/rejected": 0.3115319013595581, - "logps/accuracies": 0.875, - "logps/chosen": -309.02783203125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -306.033935546875, - "logps/ref_rejected": -304.4499816894531, - "logps/rejected": -374.966064453125, - "loss": 0.5595, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.2993917167186737, - "rewards/grad_term": 0.011198869906365871, - "rewards/margins": 6.752218246459961, - "rewards/rejected": -7.051610469818115, - "step": 210 - }, - { - "epoch": 0.4377025800596396, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 74.62504339808369, - "learning_rate": 8.696655132641292e-07, - "logits/chosen": 0.04295940697193146, - "logits/rejected": 0.1364721655845642, - "logps/accuracies": 0.75, - "logps/chosen": -253.16937255859375, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -249.6319580078125, - "logps/ref_rejected": -297.3905029296875, - "logps/rejected": -369.6263427734375, - "loss": 0.5717, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.3537403643131256, - "rewards/grad_term": 0.011611053720116615, - "rewards/margins": 6.869847297668457, - "rewards/rejected": -7.223587989807129, - "step": 211 - }, - { - "epoch": 0.43977699987034874, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.125, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 42.048425807585254, - "learning_rate": 8.685121107266435e-07, - "logits/chosen": 0.2111242413520813, - "logits/rejected": 0.26101890206336975, - "logps/accuracies": 0.625, - "logps/chosen": -333.13372802734375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -298.3570556640625, - "logps/ref_rejected": -285.5286865234375, - "logps/rejected": -340.77008056640625, - "loss": 0.59, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.4776673316955566, - "rewards/grad_term": 0.029041055589914322, - "rewards/margins": 2.046469211578369, - "rewards/rejected": -5.524137020111084, - "step": 212 - }, - { - "epoch": 0.44185141968105796, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 30.70565260692096, - "learning_rate": 8.67358708189158e-07, - "logits/chosen": 0.24488027393817902, - "logits/rejected": 0.3360682725906372, - "logps/accuracies": 0.75, - "logps/chosen": -306.7230529785156, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -302.4961242675781, - "logps/ref_rejected": -303.74755859375, - "logps/rejected": -375.0382080078125, - "loss": 0.6463, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.4226952791213989, - "rewards/grad_term": 0.006055990234017372, - "rewards/margins": 6.706371784210205, - "rewards/rejected": -7.1290669441223145, - "step": 213 - }, - { - "epoch": 0.44392583949176717, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 58.908823582347665, - "learning_rate": 8.662053056516724e-07, - "logits/chosen": 0.2656205892562866, - "logits/rejected": 0.29711484909057617, - "logps/accuracies": 0.8125, - "logps/chosen": -269.3460693359375, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -255.48500061035156, - "logps/ref_rejected": -259.42156982421875, - "logps/rejected": -320.4120178222656, - "loss": 0.5785, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.3861079216003418, - "rewards/grad_term": 0.017718670889735222, - "rewards/margins": 4.71293830871582, - "rewards/rejected": -6.099046230316162, - "step": 214 - }, - { - "epoch": 0.44600025930247633, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 56.61591688580251, - "learning_rate": 8.650519031141868e-07, - "logits/chosen": 0.35960566997528076, - "logits/rejected": 0.3392384648323059, - "logps/accuracies": 0.8125, - "logps/chosen": -299.29644775390625, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -293.4299011230469, - "logps/ref_rejected": -299.168701171875, - "logps/rejected": -368.6275939941406, - "loss": 0.5517, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.5866526961326599, - "rewards/grad_term": 0.011029000394046307, - "rewards/margins": 6.359241008758545, - "rewards/rejected": -6.94589376449585, - "step": 215 - }, - { - "epoch": 0.44807467911318555, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 35.14556178232318, - "learning_rate": 8.638985005767012e-07, - "logits/chosen": 0.11206863820552826, - "logits/rejected": 0.19429105520248413, - "logps/accuracies": 0.875, - "logps/chosen": -280.0439453125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -273.8160705566406, - "logps/ref_rejected": -276.4722595214844, - "logps/rejected": -337.29913330078125, - "loss": 0.6022, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6227847933769226, - "rewards/grad_term": 0.008503232151269913, - "rewards/margins": 5.459905624389648, - "rewards/rejected": -6.082690715789795, - "step": 216 - }, - { - "epoch": 0.4501490989238947, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 36.20806400718805, - "learning_rate": 8.627450980392156e-07, - "logits/chosen": -0.1600431501865387, - "logits/rejected": -0.10942815244197845, - "logps/accuracies": 0.875, - "logps/chosen": -301.800048828125, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -299.7960205078125, - "logps/ref_rejected": -275.73529052734375, - "logps/rejected": -364.4363098144531, - "loss": 0.5609, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.20040588080883026, - "rewards/grad_term": 0.004247845150530338, - "rewards/margins": 8.669699668884277, - "rewards/rejected": -8.870105743408203, - "step": 217 - }, - { - "epoch": 0.4522235187346039, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.125, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 45.446157769285975, - "learning_rate": 8.615916955017301e-07, - "logits/chosen": 0.24637356400489807, - "logits/rejected": 0.284047931432724, - "logps/accuracies": 0.625, - "logps/chosen": -249.30892944335938, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -239.27142333984375, - "logps/ref_rejected": -260.403076171875, - "logps/rejected": -308.0331726074219, - "loss": 0.6314, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.0037511587142944, - "rewards/grad_term": 0.02335098199546337, - "rewards/margins": 3.759258508682251, - "rewards/rejected": -4.763010025024414, - "step": 218 - }, - { - "epoch": 0.4542979385453131, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 30.72090456802453, - "learning_rate": 8.604382929642446e-07, - "logits/chosen": 0.30798035860061646, - "logits/rejected": 0.3721332848072052, - "logps/accuracies": 0.875, - "logps/chosen": -270.7388000488281, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -256.83319091796875, - "logps/ref_rejected": -253.60997009277344, - "logps/rejected": -320.3992004394531, - "loss": 0.5356, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3905625343322754, - "rewards/grad_term": 0.02029426395893097, - "rewards/margins": 5.288358688354492, - "rewards/rejected": -6.678920745849609, - "step": 219 - }, - { - "epoch": 0.4563723583560223, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 51.870714437465466, - "learning_rate": 8.592848904267589e-07, - "logits/chosen": 0.03684063255786896, - "logits/rejected": 0.16017459332942963, - "logps/accuracies": 0.75, - "logps/chosen": -253.1938934326172, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -256.5488586425781, - "logps/ref_rejected": -286.39129638671875, - "logps/rejected": -353.81689453125, - "loss": 0.5563, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.3354969322681427, - "rewards/grad_term": 0.007589938119053841, - "rewards/margins": 7.078057765960693, - "rewards/rejected": -6.742560386657715, - "step": 220 - }, - { - "epoch": 0.4584467781667315, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.625, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 35.42772905894603, - "learning_rate": 8.581314878892734e-07, - "logits/chosen": 0.2873913049697876, - "logits/rejected": 0.26383453607559204, - "logps/accuracies": 0.875, - "logps/chosen": -360.1090087890625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -353.25018310546875, - "logps/ref_rejected": -333.6820068359375, - "logps/rejected": -418.5933532714844, - "loss": 0.5623, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6858816146850586, - "rewards/grad_term": 0.0032467113342136145, - "rewards/margins": 7.8052544593811035, - "rewards/rejected": -8.491135597229004, - "step": 221 - }, - { - "epoch": 0.4605211979774407, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 84.34900463866556, - "learning_rate": 8.569780853517877e-07, - "logits/chosen": 0.1874726116657257, - "logits/rejected": 0.2251831591129303, - "logps/accuracies": 0.875, - "logps/chosen": -264.17974853515625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -255.59912109375, - "logps/ref_rejected": -261.2677001953125, - "logps/rejected": -335.05792236328125, - "loss": 0.6218, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.858064591884613, - "rewards/grad_term": 0.008338917046785355, - "rewards/margins": 6.520959377288818, - "rewards/rejected": -7.379024028778076, - "step": 222 - }, - { - "epoch": 0.4625956177881499, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 27.170194985611396, - "learning_rate": 8.558246828143022e-07, - "logits/chosen": 0.26863163709640503, - "logits/rejected": 0.2670744061470032, - "logps/accuracies": 0.875, - "logps/chosen": -277.2497863769531, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -275.13916015625, - "logps/ref_rejected": -272.11383056640625, - "logps/rejected": -328.0150146484375, - "loss": 0.5319, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.21106398105621338, - "rewards/grad_term": 0.012204117141664028, - "rewards/margins": 5.379053115844727, - "rewards/rejected": -5.590117931365967, - "step": 223 - }, - { - "epoch": 0.46467003759885905, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 79.27462179097967, - "learning_rate": 8.546712802768166e-07, - "logits/chosen": 0.18783439695835114, - "logits/rejected": 0.19196242094039917, - "logps/accuracies": 0.875, - "logps/chosen": -347.10906982421875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -339.8036193847656, - "logps/ref_rejected": -325.4422912597656, - "logps/rejected": -407.8504333496094, - "loss": 0.6027, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.7305458784103394, - "rewards/grad_term": 0.007087053265422583, - "rewards/margins": 7.510266304016113, - "rewards/rejected": -8.240811347961426, - "step": 224 - }, - { - "epoch": 0.46674445740956827, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.0625, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 35.29759314617677, - "learning_rate": 8.53517877739331e-07, - "logits/chosen": -0.19355978071689606, - "logits/rejected": -0.016655761748552322, - "logps/accuracies": 0.6875, - "logps/chosen": -308.5700378417969, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -295.40545654296875, - "logps/ref_rejected": -343.7320251464844, - "logps/rejected": -400.0100402832031, - "loss": 0.5589, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.3164560794830322, - "rewards/grad_term": 0.02172619104385376, - "rewards/margins": 4.311344146728516, - "rewards/rejected": -5.627799987792969, - "step": 225 - }, - { - "epoch": 0.46881887722027743, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 61.84351353540357, - "learning_rate": 8.523644752018454e-07, - "logits/chosen": 0.20059531927108765, - "logits/rejected": 0.1843167543411255, - "logps/accuracies": 0.75, - "logps/chosen": -261.94525146484375, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -262.80419921875, - "logps/ref_rejected": -244.80892944335938, - "logps/rejected": -298.7880554199219, - "loss": 0.5996, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.08589661121368408, - "rewards/grad_term": 0.018526069819927216, - "rewards/margins": 5.4838104248046875, - "rewards/rejected": -5.397914409637451, - "step": 226 - }, - { - "epoch": 0.47089329703098665, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 26.516761004669306, - "learning_rate": 8.512110726643598e-07, - "logits/chosen": 0.08376497030258179, - "logits/rejected": 0.1468810886144638, - "logps/accuracies": 0.9375, - "logps/chosen": -255.428466796875, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -253.26193237304688, - "logps/ref_rejected": -286.236572265625, - "logps/rejected": -325.91131591796875, - "loss": 0.6258, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.21665439009666443, - "rewards/grad_term": 0.02595067396759987, - "rewards/margins": 3.75081729888916, - "rewards/rejected": -3.9674713611602783, - "step": 227 - }, - { - "epoch": 0.47296771684169586, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 33.01960140111705, - "learning_rate": 8.500576701268742e-07, - "logits/chosen": 0.20910394191741943, - "logits/rejected": 0.21387630701065063, - "logps/accuracies": 0.8125, - "logps/chosen": -304.03131103515625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -293.5747985839844, - "logps/ref_rejected": -301.56024169921875, - "logps/rejected": -366.596435546875, - "loss": 0.5344, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.0456496477127075, - "rewards/grad_term": 0.01662050373852253, - "rewards/margins": 5.4579668045043945, - "rewards/rejected": -6.5036163330078125, - "step": 228 - }, - { - "epoch": 0.475042136652405, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 29.434714310320842, - "learning_rate": 8.489042675893887e-07, - "logits/chosen": 0.15450771152973175, - "logits/rejected": 0.19930016994476318, - "logps/accuracies": 0.8125, - "logps/chosen": -294.2052917480469, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -294.3157653808594, - "logps/ref_rejected": -289.5907897949219, - "logps/rejected": -365.57635498046875, - "loss": 0.5452, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.011044904589653015, - "rewards/grad_term": 0.0027202588971704245, - "rewards/margins": 7.609601974487305, - "rewards/rejected": -7.598557949066162, - "step": 229 - }, - { - "epoch": 0.47711655646311424, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 38.213053143905036, - "learning_rate": 8.477508650519031e-07, - "logits/chosen": 0.2588901221752167, - "logits/rejected": 0.44516509771347046, - "logps/accuracies": 1.0, - "logps/chosen": -291.5958251953125, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -285.3277282714844, - "logps/ref_rejected": -350.38787841796875, - "logps/rejected": -426.9537353515625, - "loss": 0.5646, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.6268075704574585, - "rewards/grad_term": 0.007204078137874603, - "rewards/margins": 7.029778480529785, - "rewards/rejected": -7.656586170196533, - "step": 230 - }, - { - "epoch": 0.4791909762738234, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 25.865551411840645, - "learning_rate": 8.465974625144176e-07, - "logits/chosen": 0.2644757926464081, - "logits/rejected": 0.29192623496055603, - "logps/accuracies": 0.875, - "logps/chosen": -313.8361511230469, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -314.16973876953125, - "logps/ref_rejected": -297.2124938964844, - "logps/rejected": -363.6337890625, - "loss": 0.5635, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.03335915505886078, - "rewards/grad_term": 0.01406506821513176, - "rewards/margins": 6.675488471984863, - "rewards/rejected": -6.642129421234131, - "step": 231 - }, - { - "epoch": 0.4812653960845326, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 20.4318067473308, - "learning_rate": 8.454440599769319e-07, - "logits/chosen": 0.18658952414989471, - "logits/rejected": 0.25278547406196594, - "logps/accuracies": 0.875, - "logps/chosen": -266.6859436035156, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -272.0683288574219, - "logps/ref_rejected": -291.2773132324219, - "logps/rejected": -373.94342041015625, - "loss": 0.5679, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5382405519485474, - "rewards/grad_term": 0.002569821197539568, - "rewards/margins": 8.804851531982422, - "rewards/rejected": -8.266611099243164, - "step": 232 - }, - { - "epoch": 0.4833398158952418, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 26.814576411742852, - "learning_rate": 8.442906574394463e-07, - "logits/chosen": 0.22701847553253174, - "logits/rejected": 0.5475070476531982, - "logps/accuracies": 0.875, - "logps/chosen": -319.60491943359375, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -312.2276611328125, - "logps/ref_rejected": -340.19671630859375, - "logps/rejected": -403.5230712890625, - "loss": 0.5561, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7377276420593262, - "rewards/grad_term": 0.011820271611213684, - "rewards/margins": 5.594909191131592, - "rewards/rejected": -6.33263635635376, - "step": 233 - }, - { - "epoch": 0.485414235705951, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 42.80927565779465, - "learning_rate": 8.431372549019608e-07, - "logits/chosen": 0.03597773611545563, - "logits/rejected": 0.07471846044063568, - "logps/accuracies": 0.9375, - "logps/chosen": -313.0096435546875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -303.0143737792969, - "logps/ref_rejected": -296.0675354003906, - "logps/rejected": -380.6692199707031, - "loss": 0.5752, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9995289444923401, - "rewards/grad_term": 0.003610477549955249, - "rewards/margins": 7.460636615753174, - "rewards/rejected": -8.460165977478027, - "step": 234 - }, - { - "epoch": 0.4874886555166602, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 36.409560354500066, - "learning_rate": 8.419838523644751e-07, - "logits/chosen": 0.33159953355789185, - "logits/rejected": 0.3636232912540436, - "logps/accuracies": 0.75, - "logps/chosen": -399.6930847167969, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -377.0238037109375, - "logps/ref_rejected": -366.9959411621094, - "logps/rejected": -440.5434875488281, - "loss": 0.6419, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.2669270038604736, - "rewards/grad_term": 0.01446828804910183, - "rewards/margins": 5.087828159332275, - "rewards/rejected": -7.354754447937012, - "step": 235 - }, - { - "epoch": 0.48956307532736937, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 33.58443547525811, - "learning_rate": 8.408304498269896e-07, - "logits/chosen": 0.41071584820747375, - "logits/rejected": 0.46553879976272583, - "logps/accuracies": 0.875, - "logps/chosen": -272.25848388671875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -277.6263427734375, - "logps/ref_rejected": -272.1466369628906, - "logps/rejected": -331.35101318359375, - "loss": 0.5812, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5367855429649353, - "rewards/grad_term": 0.007947854697704315, - "rewards/margins": 6.457221031188965, - "rewards/rejected": -5.920435905456543, - "step": 236 - }, - { - "epoch": 0.4916374951380786, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 32.073065465479466, - "learning_rate": 8.396770472895039e-07, - "logits/chosen": 0.35054367780685425, - "logits/rejected": 0.38058048486709595, - "logps/accuracies": 0.875, - "logps/chosen": -266.0827331542969, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -259.3359375, - "logps/ref_rejected": -264.76947021484375, - "logps/rejected": -329.091552734375, - "loss": 0.5934, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.6746789216995239, - "rewards/grad_term": 0.01457288395613432, - "rewards/margins": 5.75752592086792, - "rewards/rejected": -6.432204246520996, - "step": 237 - }, - { - "epoch": 0.49371191494878774, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 45.30830579519183, - "learning_rate": 8.385236447520184e-07, - "logits/chosen": 0.4467751979827881, - "logits/rejected": 0.4529315233230591, - "logps/accuracies": 0.8125, - "logps/chosen": -294.876953125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -297.973876953125, - "logps/ref_rejected": -294.9307861328125, - "logps/rejected": -360.9711608886719, - "loss": 0.559, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.30968841910362244, - "rewards/grad_term": 0.012534530833363533, - "rewards/margins": 6.913724899291992, - "rewards/rejected": -6.604036808013916, - "step": 238 - }, - { - "epoch": 0.49578633475949696, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 42.12849078368493, - "learning_rate": 8.373702422145328e-07, - "logits/chosen": 0.23043927550315857, - "logits/rejected": 0.4118332862854004, - "logps/accuracies": 0.75, - "logps/chosen": -323.7980041503906, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -326.3075866699219, - "logps/ref_rejected": -390.9226379394531, - "logps/rejected": -446.6671142578125, - "loss": 0.5826, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.2509579062461853, - "rewards/grad_term": 0.013378635980188847, - "rewards/margins": 5.825405597686768, - "rewards/rejected": -5.5744476318359375, - "step": 239 - }, - { - "epoch": 0.4978607545702061, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 31.909536036533762, - "learning_rate": 8.362168396770472e-07, - "logits/chosen": 0.16586509346961975, - "logits/rejected": 0.25248032808303833, - "logps/accuracies": 0.875, - "logps/chosen": -282.19866943359375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -285.1270751953125, - "logps/ref_rejected": -276.7912902832031, - "logps/rejected": -345.7266845703125, - "loss": 0.5355, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.29284125566482544, - "rewards/grad_term": 0.007976886816322803, - "rewards/margins": 7.186383247375488, - "rewards/rejected": -6.89354133605957, - "step": 240 - }, - { - "epoch": 0.49993517438091534, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 30.525809924473112, - "learning_rate": 8.350634371395616e-07, - "logits/chosen": 0.3237009048461914, - "logits/rejected": 0.40990960597991943, - "logps/accuracies": 0.875, - "logps/chosen": -287.2841796875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -285.5251159667969, - "logps/ref_rejected": -302.3129577636719, - "logps/rejected": -370.07891845703125, - "loss": 0.5961, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.1759052574634552, - "rewards/grad_term": 0.010726590640842915, - "rewards/margins": 6.6006951332092285, - "rewards/rejected": -6.776600360870361, - "step": 241 - }, - { - "epoch": 0.5020095941916245, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 31.349241798619, - "learning_rate": 8.33910034602076e-07, - "logits/chosen": 0.05717964842915535, - "logits/rejected": 0.0796816349029541, - "logps/accuracies": 0.8125, - "logps/chosen": -295.54229736328125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -297.224365234375, - "logps/ref_rejected": -284.146728515625, - "logps/rejected": -344.3078918457031, - "loss": 0.578, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.16821032762527466, - "rewards/grad_term": 0.014522448182106018, - "rewards/margins": 6.184324264526367, - "rewards/rejected": -6.016113758087158, - "step": 242 - }, - { - "epoch": 0.5040840140023337, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 33.030305222542474, - "learning_rate": 8.327566320645905e-07, - "logits/chosen": 0.06713651120662689, - "logits/rejected": 0.08016189187765121, - "logps/accuracies": 0.9375, - "logps/chosen": -268.2383117675781, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -267.01324462890625, - "logps/ref_rejected": -280.7307434082031, - "logps/rejected": -350.9720458984375, - "loss": 0.557, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.12250781059265137, - "rewards/grad_term": 0.006276478059589863, - "rewards/margins": 6.901622772216797, - "rewards/rejected": -7.024130344390869, - "step": 243 - }, - { - "epoch": 0.5061584338130429, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 27.004780238727747, - "learning_rate": 8.31603229527105e-07, - "logits/chosen": 0.04987862706184387, - "logits/rejected": 0.018930042162537575, - "logps/accuracies": 0.75, - "logps/chosen": -304.90655517578125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -304.6758728027344, - "logps/ref_rejected": -300.8800048828125, - "logps/rejected": -368.5686950683594, - "loss": 0.5892, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.023071274161338806, - "rewards/grad_term": 0.01271775085479021, - "rewards/margins": 6.745797157287598, - "rewards/rejected": -6.768868923187256, - "step": 244 - }, - { - "epoch": 0.5082328536237521, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 39.398222375890796, - "learning_rate": 8.304498269896193e-07, - "logits/chosen": 0.15408623218536377, - "logits/rejected": 0.16406217217445374, - "logps/accuracies": 0.8125, - "logps/chosen": -318.6749572753906, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -316.9960632324219, - "logps/ref_rejected": -343.3831787109375, - "logps/rejected": -391.7142333984375, - "loss": 0.5758, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.16788998246192932, - "rewards/grad_term": 0.01882031187415123, - "rewards/margins": 4.6652140617370605, - "rewards/rejected": -4.833104133605957, - "step": 245 - }, - { - "epoch": 0.5103072734344613, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 57.065723685654525, - "learning_rate": 8.292964244521338e-07, - "logits/chosen": 0.21645879745483398, - "logits/rejected": 0.2783927619457245, - "logps/accuracies": 0.75, - "logps/chosen": -304.6559143066406, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -307.0755615234375, - "logps/ref_rejected": -296.75225830078125, - "logps/rejected": -353.28765869140625, - "loss": 0.5619, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.241965651512146, - "rewards/grad_term": 0.016664672642946243, - "rewards/margins": 5.895508289337158, - "rewards/rejected": -5.6535420417785645, - "step": 246 - }, - { - "epoch": 0.5123816932451705, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 32.148990652138416, - "learning_rate": 8.281430219146481e-07, - "logits/chosen": 0.19208469986915588, - "logits/rejected": 0.09641852974891663, - "logps/accuracies": 0.6875, - "logps/chosen": -406.9360046386719, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -399.0845947265625, - "logps/ref_rejected": -377.1170349121094, - "logps/rejected": -460.0271911621094, - "loss": 0.5832, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7851426005363464, - "rewards/grad_term": 0.0035166891757398844, - "rewards/margins": 7.505876064300537, - "rewards/rejected": -8.29101848602295, - "step": 247 - }, - { - "epoch": 0.5144561130558797, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 44.560082669866084, - "learning_rate": 8.269896193771626e-07, - "logits/chosen": 0.4188195765018463, - "logits/rejected": 0.4568687975406647, - "logps/accuracies": 0.8125, - "logps/chosen": -319.69805908203125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -304.82037353515625, - "logps/ref_rejected": -328.6085205078125, - "logps/rejected": -397.77398681640625, - "loss": 0.5775, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4877678155899048, - "rewards/grad_term": 0.007945088669657707, - "rewards/margins": 5.428779602050781, - "rewards/rejected": -6.916546821594238, - "step": 248 - }, - { - "epoch": 0.5165305328665889, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 49.37389001042154, - "learning_rate": 8.25836216839677e-07, - "logits/chosen": 0.0985247939825058, - "logits/rejected": 0.13652461767196655, - "logps/accuracies": 0.875, - "logps/chosen": -316.8545837402344, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -325.44244384765625, - "logps/ref_rejected": -335.2078857421875, - "logps/rejected": -413.33251953125, - "loss": 0.5037, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.8587861657142639, - "rewards/grad_term": 0.004567756317555904, - "rewards/margins": 8.671252250671387, - "rewards/rejected": -7.812466144561768, - "step": 249 - }, - { - "epoch": 0.5186049526772981, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 55.51867340899016, - "learning_rate": 8.246828143021914e-07, - "logits/chosen": 0.28938692808151245, - "logits/rejected": 0.27797943353652954, - "logps/accuracies": 1.0, - "logps/chosen": -322.72845458984375, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -325.44219970703125, - "logps/ref_rejected": -326.6589660644531, - "logps/rejected": -402.5311279296875, - "loss": 0.5775, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.27137255668640137, - "rewards/grad_term": 0.01095657143741846, - "rewards/margins": 7.858592510223389, - "rewards/rejected": -7.587219715118408, - "step": 250 - }, - { - "epoch": 0.5206793724880072, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 56.0035451408766, - "learning_rate": 8.235294117647058e-07, - "logits/chosen": 0.2637563943862915, - "logits/rejected": 0.33145201206207275, - "logps/accuracies": 0.8125, - "logps/chosen": -395.2080078125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -379.0206604003906, - "logps/ref_rejected": -411.3179931640625, - "logps/rejected": -493.62921142578125, - "loss": 0.5508, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.6187384128570557, - "rewards/grad_term": 0.009982087649405003, - "rewards/margins": 6.612382411956787, - "rewards/rejected": -8.231120109558105, - "step": 251 - }, - { - "epoch": 0.5227537922987164, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 49.66999137567622, - "learning_rate": 8.223760092272203e-07, - "logits/chosen": 0.15164095163345337, - "logits/rejected": 0.20300878584384918, - "logps/accuracies": 0.75, - "logps/chosen": -314.79412841796875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -317.7497253417969, - "logps/ref_rejected": -307.1460876464844, - "logps/rejected": -389.70745849609375, - "loss": 0.5436, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.2955569326877594, - "rewards/grad_term": 0.00618112925440073, - "rewards/margins": 8.551695823669434, - "rewards/rejected": -8.256139755249023, - "step": 252 - }, - { - "epoch": 0.5248282121094257, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 39.0728876643306, - "learning_rate": 8.212226066897346e-07, - "logits/chosen": 0.4079715609550476, - "logits/rejected": 0.5913187861442566, - "logps/accuracies": 0.875, - "logps/chosen": -299.94525146484375, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -300.0797119140625, - "logps/ref_rejected": -358.2881164550781, - "logps/rejected": -424.64324951171875, - "loss": 0.494, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.01344829797744751, - "rewards/grad_term": 0.008727732114493847, - "rewards/margins": 6.648958683013916, - "rewards/rejected": -6.635509967803955, - "step": 253 - }, - { - "epoch": 0.5269026319201349, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 31.821418285963002, - "learning_rate": 8.200692041522491e-07, - "logits/chosen": 0.4014374613761902, - "logits/rejected": 0.4343331456184387, - "logps/accuracies": 0.9375, - "logps/chosen": -231.1729278564453, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -230.2028045654297, - "logps/ref_rejected": -237.1446075439453, - "logps/rejected": -303.7888488769531, - "loss": 0.5901, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.09701316803693771, - "rewards/grad_term": 0.006798036862164736, - "rewards/margins": 6.56741189956665, - "rewards/rejected": -6.664424896240234, - "step": 254 - }, - { - "epoch": 0.5289770517308441, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 49.53309810118567, - "learning_rate": 8.189158016147634e-07, - "logits/chosen": 0.10699253529310226, - "logits/rejected": 0.11342119425535202, - "logps/accuracies": 0.75, - "logps/chosen": -301.25408935546875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -310.416748046875, - "logps/ref_rejected": -280.7598876953125, - "logps/rejected": -345.6063537597656, - "loss": 0.6471, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.9162629842758179, - "rewards/grad_term": 0.009691519662737846, - "rewards/margins": 7.400913715362549, - "rewards/rejected": -6.484650611877441, - "step": 255 - }, - { - "epoch": 0.5310514715415532, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 33.11473943842344, - "learning_rate": 8.17762399077278e-07, - "logits/chosen": 0.09529374539852142, - "logits/rejected": 0.2966251075267792, - "logps/accuracies": 0.6875, - "logps/chosen": -278.4207763671875, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -268.88885498046875, - "logps/ref_rejected": -287.082275390625, - "logps/rejected": -344.5219421386719, - "loss": 0.5951, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9531930088996887, - "rewards/grad_term": 0.021204093471169472, - "rewards/margins": 4.790775775909424, - "rewards/rejected": -5.743968963623047, - "step": 256 - }, - { - "epoch": 0.5310514715415532, - "eval_flips/correct->correct": 0.4334975481033325, - "eval_flips/correct->incorrect": 0.009852216579020023, - "eval_flips/incorrect->correct": 0.3300492465496063, - "eval_flips/incorrect->incorrect": 0.2266009896993637, - "eval_logits/chosen": 0.20908966660499573, - "eval_logits/rejected": 0.25232627987861633, - "eval_logps/accuracies": 0.7635468244552612, - "eval_logps/chosen": -291.91790771484375, - "eval_logps/ref_accuracies": 0.4433497488498688, - "eval_logps/ref_chosen": -287.3511047363281, - "eval_logps/ref_rejected": -289.0460205078125, - "eval_logps/rejected": -345.9736328125, - "eval_loss": 0.6100751161575317, - "eval_rewards/accuracies": 0.8768472671508789, - "eval_rewards/chosen": -0.4566830098628998, - "eval_rewards/grad_term": 0.015678314492106438, - "eval_rewards/margins": 5.236079216003418, - "eval_rewards/rejected": -5.69276237487793, - "eval_runtime": 803.7781, - "eval_samples_per_second": 2.013, - "eval_steps_per_second": 0.253, - "step": 256 - }, - { - "epoch": 0.5331258913522624, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 37.54918040748534, - "learning_rate": 8.166089965397924e-07, - "logits/chosen": 0.14258748292922974, - "logits/rejected": 0.1967656910419464, - "logps/accuracies": 0.875, - "logps/chosen": -307.8832702636719, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -313.5946350097656, - "logps/ref_rejected": -304.44818115234375, - "logps/rejected": -381.94158935546875, - "loss": 0.5998, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5711380243301392, - "rewards/grad_term": 0.004750548396259546, - "rewards/margins": 8.320480346679688, - "rewards/rejected": -7.749342918395996, - "step": 257 - }, - { - "epoch": 0.5352003111629716, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 66.7330393421814, - "learning_rate": 8.154555940023068e-07, - "logits/chosen": 0.3797518014907837, - "logits/rejected": 0.36165231466293335, - "logps/accuracies": 0.8125, - "logps/chosen": -305.75604248046875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -297.5378112792969, - "logps/ref_rejected": -278.42840576171875, - "logps/rejected": -340.3268127441406, - "loss": 0.5622, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.8218250274658203, - "rewards/grad_term": 0.01640998013317585, - "rewards/margins": 5.368016242980957, - "rewards/rejected": -6.189841270446777, - "step": 258 - }, - { - "epoch": 0.5372747309736808, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 38.0243266757412, - "learning_rate": 8.143021914648212e-07, - "logits/chosen": 0.31451526284217834, - "logits/rejected": 0.3426423668861389, - "logps/accuracies": 0.75, - "logps/chosen": -214.24871826171875, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -212.14849853515625, - "logps/ref_rejected": -200.25625610351562, - "logps/rejected": -251.4813232421875, - "loss": 0.5821, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.210022434592247, - "rewards/grad_term": 0.01893402822315693, - "rewards/margins": 4.912485122680664, - "rewards/rejected": -5.1225080490112305, - "step": 259 - }, - { - "epoch": 0.5393491507843899, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 27.658939795121892, - "learning_rate": 8.131487889273356e-07, - "logits/chosen": 0.1209304928779602, - "logits/rejected": 0.1580743044614792, - "logps/accuracies": 0.6875, - "logps/chosen": -291.9943542480469, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -296.20867919921875, - "logps/ref_rejected": -299.74664306640625, - "logps/rejected": -365.398193359375, - "loss": 0.5422, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4214297831058502, - "rewards/grad_term": 0.009202235378324986, - "rewards/margins": 6.986582279205322, - "rewards/rejected": -6.565152645111084, - "step": 260 - }, - { - "epoch": 0.5414235705950992, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 75.72204106158672, - "learning_rate": 8.1199538638985e-07, - "logits/chosen": 0.16947351396083832, - "logits/rejected": 0.18022188544273376, - "logps/accuracies": 0.75, - "logps/chosen": -281.2515869140625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -267.09417724609375, - "logps/ref_rejected": -272.4349060058594, - "logps/rejected": -333.783203125, - "loss": 0.606, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.4157400131225586, - "rewards/grad_term": 0.01815151423215866, - "rewards/margins": 4.719089984893799, - "rewards/rejected": -6.134829998016357, - "step": 261 - }, - { - "epoch": 0.5434979904058084, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 58.5042523870511, - "learning_rate": 8.108419838523645e-07, - "logits/chosen": 0.18119366466999054, - "logits/rejected": 0.3171493113040924, - "logps/accuracies": 0.875, - "logps/chosen": -298.76116943359375, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -292.3468017578125, - "logps/ref_rejected": -334.3692321777344, - "logps/rejected": -419.0423278808594, - "loss": 0.5495, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6414406299591064, - "rewards/grad_term": 0.005858146119862795, - "rewards/margins": 7.825870990753174, - "rewards/rejected": -8.467310905456543, - "step": 262 - }, - { - "epoch": 0.5455724102165176, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 16.445083432164537, - "learning_rate": 8.096885813148788e-07, - "logits/chosen": 0.37694644927978516, - "logits/rejected": 0.43579670786857605, - "logps/accuracies": 0.875, - "logps/chosen": -358.7106018066406, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -354.29962158203125, - "logps/ref_rejected": -385.1765441894531, - "logps/rejected": -465.8778381347656, - "loss": 0.5401, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.44109994173049927, - "rewards/grad_term": 0.005547558423131704, - "rewards/margins": 7.6290283203125, - "rewards/rejected": -8.070128440856934, - "step": 263 - }, - { - "epoch": 0.5476468300272268, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 43.5572613048707, - "learning_rate": 8.085351787773933e-07, - "logits/chosen": 0.3072161078453064, - "logits/rejected": 0.2626444697380066, - "logps/accuracies": 0.8125, - "logps/chosen": -259.9176330566406, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -245.10406494140625, - "logps/ref_rejected": -250.30921936035156, - "logps/rejected": -325.20831298828125, - "loss": 0.5928, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4813560247421265, - "rewards/grad_term": 0.01034230925142765, - "rewards/margins": 6.008551597595215, - "rewards/rejected": -7.489907264709473, - "step": 264 - }, - { - "epoch": 0.5497212498379359, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 42.82203006051057, - "learning_rate": 8.073817762399076e-07, - "logits/chosen": 0.13615286350250244, - "logits/rejected": 0.20001475512981415, - "logps/accuracies": 0.75, - "logps/chosen": -338.6938171386719, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -319.03753662109375, - "logps/ref_rejected": -324.310791015625, - "logps/rejected": -421.2921447753906, - "loss": 0.5756, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.9656240940093994, - "rewards/grad_term": 0.006605319678783417, - "rewards/margins": 7.732507705688477, - "rewards/rejected": -9.698131561279297, - "step": 265 - }, - { - "epoch": 0.5517956696486451, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 52.44815820061085, - "learning_rate": 8.062283737024221e-07, - "logits/chosen": 0.21208931505680084, - "logits/rejected": 0.25778982043266296, - "logps/accuracies": 0.875, - "logps/chosen": -344.6730651855469, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -331.10345458984375, - "logps/ref_rejected": -351.76409912109375, - "logps/rejected": -448.85931396484375, - "loss": 0.613, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.3569657802581787, - "rewards/grad_term": 0.00555332051590085, - "rewards/margins": 8.352553367614746, - "rewards/rejected": -9.70952033996582, - "step": 266 - }, - { - "epoch": 0.5538700894593543, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 41.602018473752594, - "learning_rate": 8.050749711649365e-07, - "logits/chosen": 0.028799353167414665, - "logits/rejected": 0.013060306198894978, - "logps/accuracies": 0.875, - "logps/chosen": -331.1357727050781, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -303.6278076171875, - "logps/ref_rejected": -292.3013610839844, - "logps/rejected": -394.4818115234375, - "loss": 0.6173, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.750793933868408, - "rewards/grad_term": 0.004837782587856054, - "rewards/margins": 7.467255115509033, - "rewards/rejected": -10.218048095703125, - "step": 267 - }, - { - "epoch": 0.5559445092700636, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 113.12904360596185, - "learning_rate": 8.03921568627451e-07, - "logits/chosen": 0.02135728858411312, - "logits/rejected": 0.08749254792928696, - "logps/accuracies": 0.8125, - "logps/chosen": -336.35760498046875, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -308.16162109375, - "logps/ref_rejected": -309.57598876953125, - "logps/rejected": -400.8465576171875, - "loss": 0.6239, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.8195974826812744, - "rewards/grad_term": 0.018539071083068848, - "rewards/margins": 6.307459831237793, - "rewards/rejected": -9.127056121826172, - "step": 268 - }, - { - "epoch": 0.5580189290807728, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 33.96227826132554, - "learning_rate": 8.027681660899654e-07, - "logits/chosen": 0.5619252324104309, - "logits/rejected": 0.5743327736854553, - "logps/accuracies": 0.9375, - "logps/chosen": -255.11105346679688, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -252.5300750732422, - "logps/ref_rejected": -270.99432373046875, - "logps/rejected": -342.1889953613281, - "loss": 0.5774, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.2580994963645935, - "rewards/grad_term": 0.01020655408501625, - "rewards/margins": 6.861366271972656, - "rewards/rejected": -7.1194658279418945, - "step": 269 - }, - { - "epoch": 0.5600933488914819, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 24.02468529781505, - "learning_rate": 8.016147635524798e-07, - "logits/chosen": 0.27724260091781616, - "logits/rejected": 0.2910709083080292, - "logps/accuracies": 0.8125, - "logps/chosen": -300.6127014160156, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -277.4232482910156, - "logps/ref_rejected": -294.59234619140625, - "logps/rejected": -373.547607421875, - "loss": 0.5739, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.3189470767974854, - "rewards/grad_term": 0.014720053412020206, - "rewards/margins": 5.57658052444458, - "rewards/rejected": -7.895526885986328, - "step": 270 - }, - { - "epoch": 0.5621677687021911, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 28.444387072372734, - "learning_rate": 8.004613610149942e-07, - "logits/chosen": 0.05013295263051987, - "logits/rejected": 0.06607392430305481, - "logps/accuracies": 0.9375, - "logps/chosen": -278.4317626953125, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -279.4646301269531, - "logps/ref_rejected": -289.01959228515625, - "logps/rejected": -344.6971130371094, - "loss": 0.5915, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.10328565537929535, - "rewards/grad_term": 0.00966467522084713, - "rewards/margins": 5.671037673950195, - "rewards/rejected": -5.567751884460449, - "step": 271 - }, - { - "epoch": 0.5642421885129003, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 51.22016296104716, - "learning_rate": 7.993079584775087e-07, - "logits/chosen": 0.37125492095947266, - "logits/rejected": 0.3817085325717926, - "logps/accuracies": 0.75, - "logps/chosen": -308.3736572265625, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -300.4655456542969, - "logps/ref_rejected": -302.07696533203125, - "logps/rejected": -366.42919921875, - "loss": 0.5656, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.7908135652542114, - "rewards/grad_term": 0.007430646568536758, - "rewards/margins": 5.644411563873291, - "rewards/rejected": -6.435225009918213, - "step": 272 - }, - { - "epoch": 0.5663166083236095, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 32.46457915793264, - "learning_rate": 7.98154555940023e-07, - "logits/chosen": 0.17831876873970032, - "logits/rejected": 0.1457141786813736, - "logps/accuracies": 0.75, - "logps/chosen": -282.64410400390625, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -274.8734130859375, - "logps/ref_rejected": -274.3963317871094, - "logps/rejected": -331.91021728515625, - "loss": 0.5866, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.7770657539367676, - "rewards/grad_term": 0.016797857359051704, - "rewards/margins": 4.974320411682129, - "rewards/rejected": -5.751385688781738, - "step": 273 - }, - { - "epoch": 0.5683910281343186, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 32.9471484951805, - "learning_rate": 7.970011534025375e-07, - "logits/chosen": 0.5048956871032715, - "logits/rejected": 0.496670126914978, - "logps/accuracies": 0.8125, - "logps/chosen": -301.3432922363281, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -304.7401123046875, - "logps/ref_rejected": -310.8699645996094, - "logps/rejected": -359.89007568359375, - "loss": 0.617, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.33968228101730347, - "rewards/grad_term": 0.01532800029963255, - "rewards/margins": 5.24169397354126, - "rewards/rejected": -4.902011871337891, - "step": 274 - }, - { - "epoch": 0.5704654479450278, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 26.58072307751858, - "learning_rate": 7.958477508650518e-07, - "logits/chosen": 0.1677144318819046, - "logits/rejected": 0.2207137495279312, - "logps/accuracies": 0.75, - "logps/chosen": -240.13754272460938, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -248.08505249023438, - "logps/ref_rejected": -233.90797424316406, - "logps/rejected": -271.0197448730469, - "loss": 0.6325, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.7947514653205872, - "rewards/grad_term": 0.017878375947475433, - "rewards/margins": 4.505929946899414, - "rewards/rejected": -3.711178779602051, - "step": 275 - }, - { - "epoch": 0.5725398677557371, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 41.27546406513188, - "learning_rate": 7.946943483275663e-07, - "logits/chosen": 0.36588138341903687, - "logits/rejected": 0.4112645983695984, - "logps/accuracies": 0.75, - "logps/chosen": -253.30035400390625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -247.00111389160156, - "logps/ref_rejected": -257.2711486816406, - "logps/rejected": -302.595703125, - "loss": 0.6566, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6299245357513428, - "rewards/grad_term": 0.020320266485214233, - "rewards/margins": 3.9025347232818604, - "rewards/rejected": -4.532459259033203, - "step": 276 - }, - { - "epoch": 0.5746142875664463, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 65.27110995983777, - "learning_rate": 7.935409457900807e-07, - "logits/chosen": -0.01693597435951233, - "logits/rejected": 0.046872012317180634, - "logps/accuracies": 0.625, - "logps/chosen": -275.9945373535156, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -282.8183898925781, - "logps/ref_rejected": -310.315673828125, - "logps/rejected": -344.1127014160156, - "loss": 0.6913, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.682384192943573, - "rewards/grad_term": 0.023220881819725037, - "rewards/margins": 4.062088489532471, - "rewards/rejected": -3.379704475402832, - "step": 277 - }, - { - "epoch": 0.5766887073771555, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 51.19472036277012, - "learning_rate": 7.923875432525951e-07, - "logits/chosen": 0.19429253041744232, - "logits/rejected": 0.19109566509723663, - "logps/accuracies": 0.6875, - "logps/chosen": -299.841796875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -306.51116943359375, - "logps/ref_rejected": -324.4086608886719, - "logps/rejected": -347.66070556640625, - "loss": 0.7188, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.666935920715332, - "rewards/grad_term": 0.025834256783127785, - "rewards/margins": 2.9921374320983887, - "rewards/rejected": -2.3252012729644775, - "step": 278 - }, - { - "epoch": 0.5787631271878646, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.4375, - "grad_norm": 111.63212030223526, - "learning_rate": 7.912341407151095e-07, - "logits/chosen": 0.026315703988075256, - "logits/rejected": 0.05185367166996002, - "logps/accuracies": 0.5625, - "logps/chosen": -312.5485534667969, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -322.3186340332031, - "logps/ref_rejected": -310.2381286621094, - "logps/rejected": -346.5958251953125, - "loss": 0.6961, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.9770085215568542, - "rewards/grad_term": 0.019868649542331696, - "rewards/margins": 4.612778663635254, - "rewards/rejected": -3.635770082473755, - "step": 279 - }, - { - "epoch": 0.5808375469985738, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 40.5130776185035, - "learning_rate": 7.90080738177624e-07, - "logits/chosen": 0.32091373205184937, - "logits/rejected": 0.4111550450325012, - "logps/accuracies": 0.75, - "logps/chosen": -203.61636352539062, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -205.00054931640625, - "logps/ref_rejected": -283.1324462890625, - "logps/rejected": -327.993896484375, - "loss": 0.6669, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.1384190022945404, - "rewards/grad_term": 0.016185998916625977, - "rewards/margins": 4.624567985534668, - "rewards/rejected": -4.486148357391357, - "step": 280 - }, - { - "epoch": 0.582911966809283, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 39.877534822937, - "learning_rate": 7.889273356401384e-07, - "logits/chosen": 0.3415575325489044, - "logits/rejected": 0.360428124666214, - "logps/accuracies": 0.75, - "logps/chosen": -321.03564453125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -328.7980651855469, - "logps/ref_rejected": -312.2154541015625, - "logps/rejected": -357.23773193359375, - "loss": 0.6027, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.7762415409088135, - "rewards/grad_term": 0.020651506260037422, - "rewards/margins": 5.278467655181885, - "rewards/rejected": -4.50222635269165, - "step": 281 - }, - { - "epoch": 0.5849863866199922, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 20.370696721525093, - "learning_rate": 7.877739331026529e-07, - "logits/chosen": -0.10976716130971909, - "logits/rejected": 0.04267115890979767, - "logps/accuracies": 0.5625, - "logps/chosen": -316.794189453125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -321.988525390625, - "logps/ref_rejected": -346.69873046875, - "logps/rejected": -393.5243225097656, - "loss": 0.5422, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.519432783126831, - "rewards/grad_term": 0.019725538790225983, - "rewards/margins": 5.201993942260742, - "rewards/rejected": -4.682560920715332, - "step": 282 - }, - { - "epoch": 0.5870608064307015, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 38.36199057696001, - "learning_rate": 7.866205305651672e-07, - "logits/chosen": 0.11121785640716553, - "logits/rejected": 0.21377022564411163, - "logps/accuracies": 0.9375, - "logps/chosen": -260.2328186035156, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -243.456298828125, - "logps/ref_rejected": -277.0938720703125, - "logps/rejected": -323.23675537109375, - "loss": 0.5608, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.6776524782180786, - "rewards/grad_term": 0.029680585488677025, - "rewards/margins": 2.9366343021392822, - "rewards/rejected": -4.61428689956665, - "step": 283 - }, - { - "epoch": 0.5891352262414106, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 18.32697126494198, - "learning_rate": 7.854671280276817e-07, - "logits/chosen": 0.09643738716840744, - "logits/rejected": 0.13956782221794128, - "logps/accuracies": 0.6875, - "logps/chosen": -354.23077392578125, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -343.4710388183594, - "logps/ref_rejected": -336.4991760253906, - "logps/rejected": -402.7376403808594, - "loss": 0.6159, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.075973629951477, - "rewards/grad_term": 0.011539540253579617, - "rewards/margins": 5.547872543334961, - "rewards/rejected": -6.623846054077148, - "step": 284 - }, - { - "epoch": 0.5912096460521198, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 52.40142941235436, - "learning_rate": 7.84313725490196e-07, - "logits/chosen": 0.11267786473035812, - "logits/rejected": 0.16389338672161102, - "logps/accuracies": 0.875, - "logps/chosen": -330.61627197265625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -335.897216796875, - "logps/ref_rejected": -348.2872619628906, - "logps/rejected": -430.4730224609375, - "loss": 0.5283, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5280970931053162, - "rewards/grad_term": 0.00416451646015048, - "rewards/margins": 8.746676445007324, - "rewards/rejected": -8.218579292297363, - "step": 285 - }, - { - "epoch": 0.593284065862829, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 47.45081130082864, - "learning_rate": 7.831603229527105e-07, - "logits/chosen": -0.08656018227338791, - "logits/rejected": -0.05061071738600731, - "logps/accuracies": 0.875, - "logps/chosen": -304.7203369140625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -296.37933349609375, - "logps/ref_rejected": -303.2038269042969, - "logps/rejected": -393.2415771484375, - "loss": 0.5669, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.8340997695922852, - "rewards/grad_term": 0.011894619092345238, - "rewards/margins": 8.169673919677734, - "rewards/rejected": -9.003772735595703, - "step": 286 - }, - { - "epoch": 0.5953584856735382, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 21.830125532823576, - "learning_rate": 7.820069204152249e-07, - "logits/chosen": 0.16923511028289795, - "logits/rejected": 0.16749337315559387, - "logps/accuracies": 0.8125, - "logps/chosen": -300.83929443359375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -280.837158203125, - "logps/ref_rejected": -279.3500671386719, - "logps/rejected": -371.1993103027344, - "loss": 0.5992, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.000213623046875, - "rewards/grad_term": 0.007134607993066311, - "rewards/margins": 7.1847124099731445, - "rewards/rejected": -9.184926986694336, - "step": 287 - }, - { - "epoch": 0.5974329054842473, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 29.122328000376466, - "learning_rate": 7.808535178777393e-07, - "logits/chosen": 0.027427153661847115, - "logits/rejected": 0.04789198189973831, - "logps/accuracies": 0.8125, - "logps/chosen": -390.447509765625, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -367.87188720703125, - "logps/ref_rejected": -355.06640625, - "logps/rejected": -450.7095031738281, - "loss": 0.5902, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.2575621604919434, - "rewards/grad_term": 0.009585607796907425, - "rewards/margins": 7.306746482849121, - "rewards/rejected": -9.564309120178223, - "step": 288 - }, - { - "epoch": 0.5995073252949565, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 41.33190625329714, - "learning_rate": 7.797001153402537e-07, - "logits/chosen": 0.07080674171447754, - "logits/rejected": 0.10809577256441116, - "logps/accuracies": 0.75, - "logps/chosen": -280.20654296875, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -284.79046630859375, - "logps/ref_rejected": -268.11834716796875, - "logps/rejected": -343.5045166015625, - "loss": 0.5622, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.45839110016822815, - "rewards/grad_term": 0.006786561571061611, - "rewards/margins": 7.997011661529541, - "rewards/rejected": -7.5386199951171875, - "step": 289 - }, - { - "epoch": 0.6015817451056658, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 22.37995830872316, - "learning_rate": 7.785467128027681e-07, - "logits/chosen": 0.046341296285390854, - "logits/rejected": 0.08901657164096832, - "logps/accuracies": 1.0, - "logps/chosen": -315.9812927246094, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -314.4349365234375, - "logps/ref_rejected": -321.4173583984375, - "logps/rejected": -420.6228942871094, - "loss": 0.5426, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.15463726222515106, - "rewards/grad_term": 0.0006936362478882074, - "rewards/margins": 9.76591682434082, - "rewards/rejected": -9.920555114746094, - "step": 290 - }, - { - "epoch": 0.603656164916375, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 31.611576893068335, - "learning_rate": 7.773933102652825e-07, - "logits/chosen": 0.1491805762052536, - "logits/rejected": 0.1622200310230255, - "logps/accuracies": 0.875, - "logps/chosen": -324.14556884765625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -316.1439208984375, - "logps/ref_rejected": -310.8943176269531, - "logps/rejected": -400.73760986328125, - "loss": 0.5327, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.8001646995544434, - "rewards/grad_term": 0.008960644714534283, - "rewards/margins": 8.184164047241211, - "rewards/rejected": -8.984328269958496, - "step": 291 - }, - { - "epoch": 0.6057305847270842, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 40.66113218146233, - "learning_rate": 7.76239907727797e-07, - "logits/chosen": 0.1667100340127945, - "logits/rejected": 0.1031753420829773, - "logps/accuracies": 0.875, - "logps/chosen": -257.600830078125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -251.26223754882812, - "logps/ref_rejected": -256.9619445800781, - "logps/rejected": -332.1715393066406, - "loss": 0.5644, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.6338596940040588, - "rewards/grad_term": 0.00917502585798502, - "rewards/margins": 6.887094974517822, - "rewards/rejected": -7.5209550857543945, - "step": 292 - }, - { - "epoch": 0.6078050045377933, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 57.97079310383917, - "learning_rate": 7.750865051903114e-07, - "logits/chosen": -0.08806827664375305, - "logits/rejected": -0.045274168252944946, - "logps/accuracies": 0.75, - "logps/chosen": -308.90509033203125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -310.9958190917969, - "logps/ref_rejected": -308.5406494140625, - "logps/rejected": -372.4405212402344, - "loss": 0.54, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.20907306671142578, - "rewards/grad_term": 0.0065223718993365765, - "rewards/margins": 6.599061489105225, - "rewards/rejected": -6.389988899230957, - "step": 293 - }, - { - "epoch": 0.6098794243485025, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 49.04536258498517, - "learning_rate": 7.739331026528259e-07, - "logits/chosen": 0.17086654901504517, - "logits/rejected": 0.19536878168582916, - "logps/accuracies": 0.6875, - "logps/chosen": -318.17401123046875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -322.79205322265625, - "logps/ref_rejected": -298.66278076171875, - "logps/rejected": -354.1503601074219, - "loss": 0.5665, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.4618016183376312, - "rewards/grad_term": 0.015101278200745583, - "rewards/margins": 6.010561466217041, - "rewards/rejected": -5.548760414123535, - "step": 294 - }, - { - "epoch": 0.6119538441592117, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 29.110328186799425, - "learning_rate": 7.727797001153403e-07, - "logits/chosen": 0.2920646667480469, - "logits/rejected": 0.3339766263961792, - "logps/accuracies": 0.875, - "logps/chosen": -287.7585144042969, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -291.96600341796875, - "logps/ref_rejected": -315.6741027832031, - "logps/rejected": -371.0801696777344, - "loss": 0.5743, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.4207479953765869, - "rewards/grad_term": 0.013844680972397327, - "rewards/margins": 5.96135139465332, - "rewards/rejected": -5.540602684020996, - "step": 295 - }, - { - "epoch": 0.6140282639699209, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 53.230756957534005, - "learning_rate": 7.716262975778547e-07, - "logits/chosen": 0.12378720194101334, - "logits/rejected": 0.16028910875320435, - "logps/accuracies": 0.8125, - "logps/chosen": -291.1141357421875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -294.4344482421875, - "logps/ref_rejected": -296.4821472167969, - "logps/rejected": -359.0552062988281, - "loss": 0.5617, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.33203190565109253, - "rewards/grad_term": 0.005090603604912758, - "rewards/margins": 6.58933687210083, - "rewards/rejected": -6.257304668426514, - "step": 296 - }, - { - "epoch": 0.6161026837806302, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 49.27245845537937, - "learning_rate": 7.704728950403691e-07, - "logits/chosen": 0.07910759747028351, - "logits/rejected": 0.08938741683959961, - "logps/accuracies": 0.8125, - "logps/chosen": -336.2572021484375, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -347.0129089355469, - "logps/ref_rejected": -345.33203125, - "logps/rejected": -390.4123229980469, - "loss": 0.5452, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.0755696296691895, - "rewards/grad_term": 0.01720615103840828, - "rewards/margins": 5.583600044250488, - "rewards/rejected": -4.508030414581299, - "step": 297 - }, - { - "epoch": 0.6181771035913393, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 52.587110731751046, - "learning_rate": 7.693194925028835e-07, - "logits/chosen": 0.09930308163166046, - "logits/rejected": 0.21927960216999054, - "logps/accuracies": 0.75, - "logps/chosen": -221.76551818847656, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -217.18284606933594, - "logps/ref_rejected": -224.5638427734375, - "logps/rejected": -282.2660217285156, - "loss": 0.5868, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.4582689702510834, - "rewards/grad_term": 0.015134723857045174, - "rewards/margins": 5.311949253082275, - "rewards/rejected": -5.7702178955078125, - "step": 298 - }, - { - "epoch": 0.6202515234020485, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 50.13841731685137, - "learning_rate": 7.681660899653979e-07, - "logits/chosen": 0.05825243890285492, - "logits/rejected": 0.1010754331946373, - "logps/accuracies": 0.875, - "logps/chosen": -372.16961669921875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -380.7333679199219, - "logps/ref_rejected": -376.558349609375, - "logps/rejected": -447.0321350097656, - "loss": 0.4912, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.8563790321350098, - "rewards/grad_term": 0.006374266929924488, - "rewards/margins": 7.903756141662598, - "rewards/rejected": -7.047377586364746, - "step": 299 - }, - { - "epoch": 0.6223259432127577, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 18.47274167497265, - "learning_rate": 7.670126874279122e-07, - "logits/chosen": 0.01922018826007843, - "logits/rejected": 0.10939830541610718, - "logps/accuracies": 0.6875, - "logps/chosen": -290.3101806640625, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -292.36676025390625, - "logps/ref_rejected": -279.2781066894531, - "logps/rejected": -335.0788269042969, - "loss": 0.5304, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.20565718412399292, - "rewards/grad_term": 0.012530253268778324, - "rewards/margins": 5.785726547241211, - "rewards/rejected": -5.580069541931152, - "step": 300 - }, - { - "epoch": 0.6244003630234669, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 35.40752217083433, - "learning_rate": 7.658592848904267e-07, - "logits/chosen": 0.2818371653556824, - "logits/rejected": 0.41613805294036865, - "logps/accuracies": 0.9375, - "logps/chosen": -254.01771545410156, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -242.12171936035156, - "logps/ref_rejected": -286.4274597167969, - "logps/rejected": -363.48065185546875, - "loss": 0.5697, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.189597725868225, - "rewards/grad_term": 0.010368636809289455, - "rewards/margins": 6.51572322845459, - "rewards/rejected": -7.705321311950684, - "step": 301 - }, - { - "epoch": 0.626474782834176, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 56.807968171838894, - "learning_rate": 7.647058823529411e-07, - "logits/chosen": 0.23013733327388763, - "logits/rejected": 0.2862010598182678, - "logps/accuracies": 0.9375, - "logps/chosen": -326.96173095703125, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -329.29400634765625, - "logps/ref_rejected": -335.8630676269531, - "logps/rejected": -414.28753662109375, - "loss": 0.4896, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.23322616517543793, - "rewards/grad_term": 0.0038325442001223564, - "rewards/margins": 8.075675010681152, - "rewards/rejected": -7.842449188232422, - "step": 302 - }, - { - "epoch": 0.6285492026448852, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 30.936866332774525, - "learning_rate": 7.635524798154555e-07, - "logits/chosen": 0.38763344287872314, - "logits/rejected": 0.42555707693099976, - "logps/accuracies": 0.8125, - "logps/chosen": -309.457763671875, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -298.310791015625, - "logps/ref_rejected": -305.3328857421875, - "logps/rejected": -386.14520263671875, - "loss": 0.5817, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.1147011518478394, - "rewards/grad_term": 0.008538071066141129, - "rewards/margins": 6.966533660888672, - "rewards/rejected": -8.0812349319458, - "step": 303 - }, - { - "epoch": 0.6306236224555944, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 39.0549516486327, - "learning_rate": 7.623990772779699e-07, - "logits/chosen": 0.3378972113132477, - "logits/rejected": 0.33350008726119995, - "logps/accuracies": 0.75, - "logps/chosen": -296.42120361328125, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -286.9034423828125, - "logps/ref_rejected": -254.8472137451172, - "logps/rejected": -342.2765808105469, - "loss": 0.5692, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.9517745971679688, - "rewards/grad_term": 0.009382160380482674, - "rewards/margins": 7.791163921356201, - "rewards/rejected": -8.742938995361328, - "step": 304 - }, - { - "epoch": 0.6326980422663037, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 25.487656661381987, - "learning_rate": 7.612456747404843e-07, - "logits/chosen": -0.0026643723249435425, - "logits/rejected": 0.13260780274868011, - "logps/accuracies": 0.8125, - "logps/chosen": -337.88592529296875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -335.2144775390625, - "logps/ref_rejected": -367.57647705078125, - "logps/rejected": -445.0696716308594, - "loss": 0.5336, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.2671446204185486, - "rewards/grad_term": 0.009766257368028164, - "rewards/margins": 7.482178211212158, - "rewards/rejected": -7.749322891235352, - "step": 305 - }, - { - "epoch": 0.6347724620770129, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 53.78629399737712, - "learning_rate": 7.600922722029988e-07, - "logits/chosen": 0.17397280037403107, - "logits/rejected": 0.13213837146759033, - "logps/accuracies": 0.875, - "logps/chosen": -296.3286437988281, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -288.9657897949219, - "logps/ref_rejected": -271.7525329589844, - "logps/rejected": -347.7431640625, - "loss": 0.6198, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.7362870573997498, - "rewards/grad_term": 0.008981076069176197, - "rewards/margins": 6.862776756286621, - "rewards/rejected": -7.599064826965332, - "step": 306 - }, - { - "epoch": 0.636846881887722, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 38.57273088962716, - "learning_rate": 7.589388696655133e-07, - "logits/chosen": 0.16268330812454224, - "logits/rejected": 0.30625462532043457, - "logps/accuracies": 1.0, - "logps/chosen": -300.149169921875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -308.031982421875, - "logps/ref_rejected": -308.461181640625, - "logps/rejected": -392.94122314453125, - "loss": 0.52, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7882769703865051, - "rewards/grad_term": 0.0005324217490851879, - "rewards/margins": 9.236281394958496, - "rewards/rejected": -8.448005676269531, - "step": 307 - }, - { - "epoch": 0.6389213016984312, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 14.679022976693457, - "learning_rate": 7.577854671280276e-07, - "logits/chosen": 0.11276095360517502, - "logits/rejected": 0.15488047897815704, - "logps/accuracies": 0.75, - "logps/chosen": -320.9672546386719, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -317.51580810546875, - "logps/ref_rejected": -312.4486999511719, - "logps/rejected": -392.2602844238281, - "loss": 0.5461, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3451465368270874, - "rewards/grad_term": 0.005722560919821262, - "rewards/margins": 7.636013507843018, - "rewards/rejected": -7.9811601638793945, - "step": 308 - }, - { - "epoch": 0.6409957215091404, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 44.76031279155951, - "learning_rate": 7.566320645905421e-07, - "logits/chosen": 0.15411929786205292, - "logits/rejected": 0.17396250367164612, - "logps/accuracies": 1.0, - "logps/chosen": -271.82232666015625, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -267.814697265625, - "logps/ref_rejected": -285.9453430175781, - "logps/rejected": -360.15850830078125, - "loss": 0.5586, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.40076228976249695, - "rewards/grad_term": 0.009672937914729118, - "rewards/margins": 7.020550727844238, - "rewards/rejected": -7.421313762664795, - "step": 309 - }, - { - "epoch": 0.6430701413198496, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 53.09216498794514, - "learning_rate": 7.554786620530565e-07, - "logits/chosen": 0.1644693911075592, - "logits/rejected": 0.23783330619335175, - "logps/accuracies": 0.9375, - "logps/chosen": -335.7210693359375, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -341.6274719238281, - "logps/ref_rejected": -344.434814453125, - "logps/rejected": -419.43768310546875, - "loss": 0.5501, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5906396508216858, - "rewards/grad_term": 0.006133922841399908, - "rewards/margins": 8.090925216674805, - "rewards/rejected": -7.500285625457764, - "step": 310 - }, - { - "epoch": 0.6451445611305588, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0625, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 41.39136172238844, - "learning_rate": 7.543252595155709e-07, - "logits/chosen": 0.06381943821907043, - "logits/rejected": 0.08010812848806381, - "logps/accuracies": 0.8125, - "logps/chosen": -213.8368682861328, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -210.74017333984375, - "logps/ref_rejected": -222.75961303710938, - "logps/rejected": -290.5465087890625, - "loss": 0.5381, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.30966925621032715, - "rewards/grad_term": 0.008784075267612934, - "rewards/margins": 6.469019889831543, - "rewards/rejected": -6.778688907623291, - "step": 311 - }, - { - "epoch": 0.647218980941268, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 65.96838712092502, - "learning_rate": 7.531718569780853e-07, - "logits/chosen": 0.14972330629825592, - "logits/rejected": 0.19796700775623322, - "logps/accuracies": 0.8125, - "logps/chosen": -271.4801940917969, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -271.1293029785156, - "logps/ref_rejected": -286.2263488769531, - "logps/rejected": -364.43609619140625, - "loss": 0.4888, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.035090282559394836, - "rewards/grad_term": 0.005572815891355276, - "rewards/margins": 7.785881042480469, - "rewards/rejected": -7.8209710121154785, - "step": 312 - }, - { - "epoch": 0.6492934007519772, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 65.98088553626904, - "learning_rate": 7.520184544405997e-07, - "logits/chosen": 0.14395220577716827, - "logits/rejected": 0.09385178238153458, - "logps/accuracies": 0.8125, - "logps/chosen": -357.7359924316406, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -338.0129089355469, - "logps/ref_rejected": -343.6756896972656, - "logps/rejected": -418.16510009765625, - "loss": 0.563, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.972308874130249, - "rewards/grad_term": 0.013005997985601425, - "rewards/margins": 5.476626873016357, - "rewards/rejected": -7.4489359855651855, - "step": 313 - }, - { - "epoch": 0.6513678205626864, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 56.93501034330218, - "learning_rate": 7.508650519031141e-07, - "logits/chosen": 0.13935233652591705, - "logits/rejected": 0.19025281071662903, - "logps/accuracies": 0.8125, - "logps/chosen": -225.3142547607422, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -217.11978149414062, - "logps/ref_rejected": -215.02268981933594, - "logps/rejected": -280.1806640625, - "loss": 0.5613, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8194477558135986, - "rewards/grad_term": 0.01098605990409851, - "rewards/margins": 5.696350574493408, - "rewards/rejected": -6.5157976150512695, - "step": 314 - }, - { - "epoch": 0.6534422403733956, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 47.21672943404337, - "learning_rate": 7.497116493656286e-07, - "logits/chosen": 0.12312566488981247, - "logits/rejected": 0.11346716433763504, - "logps/accuracies": 0.9375, - "logps/chosen": -278.6199951171875, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -275.3497009277344, - "logps/ref_rejected": -268.13726806640625, - "logps/rejected": -360.0812072753906, - "loss": 0.627, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.32702964544296265, - "rewards/grad_term": 0.00423807417973876, - "rewards/margins": 8.867365837097168, - "rewards/rejected": -9.194396018981934, - "step": 315 - }, - { - "epoch": 0.6555166601841047, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 42.73491649478954, - "learning_rate": 7.485582468281429e-07, - "logits/chosen": 0.12578445672988892, - "logits/rejected": 0.11325564980506897, - "logps/accuracies": 0.9375, - "logps/chosen": -312.9777526855469, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -294.9630432128906, - "logps/ref_rejected": -309.66436767578125, - "logps/rejected": -403.28887939453125, - "loss": 0.5566, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.8014687299728394, - "rewards/grad_term": 0.006151386070996523, - "rewards/margins": 7.560985088348389, - "rewards/rejected": -9.36245346069336, - "step": 316 - }, - { - "epoch": 0.6575910799948139, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 26.036325870194347, - "learning_rate": 7.474048442906574e-07, - "logits/chosen": 0.2790083587169647, - "logits/rejected": 0.30802425742149353, - "logps/accuracies": 0.875, - "logps/chosen": -284.39532470703125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -284.0159606933594, - "logps/ref_rejected": -297.21063232421875, - "logps/rejected": -367.1194763183594, - "loss": 0.5027, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.03793831914663315, - "rewards/grad_term": 0.00753421988338232, - "rewards/margins": 6.952947616577148, - "rewards/rejected": -6.9908857345581055, - "step": 317 - }, - { - "epoch": 0.6596654998055231, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 48.848208501371566, - "learning_rate": 7.462514417531717e-07, - "logits/chosen": 0.23110151290893555, - "logits/rejected": 0.23169729113578796, - "logps/accuracies": 0.8125, - "logps/chosen": -345.29351806640625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -346.4335632324219, - "logps/ref_rejected": -320.21112060546875, - "logps/rejected": -389.7298889160156, - "loss": 0.5338, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.11400166153907776, - "rewards/grad_term": 0.005073026288300753, - "rewards/margins": 7.065882205963135, - "rewards/rejected": -6.951880931854248, - "step": 318 - }, - { - "epoch": 0.6617399196162324, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.4375, - "grad_norm": 37.43721667408988, - "learning_rate": 7.450980392156863e-07, - "logits/chosen": -0.11392828822135925, - "logits/rejected": -0.1736583262681961, - "logps/accuracies": 0.5625, - "logps/chosen": -349.2891540527344, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -354.2510986328125, - "logps/ref_rejected": -316.1826477050781, - "logps/rejected": -374.9449768066406, - "loss": 0.5794, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.4961950182914734, - "rewards/grad_term": 0.01609090529382229, - "rewards/margins": 6.372428894042969, - "rewards/rejected": -5.87623405456543, - "step": 319 - }, - { - "epoch": 0.6638143394269416, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 48.93249687471918, - "learning_rate": 7.439446366782007e-07, - "logits/chosen": 0.22672495245933533, - "logits/rejected": 0.23626157641410828, - "logps/accuracies": 0.875, - "logps/chosen": -300.6910095214844, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -306.4937744140625, - "logps/ref_rejected": -316.8069763183594, - "logps/rejected": -371.29150390625, - "loss": 0.579, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.5802759528160095, - "rewards/grad_term": 0.011157616972923279, - "rewards/margins": 6.02873420715332, - "rewards/rejected": -5.448458194732666, - "step": 320 - }, - { - "epoch": 0.6638143394269416, - "eval_flips/correct->correct": 0.43842363357543945, - "eval_flips/correct->incorrect": 0.004926108289510012, - "eval_flips/incorrect->correct": 0.30049261450767517, - "eval_flips/incorrect->incorrect": 0.25615763664245605, - "eval_logits/chosen": 0.15680116415023804, - "eval_logits/rejected": 0.20004509389400482, - "eval_logps/accuracies": 0.738916277885437, - "eval_logps/chosen": -288.21343994140625, - "eval_logps/ref_accuracies": 0.4433497488498688, - "eval_logps/ref_chosen": -287.3511047363281, - "eval_logps/ref_rejected": -289.0460205078125, - "eval_logps/rejected": -336.36444091796875, - "eval_loss": 0.6191994547843933, - "eval_rewards/accuracies": 0.871921181678772, - "eval_rewards/chosen": -0.08623380959033966, - "eval_rewards/grad_term": 0.017411047592759132, - "eval_rewards/margins": 4.645606994628906, - "eval_rewards/rejected": -4.731841087341309, - "eval_runtime": 800.1629, - "eval_samples_per_second": 2.022, - "eval_steps_per_second": 0.254, - "step": 320 - }, - { - "epoch": 0.6658887592376507, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 89.14243856965115, - "learning_rate": 7.427912341407151e-07, - "logits/chosen": 0.25495031476020813, - "logits/rejected": 0.36095547676086426, - "logps/accuracies": 0.875, - "logps/chosen": -296.5644836425781, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -300.39422607421875, - "logps/ref_rejected": -350.05474853515625, - "logps/rejected": -397.4334716796875, - "loss": 0.6239, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.38297611474990845, - "rewards/grad_term": 0.014660445041954517, - "rewards/margins": 5.120844841003418, - "rewards/rejected": -4.7378692626953125, - "step": 321 - }, - { - "epoch": 0.6679631790483599, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 16.638863764418918, - "learning_rate": 7.416378316032295e-07, - "logits/chosen": -0.062116291373968124, - "logits/rejected": 0.08867709338665009, - "logps/accuracies": 0.625, - "logps/chosen": -348.11456298828125, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -352.7536315917969, - "logps/ref_rejected": -350.36370849609375, - "logps/rejected": -394.8106689453125, - "loss": 0.5791, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.4639059007167816, - "rewards/grad_term": 0.016094159334897995, - "rewards/margins": 4.908601760864258, - "rewards/rejected": -4.444696426391602, - "step": 322 - }, - { - "epoch": 0.6700375988590691, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 69.32056292827173, - "learning_rate": 7.404844290657439e-07, - "logits/chosen": 0.2704838514328003, - "logits/rejected": 0.2706650495529175, - "logps/accuracies": 0.75, - "logps/chosen": -313.2464599609375, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -315.1943664550781, - "logps/ref_rejected": -296.2995300292969, - "logps/rejected": -356.75299072265625, - "loss": 0.5689, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.19478662312030792, - "rewards/grad_term": 0.016283176839351654, - "rewards/margins": 6.240136623382568, - "rewards/rejected": -6.045351028442383, - "step": 323 - }, - { - "epoch": 0.6721120186697783, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 53.57049601603343, - "learning_rate": 7.393310265282583e-07, - "logits/chosen": 0.24967102706432343, - "logits/rejected": 0.2552967667579651, - "logps/accuracies": 0.8125, - "logps/chosen": -270.054443359375, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -260.4380187988281, - "logps/ref_rejected": -254.31671142578125, - "logps/rejected": -320.3769836425781, - "loss": 0.5592, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.9616467952728271, - "rewards/grad_term": 0.017603037878870964, - "rewards/margins": 5.644383430480957, - "rewards/rejected": -6.606029987335205, - "step": 324 - }, - { - "epoch": 0.6741864384804875, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 54.49892498689122, - "learning_rate": 7.381776239907728e-07, - "logits/chosen": 0.22812658548355103, - "logits/rejected": 0.2515120506286621, - "logps/accuracies": 0.9375, - "logps/chosen": -324.39599609375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -317.1215515136719, - "logps/ref_rejected": -327.36962890625, - "logps/rejected": -387.77032470703125, - "loss": 0.5858, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.7274415493011475, - "rewards/grad_term": 0.015195751562714577, - "rewards/margins": 5.312624931335449, - "rewards/rejected": -6.040066242218018, - "step": 325 - }, - { - "epoch": 0.6762608582911966, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 48.53308556944907, - "learning_rate": 7.370242214532871e-07, - "logits/chosen": -0.008343299850821495, - "logits/rejected": -0.03129954636096954, - "logps/accuracies": 0.75, - "logps/chosen": -356.82958984375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -344.5155029296875, - "logps/ref_rejected": -331.8990783691406, - "logps/rejected": -418.02227783203125, - "loss": 0.5789, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.2314121723175049, - "rewards/grad_term": 0.005676737520843744, - "rewards/margins": 7.380904197692871, - "rewards/rejected": -8.612316131591797, - "step": 326 - }, - { - "epoch": 0.6783352781019059, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 98.7994855905736, - "learning_rate": 7.358708189158016e-07, - "logits/chosen": 0.005753070116043091, - "logits/rejected": 0.01671770215034485, - "logps/accuracies": 0.625, - "logps/chosen": -313.5362243652344, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -289.57489013671875, - "logps/ref_rejected": -292.51513671875, - "logps/rejected": -378.51617431640625, - "loss": 0.595, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.3961288928985596, - "rewards/grad_term": 0.017515743151307106, - "rewards/margins": 6.203976631164551, - "rewards/rejected": -8.600106239318848, - "step": 327 - }, - { - "epoch": 0.6804096979126151, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 46.182216579421194, - "learning_rate": 7.347174163783159e-07, - "logits/chosen": 0.44519540667533875, - "logits/rejected": 0.4516918659210205, - "logps/accuracies": 1.0, - "logps/chosen": -261.4315490722656, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -263.67852783203125, - "logps/ref_rejected": -263.0929260253906, - "logps/rejected": -354.0087890625, - "loss": 0.5502, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.22469913959503174, - "rewards/grad_term": 0.00040459661977365613, - "rewards/margins": 9.316282272338867, - "rewards/rejected": -9.091583251953125, - "step": 328 - }, - { - "epoch": 0.6824841177233243, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 57.57752345437062, - "learning_rate": 7.335640138408304e-07, - "logits/chosen": 0.35632041096687317, - "logits/rejected": 0.3070759177207947, - "logps/accuracies": 0.8125, - "logps/chosen": -300.12353515625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -303.2857360839844, - "logps/ref_rejected": -288.00970458984375, - "logps/rejected": -365.837646484375, - "loss": 0.6026, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.3162227272987366, - "rewards/grad_term": 0.007446423638612032, - "rewards/margins": 8.099015235900879, - "rewards/rejected": -7.782792568206787, - "step": 329 - }, - { - "epoch": 0.6845585375340334, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 53.06794587438548, - "learning_rate": 7.324106113033448e-07, - "logits/chosen": 0.08217829465866089, - "logits/rejected": 0.2244112640619278, - "logps/accuracies": 0.9375, - "logps/chosen": -315.64288330078125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -322.9136962890625, - "logps/ref_rejected": -385.41436767578125, - "logps/rejected": -465.26690673828125, - "loss": 0.5569, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7270775437355042, - "rewards/grad_term": 0.004581013694405556, - "rewards/margins": 8.71232795715332, - "rewards/rejected": -7.985250473022461, - "step": 330 - }, - { - "epoch": 0.6866329573447426, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 19.260377458337114, - "learning_rate": 7.312572087658593e-07, - "logits/chosen": 0.07186198234558105, - "logits/rejected": 0.11489441245794296, - "logps/accuracies": 0.8125, - "logps/chosen": -319.80755615234375, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -325.8714599609375, - "logps/ref_rejected": -329.986572265625, - "logps/rejected": -402.0074462890625, - "loss": 0.5508, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6063953042030334, - "rewards/grad_term": 0.006578431464731693, - "rewards/margins": 7.808480739593506, - "rewards/rejected": -7.202085971832275, - "step": 331 - }, - { - "epoch": 0.6887073771554518, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 17.756201694843515, - "learning_rate": 7.301038062283737e-07, - "logits/chosen": 0.20243048667907715, - "logits/rejected": 0.28428226709365845, - "logps/accuracies": 1.0, - "logps/chosen": -302.6923828125, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -292.7161865234375, - "logps/ref_rejected": -302.3647766113281, - "logps/rejected": -387.39251708984375, - "loss": 0.5204, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.9976207613945007, - "rewards/grad_term": 0.008672392927110195, - "rewards/margins": 7.505157470703125, - "rewards/rejected": -8.502777099609375, - "step": 332 - }, - { - "epoch": 0.690781796966161, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 83.21722843296402, - "learning_rate": 7.289504036908881e-07, - "logits/chosen": 0.18854957818984985, - "logits/rejected": 0.13426542282104492, - "logps/accuracies": 0.6875, - "logps/chosen": -330.0719909667969, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -328.91522216796875, - "logps/ref_rejected": -307.397705078125, - "logps/rejected": -382.39984130859375, - "loss": 0.549, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.11567914485931396, - "rewards/grad_term": 0.004070833325386047, - "rewards/margins": 7.384533882141113, - "rewards/rejected": -7.500212669372559, - "step": 333 - }, - { - "epoch": 0.6928562167768703, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 41.93819396079496, - "learning_rate": 7.277970011534025e-07, - "logits/chosen": 0.0038331379182636738, - "logits/rejected": 0.06832897663116455, - "logps/accuracies": 0.9375, - "logps/chosen": -271.9588317871094, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -270.6300964355469, - "logps/ref_rejected": -264.27239990234375, - "logps/rejected": -333.4945373535156, - "loss": 0.5229, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.13287392258644104, - "rewards/grad_term": 0.004513449501246214, - "rewards/margins": 6.789344787597656, - "rewards/rejected": -6.922219276428223, - "step": 334 - }, - { - "epoch": 0.6949306365875794, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 36.44772897953967, - "learning_rate": 7.26643598615917e-07, - "logits/chosen": 0.08206385374069214, - "logits/rejected": 0.15132063627243042, - "logps/accuracies": 0.8125, - "logps/chosen": -281.35186767578125, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -291.0823669433594, - "logps/ref_rejected": -298.27886962890625, - "logps/rejected": -367.3419494628906, - "loss": 0.5405, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9730521440505981, - "rewards/grad_term": 0.002131945453584194, - "rewards/margins": 7.879360198974609, - "rewards/rejected": -6.906307697296143, - "step": 335 - }, - { - "epoch": 0.6970050563982886, - "flips/correct->correct": 0.8125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 28.51646796947631, - "learning_rate": 7.254901960784313e-07, - "logits/chosen": 0.020516425371170044, - "logits/rejected": 0.06293690204620361, - "logps/accuracies": 0.9375, - "logps/chosen": -308.943359375, - "logps/ref_accuracies": 0.8125, - "logps/ref_chosen": -308.57928466796875, - "logps/ref_rejected": -345.8021545410156, - "logps/rejected": -399.0782470703125, - "loss": 0.5161, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.03641030192375183, - "rewards/grad_term": 0.012177910655736923, - "rewards/margins": 5.291202068328857, - "rewards/rejected": -5.327611923217773, - "step": 336 - }, - { - "epoch": 0.6990794762089978, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 34.67482984022363, - "learning_rate": 7.243367935409458e-07, - "logits/chosen": 0.09479643404483795, - "logits/rejected": 0.13485944271087646, - "logps/accuracies": 0.8125, - "logps/chosen": -350.71783447265625, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -344.1441955566406, - "logps/ref_rejected": -334.01727294921875, - "logps/rejected": -425.11920166015625, - "loss": 0.5386, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6573646068572998, - "rewards/grad_term": 0.005361511372029781, - "rewards/margins": 8.452826499938965, - "rewards/rejected": -9.110189437866211, - "step": 337 - }, - { - "epoch": 0.701153896019707, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 92.66931260479595, - "learning_rate": 7.231833910034601e-07, - "logits/chosen": 0.16966593265533447, - "logits/rejected": 0.13631996512413025, - "logps/accuracies": 0.875, - "logps/chosen": -345.20770263671875, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -337.16741943359375, - "logps/ref_rejected": -317.2330017089844, - "logps/rejected": -405.1679992675781, - "loss": 0.5231, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8040257096290588, - "rewards/grad_term": 0.003560500219464302, - "rewards/margins": 7.989476203918457, - "rewards/rejected": -8.793501853942871, - "step": 338 - }, - { - "epoch": 0.7032283158304162, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 28.389402918089036, - "learning_rate": 7.220299884659746e-07, - "logits/chosen": 0.17944695055484772, - "logits/rejected": 0.2976837456226349, - "logps/accuracies": 0.875, - "logps/chosen": -262.23980712890625, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -265.7380676269531, - "logps/ref_rejected": -294.07598876953125, - "logps/rejected": -336.75543212890625, - "loss": 0.5853, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.34982502460479736, - "rewards/grad_term": 0.018309494480490685, - "rewards/margins": 4.617773532867432, - "rewards/rejected": -4.267948150634766, - "step": 339 - }, - { - "epoch": 0.7053027356411253, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 33.29343285253659, - "learning_rate": 7.20876585928489e-07, - "logits/chosen": 0.042180366814136505, - "logits/rejected": 0.008254090324044228, - "logps/accuracies": 0.75, - "logps/chosen": -366.08233642578125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -362.78900146484375, - "logps/ref_rejected": -327.56585693359375, - "logps/rejected": -406.1036071777344, - "loss": 0.5219, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3293338716030121, - "rewards/grad_term": 0.006429283879697323, - "rewards/margins": 7.524442672729492, - "rewards/rejected": -7.8537774085998535, - "step": 340 - }, - { - "epoch": 0.7073771554518345, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 38.22067938349734, - "learning_rate": 7.197231833910034e-07, - "logits/chosen": 0.1745857149362564, - "logits/rejected": 0.22055479884147644, - "logps/accuracies": 0.75, - "logps/chosen": -223.20620727539062, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -214.50759887695312, - "logps/ref_rejected": -225.8865509033203, - "logps/rejected": -280.5146179199219, - "loss": 0.5904, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.8698611259460449, - "rewards/grad_term": 0.020344514399766922, - "rewards/margins": 4.592945098876953, - "rewards/rejected": -5.46280574798584, - "step": 341 - }, - { - "epoch": 0.7094515752625438, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 29.038636096202257, - "learning_rate": 7.185697808535178e-07, - "logits/chosen": 0.14953972399234772, - "logits/rejected": 0.15320980548858643, - "logps/accuracies": 0.9375, - "logps/chosen": -289.46533203125, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -288.3252868652344, - "logps/ref_rejected": -298.48101806640625, - "logps/rejected": -359.2103576660156, - "loss": 0.6004, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.11400707066059113, - "rewards/grad_term": 0.022022824734449387, - "rewards/margins": 5.958928108215332, - "rewards/rejected": -6.072935104370117, - "step": 342 - }, - { - "epoch": 0.711525995073253, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 33.00355372540172, - "learning_rate": 7.174163783160324e-07, - "logits/chosen": -0.06829185783863068, - "logits/rejected": -0.009546427056193352, - "logps/accuracies": 0.8125, - "logps/chosen": -352.6844177246094, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -354.55364990234375, - "logps/ref_rejected": -351.40203857421875, - "logps/rejected": -413.4746398925781, - "loss": 0.5504, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.18692292273044586, - "rewards/grad_term": 0.01500310655683279, - "rewards/margins": 6.394184112548828, - "rewards/rejected": -6.207261085510254, - "step": 343 - }, - { - "epoch": 0.7136004148839621, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 29.545604358917398, - "learning_rate": 7.162629757785467e-07, - "logits/chosen": 0.21840424835681915, - "logits/rejected": 0.3245609402656555, - "logps/accuracies": 0.9375, - "logps/chosen": -289.7184143066406, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -281.7314758300781, - "logps/ref_rejected": -305.2322082519531, - "logps/rejected": -371.3037414550781, - "loss": 0.5546, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.7986934185028076, - "rewards/grad_term": 0.02135634422302246, - "rewards/margins": 5.8084611892700195, - "rewards/rejected": -6.607154369354248, - "step": 344 - }, - { - "epoch": 0.7156748346946713, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 18.685102213495963, - "learning_rate": 7.151095732410612e-07, - "logits/chosen": 0.33599621057510376, - "logits/rejected": 0.259542852640152, - "logps/accuracies": 0.8125, - "logps/chosen": -297.86285400390625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -309.85284423828125, - "logps/ref_rejected": -319.9841613769531, - "logps/rejected": -364.2496643066406, - "loss": 0.5428, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.1990000009536743, - "rewards/grad_term": 0.012635907158255577, - "rewards/margins": 5.625548839569092, - "rewards/rejected": -4.426548957824707, - "step": 345 - }, - { - "epoch": 0.7177492545053805, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 51.73399077401343, - "learning_rate": 7.139561707035755e-07, - "logits/chosen": 0.19686900079250336, - "logits/rejected": 0.2250552475452423, - "logps/accuracies": 0.875, - "logps/chosen": -334.487548828125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -344.34234619140625, - "logps/ref_rejected": -351.153564453125, - "logps/rejected": -423.1881103515625, - "loss": 0.5425, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9854794144630432, - "rewards/grad_term": 0.0029634374659508467, - "rewards/margins": 8.188934326171875, - "rewards/rejected": -7.203455924987793, - "step": 346 - }, - { - "epoch": 0.7198236743160897, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 36.992429397717665, - "learning_rate": 7.1280276816609e-07, - "logits/chosen": 0.3227195143699646, - "logits/rejected": 0.3476618230342865, - "logps/accuracies": 0.8125, - "logps/chosen": -283.4656066894531, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -281.3516540527344, - "logps/ref_rejected": -273.4134826660156, - "logps/rejected": -333.3074645996094, - "loss": 0.5596, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.21139609813690186, - "rewards/grad_term": 0.013419999741017818, - "rewards/margins": 5.778001308441162, - "rewards/rejected": -5.9893975257873535, - "step": 347 - }, - { - "epoch": 0.721898094126799, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 41.16834515003933, - "learning_rate": 7.116493656286043e-07, - "logits/chosen": 0.125450000166893, - "logits/rejected": 0.17324930429458618, - "logps/accuracies": 0.8125, - "logps/chosen": -296.0699157714844, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -281.47979736328125, - "logps/ref_rejected": -311.7028503417969, - "logps/rejected": -370.5085144042969, - "loss": 0.5877, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.4590116739273071, - "rewards/grad_term": 0.020582564175128937, - "rewards/margins": 4.421552658081055, - "rewards/rejected": -5.8805646896362305, - "step": 348 - }, - { - "epoch": 0.723972513937508, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 28.04679377044801, - "learning_rate": 7.104959630911188e-07, - "logits/chosen": 0.02426442876458168, - "logits/rejected": 0.030082188546657562, - "logps/accuracies": 0.875, - "logps/chosen": -329.852294921875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -335.4429016113281, - "logps/ref_rejected": -336.506103515625, - "logps/rejected": -406.5826416015625, - "loss": 0.5545, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.5590592622756958, - "rewards/grad_term": 0.007747030816972256, - "rewards/margins": 7.566709995269775, - "rewards/rejected": -7.007650852203369, - "step": 349 - }, - { - "epoch": 0.7260469337482173, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 33.4741646395165, - "learning_rate": 7.093425605536332e-07, - "logits/chosen": 0.014736661687493324, - "logits/rejected": 0.02637672983109951, - "logps/accuracies": 1.0, - "logps/chosen": -317.66864013671875, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -317.7265930175781, - "logps/ref_rejected": -342.7004699707031, - "logps/rejected": -423.1243896484375, - "loss": 0.5797, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.005797684192657471, - "rewards/grad_term": 0.006446592975407839, - "rewards/margins": 8.048192977905273, - "rewards/rejected": -8.04239559173584, - "step": 350 - }, - { - "epoch": 0.7281213535589265, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 26.26791550365828, - "learning_rate": 7.081891580161476e-07, - "logits/chosen": -0.018278811126947403, - "logits/rejected": -0.055383071303367615, - "logps/accuracies": 0.875, - "logps/chosen": -334.0198974609375, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -332.0812683105469, - "logps/ref_rejected": -322.0040588378906, - "logps/rejected": -413.3955078125, - "loss": 0.5075, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.19386102259159088, - "rewards/grad_term": 0.009265870787203312, - "rewards/margins": 8.945282936096191, - "rewards/rejected": -9.139144897460938, - "step": 351 - }, - { - "epoch": 0.7301957733696357, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 31.547111088022394, - "learning_rate": 7.07035755478662e-07, - "logits/chosen": 0.07085268199443817, - "logits/rejected": 0.11351241916418076, - "logps/accuracies": 0.875, - "logps/chosen": -323.64910888671875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -324.082275390625, - "logps/ref_rejected": -336.0316467285156, - "logps/rejected": -416.46917724609375, - "loss": 0.5288, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.04331676661968231, - "rewards/grad_term": 0.003617448266595602, - "rewards/margins": 8.087069511413574, - "rewards/rejected": -8.043752670288086, - "step": 352 - }, - { - "epoch": 0.7322701931803449, - "flips/correct->correct": 0.1875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 44.5849999823685, - "learning_rate": 7.058823529411765e-07, - "logits/chosen": 0.3111583888530731, - "logits/rejected": 0.2975752055644989, - "logps/accuracies": 0.75, - "logps/chosen": -315.43798828125, - "logps/ref_accuracies": 0.1875, - "logps/ref_chosen": -307.661865234375, - "logps/ref_rejected": -282.1680908203125, - "logps/rejected": -370.3470153808594, - "loss": 0.5453, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.777613639831543, - "rewards/grad_term": 0.010531319305300713, - "rewards/margins": 8.040277481079102, - "rewards/rejected": -8.817892074584961, - "step": 353 - }, - { - "epoch": 0.734344612991054, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 42.86231892921175, - "learning_rate": 7.047289504036908e-07, - "logits/chosen": 0.2585771083831787, - "logits/rejected": 0.3336886465549469, - "logps/accuracies": 0.9375, - "logps/chosen": -327.9022216796875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -332.4975891113281, - "logps/ref_rejected": -343.6222839355469, - "logps/rejected": -417.87591552734375, - "loss": 0.5632, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.459539532661438, - "rewards/grad_term": 0.006328054238110781, - "rewards/margins": 7.8848958015441895, - "rewards/rejected": -7.425356864929199, - "step": 354 - }, - { - "epoch": 0.7364190328017632, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 38.37684555724554, - "learning_rate": 7.035755478662053e-07, - "logits/chosen": 0.07023249566555023, - "logits/rejected": 0.09015891700983047, - "logps/accuracies": 0.8125, - "logps/chosen": -336.48968505859375, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -324.7945861816406, - "logps/ref_rejected": -318.7522888183594, - "logps/rejected": -398.0233154296875, - "loss": 0.6105, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.1695095300674438, - "rewards/grad_term": 0.01756344363093376, - "rewards/margins": 6.757594585418701, - "rewards/rejected": -7.927104473114014, - "step": 355 - }, - { - "epoch": 0.7384934526124725, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 35.08769051586877, - "learning_rate": 7.024221453287197e-07, - "logits/chosen": 0.07610762119293213, - "logits/rejected": 0.18170149624347687, - "logps/accuracies": 0.875, - "logps/chosen": -259.91943359375, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -262.623779296875, - "logps/ref_rejected": -292.6954040527344, - "logps/rejected": -350.150146484375, - "loss": 0.5485, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.27043378353118896, - "rewards/grad_term": 0.011945006437599659, - "rewards/margins": 6.015911102294922, - "rewards/rejected": -5.745476722717285, - "step": 356 - }, - { - "epoch": 0.7405678724231817, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 26.590182989230307, - "learning_rate": 7.012687427912342e-07, - "logits/chosen": -0.002248242497444153, - "logits/rejected": 0.0742294117808342, - "logps/accuracies": 0.8125, - "logps/chosen": -248.39425659179688, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -245.6069793701172, - "logps/ref_rejected": -279.5992736816406, - "logps/rejected": -343.59112548828125, - "loss": 0.522, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.2787279188632965, - "rewards/grad_term": 0.017660701647400856, - "rewards/margins": 6.120457172393799, - "rewards/rejected": -6.399184703826904, - "step": 357 - }, - { - "epoch": 0.7426422922338909, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 29.97128814142722, - "learning_rate": 7.001153402537486e-07, - "logits/chosen": 0.05706937611103058, - "logits/rejected": 0.20207172632217407, - "logps/accuracies": 0.75, - "logps/chosen": -336.1678771972656, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -342.0132751464844, - "logps/ref_rejected": -353.97802734375, - "logps/rejected": -400.2590637207031, - "loss": 0.4873, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.5845356583595276, - "rewards/grad_term": 0.012262849137187004, - "rewards/margins": 5.212644577026367, - "rewards/rejected": -4.628108978271484, - "step": 358 - }, - { - "epoch": 0.7447167120446, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 63.41932463356418, - "learning_rate": 6.98961937716263e-07, - "logits/chosen": 0.21743306517601013, - "logits/rejected": 0.2811052203178406, - "logps/accuracies": 0.75, - "logps/chosen": -293.9018249511719, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -303.6950988769531, - "logps/ref_rejected": -292.83917236328125, - "logps/rejected": -354.6083068847656, - "loss": 0.5279, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9793245792388916, - "rewards/grad_term": 0.009044105187058449, - "rewards/margins": 7.156236171722412, - "rewards/rejected": -6.176911354064941, - "step": 359 - }, - { - "epoch": 0.7467911318553092, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 89.9026831204858, - "learning_rate": 6.978085351787774e-07, - "logits/chosen": 0.40088769793510437, - "logits/rejected": 0.40543943643569946, - "logps/accuracies": 0.8125, - "logps/chosen": -255.18496704101562, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -259.64080810546875, - "logps/ref_rejected": -265.6492919921875, - "logps/rejected": -336.0693359375, - "loss": 0.5575, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.44558367133140564, - "rewards/grad_term": 0.01041356474161148, - "rewards/margins": 7.487587928771973, - "rewards/rejected": -7.042004108428955, - "step": 360 - }, - { - "epoch": 0.7488655516660184, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 22.748796947859415, - "learning_rate": 6.966551326412918e-07, - "logits/chosen": 0.31769663095474243, - "logits/rejected": 0.3735862970352173, - "logps/accuracies": 0.8125, - "logps/chosen": -305.60565185546875, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -306.97216796875, - "logps/ref_rejected": -295.53271484375, - "logps/rejected": -375.6974182128906, - "loss": 0.5312, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.1366504728794098, - "rewards/grad_term": 0.003756206249818206, - "rewards/margins": 8.153119087219238, - "rewards/rejected": -8.016468048095703, - "step": 361 - }, - { - "epoch": 0.7509399714767276, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 48.649828270109175, - "learning_rate": 6.955017301038062e-07, - "logits/chosen": -0.11286991089582443, - "logits/rejected": -0.08522382378578186, - "logps/accuracies": 0.6875, - "logps/chosen": -306.65692138671875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -299.77130126953125, - "logps/ref_rejected": -305.17205810546875, - "logps/rejected": -366.79888916015625, - "loss": 0.563, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.688561201095581, - "rewards/grad_term": 0.015329258516430855, - "rewards/margins": 5.474117279052734, - "rewards/rejected": -6.162679195404053, - "step": 362 - }, - { - "epoch": 0.7530143912874367, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 44.582697450921174, - "learning_rate": 6.943483275663207e-07, - "logits/chosen": 0.16607432067394257, - "logits/rejected": 0.19322986900806427, - "logps/accuracies": 0.9375, - "logps/chosen": -250.20355224609375, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -252.611572265625, - "logps/ref_rejected": -272.4234924316406, - "logps/rejected": -353.1186218261719, - "loss": 0.5006, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.24080336093902588, - "rewards/grad_term": 0.006381358951330185, - "rewards/margins": 8.310314178466797, - "rewards/rejected": -8.069511413574219, - "step": 363 - }, - { - "epoch": 0.755088811098146, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 34.80591547675486, - "learning_rate": 6.93194925028835e-07, - "logits/chosen": 0.19473493099212646, - "logits/rejected": 0.18632598221302032, - "logps/accuracies": 0.8125, - "logps/chosen": -256.6549987792969, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -255.31813049316406, - "logps/ref_rejected": -249.58592224121094, - "logps/rejected": -313.6597900390625, - "loss": 0.5498, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.13368618488311768, - "rewards/grad_term": 0.011986999772489071, - "rewards/margins": 6.273699760437012, - "rewards/rejected": -6.407385349273682, - "step": 364 - }, - { - "epoch": 0.7571632309088552, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 41.34316036796179, - "learning_rate": 6.920415224913494e-07, - "logits/chosen": 0.1397414356470108, - "logits/rejected": 0.23811021447181702, - "logps/accuracies": 0.9375, - "logps/chosen": -315.8439636230469, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -314.4014892578125, - "logps/ref_rejected": -363.3650817871094, - "logps/rejected": -422.4331359863281, - "loss": 0.5638, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.14424461126327515, - "rewards/grad_term": 0.012570216320455074, - "rewards/margins": 5.762563228607178, - "rewards/rejected": -5.906806945800781, - "step": 365 - }, - { - "epoch": 0.7592376507195644, - "flips/correct->correct": 0.1875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 23.772256648413755, - "learning_rate": 6.908881199538638e-07, - "logits/chosen": 0.05051745846867561, - "logits/rejected": -0.02119167149066925, - "logps/accuracies": 0.9375, - "logps/chosen": -253.32345581054688, - "logps/ref_accuracies": 0.1875, - "logps/ref_chosen": -254.56048583984375, - "logps/ref_rejected": -240.97332763671875, - "logps/rejected": -324.1744079589844, - "loss": 0.5553, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.12370094656944275, - "rewards/grad_term": 0.0012932950630784035, - "rewards/margins": 8.443807601928711, - "rewards/rejected": -8.320106506347656, - "step": 366 - }, - { - "epoch": 0.7613120705302736, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 38.69831479632815, - "learning_rate": 6.897347174163782e-07, - "logits/chosen": 0.12788856029510498, - "logits/rejected": 0.16543559730052948, - "logps/accuracies": 0.9375, - "logps/chosen": -277.533447265625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -259.1481628417969, - "logps/ref_rejected": -260.858154296875, - "logps/rejected": -343.05718994140625, - "loss": 0.5869, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.838526725769043, - "rewards/grad_term": 0.012527183629572392, - "rewards/margins": 6.381375789642334, - "rewards/rejected": -8.219902992248535, - "step": 367 - }, - { - "epoch": 0.7633864903409827, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 35.55521896514245, - "learning_rate": 6.885813148788927e-07, - "logits/chosen": 0.15278108417987823, - "logits/rejected": 0.14702126383781433, - "logps/accuracies": 1.0, - "logps/chosen": -257.01214599609375, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -247.90902709960938, - "logps/ref_rejected": -265.93560791015625, - "logps/rejected": -346.7325134277344, - "loss": 0.5451, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.9103094339370728, - "rewards/grad_term": 0.011162678711116314, - "rewards/margins": 7.169381618499756, - "rewards/rejected": -8.079690933227539, - "step": 368 - }, - { - "epoch": 0.7654609101516919, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 30.230401166275122, - "learning_rate": 6.874279123414071e-07, - "logits/chosen": 0.086149662733078, - "logits/rejected": 0.22511181235313416, - "logps/accuracies": 0.875, - "logps/chosen": -233.25259399414062, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -216.24533081054688, - "logps/ref_rejected": -227.90420532226562, - "logps/rejected": -305.6214599609375, - "loss": 0.569, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.7007248401641846, - "rewards/grad_term": 0.013076627627015114, - "rewards/margins": 6.071000099182129, - "rewards/rejected": -7.771725177764893, - "step": 369 - }, - { - "epoch": 0.7675353299624011, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 52.22387916464415, - "learning_rate": 6.862745098039216e-07, - "logits/chosen": 0.2996848225593567, - "logits/rejected": 0.31599316000938416, - "logps/accuracies": 0.875, - "logps/chosen": -269.8756408691406, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -261.0709533691406, - "logps/ref_rejected": -246.64968872070312, - "logps/rejected": -322.8077392578125, - "loss": 0.5479, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.8804708123207092, - "rewards/grad_term": 0.01271949615329504, - "rewards/margins": 6.735333442687988, - "rewards/rejected": -7.615804195404053, - "step": 370 - }, - { - "epoch": 0.7696097497731104, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 63.7741156365506, - "learning_rate": 6.851211072664359e-07, - "logits/chosen": -0.01846727915108204, - "logits/rejected": -0.008946547284722328, - "logps/accuracies": 0.875, - "logps/chosen": -305.61639404296875, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -300.2716064453125, - "logps/ref_rejected": -331.55859375, - "logps/rejected": -386.4937438964844, - "loss": 0.5752, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.5344789624214172, - "rewards/grad_term": 0.02246464043855667, - "rewards/margins": 4.959036350250244, - "rewards/rejected": -5.4935150146484375, - "step": 371 - }, - { - "epoch": 0.7716841695838196, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 64.45685095510555, - "learning_rate": 6.839677047289504e-07, - "logits/chosen": 0.08011619746685028, - "logits/rejected": 0.09146730601787567, - "logps/accuracies": 0.8125, - "logps/chosen": -345.562255859375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -346.65069580078125, - "logps/ref_rejected": -352.12396240234375, - "logps/rejected": -403.227294921875, - "loss": 0.572, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.10884669423103333, - "rewards/grad_term": 0.013191865757107735, - "rewards/margins": 5.219181537628174, - "rewards/rejected": -5.110335350036621, - "step": 372 - }, - { - "epoch": 0.7737585893945287, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 39.59306558130555, - "learning_rate": 6.828143021914648e-07, - "logits/chosen": -0.12652695178985596, - "logits/rejected": -0.07933872938156128, - "logps/accuracies": 0.8125, - "logps/chosen": -299.42108154296875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -306.9295654296875, - "logps/ref_rejected": -296.8434143066406, - "logps/rejected": -359.88623046875, - "loss": 0.5548, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7508491277694702, - "rewards/grad_term": 0.005514204967767, - "rewards/margins": 7.0551300048828125, - "rewards/rejected": -6.304280757904053, - "step": 373 - }, - { - "epoch": 0.7758330092052379, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 76.23798525574978, - "learning_rate": 6.816608996539792e-07, - "logits/chosen": 0.12230158597230911, - "logits/rejected": 0.12023597955703735, - "logps/accuracies": 0.75, - "logps/chosen": -281.0087890625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -282.221923828125, - "logps/ref_rejected": -290.686767578125, - "logps/rejected": -355.41357421875, - "loss": 0.603, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.1213146299123764, - "rewards/grad_term": 0.011165942065417767, - "rewards/margins": 6.593995571136475, - "rewards/rejected": -6.472680568695068, - "step": 374 - }, - { - "epoch": 0.7779074290159471, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 34.99314645928879, - "learning_rate": 6.805074971164936e-07, - "logits/chosen": 0.15410278737545013, - "logits/rejected": 0.2643253803253174, - "logps/accuracies": 0.875, - "logps/chosen": -305.5322265625, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -313.14080810546875, - "logps/ref_rejected": -341.4920654296875, - "logps/rejected": -399.58892822265625, - "loss": 0.6245, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7608582377433777, - "rewards/grad_term": 0.008731910958886147, - "rewards/margins": 6.570548057556152, - "rewards/rejected": -5.809689998626709, - "step": 375 - }, - { - "epoch": 0.7799818488266563, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 54.1532317962664, - "learning_rate": 6.79354094579008e-07, - "logits/chosen": 0.3877769708633423, - "logits/rejected": 0.35827726125717163, - "logps/accuracies": 0.6875, - "logps/chosen": -262.630126953125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -267.2193298339844, - "logps/ref_rejected": -227.17079162597656, - "logps/rejected": -268.9840087890625, - "loss": 0.6305, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.45892155170440674, - "rewards/grad_term": 0.014158925041556358, - "rewards/margins": 4.640246391296387, - "rewards/rejected": -4.1813249588012695, - "step": 376 - }, - { - "epoch": 0.7820562686373654, - "flips/correct->correct": 0.1875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 29.1706749528385, - "learning_rate": 6.782006920415224e-07, - "logits/chosen": 0.33914873003959656, - "logits/rejected": 0.2858618199825287, - "logps/accuracies": 0.6875, - "logps/chosen": -333.1984558105469, - "logps/ref_accuracies": 0.1875, - "logps/ref_chosen": -334.16839599609375, - "logps/ref_rejected": -299.3583679199219, - "logps/rejected": -365.6182556152344, - "loss": 0.6119, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.0969928503036499, - "rewards/grad_term": 0.011358851566910744, - "rewards/margins": 6.722982883453369, - "rewards/rejected": -6.62598991394043, - "step": 377 - }, - { - "epoch": 0.7841306884480747, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 84.54685393998759, - "learning_rate": 6.770472895040369e-07, - "logits/chosen": 0.24667781591415405, - "logits/rejected": 0.2804810106754303, - "logps/accuracies": 0.6875, - "logps/chosen": -298.77734375, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -290.1251220703125, - "logps/ref_rejected": -294.4027099609375, - "logps/rejected": -353.4877014160156, - "loss": 0.5833, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.8652223348617554, - "rewards/grad_term": 0.013081303797662258, - "rewards/margins": 5.043279647827148, - "rewards/rejected": -5.908501625061035, - "step": 378 - }, - { - "epoch": 0.7862051082587839, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 70.28746618838973, - "learning_rate": 6.758938869665512e-07, - "logits/chosen": 0.13181552290916443, - "logits/rejected": 0.20094197988510132, - "logps/accuracies": 0.875, - "logps/chosen": -299.0951843261719, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -298.9969177246094, - "logps/ref_rejected": -297.56201171875, - "logps/rejected": -363.7420349121094, - "loss": 0.5688, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.009829364717006683, - "rewards/grad_term": 0.010300719179213047, - "rewards/margins": 6.60817289352417, - "rewards/rejected": -6.618002414703369, - "step": 379 - }, - { - "epoch": 0.7882795280694931, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 36.661344739750994, - "learning_rate": 6.747404844290657e-07, - "logits/chosen": 0.11381202936172485, - "logits/rejected": 0.27916306257247925, - "logps/accuracies": 0.75, - "logps/chosen": -311.8406982421875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -300.5757141113281, - "logps/ref_rejected": -303.3941345214844, - "logps/rejected": -374.7430419921875, - "loss": 0.5351, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.1265006065368652, - "rewards/grad_term": 0.013736705295741558, - "rewards/margins": 6.008389949798584, - "rewards/rejected": -7.134890556335449, - "step": 380 - }, - { - "epoch": 0.7903539478802023, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 52.40416785212294, - "learning_rate": 6.735870818915801e-07, - "logits/chosen": 0.28354066610336304, - "logits/rejected": 0.3793669044971466, - "logps/accuracies": 0.9375, - "logps/chosen": -251.4017333984375, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -241.32391357421875, - "logps/ref_rejected": -268.19512939453125, - "logps/rejected": -346.08892822265625, - "loss": 0.5578, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.0077815055847168, - "rewards/grad_term": 0.0052658445201814175, - "rewards/margins": 6.781601905822754, - "rewards/rejected": -7.789383411407471, - "step": 381 - }, - { - "epoch": 0.7924283676909114, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 30.49757789797264, - "learning_rate": 6.724336793540946e-07, - "logits/chosen": 0.4262790381908417, - "logits/rejected": 0.44936031103134155, - "logps/accuracies": 0.875, - "logps/chosen": -276.0723571777344, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -259.67919921875, - "logps/ref_rejected": -269.11407470703125, - "logps/rejected": -344.3211364746094, - "loss": 0.5833, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.639316201210022, - "rewards/grad_term": 0.017718428745865822, - "rewards/margins": 5.881390571594238, - "rewards/rejected": -7.520707130432129, - "step": 382 - }, - { - "epoch": 0.7945027875016206, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 32.850204291988575, - "learning_rate": 6.71280276816609e-07, - "logits/chosen": 0.41091158986091614, - "logits/rejected": 0.46820542216300964, - "logps/accuracies": 0.75, - "logps/chosen": -324.7238464355469, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -312.0603942871094, - "logps/ref_rejected": -325.2768859863281, - "logps/rejected": -401.4980773925781, - "loss": 0.6727, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.2663447856903076, - "rewards/grad_term": 0.01295191328972578, - "rewards/margins": 6.35577392578125, - "rewards/rejected": -7.622118949890137, - "step": 383 - }, - { - "epoch": 0.7965772073123298, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.4375, - "grad_norm": 40.68382714512231, - "learning_rate": 6.701268742791234e-07, - "logits/chosen": -0.04504679515957832, - "logits/rejected": -0.05939174070954323, - "logps/accuracies": 0.5, - "logps/chosen": -364.9970703125, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -330.1788635253906, - "logps/ref_rejected": -310.0872497558594, - "logps/rejected": -393.5850830078125, - "loss": 0.6752, - "rewards/accuracies": 0.9375, - "rewards/chosen": -3.481823444366455, - "rewards/grad_term": 0.014739114791154861, - "rewards/margins": 4.8679633140563965, - "rewards/rejected": -8.349786758422852, - "step": 384 - }, - { - "epoch": 0.7965772073123298, - "eval_flips/correct->correct": 0.4433497488498688, - "eval_flips/correct->incorrect": 0.0, - "eval_flips/incorrect->correct": 0.3497537076473236, - "eval_flips/incorrect->incorrect": 0.2068965584039688, - "eval_logits/chosen": 0.1350509524345398, - "eval_logits/rejected": 0.17706024646759033, - "eval_logps/accuracies": 0.7931034564971924, - "eval_logps/chosen": -310.3870544433594, - "eval_logps/ref_accuracies": 0.4433497488498688, - "eval_logps/ref_chosen": -287.3511047363281, - "eval_logps/ref_rejected": -289.0460205078125, - "eval_logps/rejected": -369.9229431152344, - "eval_loss": 0.6723487973213196, - "eval_rewards/accuracies": 0.9261083602905273, - "eval_rewards/chosen": -2.3035953044891357, - "eval_rewards/grad_term": 0.011555198580026627, - "eval_rewards/margins": 5.784095287322998, - "eval_rewards/rejected": -8.087691307067871, - "eval_runtime": 804.6111, - "eval_samples_per_second": 2.011, - "eval_steps_per_second": 0.252, - "step": 384 - }, - { - "epoch": 0.798651627123039, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 50.762610162394786, - "learning_rate": 6.689734717416378e-07, - "logits/chosen": -0.012154202908277512, - "logits/rejected": 0.0032455138862133026, - "logps/accuracies": 0.8125, - "logps/chosen": -352.6640930175781, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -328.68597412109375, - "logps/ref_rejected": -300.6054992675781, - "logps/rejected": -397.1305847167969, - "loss": 0.6399, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.3978145122528076, - "rewards/grad_term": 0.0043524750508368015, - "rewards/margins": 7.254694938659668, - "rewards/rejected": -9.652509689331055, - "step": 385 - }, - { - "epoch": 0.8007260469337483, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 95.05673688341686, - "learning_rate": 6.678200692041522e-07, - "logits/chosen": 0.21177135407924652, - "logits/rejected": 0.23154297471046448, - "logps/accuracies": 0.875, - "logps/chosen": -336.6234130859375, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -302.63336181640625, - "logps/ref_rejected": -325.64202880859375, - "logps/rejected": -405.45782470703125, - "loss": 0.6303, - "rewards/accuracies": 0.9375, - "rewards/chosen": -3.399005174636841, - "rewards/grad_term": 0.015561016276478767, - "rewards/margins": 4.582573413848877, - "rewards/rejected": -7.981578826904297, - "step": 386 - }, - { - "epoch": 0.8028004667444574, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.4375, - "grad_norm": 98.7595748405997, - "learning_rate": 6.666666666666666e-07, - "logits/chosen": -0.1825534999370575, - "logits/rejected": -0.13728323578834534, - "logps/accuracies": 0.5625, - "logps/chosen": -279.4841613769531, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -266.027099609375, - "logps/ref_rejected": -238.2875518798828, - "logps/rejected": -310.189453125, - "loss": 0.6502, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.345707893371582, - "rewards/grad_term": 0.013503390364348888, - "rewards/margins": 5.844482421875, - "rewards/rejected": -7.19019079208374, - "step": 387 - }, - { - "epoch": 0.8048748865551666, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 58.11377216377975, - "learning_rate": 6.655132641291811e-07, - "logits/chosen": 0.20901203155517578, - "logits/rejected": 0.19806969165802002, - "logps/accuracies": 0.9375, - "logps/chosen": -327.6300354003906, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -298.32843017578125, - "logps/ref_rejected": -294.7974853515625, - "logps/rejected": -393.97967529296875, - "loss": 0.6463, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.9301586151123047, - "rewards/grad_term": 0.0037229093722999096, - "rewards/margins": 6.988059997558594, - "rewards/rejected": -9.918218612670898, - "step": 388 - }, - { - "epoch": 0.8069493063658758, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 23.205827934491982, - "learning_rate": 6.643598615916954e-07, - "logits/chosen": 0.18306072056293488, - "logits/rejected": 0.23532596230506897, - "logps/accuracies": 0.875, - "logps/chosen": -248.78665161132812, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -254.37738037109375, - "logps/ref_rejected": -251.6569061279297, - "logps/rejected": -319.8973388671875, - "loss": 0.5127, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5590727925300598, - "rewards/grad_term": 0.001746954396367073, - "rewards/margins": 7.383120536804199, - "rewards/rejected": -6.824047088623047, - "step": 389 - }, - { - "epoch": 0.809023726176585, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 46.61067138619315, - "learning_rate": 6.632064590542099e-07, - "logits/chosen": 0.16985514760017395, - "logits/rejected": 0.16642533242702484, - "logps/accuracies": 0.9375, - "logps/chosen": -340.10723876953125, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -344.4974670410156, - "logps/ref_rejected": -371.35791015625, - "logps/rejected": -436.1505432128906, - "loss": 0.5613, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.4390239119529724, - "rewards/grad_term": 0.007462616544216871, - "rewards/margins": 6.918284893035889, - "rewards/rejected": -6.47926139831543, - "step": 390 - }, - { - "epoch": 0.8110981459872941, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 57.80966589693944, - "learning_rate": 6.620530565167242e-07, - "logits/chosen": -0.10273560136556625, - "logits/rejected": -0.0613471083343029, - "logps/accuracies": 0.75, - "logps/chosen": -216.7875518798828, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -226.03538513183594, - "logps/ref_rejected": -221.71621704101562, - "logps/rejected": -261.552978515625, - "loss": 0.5161, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.9247859120368958, - "rewards/grad_term": 0.0170612595975399, - "rewards/margins": 4.9084649085998535, - "rewards/rejected": -3.9836790561676025, - "step": 391 - }, - { - "epoch": 0.8131725657980033, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 62.6241872293078, - "learning_rate": 6.608996539792387e-07, - "logits/chosen": 0.22374431788921356, - "logits/rejected": 0.22435928881168365, - "logps/accuracies": 0.625, - "logps/chosen": -285.15252685546875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -289.751220703125, - "logps/ref_rejected": -287.3389587402344, - "logps/rejected": -332.70794677734375, - "loss": 0.5729, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4598711133003235, - "rewards/grad_term": 0.01684059388935566, - "rewards/margins": 4.996764659881592, - "rewards/rejected": -4.536893844604492, - "step": 392 - }, - { - "epoch": 0.8152469856087126, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 104.53799642819773, - "learning_rate": 6.597462514417531e-07, - "logits/chosen": 0.1355782002210617, - "logits/rejected": 0.15115031599998474, - "logps/accuracies": 0.6875, - "logps/chosen": -267.6117858886719, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -278.65167236328125, - "logps/ref_rejected": -268.8196105957031, - "logps/rejected": -310.05419921875, - "loss": 0.655, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.1039892435073853, - "rewards/grad_term": 0.0200988557189703, - "rewards/margins": 5.227451801300049, - "rewards/rejected": -4.123462677001953, - "step": 393 - }, - { - "epoch": 0.8173214054194218, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5625, - "grad_norm": 91.5589589431653, - "learning_rate": 6.585928489042676e-07, - "logits/chosen": 0.05219127982854843, - "logits/rejected": 0.1293550282716751, - "logps/accuracies": 0.4375, - "logps/chosen": -287.01702880859375, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -297.4679870605469, - "logps/ref_rejected": -308.44219970703125, - "logps/rejected": -331.50775146484375, - "loss": 0.6269, - "rewards/accuracies": 0.8125, - "rewards/chosen": 1.0450924634933472, - "rewards/grad_term": 0.0215632114559412, - "rewards/margins": 3.3516530990600586, - "rewards/rejected": -2.306560516357422, - "step": 394 - }, - { - "epoch": 0.819395825230131, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 80.60637705363116, - "learning_rate": 6.57439446366782e-07, - "logits/chosen": -0.04064434394240379, - "logits/rejected": -0.011088773608207703, - "logps/accuracies": 0.75, - "logps/chosen": -245.4797821044922, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -250.2657928466797, - "logps/ref_rejected": -278.16424560546875, - "logps/rejected": -307.46978759765625, - "loss": 0.6575, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.4786025285720825, - "rewards/grad_term": 0.023240692913532257, - "rewards/margins": 3.4091572761535645, - "rewards/rejected": -2.9305543899536133, - "step": 395 - }, - { - "epoch": 0.8214702450408401, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 37.32684776835478, - "learning_rate": 6.562860438292964e-07, - "logits/chosen": 0.10297183692455292, - "logits/rejected": 0.11840492486953735, - "logps/accuracies": 0.625, - "logps/chosen": -298.5022888183594, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -309.21044921875, - "logps/ref_rejected": -305.84539794921875, - "logps/rejected": -328.775390625, - "loss": 0.6662, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.0708208084106445, - "rewards/grad_term": 0.022083457559347153, - "rewards/margins": 3.363819122314453, - "rewards/rejected": -2.2929983139038086, - "step": 396 - }, - { - "epoch": 0.8235446648515493, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 23.575297499080293, - "learning_rate": 6.551326412918108e-07, - "logits/chosen": 0.11109241843223572, - "logits/rejected": 0.11409325897693634, - "logps/accuracies": 0.9375, - "logps/chosen": -276.38092041015625, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -285.5079345703125, - "logps/ref_rejected": -288.4066162109375, - "logps/rejected": -335.4505920410156, - "loss": 0.5911, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.9127010107040405, - "rewards/grad_term": 0.014403178356587887, - "rewards/margins": 5.617100715637207, - "rewards/rejected": -4.704399585723877, - "step": 397 - }, - { - "epoch": 0.8256190846622585, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 46.15593222404285, - "learning_rate": 6.539792387543253e-07, - "logits/chosen": 0.09000806510448456, - "logits/rejected": 0.09876266866922379, - "logps/accuracies": 0.75, - "logps/chosen": -270.1310119628906, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -274.2320556640625, - "logps/ref_rejected": -265.1705322265625, - "logps/rejected": -315.0960693359375, - "loss": 0.5498, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.41010797023773193, - "rewards/grad_term": 0.010970378294587135, - "rewards/margins": 5.402661323547363, - "rewards/rejected": -4.992552757263184, - "step": 398 - }, - { - "epoch": 0.8276935044729677, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 44.74586930025993, - "learning_rate": 6.528258362168396e-07, - "logits/chosen": 0.26735085248947144, - "logits/rejected": 0.30994755029678345, - "logps/accuracies": 0.875, - "logps/chosen": -246.11720275878906, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -253.5211181640625, - "logps/ref_rejected": -260.9595031738281, - "logps/rejected": -306.35498046875, - "loss": 0.5491, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7403918504714966, - "rewards/grad_term": 0.01072466466575861, - "rewards/margins": 5.279941558837891, - "rewards/rejected": -4.539549827575684, - "step": 399 - }, - { - "epoch": 0.829767924283677, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 52.75963999398655, - "learning_rate": 6.516724336793541e-07, - "logits/chosen": 0.40581169724464417, - "logits/rejected": 0.43723931908607483, - "logps/accuracies": 0.8125, - "logps/chosen": -302.2900085449219, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -299.8786926269531, - "logps/ref_rejected": -311.3181457519531, - "logps/rejected": -364.59326171875, - "loss": 0.5293, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.24113261699676514, - "rewards/grad_term": 0.017243320122361183, - "rewards/margins": 5.086377143859863, - "rewards/rejected": -5.32750940322876, - "step": 400 - }, - { - "epoch": 0.8318423440943861, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 75.29899473684678, - "learning_rate": 6.505190311418684e-07, - "logits/chosen": -0.09569695591926575, - "logits/rejected": -0.0767926424741745, - "logps/accuracies": 0.75, - "logps/chosen": -301.2300109863281, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -292.5848388671875, - "logps/ref_rejected": -290.6342468261719, - "logps/rejected": -361.4559326171875, - "loss": 0.5411, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.8645120859146118, - "rewards/grad_term": 0.014016557484865189, - "rewards/margins": 6.21765661239624, - "rewards/rejected": -7.0821685791015625, - "step": 401 - }, - { - "epoch": 0.8339167639050953, - "flips/correct->correct": 0.1875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.625, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 16.000137065905, - "learning_rate": 6.493656286043829e-07, - "logits/chosen": 0.13318368792533875, - "logits/rejected": 0.1401294469833374, - "logps/accuracies": 0.8125, - "logps/chosen": -306.74395751953125, - "logps/ref_accuracies": 0.1875, - "logps/ref_chosen": -315.7019958496094, - "logps/ref_rejected": -279.975341796875, - "logps/rejected": -362.96795654296875, - "loss": 0.5658, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.8958021998405457, - "rewards/grad_term": 0.0026785405352711678, - "rewards/margins": 9.195062637329102, - "rewards/rejected": -8.299260139465332, - "step": 402 - }, - { - "epoch": 0.8359911837158045, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 39.13628890660203, - "learning_rate": 6.482122260668973e-07, - "logits/chosen": 0.4807916283607483, - "logits/rejected": 0.6536089181900024, - "logps/accuracies": 0.8125, - "logps/chosen": -300.46356201171875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -272.149658203125, - "logps/ref_rejected": -331.09649658203125, - "logps/rejected": -420.2705078125, - "loss": 0.5903, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.831390142440796, - "rewards/grad_term": 0.010690869763493538, - "rewards/margins": 6.086010932922363, - "rewards/rejected": -8.917401313781738, - "step": 403 - }, - { - "epoch": 0.8380656035265137, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 30.654540085811046, - "learning_rate": 6.470588235294117e-07, - "logits/chosen": 0.22139021754264832, - "logits/rejected": 0.2569182515144348, - "logps/accuracies": 0.75, - "logps/chosen": -349.45458984375, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -313.68267822265625, - "logps/ref_rejected": -315.2803955078125, - "logps/rejected": -405.5434265136719, - "loss": 0.6411, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.5771892070770264, - "rewards/grad_term": 0.005928752478212118, - "rewards/margins": 5.449113368988037, - "rewards/rejected": -9.0263032913208, - "step": 404 - }, - { - "epoch": 0.8401400233372228, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 45.0761906012067, - "learning_rate": 6.459054209919261e-07, - "logits/chosen": 0.35096606612205505, - "logits/rejected": 0.44806602597236633, - "logps/accuracies": 0.8125, - "logps/chosen": -223.75851440429688, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -213.4886474609375, - "logps/ref_rejected": -226.67689514160156, - "logps/rejected": -300.3026428222656, - "loss": 0.7055, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.026986002922058, - "rewards/grad_term": 0.006491546984761953, - "rewards/margins": 6.3355865478515625, - "rewards/rejected": -7.36257266998291, - "step": 405 - }, - { - "epoch": 0.842214443147932, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 26.11429132364704, - "learning_rate": 6.447520184544407e-07, - "logits/chosen": 0.051110029220581055, - "logits/rejected": 0.10619683563709259, - "logps/accuracies": 0.875, - "logps/chosen": -313.3158874511719, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -292.2518310546875, - "logps/ref_rejected": -291.42193603515625, - "logps/rejected": -382.94781494140625, - "loss": 0.7203, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.1064045429229736, - "rewards/grad_term": 0.006023161578923464, - "rewards/margins": 7.046186447143555, - "rewards/rejected": -9.15259075164795, - "step": 406 - }, - { - "epoch": 0.8442888629586413, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 49.70258596298242, - "learning_rate": 6.43598615916955e-07, - "logits/chosen": 0.2751619219779968, - "logits/rejected": 0.2619101107120514, - "logps/accuracies": 0.875, - "logps/chosen": -298.23443603515625, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -270.52728271484375, - "logps/ref_rejected": -262.9530029296875, - "logps/rejected": -359.8721618652344, - "loss": 0.6965, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.7707149982452393, - "rewards/grad_term": 0.008775541558861732, - "rewards/margins": 6.921198844909668, - "rewards/rejected": -9.691913604736328, - "step": 407 - }, - { - "epoch": 0.8463632827693505, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.375, - "grad_norm": 76.86823523264724, - "learning_rate": 6.424452133794695e-07, - "logits/chosen": 0.10307708382606506, - "logits/rejected": 0.0942949503660202, - "logps/accuracies": 0.625, - "logps/chosen": -341.54669189453125, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -313.2756652832031, - "logps/ref_rejected": -308.5162658691406, - "logps/rejected": -385.50421142578125, - "loss": 0.7027, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.827104330062866, - "rewards/grad_term": 0.01655164361000061, - "rewards/margins": 4.871689796447754, - "rewards/rejected": -7.698794364929199, - "step": 408 - }, - { - "epoch": 0.8484377025800597, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 59.950272206470274, - "learning_rate": 6.412918108419838e-07, - "logits/chosen": 0.019011177122592926, - "logits/rejected": 0.09038500487804413, - "logps/accuracies": 0.9375, - "logps/chosen": -308.01776123046875, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -263.8279113769531, - "logps/ref_rejected": -277.70489501953125, - "logps/rejected": -373.0634765625, - "loss": 0.6458, - "rewards/accuracies": 0.9375, - "rewards/chosen": -4.418985366821289, - "rewards/grad_term": 0.01191724929958582, - "rewards/margins": 5.116873741149902, - "rewards/rejected": -9.535858154296875, - "step": 409 - }, - { - "epoch": 0.8505121223907688, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 60.54011532087722, - "learning_rate": 6.401384083044983e-07, - "logits/chosen": 0.105913445353508, - "logits/rejected": 0.05668123438954353, - "logps/accuracies": 0.9375, - "logps/chosen": -325.8265380859375, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -328.54022216796875, - "logps/ref_rejected": -309.3156433105469, - "logps/rejected": -414.94476318359375, - "loss": 0.5845, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.27136293053627014, - "rewards/grad_term": 6.003907765261829e-05, - "rewards/margins": 10.834280014038086, - "rewards/rejected": -10.56291675567627, - "step": 410 - }, - { - "epoch": 0.852586542201478, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 47.192528321568695, - "learning_rate": 6.389850057670127e-07, - "logits/chosen": 0.24323594570159912, - "logits/rejected": 0.2931632995605469, - "logps/accuracies": 0.75, - "logps/chosen": -285.09918212890625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -271.9576721191406, - "logps/ref_rejected": -269.5434875488281, - "logps/rejected": -350.0928955078125, - "loss": 0.6445, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.3141498565673828, - "rewards/grad_term": 0.012051810510456562, - "rewards/margins": 6.740789413452148, - "rewards/rejected": -8.054939270019531, - "step": 411 - }, - { - "epoch": 0.8546609620121872, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 58.581221597188936, - "learning_rate": 6.378316032295271e-07, - "logits/chosen": -0.11936801671981812, - "logits/rejected": -0.12492658197879791, - "logps/accuracies": 0.9375, - "logps/chosen": -338.5908508300781, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -339.50506591796875, - "logps/ref_rejected": -328.51324462890625, - "logps/rejected": -415.5992126464844, - "loss": 0.5409, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.0914229154586792, - "rewards/grad_term": 0.006874611601233482, - "rewards/margins": 8.800016403198242, - "rewards/rejected": -8.708593368530273, - "step": 412 - }, - { - "epoch": 0.8567353818228964, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 31.837832566947164, - "learning_rate": 6.366782006920415e-07, - "logits/chosen": 0.43914633989334106, - "logits/rejected": 0.5609852075576782, - "logps/accuracies": 0.875, - "logps/chosen": -258.90252685546875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -257.27178955078125, - "logps/ref_rejected": -292.46502685546875, - "logps/rejected": -361.5235900878906, - "loss": 0.5556, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.16307339072227478, - "rewards/grad_term": 0.015529593452811241, - "rewards/margins": 6.742788791656494, - "rewards/rejected": -6.905861854553223, - "step": 413 - }, - { - "epoch": 0.8588098016336057, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 130.10385493391817, - "learning_rate": 6.355247981545559e-07, - "logits/chosen": 0.33855926990509033, - "logits/rejected": 0.37952950596809387, - "logps/accuracies": 0.875, - "logps/chosen": -364.90386962890625, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -364.53131103515625, - "logps/ref_rejected": -361.88043212890625, - "logps/rejected": -423.7973937988281, - "loss": 0.5026, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.03725364804267883, - "rewards/grad_term": 0.014217305928468704, - "rewards/margins": 6.1544389724731445, - "rewards/rejected": -6.191693305969238, - "step": 414 - }, - { - "epoch": 0.8608842214443148, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0625, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 72.11770229856765, - "learning_rate": 6.343713956170703e-07, - "logits/chosen": 0.40989670157432556, - "logits/rejected": 0.4901154935359955, - "logps/accuracies": 0.6875, - "logps/chosen": -218.7454833984375, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -221.65101623535156, - "logps/ref_rejected": -227.98141479492188, - "logps/rejected": -275.6513366699219, - "loss": 0.5736, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.29055318236351013, - "rewards/grad_term": 0.013483730144798756, - "rewards/margins": 5.057545185089111, - "rewards/rejected": -4.766992092132568, - "step": 415 - }, - { - "epoch": 0.862958641255024, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 51.89358785836724, - "learning_rate": 6.332179930795848e-07, - "logits/chosen": -0.05566471815109253, - "logits/rejected": 0.030616842210292816, - "logps/accuracies": 0.6875, - "logps/chosen": -316.4223327636719, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -327.1333312988281, - "logps/ref_rejected": -320.0599670410156, - "logps/rejected": -366.65240478515625, - "loss": 0.6136, - "rewards/accuracies": 0.8125, - "rewards/chosen": 1.0711019039154053, - "rewards/grad_term": 0.016874371096491814, - "rewards/margins": 5.730344295501709, - "rewards/rejected": -4.659242153167725, - "step": 416 - }, - { - "epoch": 0.8650330610657332, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 38.31866291679492, - "learning_rate": 6.320645905420991e-07, - "logits/chosen": 0.1870647817850113, - "logits/rejected": 0.18267706036567688, - "logps/accuracies": 0.75, - "logps/chosen": -360.9079895019531, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -376.5132751464844, - "logps/ref_rejected": -365.2701721191406, - "logps/rejected": -400.93572998046875, - "loss": 0.591, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.56052565574646, - "rewards/grad_term": 0.011596291325986385, - "rewards/margins": 5.127077579498291, - "rewards/rejected": -3.5665524005889893, - "step": 417 - }, - { - "epoch": 0.8671074808764424, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 92.98916391894235, - "learning_rate": 6.309111880046136e-07, - "logits/chosen": 0.03672199696302414, - "logits/rejected": 0.061092860996723175, - "logps/accuracies": 0.8125, - "logps/chosen": -270.1746520996094, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -281.4916076660156, - "logps/ref_rejected": -294.692626953125, - "logps/rejected": -329.16668701171875, - "loss": 0.6474, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.1316967010498047, - "rewards/grad_term": 0.014774792827665806, - "rewards/margins": 4.579105377197266, - "rewards/rejected": -3.447408437728882, - "step": 418 - }, - { - "epoch": 0.8691819006871515, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 23.063566639192924, - "learning_rate": 6.29757785467128e-07, - "logits/chosen": 0.3395993113517761, - "logits/rejected": 0.38536539673805237, - "logps/accuracies": 0.8125, - "logps/chosen": -286.3059997558594, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -296.46038818359375, - "logps/ref_rejected": -286.7039489746094, - "logps/rejected": -349.8249206542969, - "loss": 0.6349, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.0154372453689575, - "rewards/grad_term": 0.009252113290131092, - "rewards/margins": 7.3275322914123535, - "rewards/rejected": -6.312095642089844, - "step": 419 - }, - { - "epoch": 0.8712563204978607, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 44.948117340095266, - "learning_rate": 6.286043829296425e-07, - "logits/chosen": -0.0751362144947052, - "logits/rejected": -0.003658019006252289, - "logps/accuracies": 0.75, - "logps/chosen": -268.517333984375, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -265.34674072265625, - "logps/ref_rejected": -274.8442077636719, - "logps/rejected": -334.3121643066406, - "loss": 0.6457, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.3170568645000458, - "rewards/grad_term": 0.017749693244695663, - "rewards/margins": 5.629739284515381, - "rewards/rejected": -5.94679594039917, - "step": 420 - }, - { - "epoch": 0.8733307403085699, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 34.9886652815262, - "learning_rate": 6.274509803921569e-07, - "logits/chosen": 0.3326599597930908, - "logits/rejected": 0.3828299343585968, - "logps/accuracies": 0.875, - "logps/chosen": -316.2264404296875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -327.22039794921875, - "logps/ref_rejected": -343.5686950683594, - "logps/rejected": -394.07080078125, - "loss": 0.5868, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.0993949174880981, - "rewards/grad_term": 0.012708110734820366, - "rewards/margins": 6.149601936340332, - "rewards/rejected": -5.050206661224365, - "step": 421 - }, - { - "epoch": 0.8754051601192792, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 17.44095726007292, - "learning_rate": 6.262975778546713e-07, - "logits/chosen": 0.06078142672777176, - "logits/rejected": -0.015550296753644943, - "logps/accuracies": 0.9375, - "logps/chosen": -322.6217346191406, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -336.0628356933594, - "logps/ref_rejected": -319.92767333984375, - "logps/rejected": -387.3755798339844, - "loss": 0.5532, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.344112515449524, - "rewards/grad_term": 0.007527807727456093, - "rewards/margins": 8.088907241821289, - "rewards/rejected": -6.7447943687438965, - "step": 422 - }, - { - "epoch": 0.8774795799299884, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.625, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 65.08075817623737, - "learning_rate": 6.251441753171857e-07, - "logits/chosen": 0.06982388347387314, - "logits/rejected": 0.007165290415287018, - "logps/accuracies": 1.0, - "logps/chosen": -300.95513916015625, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -306.9526062011719, - "logps/ref_rejected": -288.5630798339844, - "logps/rejected": -370.71649169921875, - "loss": 0.5573, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.5997495651245117, - "rewards/grad_term": 0.006430043373256922, - "rewards/margins": 8.815089225769043, - "rewards/rejected": -8.215339660644531, - "step": 423 - }, - { - "epoch": 0.8795539997406975, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 24.805163857445233, - "learning_rate": 6.239907727797001e-07, - "logits/chosen": 0.22340461611747742, - "logits/rejected": 0.21236705780029297, - "logps/accuracies": 0.75, - "logps/chosen": -313.9115295410156, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -315.0428161621094, - "logps/ref_rejected": -296.20806884765625, - "logps/rejected": -360.9374694824219, - "loss": 0.5634, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.11312799155712128, - "rewards/grad_term": 0.00796779990196228, - "rewards/margins": 6.586068153381348, - "rewards/rejected": -6.472940444946289, - "step": 424 - }, - { - "epoch": 0.8816284195514067, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 76.72911496113723, - "learning_rate": 6.228373702422145e-07, - "logits/chosen": 0.14682908356189728, - "logits/rejected": 0.15023761987686157, - "logps/accuracies": 0.75, - "logps/chosen": -277.3023681640625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -261.1825256347656, - "logps/ref_rejected": -256.98846435546875, - "logps/rejected": -323.0633239746094, - "loss": 0.5797, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.6119821071624756, - "rewards/grad_term": 0.017880305647850037, - "rewards/margins": 4.995503902435303, - "rewards/rejected": -6.607484817504883, - "step": 425 - }, - { - "epoch": 0.8837028393621159, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 59.32877504186551, - "learning_rate": 6.21683967704729e-07, - "logits/chosen": 0.11907504498958588, - "logits/rejected": 0.10917734354734421, - "logps/accuracies": 0.8125, - "logps/chosen": -280.1270446777344, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -282.9510192871094, - "logps/ref_rejected": -269.74761962890625, - "logps/rejected": -354.8128662109375, - "loss": 0.535, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.28239575028419495, - "rewards/grad_term": 0.011113264597952366, - "rewards/margins": 8.788921356201172, - "rewards/rejected": -8.506525993347168, - "step": 426 - }, - { - "epoch": 0.8857772591728251, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 127.10327511506759, - "learning_rate": 6.205305651672433e-07, - "logits/chosen": 0.17048220336437225, - "logits/rejected": 0.16041475534439087, - "logps/accuracies": 0.9375, - "logps/chosen": -294.7939758300781, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -294.1711730957031, - "logps/ref_rejected": -291.6072082519531, - "logps/rejected": -370.58660888671875, - "loss": 0.5691, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.062279731035232544, - "rewards/grad_term": 0.0018367553129792213, - "rewards/margins": 7.835660934448242, - "rewards/rejected": -7.8979411125183105, - "step": 427 - }, - { - "epoch": 0.8878516789835343, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 54.660911786291244, - "learning_rate": 6.193771626297578e-07, - "logits/chosen": 0.036782991141080856, - "logits/rejected": 0.06632021814584732, - "logps/accuracies": 0.8125, - "logps/chosen": -297.9986572265625, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -289.60797119140625, - "logps/ref_rejected": -254.0615234375, - "logps/rejected": -336.6078796386719, - "loss": 0.5262, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.8390707969665527, - "rewards/grad_term": 0.00769506860524416, - "rewards/margins": 7.415563583374023, - "rewards/rejected": -8.254634857177734, - "step": 428 - }, - { - "epoch": 0.8899260987942434, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 50.959026978469176, - "learning_rate": 6.182237600922721e-07, - "logits/chosen": 0.2548729181289673, - "logits/rejected": 0.24063560366630554, - "logps/accuracies": 0.9375, - "logps/chosen": -358.7782287597656, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -354.93792724609375, - "logps/ref_rejected": -339.633544921875, - "logps/rejected": -428.53302001953125, - "loss": 0.5172, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3840335011482239, - "rewards/grad_term": 0.003113335929811001, - "rewards/margins": 8.505916595458984, - "rewards/rejected": -8.8899507522583, - "step": 429 - }, - { - "epoch": 0.8920005186049527, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 29.304522529635086, - "learning_rate": 6.170703575547866e-07, - "logits/chosen": 0.11386538296937943, - "logits/rejected": 0.1519451141357422, - "logps/accuracies": 0.8125, - "logps/chosen": -247.40721130371094, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -234.81825256347656, - "logps/ref_rejected": -226.5369110107422, - "logps/rejected": -298.44622802734375, - "loss": 0.6175, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.2588945627212524, - "rewards/grad_term": 0.014798032119870186, - "rewards/margins": 5.93203592300415, - "rewards/rejected": -7.1909308433532715, - "step": 430 - }, - { - "epoch": 0.8940749384156619, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 33.20014491905227, - "learning_rate": 6.159169550173011e-07, - "logits/chosen": 0.29805174469947815, - "logits/rejected": 0.33232244849205017, - "logps/accuracies": 0.8125, - "logps/chosen": -336.54693603515625, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -328.70526123046875, - "logps/ref_rejected": -330.7543029785156, - "logps/rejected": -422.9801330566406, - "loss": 0.5606, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.7841659784317017, - "rewards/grad_term": 0.006449039559811354, - "rewards/margins": 8.438421249389648, - "rewards/rejected": -9.222586631774902, - "step": 431 - }, - { - "epoch": 0.8961493582263711, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 30.864555897425223, - "learning_rate": 6.147635524798154e-07, - "logits/chosen": 0.12058807164430618, - "logits/rejected": 0.13604214787483215, - "logps/accuracies": 0.9375, - "logps/chosen": -287.39691162109375, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -291.1800842285156, - "logps/ref_rejected": -289.5397644042969, - "logps/rejected": -376.7043151855469, - "loss": 0.5595, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.37831762433052063, - "rewards/grad_term": 0.0001301583251915872, - "rewards/margins": 9.094771385192871, - "rewards/rejected": -8.716453552246094, - "step": 432 - }, - { - "epoch": 0.8982237780370802, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 48.56405217889427, - "learning_rate": 6.136101499423299e-07, - "logits/chosen": 0.3865184783935547, - "logits/rejected": 0.4680458903312683, - "logps/accuracies": 0.75, - "logps/chosen": -285.0791015625, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -283.8902587890625, - "logps/ref_rejected": -293.55670166015625, - "logps/rejected": -369.1327209472656, - "loss": 0.5467, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.1188843846321106, - "rewards/grad_term": 0.006384614389389753, - "rewards/margins": 7.438718795776367, - "rewards/rejected": -7.557602882385254, - "step": 433 - }, - { - "epoch": 0.9002981978477894, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 41.55010194963713, - "learning_rate": 6.124567474048442e-07, - "logits/chosen": 0.37142789363861084, - "logits/rejected": 0.4116554856300354, - "logps/accuracies": 0.6875, - "logps/chosen": -264.35888671875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -255.59439086914062, - "logps/ref_rejected": -254.5562744140625, - "logps/rejected": -320.098388671875, - "loss": 0.5228, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8764491081237793, - "rewards/grad_term": 0.009346509352326393, - "rewards/margins": 5.6777663230896, - "rewards/rejected": -6.5542144775390625, - "step": 434 - }, - { - "epoch": 0.9023726176584986, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 41.02709525636894, - "learning_rate": 6.113033448673587e-07, - "logits/chosen": -0.03402477130293846, - "logits/rejected": 0.10666719824075699, - "logps/accuracies": 0.9375, - "logps/chosen": -330.32745361328125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -331.1935729980469, - "logps/ref_rejected": -342.39508056640625, - "logps/rejected": -409.2434997558594, - "loss": 0.5445, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.08661463856697083, - "rewards/grad_term": 0.011018088087439537, - "rewards/margins": 6.771457672119141, - "rewards/rejected": -6.684843063354492, - "step": 435 - }, - { - "epoch": 0.9044470374692078, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 46.61949118855806, - "learning_rate": 6.101499423298731e-07, - "logits/chosen": 0.033953070640563965, - "logits/rejected": 0.005475502926856279, - "logps/accuracies": 0.6875, - "logps/chosen": -304.1283874511719, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -310.3152160644531, - "logps/ref_rejected": -276.940185546875, - "logps/rejected": -349.3773193359375, - "loss": 0.5265, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6186816692352295, - "rewards/grad_term": 0.0051316795870661736, - "rewards/margins": 7.862398147583008, - "rewards/rejected": -7.243716239929199, - "step": 436 - }, - { - "epoch": 0.9065214572799171, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 55.50940525398979, - "learning_rate": 6.089965397923875e-07, - "logits/chosen": 0.14566786587238312, - "logits/rejected": 0.14087940752506256, - "logps/accuracies": 0.875, - "logps/chosen": -307.6397705078125, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -306.0394592285156, - "logps/ref_rejected": -290.89068603515625, - "logps/rejected": -374.4580078125, - "loss": 0.5268, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.1600308120250702, - "rewards/grad_term": 0.003625539131462574, - "rewards/margins": 8.196700096130371, - "rewards/rejected": -8.356730461120605, - "step": 437 - }, - { - "epoch": 0.9085958770906262, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 36.429819834561926, - "learning_rate": 6.078431372549019e-07, - "logits/chosen": 0.17382624745368958, - "logits/rejected": 0.17122478783130646, - "logps/accuracies": 0.9375, - "logps/chosen": -268.6485595703125, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -274.4229736328125, - "logps/ref_rejected": -260.6642761230469, - "logps/rejected": -344.1253662109375, - "loss": 0.5025, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5774391889572144, - "rewards/grad_term": 0.0027016454841941595, - "rewards/margins": 8.923548698425293, - "rewards/rejected": -8.346110343933105, - "step": 438 - }, - { - "epoch": 0.9106702969013354, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 33.56713984591113, - "learning_rate": 6.066897347174163e-07, - "logits/chosen": 0.09694240987300873, - "logits/rejected": 0.22976186871528625, - "logps/accuracies": 0.9375, - "logps/chosen": -258.59161376953125, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -264.9013977050781, - "logps/ref_rejected": -288.00958251953125, - "logps/rejected": -353.9190673828125, - "loss": 0.5424, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6309794187545776, - "rewards/grad_term": 0.008919828571379185, - "rewards/margins": 7.221925258636475, - "rewards/rejected": -6.590945720672607, - "step": 439 - }, - { - "epoch": 0.9127447167120446, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.125, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 22.84688441463557, - "learning_rate": 6.055363321799307e-07, - "logits/chosen": -0.12110434472560883, - "logits/rejected": -0.06775850802659988, - "logps/accuracies": 0.625, - "logps/chosen": -252.79736328125, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -241.74078369140625, - "logps/ref_rejected": -250.46945190429688, - "logps/rejected": -318.03790283203125, - "loss": 0.5855, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.1056597232818604, - "rewards/grad_term": 0.011472761631011963, - "rewards/margins": 5.651185989379883, - "rewards/rejected": -6.756845951080322, - "step": 440 - }, - { - "epoch": 0.9148191365227538, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 37.379916915616754, - "learning_rate": 6.043829296424452e-07, - "logits/chosen": 0.10848057270050049, - "logits/rejected": 0.11391180008649826, - "logps/accuracies": 0.8125, - "logps/chosen": -311.4754943847656, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -313.4561462402344, - "logps/ref_rejected": -308.49609375, - "logps/rejected": -389.3127136230469, - "loss": 0.5174, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.19806542992591858, - "rewards/grad_term": 0.007313254754990339, - "rewards/margins": 8.279730796813965, - "rewards/rejected": -8.0816650390625, - "step": 441 - }, - { - "epoch": 0.916893556333463, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 29.537088633715857, - "learning_rate": 6.032295271049595e-07, - "logits/chosen": -0.04151641204953194, - "logits/rejected": -0.03607035428285599, - "logps/accuracies": 0.875, - "logps/chosen": -309.4384765625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -318.02105712890625, - "logps/ref_rejected": -330.68695068359375, - "logps/rejected": -403.2144775390625, - "loss": 0.5329, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.8582614064216614, - "rewards/grad_term": 0.007493661250919104, - "rewards/margins": 8.111011505126953, - "rewards/rejected": -7.252751350402832, - "step": 442 - }, - { - "epoch": 0.9189679761441721, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 27.777171215232052, - "learning_rate": 6.02076124567474e-07, - "logits/chosen": 0.2931877374649048, - "logits/rejected": 0.2972795367240906, - "logps/accuracies": 0.75, - "logps/chosen": -301.7099609375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -301.0013122558594, - "logps/ref_rejected": -281.2795104980469, - "logps/rejected": -338.0366516113281, - "loss": 0.564, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.07086822390556335, - "rewards/grad_term": 0.012654304504394531, - "rewards/margins": 5.604846954345703, - "rewards/rejected": -5.67571496963501, - "step": 443 - }, - { - "epoch": 0.9210423959548814, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 26.705457998840874, - "learning_rate": 6.009227220299884e-07, - "logits/chosen": 0.24231280386447906, - "logits/rejected": 0.25109824538230896, - "logps/accuracies": 0.8125, - "logps/chosen": -255.10946655273438, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -262.9153747558594, - "logps/ref_rejected": -290.76556396484375, - "logps/rejected": -346.1953430175781, - "loss": 0.5406, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.7805902361869812, - "rewards/grad_term": 0.00831932295113802, - "rewards/margins": 6.32357120513916, - "rewards/rejected": -5.542980670928955, - "step": 444 - }, - { - "epoch": 0.9231168157655906, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 45.863702043985306, - "learning_rate": 5.997693194925029e-07, - "logits/chosen": 0.30657637119293213, - "logits/rejected": 0.40025636553764343, - "logps/accuracies": 0.6875, - "logps/chosen": -378.9405517578125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -375.9232482910156, - "logps/ref_rejected": -406.2908935546875, - "logps/rejected": -476.7547912597656, - "loss": 0.4917, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.30173051357269287, - "rewards/grad_term": 0.007800333667546511, - "rewards/margins": 6.744661808013916, - "rewards/rejected": -7.046392440795898, - "step": 445 - }, - { - "epoch": 0.9251912355762998, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 58.839145890605785, - "learning_rate": 5.986159169550173e-07, - "logits/chosen": 0.2454492151737213, - "logits/rejected": 0.2175568789243698, - "logps/accuracies": 0.875, - "logps/chosen": -282.2199401855469, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -279.5404968261719, - "logps/ref_rejected": -255.2272186279297, - "logps/rejected": -328.5665283203125, - "loss": 0.4951, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.2679447829723358, - "rewards/grad_term": 0.009870468638837337, - "rewards/margins": 7.065983295440674, - "rewards/rejected": -7.333928108215332, - "step": 446 - }, - { - "epoch": 0.9272656553870089, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 45.27476270388639, - "learning_rate": 5.974625144175317e-07, - "logits/chosen": 0.25637272000312805, - "logits/rejected": 0.284266859292984, - "logps/accuracies": 0.8125, - "logps/chosen": -308.41693115234375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -299.62353515625, - "logps/ref_rejected": -297.4239807128906, - "logps/rejected": -373.3579406738281, - "loss": 0.5676, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.8793376088142395, - "rewards/grad_term": 0.01210443302989006, - "rewards/margins": 6.714059829711914, - "rewards/rejected": -7.5933966636657715, - "step": 447 - }, - { - "epoch": 0.9293400751977181, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 24.468747270838374, - "learning_rate": 5.963091118800461e-07, - "logits/chosen": -0.047158196568489075, - "logits/rejected": -0.004086131229996681, - "logps/accuracies": 0.8125, - "logps/chosen": -363.8712158203125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -358.2686462402344, - "logps/ref_rejected": -370.7213439941406, - "logps/rejected": -445.895263671875, - "loss": 0.5564, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5602600574493408, - "rewards/grad_term": 0.010198243893682957, - "rewards/margins": 6.957134246826172, - "rewards/rejected": -7.517394065856934, - "step": 448 - }, - { - "epoch": 0.9293400751977181, - "eval_flips/correct->correct": 0.4433497488498688, - "eval_flips/correct->incorrect": 0.0, - "eval_flips/incorrect->correct": 0.37438422441482544, - "eval_flips/incorrect->incorrect": 0.1822660118341446, - "eval_logits/chosen": 0.12454497069120407, - "eval_logits/rejected": 0.16565194725990295, - "eval_logps/accuracies": 0.8177340030670166, - "eval_logps/chosen": -297.1930847167969, - "eval_logps/ref_accuracies": 0.4433497488498688, - "eval_logps/ref_chosen": -287.3511047363281, - "eval_logps/ref_rejected": -289.0460205078125, - "eval_logps/rejected": -360.1458740234375, - "eval_loss": 0.5838693976402283, - "eval_rewards/accuracies": 0.9113300442695618, - "eval_rewards/chosen": -0.9841962456703186, - "eval_rewards/grad_term": 0.01190107874572277, - "eval_rewards/margins": 6.125789165496826, - "eval_rewards/rejected": -7.1099853515625, - "eval_runtime": 804.5696, - "eval_samples_per_second": 2.011, - "eval_steps_per_second": 0.252, - "step": 448 - }, - { - "epoch": 0.9314144950084273, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 42.40430094029457, - "learning_rate": 5.951557093425605e-07, - "logits/chosen": 0.2953071594238281, - "logits/rejected": 0.32737353444099426, - "logps/accuracies": 0.75, - "logps/chosen": -246.87020874023438, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -241.10800170898438, - "logps/ref_rejected": -247.88864135742188, - "logps/rejected": -314.1338806152344, - "loss": 0.5558, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.5762210488319397, - "rewards/grad_term": 0.016563208773732185, - "rewards/margins": 6.0483012199401855, - "rewards/rejected": -6.6245222091674805, - "step": 449 - }, - { - "epoch": 0.9334889148191365, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 55.56936800158885, - "learning_rate": 5.940023068050749e-07, - "logits/chosen": -0.24604183435440063, - "logits/rejected": -0.20258383452892303, - "logps/accuracies": 0.9375, - "logps/chosen": -280.2922058105469, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -271.5796813964844, - "logps/ref_rejected": -273.0106506347656, - "logps/rejected": -346.3686828613281, - "loss": 0.5873, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.8712505102157593, - "rewards/grad_term": 0.010869830846786499, - "rewards/margins": 6.4645562171936035, - "rewards/rejected": -7.335805892944336, - "step": 450 - }, - { - "epoch": 0.9355633346298458, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.625, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 104.79125724883293, - "learning_rate": 5.928489042675894e-07, - "logits/chosen": 0.24167490005493164, - "logits/rejected": 0.2851963937282562, - "logps/accuracies": 0.9375, - "logps/chosen": -315.26214599609375, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -313.26177978515625, - "logps/ref_rejected": -285.7829895019531, - "logps/rejected": -376.867919921875, - "loss": 0.5462, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.20003750920295715, - "rewards/grad_term": 0.0005122160073369741, - "rewards/margins": 8.908455848693848, - "rewards/rejected": -9.10849380493164, - "step": 451 - }, - { - "epoch": 0.9376377544405549, - "flips/correct->correct": 0.6875, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 29.758835328305054, - "learning_rate": 5.916955017301037e-07, - "logits/chosen": 0.2663220167160034, - "logits/rejected": 0.3874686658382416, - "logps/accuracies": 0.875, - "logps/chosen": -262.48992919921875, - "logps/ref_accuracies": 0.6875, - "logps/ref_chosen": -257.1368408203125, - "logps/ref_rejected": -272.6324462890625, - "logps/rejected": -335.6940612792969, - "loss": 0.5535, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5353102684020996, - "rewards/grad_term": 0.008673110976815224, - "rewards/margins": 5.770854473114014, - "rewards/rejected": -6.3061652183532715, - "step": 452 - }, - { - "epoch": 0.9397121742512641, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 17.42639637388175, - "learning_rate": 5.905420991926182e-07, - "logits/chosen": 0.2760721743106842, - "logits/rejected": 0.3189687430858612, - "logps/accuracies": 0.75, - "logps/chosen": -275.108642578125, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -274.26934814453125, - "logps/ref_rejected": -267.088134765625, - "logps/rejected": -331.2291564941406, - "loss": 0.4576, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.0839340090751648, - "rewards/grad_term": 0.01929977536201477, - "rewards/margins": 6.330172538757324, - "rewards/rejected": -6.414106369018555, - "step": 453 - }, - { - "epoch": 0.9417865940619733, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 43.82809699832689, - "learning_rate": 5.893886966551325e-07, - "logits/chosen": 0.21667756140232086, - "logits/rejected": 0.20864138007164001, - "logps/accuracies": 0.6875, - "logps/chosen": -275.2607727050781, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -276.90924072265625, - "logps/ref_rejected": -287.8323974609375, - "logps/rejected": -336.0874328613281, - "loss": 0.5327, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.16484564542770386, - "rewards/grad_term": 0.018737200647592545, - "rewards/margins": 4.990347862243652, - "rewards/rejected": -4.825502395629883, - "step": 454 - }, - { - "epoch": 0.9438610138726825, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 38.23253448586923, - "learning_rate": 5.88235294117647e-07, - "logits/chosen": 0.11684215813875198, - "logits/rejected": 0.24031777679920197, - "logps/accuracies": 0.9375, - "logps/chosen": -313.59710693359375, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -317.1141052246094, - "logps/ref_rejected": -372.68316650390625, - "logps/rejected": -439.81866455078125, - "loss": 0.5429, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.35169944167137146, - "rewards/grad_term": 0.00929531641304493, - "rewards/margins": 7.065249919891357, - "rewards/rejected": -6.713550567626953, - "step": 455 - }, - { - "epoch": 0.9459354336833917, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5625, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 56.76327899023629, - "learning_rate": 5.870818915801614e-07, - "logits/chosen": 0.10899796336889267, - "logits/rejected": 0.17427489161491394, - "logps/accuracies": 0.9375, - "logps/chosen": -296.0990295410156, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -301.2550048828125, - "logps/ref_rejected": -292.7115173339844, - "logps/rejected": -371.61187744140625, - "loss": 0.5333, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.5155962705612183, - "rewards/grad_term": 0.00969706755131483, - "rewards/margins": 8.405632972717285, - "rewards/rejected": -7.890036582946777, - "step": 456 - }, - { - "epoch": 0.9480098534941008, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 55.72280578194114, - "learning_rate": 5.859284890426759e-07, - "logits/chosen": -0.003691728226840496, - "logits/rejected": 0.0035413503646850586, - "logps/accuracies": 0.8125, - "logps/chosen": -304.0773010253906, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -317.5519104003906, - "logps/ref_rejected": -319.302978515625, - "logps/rejected": -378.9814453125, - "loss": 0.5376, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.347464919090271, - "rewards/grad_term": 0.011599891819059849, - "rewards/margins": 7.315312385559082, - "rewards/rejected": -5.9678473472595215, - "step": 457 - }, - { - "epoch": 0.95008427330481, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 63.66588058556034, - "learning_rate": 5.847750865051903e-07, - "logits/chosen": -0.25413990020751953, - "logits/rejected": -0.17748790979385376, - "logps/accuracies": 0.9375, - "logps/chosen": -318.7485656738281, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -321.310546875, - "logps/ref_rejected": -325.7279052734375, - "logps/rejected": -377.45843505859375, - "loss": 0.5057, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.2561964988708496, - "rewards/grad_term": 0.013258688151836395, - "rewards/margins": 5.429249286651611, - "rewards/rejected": -5.173052787780762, - "step": 458 - }, - { - "epoch": 0.9521586931155193, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 29.584209944027126, - "learning_rate": 5.836216839677048e-07, - "logits/chosen": -0.028666552156209946, - "logits/rejected": 0.027672436088323593, - "logps/accuracies": 0.75, - "logps/chosen": -297.3187561035156, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -300.38385009765625, - "logps/ref_rejected": -299.2121276855469, - "logps/rejected": -364.8226623535156, - "loss": 0.464, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.3065095543861389, - "rewards/grad_term": 0.009040933102369308, - "rewards/margins": 6.867563247680664, - "rewards/rejected": -6.561053276062012, - "step": 459 - }, - { - "epoch": 0.9542331129262285, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 36.21237401329387, - "learning_rate": 5.824682814302191e-07, - "logits/chosen": 0.010993116535246372, - "logits/rejected": 0.10759762674570084, - "logps/accuracies": 0.8125, - "logps/chosen": -247.78744506835938, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -243.45068359375, - "logps/ref_rejected": -238.94656372070312, - "logps/rejected": -306.70025634765625, - "loss": 0.5576, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.4336736798286438, - "rewards/grad_term": 0.010049809701740742, - "rewards/margins": 6.341697692871094, - "rewards/rejected": -6.775371551513672, - "step": 460 - }, - { - "epoch": 0.9563075327369377, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 67.07396455833435, - "learning_rate": 5.813148788927336e-07, - "logits/chosen": 0.2826724350452423, - "logits/rejected": 0.30516886711120605, - "logps/accuracies": 0.9375, - "logps/chosen": -303.901611328125, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -310.4884948730469, - "logps/ref_rejected": -321.95538330078125, - "logps/rejected": -394.9017639160156, - "loss": 0.5389, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.6586897373199463, - "rewards/grad_term": 0.007842399179935455, - "rewards/margins": 7.953330993652344, - "rewards/rejected": -7.294641971588135, - "step": 461 - }, - { - "epoch": 0.9583819525476468, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 29.22489915607507, - "learning_rate": 5.801614763552479e-07, - "logits/chosen": 0.12667125463485718, - "logits/rejected": 0.24145105481147766, - "logps/accuracies": 1.0, - "logps/chosen": -228.75924682617188, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -230.16534423828125, - "logps/ref_rejected": -276.22015380859375, - "logps/rejected": -355.9960632324219, - "loss": 0.5172, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.14060965180397034, - "rewards/grad_term": 0.003057720372453332, - "rewards/margins": 8.118200302124023, - "rewards/rejected": -7.977591037750244, - "step": 462 - }, - { - "epoch": 0.960456372358356, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 46.6463409683984, - "learning_rate": 5.790080738177624e-07, - "logits/chosen": 0.22849154472351074, - "logits/rejected": 0.2621627748012543, - "logps/accuracies": 0.875, - "logps/chosen": -352.41827392578125, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -345.7464294433594, - "logps/ref_rejected": -329.25946044921875, - "logps/rejected": -404.5802001953125, - "loss": 0.4936, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.6671811938285828, - "rewards/grad_term": 0.010429211892187595, - "rewards/margins": 6.864894866943359, - "rewards/rejected": -7.532076835632324, - "step": 463 - }, - { - "epoch": 0.9625307921690652, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 23.56747059673345, - "learning_rate": 5.778546712802767e-07, - "logits/chosen": 0.07257233560085297, - "logits/rejected": 0.10124337673187256, - "logps/accuracies": 0.9375, - "logps/chosen": -296.21209716796875, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -301.79473876953125, - "logps/ref_rejected": -295.86785888671875, - "logps/rejected": -370.70001220703125, - "loss": 0.5419, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.558262288570404, - "rewards/grad_term": 0.007962905801832676, - "rewards/margins": 8.041479110717773, - "rewards/rejected": -7.483217239379883, - "step": 464 - }, - { - "epoch": 0.9646052119797744, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 32.887206960447294, - "learning_rate": 5.767012687427912e-07, - "logits/chosen": -0.02765033021569252, - "logits/rejected": -0.04159718379378319, - "logps/accuracies": 0.875, - "logps/chosen": -301.87750244140625, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -294.6784973144531, - "logps/ref_rejected": -315.06695556640625, - "logps/rejected": -396.1573486328125, - "loss": 0.5015, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.7198995351791382, - "rewards/grad_term": 0.013939508236944675, - "rewards/margins": 7.389136791229248, - "rewards/rejected": -8.109036445617676, - "step": 465 - }, - { - "epoch": 0.9666796317904836, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 31.89130120157542, - "learning_rate": 5.755478662053056e-07, - "logits/chosen": 0.06474259495735168, - "logits/rejected": 0.1507914811372757, - "logps/accuracies": 0.75, - "logps/chosen": -362.8743896484375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -358.84747314453125, - "logps/ref_rejected": -336.5288391113281, - "logps/rejected": -405.94775390625, - "loss": 0.4804, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.4026913344860077, - "rewards/grad_term": 0.012680365703999996, - "rewards/margins": 6.539196014404297, - "rewards/rejected": -6.941887378692627, - "step": 466 - }, - { - "epoch": 0.9687540516011928, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.0625, - "grad_norm": 24.005282908584242, - "learning_rate": 5.7439446366782e-07, - "logits/chosen": 0.46883174777030945, - "logits/rejected": 0.5026016235351562, - "logps/accuracies": 0.9375, - "logps/chosen": -274.5486145019531, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -275.224853515625, - "logps/ref_rejected": -294.5133361816406, - "logps/rejected": -363.38482666015625, - "loss": 0.5302, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.06762228161096573, - "rewards/grad_term": 0.0132514713332057, - "rewards/margins": 6.954771041870117, - "rewards/rejected": -6.887148857116699, - "step": 467 - }, - { - "epoch": 0.970828471411902, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 93.90250048693143, - "learning_rate": 5.732410611303344e-07, - "logits/chosen": 0.13799475133419037, - "logits/rejected": 0.10960017144680023, - "logps/accuracies": 0.6875, - "logps/chosen": -306.63336181640625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -298.33892822265625, - "logps/ref_rejected": -294.1246032714844, - "logps/rejected": -364.4803466796875, - "loss": 0.558, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.8294415473937988, - "rewards/grad_term": 0.011318499222397804, - "rewards/margins": 6.206131458282471, - "rewards/rejected": -7.0355730056762695, - "step": 468 - }, - { - "epoch": 0.9729028912226112, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 33.22451982924039, - "learning_rate": 5.72087658592849e-07, - "logits/chosen": 0.2051057368516922, - "logits/rejected": 0.39599326252937317, - "logps/accuracies": 0.875, - "logps/chosen": -297.212890625, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -296.35797119140625, - "logps/ref_rejected": -341.98760986328125, - "logps/rejected": -415.61602783203125, - "loss": 0.577, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.08549117296934128, - "rewards/grad_term": 0.008653431199491024, - "rewards/margins": 7.277350902557373, - "rewards/rejected": -7.362841606140137, - "step": 469 - }, - { - "epoch": 0.9749773110333204, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 55.27297145431258, - "learning_rate": 5.709342560553633e-07, - "logits/chosen": 0.406170129776001, - "logits/rejected": 0.4695666432380676, - "logps/accuracies": 0.75, - "logps/chosen": -267.87518310546875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -264.7390441894531, - "logps/ref_rejected": -286.3910217285156, - "logps/rejected": -358.07012939453125, - "loss": 0.4829, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.3136145770549774, - "rewards/grad_term": 0.010346844792366028, - "rewards/margins": 6.854294776916504, - "rewards/rejected": -7.167908668518066, - "step": 470 - }, - { - "epoch": 0.9770517308440295, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 46.76169337104463, - "learning_rate": 5.697808535178778e-07, - "logits/chosen": 0.021602880209684372, - "logits/rejected": 0.10054953396320343, - "logps/accuracies": 0.4375, - "logps/chosen": -277.87420654296875, - "logps/ref_accuracies": 0.3125, - "logps/ref_chosen": -278.6906433105469, - "logps/ref_rejected": -260.7660827636719, - "logps/rejected": -313.3102722167969, - "loss": 0.523, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.08164243400096893, - "rewards/grad_term": 0.015251345932483673, - "rewards/margins": 5.336061954498291, - "rewards/rejected": -5.254419326782227, - "step": 471 - }, - { - "epoch": 0.9791261506547387, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.3125, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 22.784427030938893, - "learning_rate": 5.686274509803921e-07, - "logits/chosen": 0.40202221274375916, - "logits/rejected": 0.47440534830093384, - "logps/accuracies": 0.875, - "logps/chosen": -262.98046875, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -269.6217956542969, - "logps/ref_rejected": -294.6548767089844, - "logps/rejected": -360.1872253417969, - "loss": 0.5311, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.6641333103179932, - "rewards/grad_term": 0.008491192013025284, - "rewards/margins": 7.217367172241211, - "rewards/rejected": -6.553234100341797, - "step": 472 - }, - { - "epoch": 0.981200570465448, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.3125, - "grad_norm": 17.893894514454058, - "learning_rate": 5.674740484429066e-07, - "logits/chosen": 0.2951053977012634, - "logits/rejected": 0.31693655252456665, - "logps/accuracies": 0.6875, - "logps/chosen": -262.5206298828125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -264.88336181640625, - "logps/ref_rejected": -253.90045166015625, - "logps/rejected": -300.7421875, - "loss": 0.5814, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.23627310991287231, - "rewards/grad_term": 0.016665775328874588, - "rewards/margins": 4.920448303222656, - "rewards/rejected": -4.684175491333008, - "step": 473 - }, - { - "epoch": 0.9832749902761572, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 43.55914775192543, - "learning_rate": 5.66320645905421e-07, - "logits/chosen": -0.024594342336058617, - "logits/rejected": 0.07602076232433319, - "logps/accuracies": 0.75, - "logps/chosen": -239.41683959960938, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -248.70411682128906, - "logps/ref_rejected": -289.31231689453125, - "logps/rejected": -342.78070068359375, - "loss": 0.6143, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9287264347076416, - "rewards/grad_term": 0.012531589716672897, - "rewards/margins": 6.275561332702637, - "rewards/rejected": -5.346835136413574, - "step": 474 - }, - { - "epoch": 0.9853494100868664, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 62.70114024432645, - "learning_rate": 5.651672433679354e-07, - "logits/chosen": 0.10585808008909225, - "logits/rejected": 0.11864355206489563, - "logps/accuracies": 0.8125, - "logps/chosen": -296.0154724121094, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -307.2314453125, - "logps/ref_rejected": -307.9908752441406, - "logps/rejected": -372.58465576171875, - "loss": 0.5211, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.1215964555740356, - "rewards/grad_term": 0.0117443036288023, - "rewards/margins": 7.5809736251831055, - "rewards/rejected": -6.459376811981201, - "step": 475 - }, - { - "epoch": 0.9874238298975755, - "flips/correct->correct": 0.375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.1875, - "grad_norm": 49.433246174946895, - "learning_rate": 5.640138408304498e-07, - "logits/chosen": -0.0011881794780492783, - "logits/rejected": 0.056654639542102814, - "logps/accuracies": 0.8125, - "logps/chosen": -287.03497314453125, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -299.6839904785156, - "logps/ref_rejected": -300.8826904296875, - "logps/rejected": -363.1770935058594, - "loss": 0.5397, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.2649037837982178, - "rewards/grad_term": 0.011040883138775826, - "rewards/margins": 7.494347095489502, - "rewards/rejected": -6.229443550109863, - "step": 476 - }, - { - "epoch": 0.9894982497082847, - "flips/correct->correct": 0.5625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.1875, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 37.13699226629539, - "learning_rate": 5.628604382929642e-07, - "logits/chosen": 0.22437655925750732, - "logits/rejected": 0.2689896821975708, - "logps/accuracies": 0.75, - "logps/chosen": -257.7532958984375, - "logps/ref_accuracies": 0.5625, - "logps/ref_chosen": -264.4325256347656, - "logps/ref_rejected": -272.7551574707031, - "logps/rejected": -302.99462890625, - "loss": 0.56, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.667922854423523, - "rewards/grad_term": 0.02357163466513157, - "rewards/margins": 3.6918697357177734, - "rewards/rejected": -3.02394700050354, - "step": 477 - }, - { - "epoch": 0.9915726695189939, - "flips/correct->correct": 0.4375, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.4375, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 58.00284238737268, - "learning_rate": 5.617070357554786e-07, - "logits/chosen": 0.23218779265880585, - "logits/rejected": 0.20929032564163208, - "logps/accuracies": 0.875, - "logps/chosen": -320.3446350097656, - "logps/ref_accuracies": 0.4375, - "logps/ref_chosen": -321.2319641113281, - "logps/ref_rejected": -327.994140625, - "logps/rejected": -400.4293212890625, - "loss": 0.5079, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.08873284608125687, - "rewards/grad_term": 0.011475574225187302, - "rewards/margins": 7.332255840301514, - "rewards/rejected": -7.243522644042969, - "step": 478 - }, - { - "epoch": 0.9936470893297031, - "flips/correct->correct": 0.3125, - "flips/correct->incorrect": 0.0625, - "flips/incorrect->correct": 0.375, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 30.26855049657639, - "learning_rate": 5.605536332179931e-07, - "logits/chosen": 0.20646262168884277, - "logits/rejected": 0.18982850015163422, - "logps/accuracies": 0.6875, - "logps/chosen": -338.36700439453125, - "logps/ref_accuracies": 0.375, - "logps/ref_chosen": -340.2660217285156, - "logps/ref_rejected": -332.7535705566406, - "logps/rejected": -394.439697265625, - "loss": 0.535, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.1898985505104065, - "rewards/grad_term": 0.012958088889718056, - "rewards/margins": 6.358510971069336, - "rewards/rejected": -6.168612957000732, - "step": 479 - }, - { - "epoch": 0.9957215091404122, - "flips/correct->correct": 0.625, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.125, - "grad_norm": 19.142226432349567, - "learning_rate": 5.594002306805074e-07, - "logits/chosen": 0.26075422763824463, - "logits/rejected": 0.3115725517272949, - "logps/accuracies": 0.875, - "logps/chosen": -263.19952392578125, - "logps/ref_accuracies": 0.625, - "logps/ref_chosen": -252.8601531982422, - "logps/ref_rejected": -255.4573516845703, - "logps/rejected": -332.5611267089844, - "loss": 0.5524, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.033937931060791, - "rewards/grad_term": 0.014926022849977016, - "rewards/margins": 6.676440715789795, - "rewards/rejected": -7.710378170013428, - "step": 480 - } - ], - "logging_steps": 1, - "max_steps": 964, - "num_input_tokens_seen": 0, - "num_train_epochs": 2, - "save_steps": 96, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 0.0, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}