{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997810698387214, "eval_steps": 100, "global_step": 3425, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00029190688170473617, "grad_norm": 0.013427734375, "learning_rate": 1.457725947521866e-08, "logits/chosen": -2.4752657413482666, "logits/rejected": -2.4752657413482666, "logps/chosen": -328.9035949707031, "logps/rejected": -328.9035949707031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.002919068817047362, "grad_norm": 0.0147705078125, "learning_rate": 1.457725947521866e-07, "logits/chosen": -2.395798683166504, "logits/rejected": -2.395798683166504, "logps/chosen": -317.85565185546875, "logps/rejected": -317.85565185546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0003186435205861926, "rewards/margins": 0.0, "rewards/rejected": -0.0003186435205861926, "step": 10 }, { "epoch": 0.005838137634094724, "grad_norm": 0.01318359375, "learning_rate": 2.915451895043732e-07, "logits/chosen": -2.4440758228302, "logits/rejected": -2.4440758228302, "logps/chosen": -301.12921142578125, "logps/rejected": -301.12921142578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -7.847430242691189e-05, "rewards/margins": 0.0, "rewards/rejected": -7.847430242691189e-05, "step": 20 }, { "epoch": 0.008757206451142086, "grad_norm": 0.01177978515625, "learning_rate": 4.373177842565598e-07, "logits/chosen": -2.441359519958496, "logits/rejected": -2.441359519958496, "logps/chosen": -317.1576843261719, "logps/rejected": -317.1576843261719, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.00025945488596335053, "rewards/margins": 0.0, "rewards/rejected": -0.00025945488596335053, "step": 30 }, { "epoch": 0.011676275268189448, "grad_norm": 0.0167236328125, "learning_rate": 5.830903790087464e-07, "logits/chosen": -2.455430269241333, "logits/rejected": -2.455430269241333, "logps/chosen": -328.7832946777344, "logps/rejected": -328.7832946777344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.00034936360316351056, "rewards/margins": 0.0, "rewards/rejected": -0.00034936360316351056, "step": 40 }, { "epoch": 0.014595344085236809, "grad_norm": 0.012939453125, "learning_rate": 7.288629737609331e-07, "logits/chosen": -2.406463384628296, "logits/rejected": -2.406463384628296, "logps/chosen": -303.563232421875, "logps/rejected": -303.563232421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0001031260471791029, "rewards/margins": 0.0, "rewards/rejected": 0.0001031260471791029, "step": 50 }, { "epoch": 0.01751441290228417, "grad_norm": 0.016357421875, "learning_rate": 8.746355685131196e-07, "logits/chosen": -2.4401960372924805, "logits/rejected": -2.4401960372924805, "logps/chosen": -284.1253967285156, "logps/rejected": -284.1253967285156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.00043843849562108517, "rewards/margins": 0.0, "rewards/rejected": -0.00043843849562108517, "step": 60 }, { "epoch": 0.02043348171933153, "grad_norm": 0.01153564453125, "learning_rate": 1.0204081632653063e-06, "logits/chosen": -2.423875093460083, "logits/rejected": -2.423875093460083, "logps/chosen": -280.09442138671875, "logps/rejected": -280.09442138671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.00031216375646181405, "rewards/margins": 0.0, "rewards/rejected": -0.00031216375646181405, "step": 70 }, { "epoch": 0.023352550536378896, "grad_norm": 0.01214599609375, "learning_rate": 1.1661807580174927e-06, "logits/chosen": -2.404435396194458, "logits/rejected": -2.404435396194458, "logps/chosen": -267.2549743652344, "logps/rejected": -267.2549743652344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0006922121392562985, "rewards/margins": 0.0, "rewards/rejected": 0.0006922121392562985, "step": 80 }, { "epoch": 0.026271619353426257, "grad_norm": 0.0146484375, "learning_rate": 1.3119533527696792e-06, "logits/chosen": -2.416917324066162, "logits/rejected": -2.416917324066162, "logps/chosen": -333.58563232421875, "logps/rejected": -333.58563232421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0005137195694260299, "rewards/margins": 0.0, "rewards/rejected": 0.0005137195694260299, "step": 90 }, { "epoch": 0.029190688170473617, "grad_norm": 0.0189208984375, "learning_rate": 1.4577259475218661e-06, "logits/chosen": -2.4351730346679688, "logits/rejected": -2.4351730346679688, "logps/chosen": -339.3778381347656, "logps/rejected": -339.3778381347656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0005722829955630004, "rewards/margins": 0.0, "rewards/rejected": 0.0005722829955630004, "step": 100 }, { "epoch": 0.029190688170473617, "eval_logits/chosen": -2.394068479537964, "eval_logits/rejected": -2.394068479537964, "eval_logps/chosen": -306.389892578125, "eval_logps/rejected": -306.389892578125, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": 0.0008870832389220595, "eval_rewards/margins": 0.0, "eval_rewards/rejected": 0.0008870832389220595, "eval_runtime": 2666.9983, "eval_samples_per_second": 2.283, "eval_steps_per_second": 0.286, "step": 100 }, { "epoch": 0.03210975698752098, "grad_norm": 0.015869140625, "learning_rate": 1.6034985422740526e-06, "logits/chosen": -2.420276165008545, "logits/rejected": -2.420276165008545, "logps/chosen": -306.0760803222656, "logps/rejected": -306.0760803222656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0014700460014864802, "rewards/margins": 0.0, "rewards/rejected": 0.0014700460014864802, "step": 110 }, { "epoch": 0.03502882580456834, "grad_norm": 0.01544189453125, "learning_rate": 1.7492711370262391e-06, "logits/chosen": -2.4616119861602783, "logits/rejected": -2.4616119861602783, "logps/chosen": -328.64129638671875, "logps/rejected": -328.64129638671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.001054848893545568, "rewards/margins": 0.0, "rewards/rejected": 0.001054848893545568, "step": 120 }, { "epoch": 0.037947894621615706, "grad_norm": 0.0250244140625, "learning_rate": 1.895043731778426e-06, "logits/chosen": -2.404423236846924, "logits/rejected": -2.404423236846924, "logps/chosen": -339.0644836425781, "logps/rejected": -339.0644836425781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0002889078459702432, "rewards/margins": 0.0, "rewards/rejected": 0.0002889078459702432, "step": 130 }, { "epoch": 0.04086696343866306, "grad_norm": 0.0137939453125, "learning_rate": 2.0408163265306125e-06, "logits/chosen": -2.4294090270996094, "logits/rejected": -2.4294090270996094, "logps/chosen": -299.0234375, "logps/rejected": -299.0234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0003939162997994572, "rewards/margins": 0.0, "rewards/rejected": 0.0003939162997994572, "step": 140 }, { "epoch": 0.04378603225571043, "grad_norm": 0.01470947265625, "learning_rate": 2.1865889212827988e-06, "logits/chosen": -2.4415223598480225, "logits/rejected": -2.4415223598480225, "logps/chosen": -317.4403991699219, "logps/rejected": -317.4403991699219, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0011137222172692418, "rewards/margins": 0.0, "rewards/rejected": 0.0011137222172692418, "step": 150 }, { "epoch": 0.04670510107275779, "grad_norm": 0.01202392578125, "learning_rate": 2.3323615160349855e-06, "logits/chosen": -2.433961868286133, "logits/rejected": -2.433961868286133, "logps/chosen": -315.8016662597656, "logps/rejected": -315.8016662597656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.00034690109896473587, "rewards/margins": 0.0, "rewards/rejected": 0.00034690109896473587, "step": 160 }, { "epoch": 0.04962416988980515, "grad_norm": 0.01226806640625, "learning_rate": 2.478134110787172e-06, "logits/chosen": -2.4214272499084473, "logits/rejected": -2.4214272499084473, "logps/chosen": -304.0071105957031, "logps/rejected": -304.0071105957031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.00021128072694409639, "rewards/margins": 0.0, "rewards/rejected": 0.00021128072694409639, "step": 170 }, { "epoch": 0.05254323870685251, "grad_norm": 0.01318359375, "learning_rate": 2.6239067055393585e-06, "logits/chosen": -2.410125255584717, "logits/rejected": -2.410125255584717, "logps/chosen": -329.052978515625, "logps/rejected": -329.052978515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -8.667710062582046e-05, "rewards/margins": 0.0, "rewards/rejected": -8.667710062582046e-05, "step": 180 }, { "epoch": 0.05546230752389988, "grad_norm": 0.0111083984375, "learning_rate": 2.7696793002915456e-06, "logits/chosen": -2.412470579147339, "logits/rejected": -2.412470579147339, "logps/chosen": -302.9618225097656, "logps/rejected": -302.9618225097656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0005764733068645, "rewards/margins": 0.0, "rewards/rejected": 0.0005764733068645, "step": 190 }, { "epoch": 0.058381376340947234, "grad_norm": 0.013671875, "learning_rate": 2.9154518950437323e-06, "logits/chosen": -2.3948373794555664, "logits/rejected": -2.3948373794555664, "logps/chosen": -312.7694091796875, "logps/rejected": -312.7694091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0006709109293296933, "rewards/margins": 0.0, "rewards/rejected": -0.0006709109293296933, "step": 200 }, { "epoch": 0.058381376340947234, "eval_logits/chosen": -2.3945627212524414, "eval_logits/rejected": -2.3945627212524414, "eval_logps/chosen": -306.5539245605469, "eval_logps/rejected": -306.5539245605469, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.0007532919407822192, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.0007532919407822192, "eval_runtime": 2667.9395, "eval_samples_per_second": 2.283, "eval_steps_per_second": 0.286, "step": 200 }, { "epoch": 0.0613004451579946, "grad_norm": 0.0120849609375, "learning_rate": 3.0612244897959185e-06, "logits/chosen": -2.445885181427002, "logits/rejected": -2.445885181427002, "logps/chosen": -316.7839050292969, "logps/rejected": -316.7839050292969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0010994909098371863, "rewards/margins": 0.0, "rewards/rejected": -0.0010994909098371863, "step": 210 }, { "epoch": 0.06421951397504196, "grad_norm": 0.011962890625, "learning_rate": 3.2069970845481052e-06, "logits/chosen": -2.4333603382110596, "logits/rejected": -2.4333603382110596, "logps/chosen": -277.94915771484375, "logps/rejected": -277.94915771484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.00041724619222804904, "rewards/margins": 0.0, "rewards/rejected": -0.00041724619222804904, "step": 220 }, { "epoch": 0.06713858279208933, "grad_norm": 0.0145263671875, "learning_rate": 3.352769679300292e-06, "logits/chosen": -2.4338879585266113, "logits/rejected": -2.4338879585266113, "logps/chosen": -325.23455810546875, "logps/rejected": -325.23455810546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0012703577522188425, "rewards/margins": 0.0, "rewards/rejected": -0.0012703577522188425, "step": 230 }, { "epoch": 0.07005765160913668, "grad_norm": 0.0157470703125, "learning_rate": 3.4985422740524782e-06, "logits/chosen": -2.413400173187256, "logits/rejected": -2.413400173187256, "logps/chosen": -309.69403076171875, "logps/rejected": -309.69403076171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.001970961457118392, "rewards/margins": 0.0, "rewards/rejected": -0.001970961457118392, "step": 240 }, { "epoch": 0.07297672042618404, "grad_norm": 0.01422119140625, "learning_rate": 3.644314868804665e-06, "logits/chosen": -2.4458959102630615, "logits/rejected": -2.4458959102630615, "logps/chosen": -304.130615234375, "logps/rejected": -304.130615234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0047234781086444855, "rewards/margins": 0.0, "rewards/rejected": -0.0047234781086444855, "step": 250 }, { "epoch": 0.07589578924323141, "grad_norm": 0.01324462890625, "learning_rate": 3.790087463556852e-06, "logits/chosen": -2.4266982078552246, "logits/rejected": -2.4266982078552246, "logps/chosen": -286.97076416015625, "logps/rejected": -286.97076416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.006642763502895832, "rewards/margins": 0.0, "rewards/rejected": -0.006642763502895832, "step": 260 }, { "epoch": 0.07881485806027877, "grad_norm": 0.015625, "learning_rate": 3.935860058309039e-06, "logits/chosen": -2.436506748199463, "logits/rejected": -2.436506748199463, "logps/chosen": -310.330322265625, "logps/rejected": -310.330322265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.007435324601829052, "rewards/margins": 0.0, "rewards/rejected": -0.007435324601829052, "step": 270 }, { "epoch": 0.08173392687732613, "grad_norm": 0.01495361328125, "learning_rate": 4.081632653061225e-06, "logits/chosen": -2.394254446029663, "logits/rejected": -2.394254446029663, "logps/chosen": -304.8192443847656, "logps/rejected": -304.8192443847656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.00737042585387826, "rewards/margins": 0.0, "rewards/rejected": -0.00737042585387826, "step": 280 }, { "epoch": 0.0846529956943735, "grad_norm": 0.0130615234375, "learning_rate": 4.227405247813411e-06, "logits/chosen": -2.4005939960479736, "logits/rejected": -2.4005939960479736, "logps/chosen": -288.9790954589844, "logps/rejected": -288.9790954589844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0066338046453893185, "rewards/margins": 0.0, "rewards/rejected": -0.0066338046453893185, "step": 290 }, { "epoch": 0.08757206451142086, "grad_norm": 0.01458740234375, "learning_rate": 4.3731778425655976e-06, "logits/chosen": -2.4416656494140625, "logits/rejected": -2.4416656494140625, "logps/chosen": -288.1855773925781, "logps/rejected": -288.1855773925781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0070955632254481316, "rewards/margins": 0.0, "rewards/rejected": -0.0070955632254481316, "step": 300 }, { "epoch": 0.08757206451142086, "eval_logits/chosen": -2.3941876888275146, "eval_logits/rejected": -2.3941876888275146, "eval_logps/chosen": -307.0490417480469, "eval_logps/rejected": -307.0490417480469, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.005704815499484539, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.005704815499484539, "eval_runtime": 2667.7916, "eval_samples_per_second": 2.283, "eval_steps_per_second": 0.286, "step": 300 }, { "epoch": 0.09049113332846821, "grad_norm": 0.01153564453125, "learning_rate": 4.518950437317785e-06, "logits/chosen": -2.420503854751587, "logits/rejected": -2.420503854751587, "logps/chosen": -276.64093017578125, "logps/rejected": -276.64093017578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.007363935001194477, "rewards/margins": 0.0, "rewards/rejected": -0.007363935001194477, "step": 310 }, { "epoch": 0.09341020214551558, "grad_norm": 0.0185546875, "learning_rate": 4.664723032069971e-06, "logits/chosen": -2.4066500663757324, "logits/rejected": -2.4066500663757324, "logps/chosen": -315.653076171875, "logps/rejected": -315.653076171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.006720393896102905, "rewards/margins": 0.0, "rewards/rejected": -0.006720393896102905, "step": 320 }, { "epoch": 0.09632927096256294, "grad_norm": 0.015625, "learning_rate": 4.810495626822158e-06, "logits/chosen": -2.445965528488159, "logits/rejected": -2.445965528488159, "logps/chosen": -324.6703796386719, "logps/rejected": -324.6703796386719, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.007294761948287487, "rewards/margins": 0.0, "rewards/rejected": -0.007294761948287487, "step": 330 }, { "epoch": 0.0992483397796103, "grad_norm": 0.01446533203125, "learning_rate": 4.956268221574344e-06, "logits/chosen": -2.4288485050201416, "logits/rejected": -2.4288485050201416, "logps/chosen": -323.6286926269531, "logps/rejected": -323.6286926269531, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.009048479609191418, "rewards/margins": 0.0, "rewards/rejected": -0.009048479609191418, "step": 340 }, { "epoch": 0.10216740859665767, "grad_norm": 0.01458740234375, "learning_rate": 4.999936358746211e-06, "logits/chosen": -2.4309639930725098, "logits/rejected": -2.4309639930725098, "logps/chosen": -271.655029296875, "logps/rejected": -271.655029296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.009400355629622936, "rewards/margins": 0.0, "rewards/rejected": -0.009400355629622936, "step": 350 }, { "epoch": 0.10508647741370503, "grad_norm": 0.0152587890625, "learning_rate": 4.99962465428288e-06, "logits/chosen": -2.4447290897369385, "logits/rejected": -2.4447290897369385, "logps/chosen": -303.4416198730469, "logps/rejected": -303.4416198730469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.013941009528934956, "rewards/margins": 0.0, "rewards/rejected": -0.013941009528934956, "step": 360 }, { "epoch": 0.10800554623075238, "grad_norm": 0.0185546875, "learning_rate": 4.999053229746866e-06, "logits/chosen": -2.440117359161377, "logits/rejected": -2.440117359161377, "logps/chosen": -290.806884765625, "logps/rejected": -290.806884765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.01759205386042595, "rewards/margins": 0.0, "rewards/rejected": -0.01759205386042595, "step": 370 }, { "epoch": 0.11092461504779975, "grad_norm": 0.01263427734375, "learning_rate": 4.9982221445112535e-06, "logits/chosen": -2.4275150299072266, "logits/rejected": -2.4275150299072266, "logps/chosen": -320.67938232421875, "logps/rejected": -320.67938232421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.018790820613503456, "rewards/margins": 0.0, "rewards/rejected": -0.018790820613503456, "step": 380 }, { "epoch": 0.11384368386484711, "grad_norm": 0.01397705078125, "learning_rate": 4.997131484928813e-06, "logits/chosen": -2.414685010910034, "logits/rejected": -2.414685010910034, "logps/chosen": -301.1441650390625, "logps/rejected": -301.1441650390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.015089067630469799, "rewards/margins": 0.0, "rewards/rejected": -0.015089067630469799, "step": 390 }, { "epoch": 0.11676275268189447, "grad_norm": 0.01458740234375, "learning_rate": 4.995781364323035e-06, "logits/chosen": -2.391239643096924, "logits/rejected": -2.391239643096924, "logps/chosen": -285.70941162109375, "logps/rejected": -285.70941162109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.010374903678894043, "rewards/margins": 0.0, "rewards/rejected": -0.010374903678894043, "step": 400 }, { "epoch": 0.11676275268189447, "eval_logits/chosen": -2.393982172012329, "eval_logits/rejected": -2.393982172012329, "eval_logps/chosen": -307.3796081542969, "eval_logps/rejected": -307.3796081542969, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.009010241366922855, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.009010241366922855, "eval_runtime": 2667.3233, "eval_samples_per_second": 2.283, "eval_steps_per_second": 0.286, "step": 400 }, { "epoch": 0.11968182149894184, "grad_norm": 0.01300048828125, "learning_rate": 4.994171922976349e-06, "logits/chosen": -2.4642019271850586, "logits/rejected": -2.4642019271850586, "logps/chosen": -298.46978759765625, "logps/rejected": -298.46978759765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.009510824456810951, "rewards/margins": 0.0, "rewards/rejected": -0.009510824456810951, "step": 410 }, { "epoch": 0.1226008903159892, "grad_norm": 0.0159912109375, "learning_rate": 4.992303328115551e-06, "logits/chosen": -2.420297145843506, "logits/rejected": -2.420297145843506, "logps/chosen": -306.69610595703125, "logps/rejected": -306.69610595703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0014959282707422972, "rewards/margins": 0.0, "rewards/rejected": -0.0014959282707422972, "step": 420 }, { "epoch": 0.12551995913303657, "grad_norm": 0.0159912109375, "learning_rate": 4.990175773894428e-06, "logits/chosen": -2.46386981010437, "logits/rejected": -2.46386981010437, "logps/chosen": -281.81097412109375, "logps/rejected": -281.81097412109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.008724676445126534, "rewards/margins": 0.0, "rewards/rejected": -0.008724676445126534, "step": 430 }, { "epoch": 0.1284390279500839, "grad_norm": 0.01287841796875, "learning_rate": 4.987789481373586e-06, "logits/chosen": -2.406324625015259, "logits/rejected": -2.406324625015259, "logps/chosen": -297.7574157714844, "logps/rejected": -297.7574157714844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.006952273193746805, "rewards/margins": 0.0, "rewards/rejected": -0.006952273193746805, "step": 440 }, { "epoch": 0.13135809676713128, "grad_norm": 0.015869140625, "learning_rate": 4.985144698497477e-06, "logits/chosen": -2.4094862937927246, "logits/rejected": -2.4094862937927246, "logps/chosen": -294.4402160644531, "logps/rejected": -294.4402160644531, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.009783747605979443, "rewards/margins": 0.0, "rewards/rejected": -0.009783747605979443, "step": 450 }, { "epoch": 0.13427716558417865, "grad_norm": 0.015625, "learning_rate": 4.982241700068639e-06, "logits/chosen": -2.448880434036255, "logits/rejected": -2.448880434036255, "logps/chosen": -312.9103088378906, "logps/rejected": -312.9103088378906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.010099256411194801, "rewards/margins": 0.0, "rewards/rejected": -0.010099256411194801, "step": 460 }, { "epoch": 0.137196234401226, "grad_norm": 0.014404296875, "learning_rate": 4.979080787719144e-06, "logits/chosen": -2.4513556957244873, "logits/rejected": -2.4513556957244873, "logps/chosen": -330.3889465332031, "logps/rejected": -330.3889465332031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.012815428897738457, "rewards/margins": 0.0, "rewards/rejected": -0.012815428897738457, "step": 470 }, { "epoch": 0.14011530321827337, "grad_norm": 0.013427734375, "learning_rate": 4.975662289879257e-06, "logits/chosen": -2.3824195861816406, "logits/rejected": -2.3824195861816406, "logps/chosen": -324.45654296875, "logps/rejected": -324.45654296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.010385606437921524, "rewards/margins": 0.0, "rewards/rejected": -0.010385606437921524, "step": 480 }, { "epoch": 0.14303437203532074, "grad_norm": 0.016845703125, "learning_rate": 4.971986561743308e-06, "logits/chosen": -2.388378620147705, "logits/rejected": -2.388378620147705, "logps/chosen": -292.9872131347656, "logps/rejected": -292.9872131347656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.00819515809416771, "rewards/margins": 0.0, "rewards/rejected": -0.00819515809416771, "step": 490 }, { "epoch": 0.14595344085236808, "grad_norm": 0.01348876953125, "learning_rate": 4.96805398523279e-06, "logits/chosen": -2.438722610473633, "logits/rejected": -2.438722610473633, "logps/chosen": -333.7470397949219, "logps/rejected": -333.7470397949219, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.007836517877876759, "rewards/margins": 0.0, "rewards/rejected": -0.007836517877876759, "step": 500 }, { "epoch": 0.14595344085236808, "eval_logits/chosen": -2.3937265872955322, "eval_logits/rejected": -2.3937265872955322, "eval_logps/chosen": -307.1580505371094, "eval_logps/rejected": -307.1580505371094, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.006794503424316645, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.006794503424316645, "eval_runtime": 2668.7964, "eval_samples_per_second": 2.282, "eval_steps_per_second": 0.286, "step": 500 }, { "epoch": 0.14887250966941545, "grad_norm": 0.0146484375, "learning_rate": 4.963864968956674e-06, "logits/chosen": -2.4363291263580322, "logits/rejected": -2.4363291263580322, "logps/chosen": -295.4735412597656, "logps/rejected": -295.4735412597656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.008334552869200706, "rewards/margins": 0.0, "rewards/rejected": -0.008334552869200706, "step": 510 }, { "epoch": 0.15179157848646282, "grad_norm": 0.0113525390625, "learning_rate": 4.959419948168952e-06, "logits/chosen": -2.4209957122802734, "logits/rejected": -2.4209957122802734, "logps/chosen": -252.09475708007812, "logps/rejected": -252.09475708007812, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.005244333762675524, "rewards/margins": 0.0, "rewards/rejected": -0.005244333762675524, "step": 520 }, { "epoch": 0.15471064730351017, "grad_norm": 0.011962890625, "learning_rate": 4.954719384723416e-06, "logits/chosen": -2.4421539306640625, "logits/rejected": -2.4421539306640625, "logps/chosen": -290.62939453125, "logps/rejected": -290.62939453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.006143758539110422, "rewards/margins": 0.0, "rewards/rejected": -0.006143758539110422, "step": 530 }, { "epoch": 0.15762971612055754, "grad_norm": 0.0155029296875, "learning_rate": 4.949763767025665e-06, "logits/chosen": -2.433292865753174, "logits/rejected": -2.433292865753174, "logps/chosen": -301.56488037109375, "logps/rejected": -301.56488037109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.007085380610078573, "rewards/margins": 0.0, "rewards/rejected": -0.007085380610078573, "step": 540 }, { "epoch": 0.1605487849376049, "grad_norm": 0.01513671875, "learning_rate": 4.944553609982363e-06, "logits/chosen": -2.397106647491455, "logits/rejected": -2.397106647491455, "logps/chosen": -274.3099670410156, "logps/rejected": -274.3099670410156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.002214896958321333, "rewards/margins": 0.0, "rewards/rejected": -0.002214896958321333, "step": 550 }, { "epoch": 0.16346785375465225, "grad_norm": 0.0152587890625, "learning_rate": 4.939089454947734e-06, "logits/chosen": -2.417797088623047, "logits/rejected": -2.417797088623047, "logps/chosen": -299.5130615234375, "logps/rejected": -299.5130615234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.005161653272807598, "rewards/margins": 0.0, "rewards/rejected": -0.005161653272807598, "step": 560 }, { "epoch": 0.16638692257169962, "grad_norm": 0.01507568359375, "learning_rate": 4.933371869667315e-06, "logits/chosen": -2.4109036922454834, "logits/rejected": -2.4109036922454834, "logps/chosen": -279.4015808105469, "logps/rejected": -279.4015808105469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0024168032687157393, "rewards/margins": 0.0, "rewards/rejected": -0.0024168032687157393, "step": 570 }, { "epoch": 0.169305991388747, "grad_norm": 0.00885009765625, "learning_rate": 4.9274014482189654e-06, "logits/chosen": -2.4315690994262695, "logits/rejected": -2.4315690994262695, "logps/chosen": -309.34234619140625, "logps/rejected": -309.34234619140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.004016817547380924, "rewards/margins": 0.0, "rewards/rejected": -0.004016817547380924, "step": 580 }, { "epoch": 0.17222506020579434, "grad_norm": 0.017578125, "learning_rate": 4.9211788109511405e-06, "logits/chosen": -2.460508108139038, "logits/rejected": -2.460508108139038, "logps/chosen": -334.00933837890625, "logps/rejected": -334.00933837890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.005641533527523279, "rewards/margins": 0.0, "rewards/rejected": -0.005641533527523279, "step": 590 }, { "epoch": 0.1751441290228417, "grad_norm": 0.016845703125, "learning_rate": 4.914704604418435e-06, "logits/chosen": -2.4566855430603027, "logits/rejected": -2.4566855430603027, "logps/chosen": -307.21331787109375, "logps/rejected": -307.21331787109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0077440254390239716, "rewards/margins": 0.0, "rewards/rejected": -0.0077440254390239716, "step": 600 }, { "epoch": 0.1751441290228417, "eval_logits/chosen": -2.394993782043457, "eval_logits/rejected": -2.394993782043457, "eval_logps/chosen": -306.9631042480469, "eval_logps/rejected": -306.9631042480469, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.004845078103244305, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.004845078103244305, "eval_runtime": 2667.3075, "eval_samples_per_second": 2.283, "eval_steps_per_second": 0.286, "step": 600 }, { "epoch": 0.17806319783988908, "grad_norm": 0.01312255859375, "learning_rate": 4.907979501314402e-06, "logits/chosen": -2.452761173248291, "logits/rejected": -2.452761173248291, "logps/chosen": -293.330078125, "logps/rejected": -293.330078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.005413960665464401, "rewards/margins": 0.0, "rewards/rejected": -0.005413960665464401, "step": 610 }, { "epoch": 0.18098226665693642, "grad_norm": 0.013427734375, "learning_rate": 4.901004200401659e-06, "logits/chosen": -2.415590763092041, "logits/rejected": -2.415590763092041, "logps/chosen": -316.59185791015625, "logps/rejected": -316.59185791015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.009168794378638268, "rewards/margins": 0.0, "rewards/rejected": -0.009168794378638268, "step": 620 }, { "epoch": 0.1839013354739838, "grad_norm": 0.017333984375, "learning_rate": 4.893779426439285e-06, "logits/chosen": -2.4269957542419434, "logits/rejected": -2.4269957542419434, "logps/chosen": -330.297607421875, "logps/rejected": -330.297607421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.008635496720671654, "rewards/margins": 0.0, "rewards/rejected": -0.008635496720671654, "step": 630 }, { "epoch": 0.18682040429103117, "grad_norm": 0.0137939453125, "learning_rate": 4.886305930107512e-06, "logits/chosen": -2.4132332801818848, "logits/rejected": -2.4132332801818848, "logps/chosen": -334.0628967285156, "logps/rejected": -334.0628967285156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.00843154825270176, "rewards/margins": 0.0, "rewards/rejected": -0.00843154825270176, "step": 640 }, { "epoch": 0.1897394731080785, "grad_norm": 0.0162353515625, "learning_rate": 4.878584487929731e-06, "logits/chosen": -2.393531084060669, "logits/rejected": -2.393531084060669, "logps/chosen": -312.2678527832031, "logps/rejected": -312.2678527832031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.008157333359122276, "rewards/margins": 0.0, "rewards/rejected": -0.008157333359122276, "step": 650 }, { "epoch": 0.19265854192512588, "grad_norm": 0.01141357421875, "learning_rate": 4.8706159021918046e-06, "logits/chosen": -2.4334394931793213, "logits/rejected": -2.4334394931793213, "logps/chosen": -313.9178466796875, "logps/rejected": -313.9178466796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.010157248005270958, "rewards/margins": 0.0, "rewards/rejected": -0.010157248005270958, "step": 660 }, { "epoch": 0.19557761074217325, "grad_norm": 0.01446533203125, "learning_rate": 4.86240100085871e-06, "logits/chosen": -2.4123024940490723, "logits/rejected": -2.4123024940490723, "logps/chosen": -330.71856689453125, "logps/rejected": -330.71856689453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.01049681194126606, "rewards/margins": 0.0, "rewards/rejected": -0.01049681194126606, "step": 670 }, { "epoch": 0.1984966795592206, "grad_norm": 0.0145263671875, "learning_rate": 4.853940637488505e-06, "logits/chosen": -2.4219470024108887, "logits/rejected": -2.4219470024108887, "logps/chosen": -347.1614990234375, "logps/rejected": -347.1614990234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.010124921798706055, "rewards/margins": 0.0, "rewards/rejected": -0.010124921798706055, "step": 680 }, { "epoch": 0.20141574837626797, "grad_norm": 0.0140380859375, "learning_rate": 4.84523569114365e-06, "logits/chosen": -2.441845417022705, "logits/rejected": -2.441845417022705, "logps/chosen": -268.2397766113281, "logps/rejected": -268.2397766113281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.013552245683968067, "rewards/margins": 0.0, "rewards/rejected": -0.013552245683968067, "step": 690 }, { "epoch": 0.20433481719331534, "grad_norm": 0.020751953125, "learning_rate": 4.8362870662996574e-06, "logits/chosen": -2.408205509185791, "logits/rejected": -2.408205509185791, "logps/chosen": -313.0887756347656, "logps/rejected": -313.0887756347656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.01138794980943203, "rewards/margins": 0.0, "rewards/rejected": -0.01138794980943203, "step": 700 }, { "epoch": 0.20433481719331534, "eval_logits/chosen": -2.394869565963745, "eval_logits/rejected": -2.394869565963745, "eval_logps/chosen": -307.6349182128906, "eval_logps/rejected": -307.6349182128906, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.011563203297555447, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.011563203297555447, "eval_runtime": 2685.1829, "eval_samples_per_second": 2.268, "eval_steps_per_second": 0.284, "step": 700 }, { "epoch": 0.20725388601036268, "grad_norm": 0.015380859375, "learning_rate": 4.827095692751124e-06, "logits/chosen": -2.4306788444519043, "logits/rejected": -2.4306788444519043, "logps/chosen": -295.8254089355469, "logps/rejected": -295.8254089355469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.009687040001153946, "rewards/margins": 0.0, "rewards/rejected": -0.009687040001153946, "step": 710 }, { "epoch": 0.21017295482741005, "grad_norm": 0.0135498046875, "learning_rate": 4.817662525515116e-06, "logits/chosen": -2.399963855743408, "logits/rejected": -2.399963855743408, "logps/chosen": -285.0207824707031, "logps/rejected": -285.0207824707031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.010509965009987354, "rewards/margins": 0.0, "rewards/rejected": -0.010509965009987354, "step": 720 }, { "epoch": 0.21309202364445742, "grad_norm": 0.01275634765625, "learning_rate": 4.807988544731944e-06, "logits/chosen": -2.4015610218048096, "logits/rejected": -2.4015610218048096, "logps/chosen": -301.6191711425781, "logps/rejected": -301.6191711425781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0023958988022059202, "rewards/margins": 0.0, "rewards/rejected": -0.0023958988022059202, "step": 730 }, { "epoch": 0.21601109246150477, "grad_norm": 0.0120849609375, "learning_rate": 4.7980747555633174e-06, "logits/chosen": -2.421522617340088, "logits/rejected": -2.421522617340088, "logps/chosen": -300.5765380859375, "logps/rejected": -300.5765380859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0066505610011518, "rewards/margins": 0.0, "rewards/rejected": -0.0066505610011518, "step": 740 }, { "epoch": 0.21893016127855214, "grad_norm": 0.0167236328125, "learning_rate": 4.787922188087907e-06, "logits/chosen": -2.3898696899414062, "logits/rejected": -2.3898696899414062, "logps/chosen": -312.099853515625, "logps/rejected": -312.099853515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.009563307277858257, "rewards/margins": 0.0, "rewards/rejected": -0.009563307277858257, "step": 750 }, { "epoch": 0.2218492300955995, "grad_norm": 0.0185546875, "learning_rate": 4.7775318971943165e-06, "logits/chosen": -2.368053674697876, "logits/rejected": -2.368053674697876, "logps/chosen": -280.77703857421875, "logps/rejected": -280.77703857421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.008711813017725945, "rewards/margins": 0.0, "rewards/rejected": -0.008711813017725945, "step": 760 }, { "epoch": 0.22476829891264685, "grad_norm": 0.01434326171875, "learning_rate": 4.766904962471477e-06, "logits/chosen": -2.428321361541748, "logits/rejected": -2.428321361541748, "logps/chosen": -283.40704345703125, "logps/rejected": -283.40704345703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0074463835917413235, "rewards/margins": 0.0, "rewards/rejected": -0.0074463835917413235, "step": 770 }, { "epoch": 0.22768736772969422, "grad_norm": 0.020751953125, "learning_rate": 4.756042488096472e-06, "logits/chosen": -2.421441078186035, "logits/rejected": -2.421441078186035, "logps/chosen": -283.1347961425781, "logps/rejected": -283.1347961425781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.009277506731450558, "rewards/margins": 0.0, "rewards/rejected": -0.009277506731450558, "step": 780 }, { "epoch": 0.2306064365467416, "grad_norm": 0.0169677734375, "learning_rate": 4.744945602719806e-06, "logits/chosen": -2.4225807189941406, "logits/rejected": -2.4225807189941406, "logps/chosen": -296.5173645019531, "logps/rejected": -296.5173645019531, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.009408360347151756, "rewards/margins": 0.0, "rewards/rejected": -0.009408360347151756, "step": 790 }, { "epoch": 0.23352550536378894, "grad_norm": 0.01495361328125, "learning_rate": 4.733615459348143e-06, "logits/chosen": -2.3777918815612793, "logits/rejected": -2.3777918815612793, "logps/chosen": -337.0318298339844, "logps/rejected": -337.0318298339844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.012588550336658955, "rewards/margins": 0.0, "rewards/rejected": -0.012588550336658955, "step": 800 }, { "epoch": 0.23352550536378894, "eval_logits/chosen": -2.394713878631592, "eval_logits/rejected": -2.394713878631592, "eval_logps/chosen": -307.6956787109375, "eval_logps/rejected": -307.6956787109375, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.012170875445008278, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.012170875445008278, "eval_runtime": 2762.1462, "eval_samples_per_second": 2.205, "eval_steps_per_second": 0.276, "step": 800 }, { "epoch": 0.2364445741808363, "grad_norm": 0.0145263671875, "learning_rate": 4.722053235224495e-06, "logits/chosen": -2.4402616024017334, "logits/rejected": -2.4402616024017334, "logps/chosen": -333.5353698730469, "logps/rejected": -333.5353698730469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.008296088315546513, "rewards/margins": 0.0, "rewards/rejected": -0.008296088315546513, "step": 810 }, { "epoch": 0.23936364299788368, "grad_norm": 0.0128173828125, "learning_rate": 4.710260131705908e-06, "logits/chosen": -2.411567211151123, "logits/rejected": -2.411567211151123, "logps/chosen": -274.9350280761719, "logps/rejected": -274.9350280761719, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.015997527167201042, "rewards/margins": 0.0, "rewards/rejected": -0.015997527167201042, "step": 820 }, { "epoch": 0.24228271181493102, "grad_norm": 0.01531982421875, "learning_rate": 4.698237374138634e-06, "logits/chosen": -2.420203447341919, "logits/rejected": -2.420203447341919, "logps/chosen": -312.3550720214844, "logps/rejected": -312.3550720214844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.015846502035856247, "rewards/margins": 0.0, "rewards/rejected": -0.015846502035856247, "step": 830 }, { "epoch": 0.2452017806319784, "grad_norm": 0.01513671875, "learning_rate": 4.685986211730816e-06, "logits/chosen": -2.3960068225860596, "logits/rejected": -2.3960068225860596, "logps/chosen": -331.6641845703125, "logps/rejected": -331.6641845703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.01894356682896614, "rewards/margins": 0.0, "rewards/rejected": -0.01894356682896614, "step": 840 }, { "epoch": 0.24812084944902577, "grad_norm": 0.01165771484375, "learning_rate": 4.6735079174226864e-06, "logits/chosen": -2.408433198928833, "logits/rejected": -2.408433198928833, "logps/chosen": -269.3624572753906, "logps/rejected": -269.3624572753906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.009970271959900856, "rewards/margins": 0.0, "rewards/rejected": -0.009970271959900856, "step": 850 }, { "epoch": 0.25103991826607314, "grad_norm": 0.01483154296875, "learning_rate": 4.660803787754306e-06, "logits/chosen": -2.416790723800659, "logits/rejected": -2.416790723800659, "logps/chosen": -302.0819396972656, "logps/rejected": -302.0819396972656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.010707431472837925, "rewards/margins": 0.0, "rewards/rejected": -0.010707431472837925, "step": 860 }, { "epoch": 0.2539589870831205, "grad_norm": 0.0157470703125, "learning_rate": 4.647875142730853e-06, "logits/chosen": -2.3868987560272217, "logits/rejected": -2.3868987560272217, "logps/chosen": -299.74444580078125, "logps/rejected": -299.74444580078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.012594198808073997, "rewards/margins": 0.0, "rewards/rejected": -0.012594198808073997, "step": 870 }, { "epoch": 0.2568780559001678, "grad_norm": 0.0140380859375, "learning_rate": 4.634723325685462e-06, "logits/chosen": -2.442610263824463, "logits/rejected": -2.442610263824463, "logps/chosen": -308.396240234375, "logps/rejected": -308.396240234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.011057281866669655, "rewards/margins": 0.0, "rewards/rejected": -0.011057281866669655, "step": 880 }, { "epoch": 0.2597971247172152, "grad_norm": 0.0157470703125, "learning_rate": 4.621349703139651e-06, "logits/chosen": -2.4502758979797363, "logits/rejected": -2.4502758979797363, "logps/chosen": -327.5845031738281, "logps/rejected": -327.5845031738281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.012428502552211285, "rewards/margins": 0.0, "rewards/rejected": -0.012428502552211285, "step": 890 }, { "epoch": 0.26271619353426257, "grad_norm": 0.01519775390625, "learning_rate": 4.6077556646613365e-06, "logits/chosen": -2.4429335594177246, "logits/rejected": -2.4429335594177246, "logps/chosen": -309.44598388671875, "logps/rejected": -309.44598388671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.008300786837935448, "rewards/margins": 0.0, "rewards/rejected": -0.008300786837935448, "step": 900 }, { "epoch": 0.26271619353426257, "eval_logits/chosen": -2.396768093109131, "eval_logits/rejected": -2.396768093109131, "eval_logps/chosen": -307.1708068847656, "eval_logps/rejected": -307.1708068847656, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.0069224112667143345, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.0069224112667143345, "eval_runtime": 2667.1988, "eval_samples_per_second": 2.283, "eval_steps_per_second": 0.286, "step": 900 }, { "epoch": 0.2656352623513099, "grad_norm": 0.0150146484375, "learning_rate": 4.593942622720449e-06, "logits/chosen": -2.431570529937744, "logits/rejected": -2.431570529937744, "logps/chosen": -333.9033203125, "logps/rejected": -333.9033203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.008790754713118076, "rewards/margins": 0.0, "rewards/rejected": -0.008790754713118076, "step": 910 }, { "epoch": 0.2685543311683573, "grad_norm": 0.011474609375, "learning_rate": 4.579912012542172e-06, "logits/chosen": -2.4538259506225586, "logits/rejected": -2.4538259506225586, "logps/chosen": -330.14776611328125, "logps/rejected": -330.14776611328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.010161884129047394, "rewards/margins": 0.0, "rewards/rejected": -0.010161884129047394, "step": 920 }, { "epoch": 0.27147339998540465, "grad_norm": 0.0164794921875, "learning_rate": 4.565665291957821e-06, "logits/chosen": -2.412051200866699, "logits/rejected": -2.412051200866699, "logps/chosen": -300.0600891113281, "logps/rejected": -300.0600891113281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.010114507749676704, "rewards/margins": 0.0, "rewards/rejected": -0.010114507749676704, "step": 930 }, { "epoch": 0.274392468802452, "grad_norm": 0.0125732421875, "learning_rate": 4.551203941253367e-06, "logits/chosen": -2.4353108406066895, "logits/rejected": -2.4353108406066895, "logps/chosen": -288.15032958984375, "logps/rejected": -288.15032958984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.01036372222006321, "rewards/margins": 0.0, "rewards/rejected": -0.01036372222006321, "step": 940 }, { "epoch": 0.2773115376194994, "grad_norm": 0.01434326171875, "learning_rate": 4.5365294630156264e-06, "logits/chosen": -2.4350383281707764, "logits/rejected": -2.4350383281707764, "logps/chosen": -319.06195068359375, "logps/rejected": -319.06195068359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.011402562260627747, "rewards/margins": 0.0, "rewards/rejected": -0.011402562260627747, "step": 950 }, { "epoch": 0.28023060643654674, "grad_norm": 0.012451171875, "learning_rate": 4.521643381976142e-06, "logits/chosen": -2.428330898284912, "logits/rejected": -2.428330898284912, "logps/chosen": -322.0547790527344, "logps/rejected": -322.0547790527344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.012878289446234703, "rewards/margins": 0.0, "rewards/rejected": -0.012878289446234703, "step": 960 }, { "epoch": 0.2831496752535941, "grad_norm": 0.013671875, "learning_rate": 4.506547244852756e-06, "logits/chosen": -2.4220213890075684, "logits/rejected": -2.4220213890075684, "logps/chosen": -298.77056884765625, "logps/rejected": -298.77056884765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.012091143056750298, "rewards/margins": 0.0, "rewards/rejected": -0.012091143056750298, "step": 970 }, { "epoch": 0.2860687440706415, "grad_norm": 0.0145263671875, "learning_rate": 4.491242620188898e-06, "logits/chosen": -2.400778293609619, "logits/rejected": -2.400778293609619, "logps/chosen": -302.7762756347656, "logps/rejected": -302.7762756347656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.01696743816137314, "rewards/margins": 0.0, "rewards/rejected": -0.01696743816137314, "step": 980 }, { "epoch": 0.2889878128876888, "grad_norm": 0.012451171875, "learning_rate": 4.475731098190611e-06, "logits/chosen": -2.4159862995147705, "logits/rejected": -2.4159862995147705, "logps/chosen": -278.34356689453125, "logps/rejected": -278.34356689453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.014010600745677948, "rewards/margins": 0.0, "rewards/rejected": -0.014010600745677948, "step": 990 }, { "epoch": 0.29190688170473617, "grad_norm": 0.0145263671875, "learning_rate": 4.4600142905613216e-06, "logits/chosen": -2.416891098022461, "logits/rejected": -2.416891098022461, "logps/chosen": -310.4523620605469, "logps/rejected": -310.4523620605469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.01909947767853737, "rewards/margins": 0.0, "rewards/rejected": -0.01909947767853737, "step": 1000 }, { "epoch": 0.29190688170473617, "eval_logits/chosen": -2.3967111110687256, "eval_logits/rejected": -2.3967111110687256, "eval_logps/chosen": -308.2130432128906, "eval_logps/rejected": -308.2130432128906, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.017344659194350243, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.017344659194350243, "eval_runtime": 2668.0913, "eval_samples_per_second": 2.283, "eval_steps_per_second": 0.286, "step": 1000 }, { "epoch": 0.29482595052178356, "grad_norm": 0.029296875, "learning_rate": 4.444093830334381e-06, "logits/chosen": -2.395017147064209, "logits/rejected": -2.395017147064209, "logps/chosen": -330.1224670410156, "logps/rejected": -330.1224670410156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.013958754017949104, "rewards/margins": 0.0, "rewards/rejected": -0.013958754017949104, "step": 1010 }, { "epoch": 0.2977450193388309, "grad_norm": 0.01611328125, "learning_rate": 4.427971371703378e-06, "logits/chosen": -2.4404492378234863, "logits/rejected": -2.4404492378234863, "logps/chosen": -314.79888916015625, "logps/rejected": -314.79888916015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.027685949578881264, "rewards/margins": 0.0, "rewards/rejected": -0.027685949578881264, "step": 1020 }, { "epoch": 0.30066408815587825, "grad_norm": 0.01263427734375, "learning_rate": 4.411648589850276e-06, "logits/chosen": -2.4368889331817627, "logits/rejected": -2.4368889331817627, "logps/chosen": -299.6970520019531, "logps/rejected": -299.6970520019531, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.01648426428437233, "rewards/margins": 0.0, "rewards/rejected": -0.01648426428437233, "step": 1030 }, { "epoch": 0.30358315697292565, "grad_norm": 0.01416015625, "learning_rate": 4.395127180771342e-06, "logits/chosen": -2.4541175365448, "logits/rejected": -2.4541175365448, "logps/chosen": -326.87841796875, "logps/rejected": -326.87841796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.020237499848008156, "rewards/margins": 0.0, "rewards/rejected": -0.020237499848008156, "step": 1040 }, { "epoch": 0.306502225789973, "grad_norm": 0.01318359375, "learning_rate": 4.378408861100937e-06, "logits/chosen": -2.415283203125, "logits/rejected": -2.415283203125, "logps/chosen": -261.1552429199219, "logps/rejected": -261.1552429199219, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.01741962879896164, "rewards/margins": 0.0, "rewards/rejected": -0.01741962879896164, "step": 1050 }, { "epoch": 0.30942129460702034, "grad_norm": 0.01416015625, "learning_rate": 4.361495367933144e-06, "logits/chosen": -2.396031141281128, "logits/rejected": -2.396031141281128, "logps/chosen": -322.30377197265625, "logps/rejected": -322.30377197265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.014474359340965748, "rewards/margins": 0.0, "rewards/rejected": -0.014474359340965748, "step": 1060 }, { "epoch": 0.31234036342406774, "grad_norm": 0.0181884765625, "learning_rate": 4.344388458641283e-06, "logits/chosen": -2.4288814067840576, "logits/rejected": -2.4288814067840576, "logps/chosen": -324.64501953125, "logps/rejected": -324.64501953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.025701653212308884, "rewards/margins": 0.0, "rewards/rejected": -0.025701653212308884, "step": 1070 }, { "epoch": 0.3152594322411151, "grad_norm": 0.0164794921875, "learning_rate": 4.32708991069531e-06, "logits/chosen": -2.411003589630127, "logits/rejected": -2.411003589630127, "logps/chosen": -318.289794921875, "logps/rejected": -318.289794921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.02725202962756157, "rewards/margins": 0.0, "rewards/rejected": -0.02725202962756157, "step": 1080 }, { "epoch": 0.3181785010581624, "grad_norm": 0.01312255859375, "learning_rate": 4.309601521477134e-06, "logits/chosen": -2.437730550765991, "logits/rejected": -2.437730550765991, "logps/chosen": -318.1125793457031, "logps/rejected": -318.1125793457031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.035508617758750916, "rewards/margins": 0.0, "rewards/rejected": -0.035508617758750916, "step": 1090 }, { "epoch": 0.3210975698752098, "grad_norm": 0.01373291015625, "learning_rate": 4.291925108093856e-06, "logits/chosen": -2.4134514331817627, "logits/rejected": -2.4134514331817627, "logps/chosen": -306.98712158203125, "logps/rejected": -306.98712158203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.02741456963121891, "rewards/margins": 0.0, "rewards/rejected": -0.02741456963121891, "step": 1100 }, { "epoch": 0.3210975698752098, "eval_logits/chosen": -2.3970751762390137, "eval_logits/rejected": -2.3970751762390137, "eval_logps/chosen": -309.472412109375, "eval_logps/rejected": -309.472412109375, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.029938040301203728, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.029938040301203728, "eval_runtime": 2667.8688, "eval_samples_per_second": 2.283, "eval_steps_per_second": 0.286, "step": 1100 }, { "epoch": 0.32401663869225716, "grad_norm": 0.0120849609375, "learning_rate": 4.274062507188978e-06, "logits/chosen": -2.413846492767334, "logits/rejected": -2.413846492767334, "logps/chosen": -319.53887939453125, "logps/rejected": -319.53887939453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03637847676873207, "rewards/margins": 0.0, "rewards/rejected": -0.03637847676873207, "step": 1110 }, { "epoch": 0.3269357075093045, "grad_norm": 0.0130615234375, "learning_rate": 4.256015574751555e-06, "logits/chosen": -2.443239212036133, "logits/rejected": -2.443239212036133, "logps/chosen": -302.9671630859375, "logps/rejected": -302.9671630859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.032804206013679504, "rewards/margins": 0.0, "rewards/rejected": -0.032804206013679504, "step": 1120 }, { "epoch": 0.3298547763263519, "grad_norm": 0.0159912109375, "learning_rate": 4.2377861859233604e-06, "logits/chosen": -2.4368813037872314, "logits/rejected": -2.4368813037872314, "logps/chosen": -277.4005126953125, "logps/rejected": -277.4005126953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.030909573659300804, "rewards/margins": 0.0, "rewards/rejected": -0.030909573659300804, "step": 1130 }, { "epoch": 0.33277384514339925, "grad_norm": 0.01263427734375, "learning_rate": 4.219376234804047e-06, "logits/chosen": -2.4358789920806885, "logits/rejected": -2.4358789920806885, "logps/chosen": -294.87567138671875, "logps/rejected": -294.87567138671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.033531349152326584, "rewards/margins": 0.0, "rewards/rejected": -0.033531349152326584, "step": 1140 }, { "epoch": 0.3356929139604466, "grad_norm": 0.01519775390625, "learning_rate": 4.200787634254345e-06, "logits/chosen": -2.458458662033081, "logits/rejected": -2.458458662033081, "logps/chosen": -284.5567321777344, "logps/rejected": -284.5567321777344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.029438916593790054, "rewards/margins": 0.0, "rewards/rejected": -0.029438916593790054, "step": 1150 }, { "epoch": 0.338611982777494, "grad_norm": 0.0157470703125, "learning_rate": 4.18202231569731e-06, "logits/chosen": -2.465770721435547, "logits/rejected": -2.465770721435547, "logps/chosen": -325.60443115234375, "logps/rejected": -325.60443115234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03488076478242874, "rewards/margins": 0.0, "rewards/rejected": -0.03488076478242874, "step": 1160 }, { "epoch": 0.34153105159454133, "grad_norm": 0.0194091796875, "learning_rate": 4.163082228917639e-06, "logits/chosen": -2.42230224609375, "logits/rejected": -2.42230224609375, "logps/chosen": -332.96807861328125, "logps/rejected": -332.96807861328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03761008754372597, "rewards/margins": 0.0, "rewards/rejected": -0.03761008754372597, "step": 1170 }, { "epoch": 0.3444501204115887, "grad_norm": 0.01519775390625, "learning_rate": 4.143969341859083e-06, "logits/chosen": -2.4006218910217285, "logits/rejected": -2.4006218910217285, "logps/chosen": -298.38372802734375, "logps/rejected": -298.38372802734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.027944693341851234, "rewards/margins": 0.0, "rewards/rejected": -0.027944693341851234, "step": 1180 }, { "epoch": 0.3473691892286361, "grad_norm": 0.0167236328125, "learning_rate": 4.124685640419967e-06, "logits/chosen": -2.4376044273376465, "logits/rejected": -2.4376044273376465, "logps/chosen": -339.3370666503906, "logps/rejected": -339.3370666503906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04471305012702942, "rewards/margins": 0.0, "rewards/rejected": -0.04471305012702942, "step": 1190 }, { "epoch": 0.3502882580456834, "grad_norm": 0.015625, "learning_rate": 4.105233128246849e-06, "logits/chosen": -2.4307379722595215, "logits/rejected": -2.4307379722595215, "logps/chosen": -314.7157287597656, "logps/rejected": -314.7157287597656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04377968981862068, "rewards/margins": 0.0, "rewards/rejected": -0.04377968981862068, "step": 1200 }, { "epoch": 0.3502882580456834, "eval_logits/chosen": -2.3975985050201416, "eval_logits/rejected": -2.3975985050201416, "eval_logps/chosen": -310.0194091796875, "eval_logps/rejected": -310.0194091796875, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.035407647490501404, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.035407647490501404, "eval_runtime": 2667.8638, "eval_samples_per_second": 2.283, "eval_steps_per_second": 0.286, "step": 1200 }, { "epoch": 0.35320732686273076, "grad_norm": 0.01373291015625, "learning_rate": 4.085613826526338e-06, "logits/chosen": -2.4104952812194824, "logits/rejected": -2.4104952812194824, "logps/chosen": -307.89056396484375, "logps/rejected": -307.89056396484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.034878071397542953, "rewards/margins": 0.0, "rewards/rejected": -0.034878071397542953, "step": 1210 }, { "epoch": 0.35612639567977816, "grad_norm": 0.0130615234375, "learning_rate": 4.065829773775082e-06, "logits/chosen": -2.454697847366333, "logits/rejected": -2.454697847366333, "logps/chosen": -331.95556640625, "logps/rejected": -331.95556640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.035688284784555435, "rewards/margins": 0.0, "rewards/rejected": -0.035688284784555435, "step": 1220 }, { "epoch": 0.3590454644968255, "grad_norm": 0.01318359375, "learning_rate": 4.045883025627957e-06, "logits/chosen": -2.416503429412842, "logits/rejected": -2.416503429412842, "logps/chosen": -317.5516662597656, "logps/rejected": -317.5516662597656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.036794569343328476, "rewards/margins": 0.0, "rewards/rejected": -0.036794569343328476, "step": 1230 }, { "epoch": 0.36196453331387285, "grad_norm": 0.0159912109375, "learning_rate": 4.025775654624481e-06, "logits/chosen": -2.431762218475342, "logits/rejected": -2.431762218475342, "logps/chosen": -286.4144592285156, "logps/rejected": -286.4144592285156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0327475443482399, "rewards/margins": 0.0, "rewards/rejected": -0.0327475443482399, "step": 1240 }, { "epoch": 0.36488360213092025, "grad_norm": 0.01373291015625, "learning_rate": 4.005509749993471e-06, "logits/chosen": -2.4348835945129395, "logits/rejected": -2.4348835945129395, "logps/chosen": -264.43670654296875, "logps/rejected": -264.43670654296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03447514772415161, "rewards/margins": 0.0, "rewards/rejected": -0.03447514772415161, "step": 1250 }, { "epoch": 0.3678026709479676, "grad_norm": 0.01544189453125, "learning_rate": 3.985087417435964e-06, "logits/chosen": -2.4379494190216064, "logits/rejected": -2.4379494190216064, "logps/chosen": -306.0783386230469, "logps/rejected": -306.0783386230469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03243451565504074, "rewards/margins": 0.0, "rewards/rejected": -0.03243451565504074, "step": 1260 }, { "epoch": 0.37072173976501493, "grad_norm": 0.01318359375, "learning_rate": 3.964510778906425e-06, "logits/chosen": -2.434380292892456, "logits/rejected": -2.434380292892456, "logps/chosen": -316.9388427734375, "logps/rejected": -316.9388427734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.038867734372615814, "rewards/margins": 0.0, "rewards/rejected": -0.038867734372615814, "step": 1270 }, { "epoch": 0.37364080858206233, "grad_norm": 0.0142822265625, "learning_rate": 3.943781972392269e-06, "logits/chosen": -2.4212710857391357, "logits/rejected": -2.4212710857391357, "logps/chosen": -326.74237060546875, "logps/rejected": -326.74237060546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.038525618612766266, "rewards/margins": 0.0, "rewards/rejected": -0.038525618612766266, "step": 1280 }, { "epoch": 0.3765598773991097, "grad_norm": 0.016357421875, "learning_rate": 3.922903151691716e-06, "logits/chosen": -2.450032949447632, "logits/rejected": -2.450032949447632, "logps/chosen": -329.82073974609375, "logps/rejected": -329.82073974609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.037473224103450775, "rewards/margins": 0.0, "rewards/rejected": -0.037473224103450775, "step": 1290 }, { "epoch": 0.379478946216157, "grad_norm": 0.018310546875, "learning_rate": 3.901876486190008e-06, "logits/chosen": -2.4351401329040527, "logits/rejected": -2.4351401329040527, "logps/chosen": -315.5516662597656, "logps/rejected": -315.5516662597656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03452508896589279, "rewards/margins": 0.0, "rewards/rejected": -0.03452508896589279, "step": 1300 }, { "epoch": 0.379478946216157, "eval_logits/chosen": -2.3963370323181152, "eval_logits/rejected": -2.3963370323181152, "eval_logps/chosen": -309.5113525390625, "eval_logps/rejected": -309.5113525390625, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.030327608808875084, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.030327608808875084, "eval_runtime": 2666.8225, "eval_samples_per_second": 2.284, "eval_steps_per_second": 0.286, "step": 1300 }, { "epoch": 0.3823980150332044, "grad_norm": 0.018798828125, "learning_rate": 3.880704160633995e-06, "logits/chosen": -2.4444994926452637, "logits/rejected": -2.4444994926452637, "logps/chosen": -295.88348388671875, "logps/rejected": -295.88348388671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.036420173943042755, "rewards/margins": 0.0, "rewards/rejected": -0.036420173943042755, "step": 1310 }, { "epoch": 0.38531708385025176, "grad_norm": 0.013671875, "learning_rate": 3.859388374905136e-06, "logits/chosen": -2.41549015045166, "logits/rejected": -2.41549015045166, "logps/chosen": -291.2346496582031, "logps/rejected": -291.2346496582031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03046022728085518, "rewards/margins": 0.0, "rewards/rejected": -0.03046022728085518, "step": 1320 }, { "epoch": 0.3882361526672991, "grad_norm": 0.0152587890625, "learning_rate": 3.837931343790924e-06, "logits/chosen": -2.4401891231536865, "logits/rejected": -2.4401891231536865, "logps/chosen": -297.060791015625, "logps/rejected": -297.060791015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.02374189719557762, "rewards/margins": 0.0, "rewards/rejected": -0.02374189719557762, "step": 1330 }, { "epoch": 0.3911552214843465, "grad_norm": 0.0152587890625, "learning_rate": 3.8163352967547575e-06, "logits/chosen": -2.4282491207122803, "logits/rejected": -2.4282491207122803, "logps/chosen": -350.7884216308594, "logps/rejected": -350.7884216308594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.031809043139219284, "rewards/margins": 0.0, "rewards/rejected": -0.031809043139219284, "step": 1340 }, { "epoch": 0.39407429030139385, "grad_norm": 0.01190185546875, "learning_rate": 3.7946024777042974e-06, "logits/chosen": -2.423346996307373, "logits/rejected": -2.423346996307373, "logps/chosen": -300.26800537109375, "logps/rejected": -300.26800537109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.029370862990617752, "rewards/margins": 0.0, "rewards/rejected": -0.029370862990617752, "step": 1350 }, { "epoch": 0.3969933591184412, "grad_norm": 0.01953125, "learning_rate": 3.7727351447583095e-06, "logits/chosen": -2.397026538848877, "logits/rejected": -2.397026538848877, "logps/chosen": -318.9501647949219, "logps/rejected": -318.9501647949219, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.030634000897407532, "rewards/margins": 0.0, "rewards/rejected": -0.030634000897407532, "step": 1360 }, { "epoch": 0.3999124279354886, "grad_norm": 0.01385498046875, "learning_rate": 3.750735570012043e-06, "logits/chosen": -2.438441276550293, "logits/rejected": -2.438441276550293, "logps/chosen": -330.5710754394531, "logps/rejected": -330.5710754394531, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03722671791911125, "rewards/margins": 0.0, "rewards/rejected": -0.03722671791911125, "step": 1370 }, { "epoch": 0.40283149675253593, "grad_norm": 0.01806640625, "learning_rate": 3.7286060393011513e-06, "logits/chosen": -2.419067144393921, "logits/rejected": -2.419067144393921, "logps/chosen": -314.528564453125, "logps/rejected": -314.528564453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.032639987766742706, "rewards/margins": 0.0, "rewards/rejected": -0.032639987766742706, "step": 1380 }, { "epoch": 0.4057505655695833, "grad_norm": 0.01904296875, "learning_rate": 3.7063488519641825e-06, "logits/chosen": -2.4223015308380127, "logits/rejected": -2.4223015308380127, "logps/chosen": -329.4114685058594, "logps/rejected": -329.4114685058594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03504698723554611, "rewards/margins": 0.0, "rewards/rejected": -0.03504698723554611, "step": 1390 }, { "epoch": 0.4086696343866307, "grad_norm": 0.0162353515625, "learning_rate": 3.6839663206036715e-06, "logits/chosen": -2.4432168006896973, "logits/rejected": -2.4432168006896973, "logps/chosen": -293.8369445800781, "logps/rejected": -293.8369445800781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.031177738681435585, "rewards/margins": 0.0, "rewards/rejected": -0.031177738681435585, "step": 1400 }, { "epoch": 0.4086696343866307, "eval_logits/chosen": -2.395508050918579, "eval_logits/rejected": -2.395508050918579, "eval_logps/chosen": -309.2061462402344, "eval_logps/rejected": -309.2061462402344, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.02727527543902397, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.02727527543902397, "eval_runtime": 2667.2175, "eval_samples_per_second": 2.283, "eval_steps_per_second": 0.286, "step": 1400 }, { "epoch": 0.411588703203678, "grad_norm": 0.01239013671875, "learning_rate": 3.6614607708458532e-06, "logits/chosen": -2.418804883956909, "logits/rejected": -2.418804883956909, "logps/chosen": -295.696533203125, "logps/rejected": -295.696533203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.024944758042693138, "rewards/margins": 0.0, "rewards/rejected": -0.024944758042693138, "step": 1410 }, { "epoch": 0.41450777202072536, "grad_norm": 0.0146484375, "learning_rate": 3.6388345410990195e-06, "logits/chosen": -2.4199652671813965, "logits/rejected": -2.4199652671813965, "logps/chosen": -341.0202331542969, "logps/rejected": -341.0202331542969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.030235985293984413, "rewards/margins": 0.0, "rewards/rejected": -0.030235985293984413, "step": 1420 }, { "epoch": 0.41742684083777276, "grad_norm": 0.01141357421875, "learning_rate": 3.6160899823105518e-06, "logits/chosen": -2.4291069507598877, "logits/rejected": -2.4291069507598877, "logps/chosen": -287.2336730957031, "logps/rejected": -287.2336730957031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0277925543487072, "rewards/margins": 0.0, "rewards/rejected": -0.0277925543487072, "step": 1430 }, { "epoch": 0.4203459096548201, "grad_norm": 0.0140380859375, "learning_rate": 3.5932294577226468e-06, "logits/chosen": -2.440561532974243, "logits/rejected": -2.440561532974243, "logps/chosen": -276.7684020996094, "logps/rejected": -276.7684020996094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.017870336771011353, "rewards/margins": 0.0, "rewards/rejected": -0.017870336771011353, "step": 1440 }, { "epoch": 0.42326497847186745, "grad_norm": 0.0118408203125, "learning_rate": 3.5702553426267704e-06, "logits/chosen": -2.449218988418579, "logits/rejected": -2.449218988418579, "logps/chosen": -305.78814697265625, "logps/rejected": -305.78814697265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.024231892079114914, "rewards/margins": 0.0, "rewards/rejected": -0.024231892079114914, "step": 1450 }, { "epoch": 0.42618404728891485, "grad_norm": 0.015625, "learning_rate": 3.547170024116854e-06, "logits/chosen": -2.4015636444091797, "logits/rejected": -2.4015636444091797, "logps/chosen": -281.1402893066406, "logps/rejected": -281.1402893066406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.027333328500390053, "rewards/margins": 0.0, "rewards/rejected": -0.027333328500390053, "step": 1460 }, { "epoch": 0.4291031161059622, "grad_norm": 0.0164794921875, "learning_rate": 3.5239759008412666e-06, "logits/chosen": -2.461341381072998, "logits/rejected": -2.461341381072998, "logps/chosen": -315.0804443359375, "logps/rejected": -315.0804443359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.026240995153784752, "rewards/margins": 0.0, "rewards/rejected": -0.026240995153784752, "step": 1470 }, { "epoch": 0.43202218492300953, "grad_norm": 0.0164794921875, "learning_rate": 3.500675382753588e-06, "logits/chosen": -2.420381784439087, "logits/rejected": -2.420381784439087, "logps/chosen": -310.7515563964844, "logps/rejected": -310.7515563964844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.023152858018875122, "rewards/margins": 0.0, "rewards/rejected": -0.023152858018875122, "step": 1480 }, { "epoch": 0.43494125374005693, "grad_norm": 0.01336669921875, "learning_rate": 3.477270890862204e-06, "logits/chosen": -2.3881866931915283, "logits/rejected": -2.3881866931915283, "logps/chosen": -318.3128356933594, "logps/rejected": -318.3128356933594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.030725980177521706, "rewards/margins": 0.0, "rewards/rejected": -0.030725980177521706, "step": 1490 }, { "epoch": 0.4378603225571043, "grad_norm": 0.0140380859375, "learning_rate": 3.453764856978758e-06, "logits/chosen": -2.409209728240967, "logits/rejected": -2.409209728240967, "logps/chosen": -331.4593200683594, "logps/rejected": -331.4593200683594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.022285277023911476, "rewards/margins": 0.0, "rewards/rejected": -0.022285277023911476, "step": 1500 }, { "epoch": 0.4378603225571043, "eval_logits/chosen": -2.394321918487549, "eval_logits/rejected": -2.394321918487549, "eval_logps/chosen": -308.9651794433594, "eval_logps/rejected": -308.9651794433594, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.024866018444299698, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.024866018444299698, "eval_runtime": 2666.7789, "eval_samples_per_second": 2.284, "eval_steps_per_second": 0.286, "step": 1500 }, { "epoch": 0.4407793913741516, "grad_norm": 0.01312255859375, "learning_rate": 3.4301597234654733e-06, "logits/chosen": -2.4193215370178223, "logits/rejected": -2.4193215370178223, "logps/chosen": -304.951171875, "logps/rejected": -304.951171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.02852988801896572, "rewards/margins": 0.0, "rewards/rejected": -0.02852988801896572, "step": 1510 }, { "epoch": 0.443698460191199, "grad_norm": 0.0177001953125, "learning_rate": 3.406457942981384e-06, "logits/chosen": -2.430614948272705, "logits/rejected": -2.430614948272705, "logps/chosen": -333.06988525390625, "logps/rejected": -333.06988525390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.024759288877248764, "rewards/margins": 0.0, "rewards/rejected": -0.024759288877248764, "step": 1520 }, { "epoch": 0.44661752900824636, "grad_norm": 0.0133056640625, "learning_rate": 3.3826619782274954e-06, "logits/chosen": -2.43021559715271, "logits/rejected": -2.43021559715271, "logps/chosen": -284.0345153808594, "logps/rejected": -284.0345153808594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.025433775037527084, "rewards/margins": 0.0, "rewards/rejected": -0.025433775037527084, "step": 1530 }, { "epoch": 0.4495365978252937, "grad_norm": 0.0142822265625, "learning_rate": 3.3587743016909013e-06, "logits/chosen": -2.439312219619751, "logits/rejected": -2.439312219619751, "logps/chosen": -320.015380859375, "logps/rejected": -320.015380859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.02944205328822136, "rewards/margins": 0.0, "rewards/rejected": -0.02944205328822136, "step": 1540 }, { "epoch": 0.4524556666423411, "grad_norm": 0.044677734375, "learning_rate": 3.334797395387882e-06, "logits/chosen": -2.4262938499450684, "logits/rejected": -2.4262938499450684, "logps/chosen": -329.60504150390625, "logps/rejected": -329.60504150390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.027106398716568947, "rewards/margins": 0.0, "rewards/rejected": -0.027106398716568947, "step": 1550 }, { "epoch": 0.45537473545938845, "grad_norm": 0.01226806640625, "learning_rate": 3.3107337506060145e-06, "logits/chosen": -2.4414420127868652, "logits/rejected": -2.4414420127868652, "logps/chosen": -289.9877014160156, "logps/rejected": -289.9877014160156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.028158003464341164, "rewards/margins": 0.0, "rewards/rejected": -0.028158003464341164, "step": 1560 }, { "epoch": 0.4582938042764358, "grad_norm": 0.0301513671875, "learning_rate": 3.2865858676453172e-06, "logits/chosen": -2.434182643890381, "logits/rejected": -2.434182643890381, "logps/chosen": -306.0428466796875, "logps/rejected": -306.0428466796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.02475564181804657, "rewards/margins": 0.0, "rewards/rejected": -0.02475564181804657, "step": 1570 }, { "epoch": 0.4612128730934832, "grad_norm": 0.0098876953125, "learning_rate": 3.2623562555584633e-06, "logits/chosen": -2.430816411972046, "logits/rejected": -2.430816411972046, "logps/chosen": -281.2196960449219, "logps/rejected": -281.2196960449219, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.02929893136024475, "rewards/margins": 0.0, "rewards/rejected": -0.02929893136024475, "step": 1580 }, { "epoch": 0.46413194191053053, "grad_norm": 0.024658203125, "learning_rate": 3.2380474318900766e-06, "logits/chosen": -2.4165406227111816, "logits/rejected": -2.4165406227111816, "logps/chosen": -310.68511962890625, "logps/rejected": -310.68511962890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03376628831028938, "rewards/margins": 0.0, "rewards/rejected": -0.03376628831028938, "step": 1590 }, { "epoch": 0.4670510107275779, "grad_norm": 0.016845703125, "learning_rate": 3.2136619224151533e-06, "logits/chosen": -2.4508678913116455, "logits/rejected": -2.4508678913116455, "logps/chosen": -327.84619140625, "logps/rejected": -327.84619140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03426826745271683, "rewards/margins": 0.0, "rewards/rejected": -0.03426826745271683, "step": 1600 }, { "epoch": 0.4670510107275779, "eval_logits/chosen": -2.3953943252563477, "eval_logits/rejected": -2.3953943252563477, "eval_logps/chosen": -309.15863037109375, "eval_logps/rejected": -309.15863037109375, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.026800233870744705, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.026800233870744705, "eval_runtime": 2666.9806, "eval_samples_per_second": 2.283, "eval_steps_per_second": 0.286, "step": 1600 }, { "epoch": 0.4699700795446253, "grad_norm": 0.014892578125, "learning_rate": 3.1892022608766215e-06, "logits/chosen": -2.361971378326416, "logits/rejected": -2.361971378326416, "logps/chosen": -299.3944396972656, "logps/rejected": -299.3944396972656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0262086633592844, "rewards/margins": 0.0, "rewards/rejected": -0.0262086633592844, "step": 1610 }, { "epoch": 0.4728891483616726, "grad_norm": 0.01422119140625, "learning_rate": 3.16467098872208e-06, "logits/chosen": -2.4706971645355225, "logits/rejected": -2.4706971645355225, "logps/chosen": -332.5861511230469, "logps/rejected": -332.5861511230469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.034811943769454956, "rewards/margins": 0.0, "rewards/rejected": -0.034811943769454956, "step": 1620 }, { "epoch": 0.47580821717871996, "grad_norm": 0.032470703125, "learning_rate": 3.140070654839728e-06, "logits/chosen": -2.4026148319244385, "logits/rejected": -2.4026148319244385, "logps/chosen": -296.76605224609375, "logps/rejected": -296.76605224609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.02368254028260708, "rewards/margins": 0.0, "rewards/rejected": -0.02368254028260708, "step": 1630 }, { "epoch": 0.47872728599576736, "grad_norm": 0.0242919921875, "learning_rate": 3.115403815293532e-06, "logits/chosen": -2.43617582321167, "logits/rejected": -2.43617582321167, "logps/chosen": -342.2427062988281, "logps/rejected": -342.2427062988281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.036105576902627945, "rewards/margins": 0.0, "rewards/rejected": -0.036105576902627945, "step": 1640 }, { "epoch": 0.4816463548128147, "grad_norm": 0.0113525390625, "learning_rate": 3.0906730330576345e-06, "logits/chosen": -2.4739155769348145, "logits/rejected": -2.4739155769348145, "logps/chosen": -332.26678466796875, "logps/rejected": -332.26678466796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.028261488303542137, "rewards/margins": 0.0, "rewards/rejected": -0.028261488303542137, "step": 1650 }, { "epoch": 0.48456542362986205, "grad_norm": 0.017333984375, "learning_rate": 3.065880877750059e-06, "logits/chosen": -2.427436351776123, "logits/rejected": -2.427436351776123, "logps/chosen": -304.4495544433594, "logps/rejected": -304.4495544433594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03417867794632912, "rewards/margins": 0.0, "rewards/rejected": -0.03417867794632912, "step": 1660 }, { "epoch": 0.48748449244690945, "grad_norm": 0.01226806640625, "learning_rate": 3.041029925365711e-06, "logits/chosen": -2.4058425426483154, "logits/rejected": -2.4058425426483154, "logps/chosen": -308.30072021484375, "logps/rejected": -308.30072021484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.035630594938993454, "rewards/margins": 0.0, "rewards/rejected": -0.035630594938993454, "step": 1670 }, { "epoch": 0.4904035612639568, "grad_norm": 0.0126953125, "learning_rate": 3.0161227580087282e-06, "logits/chosen": -2.433281421661377, "logits/rejected": -2.433281421661377, "logps/chosen": -342.0614013671875, "logps/rejected": -342.0614013671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03289630264043808, "rewards/margins": 0.0, "rewards/rejected": -0.03289630264043808, "step": 1680 }, { "epoch": 0.49332263008100413, "grad_norm": 0.0123291015625, "learning_rate": 2.9911619636241862e-06, "logits/chosen": -2.4333884716033936, "logits/rejected": -2.4333884716033936, "logps/chosen": -322.1616516113281, "logps/rejected": -322.1616516113281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.034327663481235504, "rewards/margins": 0.0, "rewards/rejected": -0.034327663481235504, "step": 1690 }, { "epoch": 0.49624169889805153, "grad_norm": 0.01275634765625, "learning_rate": 2.966150135729203e-06, "logits/chosen": -2.38623046875, "logits/rejected": -2.38623046875, "logps/chosen": -335.8984680175781, "logps/rejected": -335.8984680175781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03050742670893669, "rewards/margins": 0.0, "rewards/rejected": -0.03050742670893669, "step": 1700 }, { "epoch": 0.49624169889805153, "eval_logits/chosen": -2.3913044929504395, "eval_logits/rejected": -2.3913044929504395, "eval_logps/chosen": -309.405517578125, "eval_logps/rejected": -309.405517578125, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.029269486665725708, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.029269486665725708, "eval_runtime": 2669.612, "eval_samples_per_second": 2.281, "eval_steps_per_second": 0.285, "step": 1700 }, { "epoch": 0.4991607677150989, "grad_norm": 0.01324462890625, "learning_rate": 2.9410898731434667e-06, "logits/chosen": -2.41214919090271, "logits/rejected": -2.41214919090271, "logps/chosen": -302.40887451171875, "logps/rejected": -302.40887451171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.028926188126206398, "rewards/margins": 0.0, "rewards/rejected": -0.028926188126206398, "step": 1710 }, { "epoch": 0.5020798365321463, "grad_norm": 0.0152587890625, "learning_rate": 2.9159837797192003e-06, "logits/chosen": -2.415527820587158, "logits/rejected": -2.415527820587158, "logps/chosen": -329.7999267578125, "logps/rejected": -329.7999267578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03768650442361832, "rewards/margins": 0.0, "rewards/rejected": -0.03768650442361832, "step": 1720 }, { "epoch": 0.5049989053491936, "grad_norm": 0.014404296875, "learning_rate": 2.890834464070623e-06, "logits/chosen": -2.4205574989318848, "logits/rejected": -2.4205574989318848, "logps/chosen": -309.94329833984375, "logps/rejected": -309.94329833984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03702525794506073, "rewards/margins": 0.0, "rewards/rejected": -0.03702525794506073, "step": 1730 }, { "epoch": 0.507917974166241, "grad_norm": 0.013671875, "learning_rate": 2.865644539302896e-06, "logits/chosen": -2.389092206954956, "logits/rejected": -2.389092206954956, "logps/chosen": -339.6660461425781, "logps/rejected": -339.6660461425781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.029835382476449013, "rewards/margins": 0.0, "rewards/rejected": -0.029835382476449013, "step": 1740 }, { "epoch": 0.5108370429832884, "grad_norm": 0.01300048828125, "learning_rate": 2.840416622740617e-06, "logits/chosen": -2.444392681121826, "logits/rejected": -2.444392681121826, "logps/chosen": -318.47296142578125, "logps/rejected": -318.47296142578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03087993524968624, "rewards/margins": 0.0, "rewards/rejected": -0.03087993524968624, "step": 1750 }, { "epoch": 0.5137561118003356, "grad_norm": 0.01263427734375, "learning_rate": 2.8151533356558673e-06, "logits/chosen": -2.4179341793060303, "logits/rejected": -2.4179341793060303, "logps/chosen": -295.8548889160156, "logps/rejected": -295.8548889160156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.032246123999357224, "rewards/margins": 0.0, "rewards/rejected": -0.032246123999357224, "step": 1760 }, { "epoch": 0.516675180617383, "grad_norm": 0.014892578125, "learning_rate": 2.7898573029958563e-06, "logits/chosen": -2.377382516860962, "logits/rejected": -2.377382516860962, "logps/chosen": -305.41656494140625, "logps/rejected": -305.41656494140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03165289759635925, "rewards/margins": 0.0, "rewards/rejected": -0.03165289759635925, "step": 1770 }, { "epoch": 0.5195942494344304, "grad_norm": 0.0103759765625, "learning_rate": 2.7645311531101763e-06, "logits/chosen": -2.412802219390869, "logits/rejected": -2.412802219390869, "logps/chosen": -312.50067138671875, "logps/rejected": -312.50067138671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.034763775765895844, "rewards/margins": 0.0, "rewards/rejected": -0.034763775765895844, "step": 1780 }, { "epoch": 0.5225133182514777, "grad_norm": 0.0135498046875, "learning_rate": 2.7391775174777084e-06, "logits/chosen": -2.419868230819702, "logits/rejected": -2.419868230819702, "logps/chosen": -310.26922607421875, "logps/rejected": -310.26922607421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.035530101507902145, "rewards/margins": 0.0, "rewards/rejected": -0.035530101507902145, "step": 1790 }, { "epoch": 0.5254323870685251, "grad_norm": 0.0167236328125, "learning_rate": 2.713799030433203e-06, "logits/chosen": -2.423767566680908, "logits/rejected": -2.423767566680908, "logps/chosen": -308.0718688964844, "logps/rejected": -308.0718688964844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03834807127714157, "rewards/margins": 0.0, "rewards/rejected": -0.03834807127714157, "step": 1800 }, { "epoch": 0.5254323870685251, "eval_logits/chosen": -2.392709732055664, "eval_logits/rejected": -2.392709732055664, "eval_logps/chosen": -310.26434326171875, "eval_logps/rejected": -310.26434326171875, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.037857454270124435, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.037857454270124435, "eval_runtime": 2669.2359, "eval_samples_per_second": 2.282, "eval_steps_per_second": 0.285, "step": 1800 }, { "epoch": 0.5283514558855725, "grad_norm": 0.01373291015625, "learning_rate": 2.688398328893561e-06, "logits/chosen": -2.4216887950897217, "logits/rejected": -2.4216887950897217, "logps/chosen": -307.491455078125, "logps/rejected": -307.491455078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03987189009785652, "rewards/margins": 0.0, "rewards/rejected": -0.03987189009785652, "step": 1810 }, { "epoch": 0.5312705247026198, "grad_norm": 0.013916015625, "learning_rate": 2.6629780520838526e-06, "logits/chosen": -2.389004945755005, "logits/rejected": -2.389004945755005, "logps/chosen": -314.912353515625, "logps/rejected": -314.912353515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03697946295142174, "rewards/margins": 0.0, "rewards/rejected": -0.03697946295142174, "step": 1820 }, { "epoch": 0.5341895935196672, "grad_norm": 0.016845703125, "learning_rate": 2.637540841263088e-06, "logits/chosen": -2.4251251220703125, "logits/rejected": -2.4251251220703125, "logps/chosen": -309.82611083984375, "logps/rejected": -309.82611083984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.041506171226501465, "rewards/margins": 0.0, "rewards/rejected": -0.041506171226501465, "step": 1830 }, { "epoch": 0.5371086623367146, "grad_norm": 0.0130615234375, "learning_rate": 2.6120893394497825e-06, "logits/chosen": -2.4095826148986816, "logits/rejected": -2.4095826148986816, "logps/chosen": -290.29876708984375, "logps/rejected": -290.29876708984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03885159641504288, "rewards/margins": 0.0, "rewards/rejected": -0.03885159641504288, "step": 1840 }, { "epoch": 0.5400277311537619, "grad_norm": 0.0203857421875, "learning_rate": 2.586626191147337e-06, "logits/chosen": -2.414461612701416, "logits/rejected": -2.414461612701416, "logps/chosen": -298.74444580078125, "logps/rejected": -298.74444580078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.035465486347675323, "rewards/margins": 0.0, "rewards/rejected": -0.035465486347675323, "step": 1850 }, { "epoch": 0.5429467999708093, "grad_norm": 0.0142822265625, "learning_rate": 2.5611540420692666e-06, "logits/chosen": -2.4189705848693848, "logits/rejected": -2.4189705848693848, "logps/chosen": -361.6686706542969, "logps/rejected": -361.6686706542969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04054202139377594, "rewards/margins": 0.0, "rewards/rejected": -0.04054202139377594, "step": 1860 }, { "epoch": 0.5458658687878567, "grad_norm": 0.01446533203125, "learning_rate": 2.5356755388642973e-06, "logits/chosen": -2.4053876399993896, "logits/rejected": -2.4053876399993896, "logps/chosen": -290.9534606933594, "logps/rejected": -290.9534606933594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.037081241607666016, "rewards/margins": 0.0, "rewards/rejected": -0.037081241607666016, "step": 1870 }, { "epoch": 0.548784937604904, "grad_norm": 0.01611328125, "learning_rate": 2.510193328841375e-06, "logits/chosen": -2.4209909439086914, "logits/rejected": -2.4209909439086914, "logps/chosen": -304.0765075683594, "logps/rejected": -304.0765075683594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03415703400969505, "rewards/margins": 0.0, "rewards/rejected": -0.03415703400969505, "step": 1880 }, { "epoch": 0.5517040064219514, "grad_norm": 0.0164794921875, "learning_rate": 2.484710059694594e-06, "logits/chosen": -2.4459662437438965, "logits/rejected": -2.4459662437438965, "logps/chosen": -274.7349548339844, "logps/rejected": -274.7349548339844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03464942425489426, "rewards/margins": 0.0, "rewards/rejected": -0.03464942425489426, "step": 1890 }, { "epoch": 0.5546230752389988, "grad_norm": 0.01348876953125, "learning_rate": 2.4592283792280977e-06, "logits/chosen": -2.384141206741333, "logits/rejected": -2.384141206741333, "logps/chosen": -293.96533203125, "logps/rejected": -293.96533203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04034542292356491, "rewards/margins": 0.0, "rewards/rejected": -0.04034542292356491, "step": 1900 }, { "epoch": 0.5546230752389988, "eval_logits/chosen": -2.3927481174468994, "eval_logits/rejected": -2.3927481174468994, "eval_logps/chosen": -310.4163818359375, "eval_logps/rejected": -310.4163818359375, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.03937768191099167, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.03937768191099167, "eval_runtime": 2717.9911, "eval_samples_per_second": 2.241, "eval_steps_per_second": 0.28, "step": 1900 }, { "epoch": 0.5575421440560461, "grad_norm": 0.01123046875, "learning_rate": 2.433750935080959e-06, "logits/chosen": -2.438390016555786, "logits/rejected": -2.438390016555786, "logps/chosen": -282.78106689453125, "logps/rejected": -282.78106689453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05149908736348152, "rewards/margins": 0.0, "rewards/rejected": -0.05149908736348152, "step": 1910 }, { "epoch": 0.5604612128730935, "grad_norm": 0.011962890625, "learning_rate": 2.408280374452083e-06, "logits/chosen": -2.4534342288970947, "logits/rejected": -2.4534342288970947, "logps/chosen": -306.63946533203125, "logps/rejected": -306.63946533203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04204695671796799, "rewards/margins": 0.0, "rewards/rejected": -0.04204695671796799, "step": 1920 }, { "epoch": 0.5633802816901409, "grad_norm": 0.01385498046875, "learning_rate": 2.3828193438251497e-06, "logits/chosen": -2.4302496910095215, "logits/rejected": -2.4302496910095215, "logps/chosen": -328.1105651855469, "logps/rejected": -328.1105651855469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03839876502752304, "rewards/margins": 0.0, "rewards/rejected": -0.03839876502752304, "step": 1930 }, { "epoch": 0.5662993505071882, "grad_norm": 0.01513671875, "learning_rate": 2.3573704886936414e-06, "logits/chosen": -2.4566609859466553, "logits/rejected": -2.4566609859466553, "logps/chosen": -314.76910400390625, "logps/rejected": -314.76910400390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04071163386106491, "rewards/margins": 0.0, "rewards/rejected": -0.04071163386106491, "step": 1940 }, { "epoch": 0.5692184193242356, "grad_norm": 0.01397705078125, "learning_rate": 2.331936453285957e-06, "logits/chosen": -2.414055109024048, "logits/rejected": -2.414055109024048, "logps/chosen": -346.7576904296875, "logps/rejected": -346.7576904296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.036326270550489426, "rewards/margins": 0.0, "rewards/rejected": -0.036326270550489426, "step": 1950 }, { "epoch": 0.572137488141283, "grad_norm": 0.0157470703125, "learning_rate": 2.3065198802906767e-06, "logits/chosen": -2.4286112785339355, "logits/rejected": -2.4286112785339355, "logps/chosen": -339.60064697265625, "logps/rejected": -339.60064697265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04548191279172897, "rewards/margins": 0.0, "rewards/rejected": -0.04548191279172897, "step": 1960 }, { "epoch": 0.5750565569583302, "grad_norm": 0.01141357421875, "learning_rate": 2.2811234105819714e-06, "logits/chosen": -2.4342637062072754, "logits/rejected": -2.4342637062072754, "logps/chosen": -314.4915771484375, "logps/rejected": -314.4915771484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03697306662797928, "rewards/margins": 0.0, "rewards/rejected": -0.03697306662797928, "step": 1970 }, { "epoch": 0.5779756257753776, "grad_norm": 0.01495361328125, "learning_rate": 2.2557496829452056e-06, "logits/chosen": -2.387324810028076, "logits/rejected": -2.387324810028076, "logps/chosen": -349.37835693359375, "logps/rejected": -349.37835693359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04250973090529442, "rewards/margins": 0.0, "rewards/rejected": -0.04250973090529442, "step": 1980 }, { "epoch": 0.580894694592425, "grad_norm": 0.0152587890625, "learning_rate": 2.230401333802763e-06, "logits/chosen": -2.412137985229492, "logits/rejected": -2.412137985229492, "logps/chosen": -310.9895324707031, "logps/rejected": -310.9895324707031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.040226660668849945, "rewards/margins": 0.0, "rewards/rejected": -0.040226660668849945, "step": 1990 }, { "epoch": 0.5838137634094723, "grad_norm": 0.01483154296875, "learning_rate": 2.205080996940108e-06, "logits/chosen": -2.4124810695648193, "logits/rejected": -2.4124810695648193, "logps/chosen": -273.5890197753906, "logps/rejected": -273.5890197753906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04197770729660988, "rewards/margins": 0.0, "rewards/rejected": -0.04197770729660988, "step": 2000 }, { "epoch": 0.5838137634094723, "eval_logits/chosen": -2.392037868499756, "eval_logits/rejected": -2.392037868499756, "eval_logps/chosen": -310.4427185058594, "eval_logps/rejected": -310.4427185058594, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.039641354233026505, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.039641354233026505, "eval_runtime": 2711.551, "eval_samples_per_second": 2.246, "eval_steps_per_second": 0.281, "step": 2000 }, { "epoch": 0.5867328322265197, "grad_norm": 0.01214599609375, "learning_rate": 2.1797913032321283e-06, "logits/chosen": -2.420572519302368, "logits/rejected": -2.420572519302368, "logps/chosen": -277.4279479980469, "logps/rejected": -277.4279479980469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03539072722196579, "rewards/margins": 0.0, "rewards/rejected": -0.03539072722196579, "step": 2010 }, { "epoch": 0.5896519010435671, "grad_norm": 0.0157470703125, "learning_rate": 2.1545348803697745e-06, "logits/chosen": -2.4433321952819824, "logits/rejected": -2.4433321952819824, "logps/chosen": -281.5128479003906, "logps/rejected": -281.5128479003906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.016543438658118248, "rewards/margins": 0.0, "rewards/rejected": 0.016543438658118248, "step": 2020 }, { "epoch": 0.5925709698606144, "grad_norm": 0.015869140625, "learning_rate": 2.1293143525870396e-06, "logits/chosen": -2.435228109359741, "logits/rejected": -2.435228109359741, "logps/chosen": -315.1198425292969, "logps/rejected": -315.1198425292969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04324204847216606, "rewards/margins": 0.0, "rewards/rejected": -0.04324204847216606, "step": 2030 }, { "epoch": 0.5954900386776618, "grad_norm": 0.0133056640625, "learning_rate": 2.1041323403882836e-06, "logits/chosen": -2.458317995071411, "logits/rejected": -2.458317995071411, "logps/chosen": -314.63482666015625, "logps/rejected": -314.63482666015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.039002105593681335, "rewards/margins": 0.0, "rewards/rejected": -0.039002105593681335, "step": 2040 }, { "epoch": 0.5984091074947092, "grad_norm": 0.0164794921875, "learning_rate": 2.078991460275958e-06, "logits/chosen": -2.4496326446533203, "logits/rejected": -2.4496326446533203, "logps/chosen": -295.86199951171875, "logps/rejected": -295.86199951171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03856682404875755, "rewards/margins": 0.0, "rewards/rejected": -0.03856682404875755, "step": 2050 }, { "epoch": 0.6013281763117565, "grad_norm": 0.01409912109375, "learning_rate": 2.0538943244787452e-06, "logits/chosen": -2.440256118774414, "logits/rejected": -2.440256118774414, "logps/chosen": -302.68463134765625, "logps/rejected": -302.68463134765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.040991030633449554, "rewards/margins": 0.0, "rewards/rejected": -0.040991030633449554, "step": 2060 }, { "epoch": 0.6042472451288039, "grad_norm": 0.01226806640625, "learning_rate": 2.0288435406801293e-06, "logits/chosen": -2.4207422733306885, "logits/rejected": -2.4207422733306885, "logps/chosen": -347.23297119140625, "logps/rejected": -347.23297119140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03826383873820305, "rewards/margins": 0.0, "rewards/rejected": -0.03826383873820305, "step": 2070 }, { "epoch": 0.6071663139458513, "grad_norm": 0.01275634765625, "learning_rate": 2.0038417117474574e-06, "logits/chosen": -2.4277267456054688, "logits/rejected": -2.4277267456054688, "logps/chosen": -314.09674072265625, "logps/rejected": -314.09674072265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05563684552907944, "rewards/margins": 0.0, "rewards/rejected": -0.05563684552907944, "step": 2080 }, { "epoch": 0.6100853827628986, "grad_norm": 0.01251220703125, "learning_rate": 1.9788914354614853e-06, "logits/chosen": -2.4430274963378906, "logits/rejected": -2.4430274963378906, "logps/chosen": -280.791015625, "logps/rejected": -280.791015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.039162371307611465, "rewards/margins": 0.0, "rewards/rejected": -0.039162371307611465, "step": 2090 }, { "epoch": 0.613004451579946, "grad_norm": 0.0159912109375, "learning_rate": 1.9539953042464656e-06, "logits/chosen": -2.4126973152160645, "logits/rejected": -2.4126973152160645, "logps/chosen": -341.8514709472656, "logps/rejected": -341.8514709472656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04444648697972298, "rewards/margins": 0.0, "rewards/rejected": -0.04444648697972298, "step": 2100 }, { "epoch": 0.613004451579946, "eval_logits/chosen": -2.390094041824341, "eval_logits/rejected": -2.390094041824341, "eval_logps/chosen": -310.71502685546875, "eval_logps/rejected": -310.71502685546875, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.042364299297332764, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.042364299297332764, "eval_runtime": 2698.9127, "eval_samples_per_second": 2.256, "eval_steps_per_second": 0.282, "step": 2100 }, { "epoch": 0.6159235203969934, "grad_norm": 0.0126953125, "learning_rate": 1.929155904900778e-06, "logits/chosen": -2.442920207977295, "logits/rejected": -2.442920207977295, "logps/chosen": -336.13153076171875, "logps/rejected": -336.13153076171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04605261981487274, "rewards/margins": 0.0, "rewards/rejected": -0.04605261981487274, "step": 2110 }, { "epoch": 0.6188425892140407, "grad_norm": 0.0128173828125, "learning_rate": 1.9043758183281548e-06, "logits/chosen": -2.398139476776123, "logits/rejected": -2.398139476776123, "logps/chosen": -297.93353271484375, "logps/rejected": -297.93353271484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03661734238266945, "rewards/margins": 0.0, "rewards/rejected": -0.03661734238266945, "step": 2120 }, { "epoch": 0.6217616580310881, "grad_norm": 0.0162353515625, "learning_rate": 1.8796576192695198e-06, "logits/chosen": -2.4115586280822754, "logits/rejected": -2.4115586280822754, "logps/chosen": -283.5032653808594, "logps/rejected": -283.5032653808594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05024952441453934, "rewards/margins": 0.0, "rewards/rejected": -0.05024952441453934, "step": 2130 }, { "epoch": 0.6246807268481355, "grad_norm": 0.01611328125, "learning_rate": 1.8550038760354559e-06, "logits/chosen": -2.4140570163726807, "logits/rejected": -2.4140570163726807, "logps/chosen": -328.29241943359375, "logps/rejected": -328.29241943359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03783569857478142, "rewards/margins": 0.0, "rewards/rejected": -0.03783569857478142, "step": 2140 }, { "epoch": 0.6275997956651828, "grad_norm": 0.01470947265625, "learning_rate": 1.8304171502393542e-06, "logits/chosen": -2.4498252868652344, "logits/rejected": -2.4498252868652344, "logps/chosen": -333.46807861328125, "logps/rejected": -333.46807861328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05009561777114868, "rewards/margins": 0.0, "rewards/rejected": -0.05009561777114868, "step": 2150 }, { "epoch": 0.6305188644822302, "grad_norm": 0.0198974609375, "learning_rate": 1.8058999965312484e-06, "logits/chosen": -2.3965957164764404, "logits/rejected": -2.3965957164764404, "logps/chosen": -306.3211669921875, "logps/rejected": -306.3211669921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04475449398159981, "rewards/margins": 0.0, "rewards/rejected": -0.04475449398159981, "step": 2160 }, { "epoch": 0.6334379332992776, "grad_norm": 0.016357421875, "learning_rate": 1.7814549623323828e-06, "logits/chosen": -2.400684356689453, "logits/rejected": -2.400684356689453, "logps/chosen": -286.625, "logps/rejected": -286.625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.043368883430957794, "rewards/margins": 0.0, "rewards/rejected": -0.043368883430957794, "step": 2170 }, { "epoch": 0.6363570021163248, "grad_norm": 0.01531982421875, "learning_rate": 1.7570845875705205e-06, "logits/chosen": -2.4366753101348877, "logits/rejected": -2.4366753101348877, "logps/chosen": -338.27679443359375, "logps/rejected": -338.27679443359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05562018230557442, "rewards/margins": 0.0, "rewards/rejected": -0.05562018230557442, "step": 2180 }, { "epoch": 0.6392760709333722, "grad_norm": 0.0162353515625, "learning_rate": 1.7327914044160388e-06, "logits/chosen": -2.449612617492676, "logits/rejected": -2.449612617492676, "logps/chosen": -316.91766357421875, "logps/rejected": -316.91766357421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04444233328104019, "rewards/margins": 0.0, "rewards/rejected": -0.04444233328104019, "step": 2190 }, { "epoch": 0.6421951397504196, "grad_norm": 0.0145263671875, "learning_rate": 1.7085779370188276e-06, "logits/chosen": -2.3980746269226074, "logits/rejected": -2.3980746269226074, "logps/chosen": -308.85906982421875, "logps/rejected": -308.85906982421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0481327660381794, "rewards/margins": 0.0, "rewards/rejected": -0.0481327660381794, "step": 2200 }, { "epoch": 0.6421951397504196, "eval_logits/chosen": -2.3910679817199707, "eval_logits/rejected": -2.3910679817199707, "eval_logps/chosen": -311.0310363769531, "eval_logps/rejected": -311.0310363769531, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.04552413523197174, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.04552413523197174, "eval_runtime": 2707.3295, "eval_samples_per_second": 2.249, "eval_steps_per_second": 0.281, "step": 2200 }, { "epoch": 0.6451142085674669, "grad_norm": 0.016845703125, "learning_rate": 1.6844467012460193e-06, "logits/chosen": -2.429086446762085, "logits/rejected": -2.429086446762085, "logps/chosen": -306.8155822753906, "logps/rejected": -306.8155822753906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04282836988568306, "rewards/margins": 0.0, "rewards/rejected": -0.04282836988568306, "step": 2210 }, { "epoch": 0.6480332773845143, "grad_norm": 0.014404296875, "learning_rate": 1.6604002044205825e-06, "logits/chosen": -2.4325811862945557, "logits/rejected": -2.4325811862945557, "logps/chosen": -337.0578308105469, "logps/rejected": -337.0578308105469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04457175359129906, "rewards/margins": 0.0, "rewards/rejected": -0.04457175359129906, "step": 2220 }, { "epoch": 0.6509523462015617, "grad_norm": 0.01397705078125, "learning_rate": 1.6364409450608018e-06, "logits/chosen": -2.4428985118865967, "logits/rejected": -2.4428985118865967, "logps/chosen": -308.55657958984375, "logps/rejected": -308.55657958984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04731472209095955, "rewards/margins": 0.0, "rewards/rejected": -0.04731472209095955, "step": 2230 }, { "epoch": 0.653871415018609, "grad_norm": 0.013427734375, "learning_rate": 1.6125714126206736e-06, "logits/chosen": -2.4196009635925293, "logits/rejected": -2.4196009635925293, "logps/chosen": -348.8056335449219, "logps/rejected": -348.8056335449219, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.049455009400844574, "rewards/margins": 0.0, "rewards/rejected": -0.049455009400844574, "step": 2240 }, { "epoch": 0.6567904838356564, "grad_norm": 0.01556396484375, "learning_rate": 1.5887940872312391e-06, "logits/chosen": -2.4100897312164307, "logits/rejected": -2.4100897312164307, "logps/chosen": -320.3233642578125, "logps/rejected": -320.3233642578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04676957428455353, "rewards/margins": 0.0, "rewards/rejected": -0.04676957428455353, "step": 2250 }, { "epoch": 0.6597095526527038, "grad_norm": 0.0147705078125, "learning_rate": 1.5651114394428955e-06, "logits/chosen": -2.4624266624450684, "logits/rejected": -2.4624266624450684, "logps/chosen": -344.6718444824219, "logps/rejected": -344.6718444824219, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0538489893078804, "rewards/margins": 0.0, "rewards/rejected": -0.0538489893078804, "step": 2260 }, { "epoch": 0.6626286214697511, "grad_norm": 0.01251220703125, "learning_rate": 1.5415259299686903e-06, "logits/chosen": -2.4147191047668457, "logits/rejected": -2.4147191047668457, "logps/chosen": -316.6529235839844, "logps/rejected": -316.6529235839844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.041484713554382324, "rewards/margins": 0.0, "rewards/rejected": -0.041484713554382324, "step": 2270 }, { "epoch": 0.6655476902867985, "grad_norm": 0.01348876953125, "learning_rate": 1.5180400094286496e-06, "logits/chosen": -2.440053939819336, "logits/rejected": -2.440053939819336, "logps/chosen": -309.5370178222656, "logps/rejected": -309.5370178222656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04570756107568741, "rewards/margins": 0.0, "rewards/rejected": -0.04570756107568741, "step": 2280 }, { "epoch": 0.6684667591038459, "grad_norm": 0.017822265625, "learning_rate": 1.494656118095149e-06, "logits/chosen": -2.407764434814453, "logits/rejected": -2.407764434814453, "logps/chosen": -320.51263427734375, "logps/rejected": -320.51263427734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04706931859254837, "rewards/margins": 0.0, "rewards/rejected": -0.04706931859254837, "step": 2290 }, { "epoch": 0.6713858279208932, "grad_norm": 0.0120849609375, "learning_rate": 1.4713766856393557e-06, "logits/chosen": -2.420919895172119, "logits/rejected": -2.420919895172119, "logps/chosen": -295.04547119140625, "logps/rejected": -295.04547119140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05071335285902023, "rewards/margins": 0.0, "rewards/rejected": -0.05071335285902023, "step": 2300 }, { "epoch": 0.6713858279208932, "eval_logits/chosen": -2.391244411468506, "eval_logits/rejected": -2.391244411468506, "eval_logps/chosen": -310.7880554199219, "eval_logps/rejected": -310.7880554199219, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.04309455305337906, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.04309455305337906, "eval_runtime": 2670.058, "eval_samples_per_second": 2.281, "eval_steps_per_second": 0.285, "step": 2300 }, { "epoch": 0.6743048967379406, "grad_norm": 0.0198974609375, "learning_rate": 1.448204130878785e-06, "logits/chosen": -2.3968968391418457, "logits/rejected": -2.3968968391418457, "logps/chosen": -287.2406005859375, "logps/rejected": -287.2406005859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04974811524152756, "rewards/margins": 0.0, "rewards/rejected": -0.04974811524152756, "step": 2310 }, { "epoch": 0.677223965554988, "grad_norm": 0.013916015625, "learning_rate": 1.425140861525967e-06, "logits/chosen": -2.407982587814331, "logits/rejected": -2.407982587814331, "logps/chosen": -346.8302307128906, "logps/rejected": -346.8302307128906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.045040082186460495, "rewards/margins": 0.0, "rewards/rejected": -0.045040082186460495, "step": 2320 }, { "epoch": 0.6801430343720353, "grad_norm": 0.01531982421875, "learning_rate": 1.4021892739382853e-06, "logits/chosen": -2.4366557598114014, "logits/rejected": -2.4366557598114014, "logps/chosen": -315.5507507324219, "logps/rejected": -315.5507507324219, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.053034014999866486, "rewards/margins": 0.0, "rewards/rejected": -0.053034014999866486, "step": 2330 }, { "epoch": 0.6830621031890827, "grad_norm": 0.013916015625, "learning_rate": 1.3793517528689804e-06, "logits/chosen": -2.40993070602417, "logits/rejected": -2.40993070602417, "logps/chosen": -322.5754699707031, "logps/rejected": -322.5754699707031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04859765246510506, "rewards/margins": 0.0, "rewards/rejected": -0.04859765246510506, "step": 2340 }, { "epoch": 0.6859811720061301, "grad_norm": 0.0167236328125, "learning_rate": 1.3566306712193704e-06, "logits/chosen": -2.4204134941101074, "logits/rejected": -2.4204134941101074, "logps/chosen": -349.4993896484375, "logps/rejected": -349.4993896484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05103806406259537, "rewards/margins": 0.0, "rewards/rejected": -0.05103806406259537, "step": 2350 }, { "epoch": 0.6889002408231774, "grad_norm": 0.01531982421875, "learning_rate": 1.3340283897922911e-06, "logits/chosen": -2.4295237064361572, "logits/rejected": -2.4295237064361572, "logps/chosen": -330.99005126953125, "logps/rejected": -330.99005126953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04879484325647354, "rewards/margins": 0.0, "rewards/rejected": -0.04879484325647354, "step": 2360 }, { "epoch": 0.6918193096402248, "grad_norm": 0.0146484375, "learning_rate": 1.3115472570468058e-06, "logits/chosen": -2.4285712242126465, "logits/rejected": -2.4285712242126465, "logps/chosen": -336.67364501953125, "logps/rejected": -336.67364501953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04440216347575188, "rewards/margins": 0.0, "rewards/rejected": -0.04440216347575188, "step": 2370 }, { "epoch": 0.6947383784572722, "grad_norm": 0.0162353515625, "learning_rate": 1.2891896088541928e-06, "logits/chosen": -2.405956745147705, "logits/rejected": -2.405956745147705, "logps/chosen": -338.88739013671875, "logps/rejected": -338.88739013671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.047105275094509125, "rewards/margins": 0.0, "rewards/rejected": -0.047105275094509125, "step": 2380 }, { "epoch": 0.6976574472743194, "grad_norm": 0.0169677734375, "learning_rate": 1.266957768255232e-06, "logits/chosen": -2.422194719314575, "logits/rejected": -2.422194719314575, "logps/chosen": -318.286865234375, "logps/rejected": -318.286865234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04666005074977875, "rewards/margins": 0.0, "rewards/rejected": -0.04666005074977875, "step": 2390 }, { "epoch": 0.7005765160913668, "grad_norm": 0.0142822265625, "learning_rate": 1.2448540452188432e-06, "logits/chosen": -2.3955206871032715, "logits/rejected": -2.3955206871032715, "logps/chosen": -314.3586120605469, "logps/rejected": -314.3586120605469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04044215753674507, "rewards/margins": 0.0, "rewards/rejected": -0.04044215753674507, "step": 2400 }, { "epoch": 0.7005765160913668, "eval_logits/chosen": -2.3899266719818115, "eval_logits/rejected": -2.3899266719818115, "eval_logps/chosen": -310.6455078125, "eval_logps/rejected": -310.6455078125, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.04166920483112335, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.04166920483112335, "eval_runtime": 2668.7744, "eval_samples_per_second": 2.282, "eval_steps_per_second": 0.286, "step": 2400 }, { "epoch": 0.7034955849084142, "grad_norm": 0.01446533203125, "learning_rate": 1.2228807364020617e-06, "logits/chosen": -2.4090027809143066, "logits/rejected": -2.4090027809143066, "logps/chosen": -268.48944091796875, "logps/rejected": -268.48944091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.037643421441316605, "rewards/margins": 0.0, "rewards/rejected": -0.037643421441316605, "step": 2410 }, { "epoch": 0.7064146537254615, "grad_norm": 0.012451171875, "learning_rate": 1.2010401249114166e-06, "logits/chosen": -2.4060184955596924, "logits/rejected": -2.4060184955596924, "logps/chosen": -338.2677001953125, "logps/rejected": -338.2677001953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.035085879266262054, "rewards/margins": 0.0, "rewards/rejected": -0.035085879266262054, "step": 2420 }, { "epoch": 0.7093337225425089, "grad_norm": 0.0206298828125, "learning_rate": 1.1793344800656995e-06, "logits/chosen": -2.3857572078704834, "logits/rejected": -2.3857572078704834, "logps/chosen": -325.4837646484375, "logps/rejected": -325.4837646484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03704181686043739, "rewards/margins": 0.0, "rewards/rejected": -0.03704181686043739, "step": 2430 }, { "epoch": 0.7122527913595563, "grad_norm": 0.01544189453125, "learning_rate": 1.1577660571601796e-06, "logits/chosen": -2.396127223968506, "logits/rejected": -2.396127223968506, "logps/chosen": -321.38897705078125, "logps/rejected": -321.38897705078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0452260822057724, "rewards/margins": 0.0, "rewards/rejected": -0.0452260822057724, "step": 2440 }, { "epoch": 0.7151718601766036, "grad_norm": 0.0137939453125, "learning_rate": 1.1363370972322694e-06, "logits/chosen": -2.4177489280700684, "logits/rejected": -2.4177489280700684, "logps/chosen": -296.6512756347656, "logps/rejected": -296.6512756347656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04763947054743767, "rewards/margins": 0.0, "rewards/rejected": -0.04763947054743767, "step": 2450 }, { "epoch": 0.718090928993651, "grad_norm": 0.0142822265625, "learning_rate": 1.115049826828669e-06, "logits/chosen": -2.4321625232696533, "logits/rejected": -2.4321625232696533, "logps/chosen": -306.14141845703125, "logps/rejected": -306.14141845703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04333222657442093, "rewards/margins": 0.0, "rewards/rejected": -0.04333222657442093, "step": 2460 }, { "epoch": 0.7210099978106984, "grad_norm": 0.01483154296875, "learning_rate": 1.0939064577740266e-06, "logits/chosen": -2.4054694175720215, "logits/rejected": -2.4054694175720215, "logps/chosen": -301.36334228515625, "logps/rejected": -301.36334228515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.041240572929382324, "rewards/margins": 0.0, "rewards/rejected": -0.041240572929382324, "step": 2470 }, { "epoch": 0.7239290666277457, "grad_norm": 0.0159912109375, "learning_rate": 1.0729091869411137e-06, "logits/chosen": -2.4020252227783203, "logits/rejected": -2.4020252227783203, "logps/chosen": -332.1387023925781, "logps/rejected": -332.1387023925781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.043921731412410736, "rewards/margins": 0.0, "rewards/rejected": -0.043921731412410736, "step": 2480 }, { "epoch": 0.7268481354447931, "grad_norm": 0.013427734375, "learning_rate": 1.0520601960225708e-06, "logits/chosen": -2.421534299850464, "logits/rejected": -2.421534299850464, "logps/chosen": -314.00311279296875, "logps/rejected": -314.00311279296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.01183334831148386, "rewards/margins": 0.0, "rewards/rejected": -0.01183334831148386, "step": 2490 }, { "epoch": 0.7297672042618405, "grad_norm": 0.020751953125, "learning_rate": 1.0313616513042133e-06, "logits/chosen": -2.4747350215911865, "logits/rejected": -2.4747350215911865, "logps/chosen": -319.47918701171875, "logps/rejected": -319.47918701171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.055976878851652145, "rewards/margins": 0.0, "rewards/rejected": -0.055976878851652145, "step": 2500 }, { "epoch": 0.7297672042618405, "eval_logits/chosen": -2.3914709091186523, "eval_logits/rejected": -2.3914709091186523, "eval_logps/chosen": -310.819580078125, "eval_logps/rejected": -310.819580078125, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.04341000318527222, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.04341000318527222, "eval_runtime": 2669.3967, "eval_samples_per_second": 2.281, "eval_steps_per_second": 0.285, "step": 2500 }, { "epoch": 0.7326862730788878, "grad_norm": 0.0145263671875, "learning_rate": 1.0108157034399532e-06, "logits/chosen": -2.4052977561950684, "logits/rejected": -2.4052977561950684, "logps/chosen": -298.67474365234375, "logps/rejected": -298.67474365234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04686864838004112, "rewards/margins": 0.0, "rewards/rejected": -0.04686864838004112, "step": 2510 }, { "epoch": 0.7356053418959352, "grad_norm": 0.0179443359375, "learning_rate": 9.90424487228334e-07, "logits/chosen": -2.411712646484375, "logits/rejected": -2.411712646484375, "logps/chosen": -322.70428466796875, "logps/rejected": -322.70428466796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.043312918394804, "rewards/margins": 0.0, "rewards/rejected": -0.043312918394804, "step": 2520 }, { "epoch": 0.7385244107129826, "grad_norm": 0.01611328125, "learning_rate": 9.701901213907192e-07, "logits/chosen": -2.4330382347106934, "logits/rejected": -2.4330382347106934, "logps/chosen": -324.5224609375, "logps/rejected": -324.5224609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05447854846715927, "rewards/margins": 0.0, "rewards/rejected": -0.05447854846715927, "step": 2530 }, { "epoch": 0.7414434795300299, "grad_norm": 0.01416015625, "learning_rate": 9.501147083511511e-07, "logits/chosen": -2.45332407951355, "logits/rejected": -2.45332407951355, "logps/chosen": -321.7140808105469, "logps/rejected": -321.7140808105469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0516216978430748, "rewards/margins": 0.0, "rewards/rejected": -0.0516216978430748, "step": 2540 }, { "epoch": 0.7443625483470773, "grad_norm": 0.015625, "learning_rate": 9.302003340178962e-07, "logits/chosen": -2.417236804962158, "logits/rejected": -2.417236804962158, "logps/chosen": -333.95574951171875, "logps/rejected": -333.95574951171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0465201810002327, "rewards/margins": 0.0, "rewards/rejected": -0.0465201810002327, "step": 2550 }, { "epoch": 0.7472816171641247, "grad_norm": 0.01422119140625, "learning_rate": 9.10449067566718e-07, "logits/chosen": -2.459394931793213, "logits/rejected": -2.459394931793213, "logps/chosen": -303.9725646972656, "logps/rejected": -303.9725646972656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04736841470003128, "rewards/margins": 0.0, "rewards/rejected": -0.04736841470003128, "step": 2560 }, { "epoch": 0.750200685981172, "grad_norm": 0.01513671875, "learning_rate": 8.908629612258765e-07, "logits/chosen": -2.435121774673462, "logits/rejected": -2.435121774673462, "logps/chosen": -300.51055908203125, "logps/rejected": -300.51055908203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04969844967126846, "rewards/margins": 0.0, "rewards/rejected": -0.04969844967126846, "step": 2570 }, { "epoch": 0.7531197547982194, "grad_norm": 0.0145263671875, "learning_rate": 8.714440500628999e-07, "logits/chosen": -2.393557071685791, "logits/rejected": -2.393557071685791, "logps/chosen": -305.946044921875, "logps/rejected": -305.946044921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.035433102399110794, "rewards/margins": 0.0, "rewards/rejected": -0.035433102399110794, "step": 2580 }, { "epoch": 0.7560388236152668, "grad_norm": 0.01385498046875, "learning_rate": 8.521943517731276e-07, "logits/chosen": -2.394944667816162, "logits/rejected": -2.394944667816162, "logps/chosen": -329.5417175292969, "logps/rejected": -329.5417175292969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.029879886656999588, "rewards/margins": 0.0, "rewards/rejected": -0.029879886656999588, "step": 2590 }, { "epoch": 0.758957892432314, "grad_norm": 0.01513671875, "learning_rate": 8.33115866470069e-07, "logits/chosen": -2.3986093997955322, "logits/rejected": -2.3986093997955322, "logps/chosen": -297.0606994628906, "logps/rejected": -297.0606994628906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04173046723008156, "rewards/margins": 0.0, "rewards/rejected": -0.04173046723008156, "step": 2600 }, { "epoch": 0.758957892432314, "eval_logits/chosen": -2.3918919563293457, "eval_logits/rejected": -2.3918919563293457, "eval_logps/chosen": -310.8546447753906, "eval_logps/rejected": -310.8546447753906, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.0437602661550045, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.0437602661550045, "eval_runtime": 2682.4018, "eval_samples_per_second": 2.27, "eval_steps_per_second": 0.284, "step": 2600 }, { "epoch": 0.7618769612493614, "grad_norm": 0.01544189453125, "learning_rate": 8.142105764775824e-07, "logits/chosen": -2.384005546569824, "logits/rejected": -2.384005546569824, "logps/chosen": -327.1615295410156, "logps/rejected": -327.1615295410156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.051810719072818756, "rewards/margins": 0.0, "rewards/rejected": -0.051810719072818756, "step": 2610 }, { "epoch": 0.7647960300664088, "grad_norm": 0.01458740234375, "learning_rate": 7.954804461239054e-07, "logits/chosen": -2.444282054901123, "logits/rejected": -2.444282054901123, "logps/chosen": -314.5889587402344, "logps/rejected": -314.5889587402344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04727676510810852, "rewards/margins": 0.0, "rewards/rejected": -0.04727676510810852, "step": 2620 }, { "epoch": 0.7677150988834561, "grad_norm": 0.016357421875, "learning_rate": 7.769274215375544e-07, "logits/chosen": -2.432978391647339, "logits/rejected": -2.432978391647339, "logps/chosen": -293.0484924316406, "logps/rejected": -293.0484924316406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0413900688290596, "rewards/margins": 0.0, "rewards/rejected": -0.0413900688290596, "step": 2630 }, { "epoch": 0.7706341677005035, "grad_norm": 0.01446533203125, "learning_rate": 7.585534304451103e-07, "logits/chosen": -2.444913387298584, "logits/rejected": -2.444913387298584, "logps/chosen": -330.8976135253906, "logps/rejected": -330.8976135253906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.043237775564193726, "rewards/margins": 0.0, "rewards/rejected": -0.043237775564193726, "step": 2640 }, { "epoch": 0.7735532365175509, "grad_norm": 0.01312255859375, "learning_rate": 7.403603819709288e-07, "logits/chosen": -2.4194247722625732, "logits/rejected": -2.4194247722625732, "logps/chosen": -302.08465576171875, "logps/rejected": -302.08465576171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04694979637861252, "rewards/margins": 0.0, "rewards/rejected": -0.04694979637861252, "step": 2650 }, { "epoch": 0.7764723053345982, "grad_norm": 0.014404296875, "learning_rate": 7.223501664387664e-07, "logits/chosen": -2.440764904022217, "logits/rejected": -2.440764904022217, "logps/chosen": -280.7825622558594, "logps/rejected": -280.7825622558594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.036083877086639404, "rewards/margins": 0.0, "rewards/rejected": -0.036083877086639404, "step": 2660 }, { "epoch": 0.7793913741516456, "grad_norm": 0.01458740234375, "learning_rate": 7.045246551753779e-07, "logits/chosen": -2.4197888374328613, "logits/rejected": -2.4197888374328613, "logps/chosen": -323.67938232421875, "logps/rejected": -323.67938232421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.043979812413454056, "rewards/margins": 0.0, "rewards/rejected": -0.043979812413454056, "step": 2670 }, { "epoch": 0.782310442968693, "grad_norm": 0.0142822265625, "learning_rate": 6.868857003160709e-07, "logits/chosen": -2.470567226409912, "logits/rejected": -2.470567226409912, "logps/chosen": -356.6578369140625, "logps/rejected": -356.6578369140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05309978872537613, "rewards/margins": 0.0, "rewards/rejected": -0.05309978872537613, "step": 2680 }, { "epoch": 0.7852295117857403, "grad_norm": 0.0150146484375, "learning_rate": 6.69435134612266e-07, "logits/chosen": -2.4125561714172363, "logits/rejected": -2.4125561714172363, "logps/chosen": -302.1919250488281, "logps/rejected": -302.1919250488281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04638701677322388, "rewards/margins": 0.0, "rewards/rejected": -0.04638701677322388, "step": 2690 }, { "epoch": 0.7881485806027877, "grad_norm": 0.013427734375, "learning_rate": 6.521747712410687e-07, "logits/chosen": -2.431802988052368, "logits/rejected": -2.431802988052368, "logps/chosen": -319.6323547363281, "logps/rejected": -319.6323547363281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04552530124783516, "rewards/margins": 0.0, "rewards/rejected": -0.04552530124783516, "step": 2700 }, { "epoch": 0.7881485806027877, "eval_logits/chosen": -2.3916165828704834, "eval_logits/rejected": -2.3916165828704834, "eval_logps/chosen": -310.8406677246094, "eval_logps/rejected": -310.8406677246094, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.04362065717577934, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.04362065717577934, "eval_runtime": 2682.1268, "eval_samples_per_second": 2.271, "eval_steps_per_second": 0.284, "step": 2700 }, { "epoch": 0.7910676494198351, "grad_norm": 0.0250244140625, "learning_rate": 6.351064036168708e-07, "logits/chosen": -2.4238877296447754, "logits/rejected": -2.4238877296447754, "logps/chosen": -338.21759033203125, "logps/rejected": -338.21759033203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.048554692417383194, "rewards/margins": 0.0, "rewards/rejected": -0.048554692417383194, "step": 2710 }, { "epoch": 0.7939867182368824, "grad_norm": 0.01416015625, "learning_rate": 6.182318052050102e-07, "logits/chosen": -2.398974895477295, "logits/rejected": -2.398974895477295, "logps/chosen": -329.53106689453125, "logps/rejected": -329.53106689453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05249527841806412, "rewards/margins": 0.0, "rewards/rejected": -0.05249527841806412, "step": 2720 }, { "epoch": 0.7969057870539298, "grad_norm": 0.019287109375, "learning_rate": 6.015527293374979e-07, "logits/chosen": -2.4338581562042236, "logits/rejected": -2.4338581562042236, "logps/chosen": -334.1202087402344, "logps/rejected": -334.1202087402344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04841463267803192, "rewards/margins": 0.0, "rewards/rejected": -0.04841463267803192, "step": 2730 }, { "epoch": 0.7998248558709772, "grad_norm": 0.014404296875, "learning_rate": 5.850709090308459e-07, "logits/chosen": -2.4255330562591553, "logits/rejected": -2.4255330562591553, "logps/chosen": -295.30523681640625, "logps/rejected": -295.30523681640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0433431938290596, "rewards/margins": 0.0, "rewards/rejected": -0.0433431938290596, "step": 2740 }, { "epoch": 0.8027439246880245, "grad_norm": 0.0133056640625, "learning_rate": 5.687880568059961e-07, "logits/chosen": -2.3997416496276855, "logits/rejected": -2.3997416496276855, "logps/chosen": -314.76361083984375, "logps/rejected": -314.76361083984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04872073233127594, "rewards/margins": 0.0, "rewards/rejected": -0.04872073233127594, "step": 2750 }, { "epoch": 0.8056629935050719, "grad_norm": 0.01422119140625, "learning_rate": 5.527058645103842e-07, "logits/chosen": -2.3996376991271973, "logits/rejected": -2.3996376991271973, "logps/chosen": -376.6802673339844, "logps/rejected": -376.6802673339844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0522351935505867, "rewards/margins": 0.0, "rewards/rejected": -0.0522351935505867, "step": 2760 }, { "epoch": 0.8085820623221193, "grad_norm": 0.0159912109375, "learning_rate": 5.368260031421526e-07, "logits/chosen": -2.4533755779266357, "logits/rejected": -2.4533755779266357, "logps/chosen": -338.7648010253906, "logps/rejected": -338.7648010253906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04105439782142639, "rewards/margins": 0.0, "rewards/rejected": -0.04105439782142639, "step": 2770 }, { "epoch": 0.8115011311391666, "grad_norm": 0.01263427734375, "learning_rate": 5.211501226765242e-07, "logits/chosen": -2.43373441696167, "logits/rejected": -2.43373441696167, "logps/chosen": -285.7012023925781, "logps/rejected": -285.7012023925781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.02375207468867302, "rewards/margins": 0.0, "rewards/rejected": -0.02375207468867302, "step": 2780 }, { "epoch": 0.814420199956214, "grad_norm": 0.0184326171875, "learning_rate": 5.056798518943678e-07, "logits/chosen": -2.4133718013763428, "logits/rejected": -2.4133718013763428, "logps/chosen": -315.09210205078125, "logps/rejected": -315.09210205078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.053018081933259964, "rewards/margins": 0.0, "rewards/rejected": -0.053018081933259964, "step": 2790 }, { "epoch": 0.8173392687732614, "grad_norm": 0.01397705078125, "learning_rate": 4.904167982129591e-07, "logits/chosen": -2.423839569091797, "logits/rejected": -2.423839569091797, "logps/chosen": -294.44683837890625, "logps/rejected": -294.44683837890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04831403121352196, "rewards/margins": 0.0, "rewards/rejected": -0.04831403121352196, "step": 2800 }, { "epoch": 0.8173392687732614, "eval_logits/chosen": -2.3914895057678223, "eval_logits/rejected": -2.3914895057678223, "eval_logps/chosen": -310.798095703125, "eval_logps/rejected": -310.798095703125, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.04319505766034126, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.04319505766034126, "eval_runtime": 2682.0962, "eval_samples_per_second": 2.271, "eval_steps_per_second": 0.284, "step": 2800 }, { "epoch": 0.8202583375903086, "grad_norm": 0.0126953125, "learning_rate": 4.7536254751896493e-07, "logits/chosen": -2.4333229064941406, "logits/rejected": -2.4333229064941406, "logps/chosen": -315.96234130859375, "logps/rejected": -315.96234130859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05122748017311096, "rewards/margins": 0.0, "rewards/rejected": -0.05122748017311096, "step": 2810 }, { "epoch": 0.823177406407356, "grad_norm": 0.0167236328125, "learning_rate": 4.6051866400366354e-07, "logits/chosen": -2.4289793968200684, "logits/rejected": -2.4289793968200684, "logps/chosen": -344.29608154296875, "logps/rejected": -344.29608154296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04764155298471451, "rewards/margins": 0.0, "rewards/rejected": -0.04764155298471451, "step": 2820 }, { "epoch": 0.8260964752244034, "grad_norm": 0.0166015625, "learning_rate": 4.4588669000042133e-07, "logits/chosen": -2.4046084880828857, "logits/rejected": -2.4046084880828857, "logps/chosen": -325.74957275390625, "logps/rejected": -325.74957275390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.035486068576574326, "rewards/margins": 0.0, "rewards/rejected": -0.035486068576574326, "step": 2830 }, { "epoch": 0.8290155440414507, "grad_norm": 0.016845703125, "learning_rate": 4.3146814582443605e-07, "logits/chosen": -2.418729066848755, "logits/rejected": -2.418729066848755, "logps/chosen": -327.8818359375, "logps/rejected": -327.8818359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.046850480139255524, "rewards/margins": 0.0, "rewards/rejected": -0.046850480139255524, "step": 2840 }, { "epoch": 0.8319346128584981, "grad_norm": 0.0135498046875, "learning_rate": 4.1726452961477147e-07, "logits/chosen": -2.416329860687256, "logits/rejected": -2.416329860687256, "logps/chosen": -319.5370178222656, "logps/rejected": -319.5370178222656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0506584569811821, "rewards/margins": 0.0, "rewards/rejected": -0.0506584569811821, "step": 2850 }, { "epoch": 0.8348536816755455, "grad_norm": 0.0146484375, "learning_rate": 4.0327731717869775e-07, "logits/chosen": -2.4376559257507324, "logits/rejected": -2.4376559257507324, "logps/chosen": -272.7819519042969, "logps/rejected": -272.7819519042969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05310596153140068, "rewards/margins": 0.0, "rewards/rejected": -0.05310596153140068, "step": 2860 }, { "epoch": 0.8377727504925928, "grad_norm": 0.0191650390625, "learning_rate": 3.8950796183834516e-07, "logits/chosen": -2.4388468265533447, "logits/rejected": -2.4388468265533447, "logps/chosen": -345.3861389160156, "logps/rejected": -345.3861389160156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05019260570406914, "rewards/margins": 0.0, "rewards/rejected": -0.05019260570406914, "step": 2870 }, { "epoch": 0.8406918193096402, "grad_norm": 0.01495361328125, "learning_rate": 3.759578942797029e-07, "logits/chosen": -2.4550201892852783, "logits/rejected": -2.4550201892852783, "logps/chosen": -306.2907409667969, "logps/rejected": -306.2907409667969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.046652454882860184, "rewards/margins": 0.0, "rewards/rejected": -0.046652454882860184, "step": 2880 }, { "epoch": 0.8436108881266876, "grad_norm": 0.0126953125, "learning_rate": 3.6262852240396356e-07, "logits/chosen": -2.446690082550049, "logits/rejected": -2.446690082550049, "logps/chosen": -310.69708251953125, "logps/rejected": -310.69708251953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04568660259246826, "rewards/margins": 0.0, "rewards/rejected": -0.04568660259246826, "step": 2890 }, { "epoch": 0.8465299569437349, "grad_norm": 0.014404296875, "learning_rate": 3.4952123118123735e-07, "logits/chosen": -2.402627468109131, "logits/rejected": -2.402627468109131, "logps/chosen": -312.1624755859375, "logps/rejected": -312.1624755859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.045014895498752594, "rewards/margins": 0.0, "rewards/rejected": -0.045014895498752594, "step": 2900 }, { "epoch": 0.8465299569437349, "eval_logits/chosen": -2.391954183578491, "eval_logits/rejected": -2.391954183578491, "eval_logps/chosen": -310.79425048828125, "eval_logps/rejected": -310.79425048828125, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.043156567960977554, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.043156567960977554, "eval_runtime": 2682.3759, "eval_samples_per_second": 2.27, "eval_steps_per_second": 0.284, "step": 2900 }, { "epoch": 0.8494490257607823, "grad_norm": 0.01470947265625, "learning_rate": 3.3663738250664853e-07, "logits/chosen": -2.416839122772217, "logits/rejected": -2.416839122772217, "logps/chosen": -342.91815185546875, "logps/rejected": -342.91815185546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.051371246576309204, "rewards/margins": 0.0, "rewards/rejected": -0.051371246576309204, "step": 2910 }, { "epoch": 0.8523680945778297, "grad_norm": 0.0159912109375, "learning_rate": 3.239783150588283e-07, "logits/chosen": -2.3476662635803223, "logits/rejected": -2.3476662635803223, "logps/chosen": -304.71368408203125, "logps/rejected": -304.71368408203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.005339882802218199, "rewards/margins": 0.0, "rewards/rejected": 0.005339882802218199, "step": 2920 }, { "epoch": 0.855287163394877, "grad_norm": 0.01409912109375, "learning_rate": 3.1154534416082573e-07, "logits/chosen": -2.416965961456299, "logits/rejected": -2.416965961456299, "logps/chosen": -299.3199157714844, "logps/rejected": -299.3199157714844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04180007427930832, "rewards/margins": 0.0, "rewards/rejected": -0.04180007427930832, "step": 2930 }, { "epoch": 0.8582062322119244, "grad_norm": 0.01055908203125, "learning_rate": 2.9933976164343514e-07, "logits/chosen": -2.4285387992858887, "logits/rejected": -2.4285387992858887, "logps/chosen": -303.32183837890625, "logps/rejected": -303.32183837890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04410778731107712, "rewards/margins": 0.0, "rewards/rejected": -0.04410778731107712, "step": 2940 }, { "epoch": 0.8611253010289718, "grad_norm": 0.0162353515625, "learning_rate": 2.873628357109745e-07, "logits/chosen": -2.4083211421966553, "logits/rejected": -2.4083211421966553, "logps/chosen": -326.7142028808594, "logps/rejected": -326.7142028808594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.046554580330848694, "rewards/margins": 0.0, "rewards/rejected": -0.046554580330848694, "step": 2950 }, { "epoch": 0.8640443698460191, "grad_norm": 0.01324462890625, "learning_rate": 2.7561581080951195e-07, "logits/chosen": -2.4226157665252686, "logits/rejected": -2.4226157665252686, "logps/chosen": -292.55767822265625, "logps/rejected": -292.55767822265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04246639460325241, "rewards/margins": 0.0, "rewards/rejected": -0.04246639460325241, "step": 2960 }, { "epoch": 0.8669634386630665, "grad_norm": 0.01361083984375, "learning_rate": 2.640999074975645e-07, "logits/chosen": -2.43457293510437, "logits/rejected": -2.43457293510437, "logps/chosen": -298.2882385253906, "logps/rejected": -298.2882385253906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04169774800539017, "rewards/margins": 0.0, "rewards/rejected": -0.04169774800539017, "step": 2970 }, { "epoch": 0.8698825074801139, "grad_norm": 0.01708984375, "learning_rate": 2.5281632231927786e-07, "logits/chosen": -2.473017930984497, "logits/rejected": -2.473017930984497, "logps/chosen": -307.8494567871094, "logps/rejected": -307.8494567871094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.045904386788606644, "rewards/margins": 0.0, "rewards/rejected": -0.045904386788606644, "step": 2980 }, { "epoch": 0.8728015762971612, "grad_norm": 0.014404296875, "learning_rate": 2.417662276800997e-07, "logits/chosen": -2.4377925395965576, "logits/rejected": -2.4377925395965576, "logps/chosen": -329.8043518066406, "logps/rejected": -329.8043518066406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.052566416561603546, "rewards/margins": 0.0, "rewards/rejected": -0.052566416561603546, "step": 2990 }, { "epoch": 0.8757206451142086, "grad_norm": 0.01226806640625, "learning_rate": 2.30950771724964e-07, "logits/chosen": -2.4452061653137207, "logits/rejected": -2.4452061653137207, "logps/chosen": -316.7723388671875, "logps/rejected": -316.7723388671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.048340607434511185, "rewards/margins": 0.0, "rewards/rejected": -0.048340607434511185, "step": 3000 }, { "epoch": 0.8757206451142086, "eval_logits/chosen": -2.3918232917785645, "eval_logits/rejected": -2.3918232917785645, "eval_logps/chosen": -310.78662109375, "eval_logps/rejected": -310.78662109375, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.04308019578456879, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.04308019578456879, "eval_runtime": 2681.8561, "eval_samples_per_second": 2.271, "eval_steps_per_second": 0.284, "step": 3000 }, { "epoch": 0.878639713931256, "grad_norm": 0.017822265625, "learning_rate": 2.2037107821899272e-07, "logits/chosen": -2.414727210998535, "logits/rejected": -2.414727210998535, "logps/chosen": -343.22796630859375, "logps/rejected": -343.22796630859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.047485075891017914, "rewards/margins": 0.0, "rewards/rejected": -0.047485075891017914, "step": 3010 }, { "epoch": 0.8815587827483032, "grad_norm": 0.01708984375, "learning_rate": 2.100282464307357e-07, "logits/chosen": -2.4386258125305176, "logits/rejected": -2.4386258125305176, "logps/chosen": -305.25250244140625, "logps/rejected": -305.25250244140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04185379669070244, "rewards/margins": 0.0, "rewards/rejected": -0.04185379669070244, "step": 3020 }, { "epoch": 0.8844778515653506, "grad_norm": 0.016357421875, "learning_rate": 1.999233510179488e-07, "logits/chosen": -2.4112370014190674, "logits/rejected": -2.4112370014190674, "logps/chosen": -339.65093994140625, "logps/rejected": -339.65093994140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.044059764593839645, "rewards/margins": 0.0, "rewards/rejected": -0.044059764593839645, "step": 3030 }, { "epoch": 0.887396920382398, "grad_norm": 0.012939453125, "learning_rate": 1.9005744191593678e-07, "logits/chosen": -2.4179887771606445, "logits/rejected": -2.4179887771606445, "logps/chosen": -297.5303649902344, "logps/rejected": -297.5303649902344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03384246677160263, "rewards/margins": 0.0, "rewards/rejected": -0.03384246677160263, "step": 3040 }, { "epoch": 0.8903159891994453, "grad_norm": 0.0120849609375, "learning_rate": 1.8043154422845794e-07, "logits/chosen": -2.4646730422973633, "logits/rejected": -2.4646730422973633, "logps/chosen": -295.91790771484375, "logps/rejected": -295.91790771484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04882458597421646, "rewards/margins": 0.0, "rewards/rejected": -0.04882458597421646, "step": 3050 }, { "epoch": 0.8932350580164927, "grad_norm": 0.0186767578125, "learning_rate": 1.7104665812121445e-07, "logits/chosen": -2.423285961151123, "logits/rejected": -2.423285961151123, "logps/chosen": -297.9593505859375, "logps/rejected": -297.9593505859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04218859225511551, "rewards/margins": 0.0, "rewards/rejected": -0.04218859225511551, "step": 3060 }, { "epoch": 0.8961541268335401, "grad_norm": 0.0164794921875, "learning_rate": 1.619037587179309e-07, "logits/chosen": -2.3985249996185303, "logits/rejected": -2.3985249996185303, "logps/chosen": -332.85809326171875, "logps/rejected": -332.85809326171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.048540227115154266, "rewards/margins": 0.0, "rewards/rejected": -0.048540227115154266, "step": 3070 }, { "epoch": 0.8990731956505874, "grad_norm": 0.0172119140625, "learning_rate": 1.5300379599903408e-07, "logits/chosen": -2.4070308208465576, "logits/rejected": -2.4070308208465576, "logps/chosen": -310.7314147949219, "logps/rejected": -310.7314147949219, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03895800933241844, "rewards/margins": 0.0, "rewards/rejected": -0.03895800933241844, "step": 3080 }, { "epoch": 0.9019922644676348, "grad_norm": 0.013671875, "learning_rate": 1.44347694702949e-07, "logits/chosen": -2.3916313648223877, "logits/rejected": -2.3916313648223877, "logps/chosen": -288.28106689453125, "logps/rejected": -288.28106689453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03293871134519577, "rewards/margins": 0.0, "rewards/rejected": -0.03293871134519577, "step": 3090 }, { "epoch": 0.9049113332846822, "grad_norm": 0.017822265625, "learning_rate": 1.359363542300124e-07, "logits/chosen": -2.4147801399230957, "logits/rejected": -2.4147801399230957, "logps/chosen": -295.56768798828125, "logps/rejected": -295.56768798828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04364749416708946, "rewards/margins": 0.0, "rewards/rejected": -0.04364749416708946, "step": 3100 }, { "epoch": 0.9049113332846822, "eval_logits/chosen": -2.390821933746338, "eval_logits/rejected": -2.390821933746338, "eval_logps/chosen": -310.7793884277344, "eval_logps/rejected": -310.7793884277344, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.04300786182284355, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.04300786182284355, "eval_runtime": 2681.9583, "eval_samples_per_second": 2.271, "eval_steps_per_second": 0.284, "step": 3100 }, { "epoch": 0.9078304021017295, "grad_norm": 0.0152587890625, "learning_rate": 1.2777064854902487e-07, "logits/chosen": -2.44869065284729, "logits/rejected": -2.44869065284729, "logps/chosen": -324.82257080078125, "logps/rejected": -324.82257080078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04247719421982765, "rewards/margins": 0.0, "rewards/rejected": -0.04247719421982765, "step": 3110 }, { "epoch": 0.9107494709187769, "grad_norm": 0.023681640625, "learning_rate": 1.1985142610643902e-07, "logits/chosen": -2.4080257415771484, "logits/rejected": -2.4080257415771484, "logps/chosen": -321.1974792480469, "logps/rejected": -321.1974792480469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05160089209675789, "rewards/margins": 0.0, "rewards/rejected": -0.05160089209675789, "step": 3120 }, { "epoch": 0.9136685397358243, "grad_norm": 0.01275634765625, "learning_rate": 1.121795097382064e-07, "logits/chosen": -2.422560691833496, "logits/rejected": -2.422560691833496, "logps/chosen": -335.0086975097656, "logps/rejected": -335.0086975097656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04736005887389183, "rewards/margins": 0.0, "rewards/rejected": -0.04736005887389183, "step": 3130 }, { "epoch": 0.9165876085528716, "grad_norm": 0.0169677734375, "learning_rate": 1.0475569658427803e-07, "logits/chosen": -2.438781261444092, "logits/rejected": -2.438781261444092, "logps/chosen": -311.33868408203125, "logps/rejected": -311.33868408203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03844516724348068, "rewards/margins": 0.0, "rewards/rejected": -0.03844516724348068, "step": 3140 }, { "epoch": 0.919506677369919, "grad_norm": 0.02001953125, "learning_rate": 9.758075800578193e-08, "logits/chosen": -2.4374260902404785, "logits/rejected": -2.4374260902404785, "logps/chosen": -300.9288635253906, "logps/rejected": -300.9288635253906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0434752032160759, "rewards/margins": 0.0, "rewards/rejected": -0.0434752032160759, "step": 3150 }, { "epoch": 0.9224257461869664, "grad_norm": 0.01544189453125, "learning_rate": 9.06554395048742e-08, "logits/chosen": -2.4104561805725098, "logits/rejected": -2.4104561805725098, "logps/chosen": -310.27789306640625, "logps/rejected": -310.27789306640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04003220796585083, "rewards/margins": 0.0, "rewards/rejected": -0.04003220796585083, "step": 3160 }, { "epoch": 0.9253448150040137, "grad_norm": 0.01416015625, "learning_rate": 8.398046064727855e-08, "logits/chosen": -2.448122262954712, "logits/rejected": -2.448122262954712, "logps/chosen": -303.9940185546875, "logps/rejected": -303.9940185546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04497329518198967, "rewards/margins": 0.0, "rewards/rejected": -0.04497329518198967, "step": 3170 }, { "epoch": 0.9282638838210611, "grad_norm": 0.0140380859375, "learning_rate": 7.755651498752265e-08, "logits/chosen": -2.4395852088928223, "logits/rejected": -2.4395852088928223, "logps/chosen": -292.140380859375, "logps/rejected": -292.140380859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04522908851504326, "rewards/margins": 0.0, "rewards/rejected": -0.04522908851504326, "step": 3180 }, { "epoch": 0.9311829526381085, "grad_norm": 0.016357421875, "learning_rate": 7.138426999687171e-08, "logits/chosen": -2.4227964878082275, "logits/rejected": -2.4227964878082275, "logps/chosen": -333.205810546875, "logps/rejected": -333.205810546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04331531375646591, "rewards/margins": 0.0, "rewards/rejected": -0.04331531375646591, "step": 3190 }, { "epoch": 0.9341020214551558, "grad_norm": 0.0177001953125, "learning_rate": 6.546436699398029e-08, "logits/chosen": -2.4100470542907715, "logits/rejected": -2.4100470542907715, "logps/chosen": -334.2508850097656, "logps/rejected": -334.2508850097656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.043238040059804916, "rewards/margins": 0.0, "rewards/rejected": -0.043238040059804916, "step": 3200 }, { "epoch": 0.9341020214551558, "eval_logits/chosen": -2.391075849533081, "eval_logits/rejected": -2.391075849533081, "eval_logps/chosen": -310.7811584472656, "eval_logps/rejected": -310.7811584472656, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.043025679886341095, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.043025679886341095, "eval_runtime": 2682.606, "eval_samples_per_second": 2.27, "eval_steps_per_second": 0.284, "step": 3200 }, { "epoch": 0.9370210902722031, "grad_norm": 0.0150146484375, "learning_rate": 5.979742107825287e-08, "logits/chosen": -2.3894600868225098, "logits/rejected": -2.3894600868225098, "logps/chosen": -313.91131591796875, "logps/rejected": -313.91131591796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04388252645730972, "rewards/margins": 0.0, "rewards/rejected": -0.04388252645730972, "step": 3210 }, { "epoch": 0.9399401590892505, "grad_norm": 0.01446533203125, "learning_rate": 5.4384021065936045e-08, "logits/chosen": -2.408024549484253, "logits/rejected": -2.408024549484253, "logps/chosen": -288.5419006347656, "logps/rejected": -288.5419006347656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0435100793838501, "rewards/margins": 0.0, "rewards/rejected": -0.0435100793838501, "step": 3220 }, { "epoch": 0.9428592279062978, "grad_norm": 0.033447265625, "learning_rate": 4.9224729428935806e-08, "logits/chosen": -2.423318862915039, "logits/rejected": -2.423318862915039, "logps/chosen": -309.74176025390625, "logps/rejected": -309.74176025390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04652589559555054, "rewards/margins": 0.0, "rewards/rejected": -0.04652589559555054, "step": 3230 }, { "epoch": 0.9457782967233452, "grad_norm": 0.011962890625, "learning_rate": 4.432008223637596e-08, "logits/chosen": -2.4209766387939453, "logits/rejected": -2.4209766387939453, "logps/chosen": -299.3330078125, "logps/rejected": -299.3330078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04555036872625351, "rewards/margins": 0.0, "rewards/rejected": -0.04555036872625351, "step": 3240 }, { "epoch": 0.9486973655403926, "grad_norm": 0.01953125, "learning_rate": 3.967058909889937e-08, "logits/chosen": -2.397352457046509, "logits/rejected": -2.397352457046509, "logps/chosen": -313.8124694824219, "logps/rejected": -313.8124694824219, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03916158154606819, "rewards/margins": 0.0, "rewards/rejected": -0.03916158154606819, "step": 3250 }, { "epoch": 0.9516164343574399, "grad_norm": 0.014404296875, "learning_rate": 3.5276733115715556e-08, "logits/chosen": -2.448172092437744, "logits/rejected": -2.448172092437744, "logps/chosen": -305.734130859375, "logps/rejected": -305.734130859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.050101179629564285, "rewards/margins": 0.0, "rewards/rejected": -0.050101179629564285, "step": 3260 }, { "epoch": 0.9545355031744873, "grad_norm": 0.01397705078125, "learning_rate": 3.11389708244067e-08, "logits/chosen": -2.4387991428375244, "logits/rejected": -2.4387991428375244, "logps/chosen": -325.77374267578125, "logps/rejected": -325.77374267578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04632676765322685, "rewards/margins": 0.0, "rewards/rejected": -0.04632676765322685, "step": 3270 }, { "epoch": 0.9574545719915347, "grad_norm": 0.0130615234375, "learning_rate": 2.7257732153490313e-08, "logits/chosen": -2.3997585773468018, "logits/rejected": -2.3997585773468018, "logps/chosen": -323.36962890625, "logps/rejected": -323.36962890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04246259480714798, "rewards/margins": 0.0, "rewards/rejected": -0.04246259480714798, "step": 3280 }, { "epoch": 0.960373640808582, "grad_norm": 0.01226806640625, "learning_rate": 2.3633420377749684e-08, "logits/chosen": -2.404913902282715, "logits/rejected": -2.404913902282715, "logps/chosen": -309.89715576171875, "logps/rejected": -309.89715576171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.05698453634977341, "rewards/margins": 0.0, "rewards/rejected": -0.05698453634977341, "step": 3290 }, { "epoch": 0.9632927096256294, "grad_norm": 0.013671875, "learning_rate": 2.0266412076330457e-08, "logits/chosen": -2.431570529937744, "logits/rejected": -2.431570529937744, "logps/chosen": -297.8599548339844, "logps/rejected": -297.8599548339844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.052738986909389496, "rewards/margins": 0.0, "rewards/rejected": -0.052738986909389496, "step": 3300 }, { "epoch": 0.9632927096256294, "eval_logits/chosen": -2.3914639949798584, "eval_logits/rejected": -2.3914639949798584, "eval_logps/chosen": -310.7767333984375, "eval_logps/rejected": -310.7767333984375, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.0429811105132103, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.0429811105132103, "eval_runtime": 2682.6978, "eval_samples_per_second": 2.27, "eval_steps_per_second": 0.284, "step": 3300 }, { "epoch": 0.9662117784426768, "grad_norm": 0.01361083984375, "learning_rate": 1.7157057093614704e-08, "logits/chosen": -2.452519178390503, "logits/rejected": -2.452519178390503, "logps/chosen": -296.8190002441406, "logps/rejected": -296.8190002441406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.041969865560531616, "rewards/margins": 0.0, "rewards/rejected": -0.041969865560531616, "step": 3310 }, { "epoch": 0.9691308472597241, "grad_norm": 0.01422119140625, "learning_rate": 1.430567850286807e-08, "logits/chosen": -2.4390811920166016, "logits/rejected": -2.4390811920166016, "logps/chosen": -339.14801025390625, "logps/rejected": -339.14801025390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0495796874165535, "rewards/margins": 0.0, "rewards/rejected": -0.0495796874165535, "step": 3320 }, { "epoch": 0.9720499160767715, "grad_norm": 0.017333984375, "learning_rate": 1.1712572572674386e-08, "logits/chosen": -2.3779425621032715, "logits/rejected": -2.3779425621032715, "logps/chosen": -342.52117919921875, "logps/rejected": -342.52117919921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03777734562754631, "rewards/margins": 0.0, "rewards/rejected": -0.03777734562754631, "step": 3330 }, { "epoch": 0.9749689848938189, "grad_norm": 0.0166015625, "learning_rate": 9.378008736149746e-09, "logits/chosen": -2.408357620239258, "logits/rejected": -2.408357620239258, "logps/chosen": -321.5648193359375, "logps/rejected": -321.5648193359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03947510942816734, "rewards/margins": 0.0, "rewards/rejected": -0.03947510942816734, "step": 3340 }, { "epoch": 0.9778880537108662, "grad_norm": 0.01275634765625, "learning_rate": 7.30222956294907e-09, "logits/chosen": -2.456228733062744, "logits/rejected": -2.456228733062744, "logps/chosen": -322.9805603027344, "logps/rejected": -322.9805603027344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04836040362715721, "rewards/margins": 0.0, "rewards/rejected": -0.04836040362715721, "step": 3350 }, { "epoch": 0.9808071225279136, "grad_norm": 0.015380859375, "learning_rate": 5.485450734061259e-09, "logits/chosen": -2.395473003387451, "logits/rejected": -2.395473003387451, "logps/chosen": -292.87994384765625, "logps/rejected": -292.87994384765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0445740707218647, "rewards/margins": 0.0, "rewards/rejected": -0.0445740707218647, "step": 3360 }, { "epoch": 0.983726191344961, "grad_norm": 0.01544189453125, "learning_rate": 3.927861019399903e-09, "logits/chosen": -2.406294345855713, "logits/rejected": -2.406294345855713, "logps/chosen": -288.55987548828125, "logps/rejected": -288.55987548828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03885042294859886, "rewards/margins": 0.0, "rewards/rejected": -0.03885042294859886, "step": 3370 }, { "epoch": 0.9866452601620083, "grad_norm": 0.0155029296875, "learning_rate": 2.629622258188691e-09, "logits/chosen": -2.4149577617645264, "logits/rejected": -2.4149577617645264, "logps/chosen": -282.56585693359375, "logps/rejected": -282.56585693359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.0403311550617218, "rewards/margins": 0.0, "rewards/rejected": -0.0403311550617218, "step": 3380 }, { "epoch": 0.9895643289790557, "grad_norm": 0.01544189453125, "learning_rate": 1.5908693421465282e-09, "logits/chosen": -2.4097559452056885, "logits/rejected": -2.4097559452056885, "logps/chosen": -284.1174011230469, "logps/rejected": -284.1174011230469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04213656857609749, "rewards/margins": 0.0, "rewards/rejected": -0.04213656857609749, "step": 3390 }, { "epoch": 0.9924833977961031, "grad_norm": 0.0137939453125, "learning_rate": 8.11710201470417e-10, "logits/chosen": -2.4348714351654053, "logits/rejected": -2.4348714351654053, "logps/chosen": -325.41339111328125, "logps/rejected": -325.41339111328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04575268179178238, "rewards/margins": 0.0, "rewards/rejected": -0.04575268179178238, "step": 3400 }, { "epoch": 0.9924833977961031, "eval_logits/chosen": -2.3908708095550537, "eval_logits/rejected": -2.3908708095550537, "eval_logps/chosen": -310.7832336425781, "eval_logps/rejected": -310.7832336425781, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.04304642230272293, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.04304642230272293, "eval_runtime": 2747.5083, "eval_samples_per_second": 2.217, "eval_steps_per_second": 0.277, "step": 3400 }, { "epoch": 0.9954024666131503, "grad_norm": 0.01165771484375, "learning_rate": 2.922257936230355e-10, "logits/chosen": -2.409545421600342, "logits/rejected": -2.409545421600342, "logps/chosen": -264.7913818359375, "logps/rejected": -264.7913818359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.038275159895420074, "rewards/margins": 0.0, "rewards/rejected": -0.038275159895420074, "step": 3410 }, { "epoch": 0.9983215354301977, "grad_norm": 0.0167236328125, "learning_rate": 3.247009491946784e-11, "logits/chosen": -2.429719924926758, "logits/rejected": -2.429719924926758, "logps/chosen": -340.23126220703125, "logps/rejected": -340.23126220703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04289505258202553, "rewards/margins": 0.0, "rewards/rejected": -0.04289505258202553, "step": 3420 }, { "epoch": 0.9997810698387214, "step": 3425, "total_flos": 0.0, "train_loss": 0.18720042388804636, "train_runtime": 41876.0871, "train_samples_per_second": 1.309, "train_steps_per_second": 0.082 } ], "logging_steps": 10, "max_steps": 3425, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }