diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5719 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997810698387214, + "eval_steps": 100, + "global_step": 3425, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00029190688170473617, + "grad_norm": 0.013427734375, + "learning_rate": 1.457725947521866e-08, + "logits/chosen": -2.4752657413482666, + "logits/rejected": -2.4752657413482666, + "logps/chosen": -328.9035949707031, + "logps/rejected": -328.9035949707031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.002919068817047362, + "grad_norm": 0.0147705078125, + "learning_rate": 1.457725947521866e-07, + "logits/chosen": -2.395798683166504, + "logits/rejected": -2.395798683166504, + "logps/chosen": -317.85565185546875, + "logps/rejected": -317.85565185546875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0003186435205861926, + "rewards/margins": 0.0, + "rewards/rejected": -0.0003186435205861926, + "step": 10 + }, + { + "epoch": 0.005838137634094724, + "grad_norm": 0.01318359375, + "learning_rate": 2.915451895043732e-07, + "logits/chosen": -2.4440758228302, + "logits/rejected": -2.4440758228302, + "logps/chosen": -301.12921142578125, + "logps/rejected": -301.12921142578125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -7.847430242691189e-05, + "rewards/margins": 0.0, + "rewards/rejected": -7.847430242691189e-05, + "step": 20 + }, + { + "epoch": 0.008757206451142086, + "grad_norm": 0.01177978515625, + "learning_rate": 4.373177842565598e-07, + "logits/chosen": -2.441359519958496, + "logits/rejected": -2.441359519958496, + "logps/chosen": -317.1576843261719, + "logps/rejected": -317.1576843261719, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00025945488596335053, + "rewards/margins": 0.0, + "rewards/rejected": -0.00025945488596335053, + "step": 30 + }, + { + "epoch": 0.011676275268189448, + "grad_norm": 0.0167236328125, + "learning_rate": 5.830903790087464e-07, + "logits/chosen": -2.455430269241333, + "logits/rejected": -2.455430269241333, + "logps/chosen": -328.7832946777344, + "logps/rejected": -328.7832946777344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00034936360316351056, + "rewards/margins": 0.0, + "rewards/rejected": -0.00034936360316351056, + "step": 40 + }, + { + "epoch": 0.014595344085236809, + "grad_norm": 0.012939453125, + "learning_rate": 7.288629737609331e-07, + "logits/chosen": -2.406463384628296, + "logits/rejected": -2.406463384628296, + "logps/chosen": -303.563232421875, + "logps/rejected": -303.563232421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0001031260471791029, + "rewards/margins": 0.0, + "rewards/rejected": 0.0001031260471791029, + "step": 50 + }, + { + "epoch": 0.01751441290228417, + "grad_norm": 0.016357421875, + "learning_rate": 8.746355685131196e-07, + "logits/chosen": -2.4401960372924805, + "logits/rejected": -2.4401960372924805, + "logps/chosen": -284.1253967285156, + "logps/rejected": -284.1253967285156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00043843849562108517, + "rewards/margins": 0.0, + "rewards/rejected": -0.00043843849562108517, + "step": 60 + }, + { + "epoch": 0.02043348171933153, + "grad_norm": 0.01153564453125, + "learning_rate": 1.0204081632653063e-06, + "logits/chosen": -2.423875093460083, + "logits/rejected": -2.423875093460083, + "logps/chosen": -280.09442138671875, + "logps/rejected": -280.09442138671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00031216375646181405, + "rewards/margins": 0.0, + "rewards/rejected": -0.00031216375646181405, + "step": 70 + }, + { + "epoch": 0.023352550536378896, + "grad_norm": 0.01214599609375, + "learning_rate": 1.1661807580174927e-06, + "logits/chosen": -2.404435396194458, + "logits/rejected": -2.404435396194458, + "logps/chosen": -267.2549743652344, + "logps/rejected": -267.2549743652344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0006922121392562985, + "rewards/margins": 0.0, + "rewards/rejected": 0.0006922121392562985, + "step": 80 + }, + { + "epoch": 0.026271619353426257, + "grad_norm": 0.0146484375, + "learning_rate": 1.3119533527696792e-06, + "logits/chosen": -2.416917324066162, + "logits/rejected": -2.416917324066162, + "logps/chosen": -333.58563232421875, + "logps/rejected": -333.58563232421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0005137195694260299, + "rewards/margins": 0.0, + "rewards/rejected": 0.0005137195694260299, + "step": 90 + }, + { + "epoch": 0.029190688170473617, + "grad_norm": 0.0189208984375, + "learning_rate": 1.4577259475218661e-06, + "logits/chosen": -2.4351730346679688, + "logits/rejected": -2.4351730346679688, + "logps/chosen": -339.3778381347656, + "logps/rejected": -339.3778381347656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0005722829955630004, + "rewards/margins": 0.0, + "rewards/rejected": 0.0005722829955630004, + "step": 100 + }, + { + "epoch": 0.029190688170473617, + "eval_logits/chosen": -2.394068479537964, + "eval_logits/rejected": -2.394068479537964, + "eval_logps/chosen": -306.389892578125, + "eval_logps/rejected": -306.389892578125, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": 0.0008870832389220595, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": 0.0008870832389220595, + "eval_runtime": 2666.9983, + "eval_samples_per_second": 2.283, + "eval_steps_per_second": 0.286, + "step": 100 + }, + { + "epoch": 0.03210975698752098, + "grad_norm": 0.015869140625, + "learning_rate": 1.6034985422740526e-06, + "logits/chosen": -2.420276165008545, + "logits/rejected": -2.420276165008545, + "logps/chosen": -306.0760803222656, + "logps/rejected": -306.0760803222656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0014700460014864802, + "rewards/margins": 0.0, + "rewards/rejected": 0.0014700460014864802, + "step": 110 + }, + { + "epoch": 0.03502882580456834, + "grad_norm": 0.01544189453125, + "learning_rate": 1.7492711370262391e-06, + "logits/chosen": -2.4616119861602783, + "logits/rejected": -2.4616119861602783, + "logps/chosen": -328.64129638671875, + "logps/rejected": -328.64129638671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.001054848893545568, + "rewards/margins": 0.0, + "rewards/rejected": 0.001054848893545568, + "step": 120 + }, + { + "epoch": 0.037947894621615706, + "grad_norm": 0.0250244140625, + "learning_rate": 1.895043731778426e-06, + "logits/chosen": -2.404423236846924, + "logits/rejected": -2.404423236846924, + "logps/chosen": -339.0644836425781, + "logps/rejected": -339.0644836425781, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0002889078459702432, + "rewards/margins": 0.0, + "rewards/rejected": 0.0002889078459702432, + "step": 130 + }, + { + "epoch": 0.04086696343866306, + "grad_norm": 0.0137939453125, + "learning_rate": 2.0408163265306125e-06, + "logits/chosen": -2.4294090270996094, + "logits/rejected": -2.4294090270996094, + "logps/chosen": -299.0234375, + "logps/rejected": -299.0234375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0003939162997994572, + "rewards/margins": 0.0, + "rewards/rejected": 0.0003939162997994572, + "step": 140 + }, + { + "epoch": 0.04378603225571043, + "grad_norm": 0.01470947265625, + "learning_rate": 2.1865889212827988e-06, + "logits/chosen": -2.4415223598480225, + "logits/rejected": -2.4415223598480225, + "logps/chosen": -317.4403991699219, + "logps/rejected": -317.4403991699219, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0011137222172692418, + "rewards/margins": 0.0, + "rewards/rejected": 0.0011137222172692418, + "step": 150 + }, + { + "epoch": 0.04670510107275779, + "grad_norm": 0.01202392578125, + "learning_rate": 2.3323615160349855e-06, + "logits/chosen": -2.433961868286133, + "logits/rejected": -2.433961868286133, + "logps/chosen": -315.8016662597656, + "logps/rejected": -315.8016662597656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.00034690109896473587, + "rewards/margins": 0.0, + "rewards/rejected": 0.00034690109896473587, + "step": 160 + }, + { + "epoch": 0.04962416988980515, + "grad_norm": 0.01226806640625, + "learning_rate": 2.478134110787172e-06, + "logits/chosen": -2.4214272499084473, + "logits/rejected": -2.4214272499084473, + "logps/chosen": -304.0071105957031, + "logps/rejected": -304.0071105957031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.00021128072694409639, + "rewards/margins": 0.0, + "rewards/rejected": 0.00021128072694409639, + "step": 170 + }, + { + "epoch": 0.05254323870685251, + "grad_norm": 0.01318359375, + "learning_rate": 2.6239067055393585e-06, + "logits/chosen": -2.410125255584717, + "logits/rejected": -2.410125255584717, + "logps/chosen": -329.052978515625, + "logps/rejected": -329.052978515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -8.667710062582046e-05, + "rewards/margins": 0.0, + "rewards/rejected": -8.667710062582046e-05, + "step": 180 + }, + { + "epoch": 0.05546230752389988, + "grad_norm": 0.0111083984375, + "learning_rate": 2.7696793002915456e-06, + "logits/chosen": -2.412470579147339, + "logits/rejected": -2.412470579147339, + "logps/chosen": -302.9618225097656, + "logps/rejected": -302.9618225097656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0005764733068645, + "rewards/margins": 0.0, + "rewards/rejected": 0.0005764733068645, + "step": 190 + }, + { + "epoch": 0.058381376340947234, + "grad_norm": 0.013671875, + "learning_rate": 2.9154518950437323e-06, + "logits/chosen": -2.3948373794555664, + "logits/rejected": -2.3948373794555664, + "logps/chosen": -312.7694091796875, + "logps/rejected": -312.7694091796875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0006709109293296933, + "rewards/margins": 0.0, + "rewards/rejected": -0.0006709109293296933, + "step": 200 + }, + { + "epoch": 0.058381376340947234, + "eval_logits/chosen": -2.3945627212524414, + "eval_logits/rejected": -2.3945627212524414, + "eval_logps/chosen": -306.5539245605469, + "eval_logps/rejected": -306.5539245605469, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.0007532919407822192, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.0007532919407822192, + "eval_runtime": 2667.9395, + "eval_samples_per_second": 2.283, + "eval_steps_per_second": 0.286, + "step": 200 + }, + { + "epoch": 0.0613004451579946, + "grad_norm": 0.0120849609375, + "learning_rate": 3.0612244897959185e-06, + "logits/chosen": -2.445885181427002, + "logits/rejected": -2.445885181427002, + "logps/chosen": -316.7839050292969, + "logps/rejected": -316.7839050292969, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0010994909098371863, + "rewards/margins": 0.0, + "rewards/rejected": -0.0010994909098371863, + "step": 210 + }, + { + "epoch": 0.06421951397504196, + "grad_norm": 0.011962890625, + "learning_rate": 3.2069970845481052e-06, + "logits/chosen": -2.4333603382110596, + "logits/rejected": -2.4333603382110596, + "logps/chosen": -277.94915771484375, + "logps/rejected": -277.94915771484375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00041724619222804904, + "rewards/margins": 0.0, + "rewards/rejected": -0.00041724619222804904, + "step": 220 + }, + { + "epoch": 0.06713858279208933, + "grad_norm": 0.0145263671875, + "learning_rate": 3.352769679300292e-06, + "logits/chosen": -2.4338879585266113, + "logits/rejected": -2.4338879585266113, + "logps/chosen": -325.23455810546875, + "logps/rejected": -325.23455810546875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0012703577522188425, + "rewards/margins": 0.0, + "rewards/rejected": -0.0012703577522188425, + "step": 230 + }, + { + "epoch": 0.07005765160913668, + "grad_norm": 0.0157470703125, + "learning_rate": 3.4985422740524782e-06, + "logits/chosen": -2.413400173187256, + "logits/rejected": -2.413400173187256, + "logps/chosen": -309.69403076171875, + "logps/rejected": -309.69403076171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.001970961457118392, + "rewards/margins": 0.0, + "rewards/rejected": -0.001970961457118392, + "step": 240 + }, + { + "epoch": 0.07297672042618404, + "grad_norm": 0.01422119140625, + "learning_rate": 3.644314868804665e-06, + "logits/chosen": -2.4458959102630615, + "logits/rejected": -2.4458959102630615, + "logps/chosen": -304.130615234375, + "logps/rejected": -304.130615234375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0047234781086444855, + "rewards/margins": 0.0, + "rewards/rejected": -0.0047234781086444855, + "step": 250 + }, + { + "epoch": 0.07589578924323141, + "grad_norm": 0.01324462890625, + "learning_rate": 3.790087463556852e-06, + "logits/chosen": -2.4266982078552246, + "logits/rejected": -2.4266982078552246, + "logps/chosen": -286.97076416015625, + "logps/rejected": -286.97076416015625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.006642763502895832, + "rewards/margins": 0.0, + "rewards/rejected": -0.006642763502895832, + "step": 260 + }, + { + "epoch": 0.07881485806027877, + "grad_norm": 0.015625, + "learning_rate": 3.935860058309039e-06, + "logits/chosen": -2.436506748199463, + "logits/rejected": -2.436506748199463, + "logps/chosen": -310.330322265625, + "logps/rejected": -310.330322265625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.007435324601829052, + "rewards/margins": 0.0, + "rewards/rejected": -0.007435324601829052, + "step": 270 + }, + { + "epoch": 0.08173392687732613, + "grad_norm": 0.01495361328125, + "learning_rate": 4.081632653061225e-06, + "logits/chosen": -2.394254446029663, + "logits/rejected": -2.394254446029663, + "logps/chosen": -304.8192443847656, + "logps/rejected": -304.8192443847656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00737042585387826, + "rewards/margins": 0.0, + "rewards/rejected": -0.00737042585387826, + "step": 280 + }, + { + "epoch": 0.0846529956943735, + "grad_norm": 0.0130615234375, + "learning_rate": 4.227405247813411e-06, + "logits/chosen": -2.4005939960479736, + "logits/rejected": -2.4005939960479736, + "logps/chosen": -288.9790954589844, + "logps/rejected": -288.9790954589844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0066338046453893185, + "rewards/margins": 0.0, + "rewards/rejected": -0.0066338046453893185, + "step": 290 + }, + { + "epoch": 0.08757206451142086, + "grad_norm": 0.01458740234375, + "learning_rate": 4.3731778425655976e-06, + "logits/chosen": -2.4416656494140625, + "logits/rejected": -2.4416656494140625, + "logps/chosen": -288.1855773925781, + "logps/rejected": -288.1855773925781, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0070955632254481316, + "rewards/margins": 0.0, + "rewards/rejected": -0.0070955632254481316, + "step": 300 + }, + { + "epoch": 0.08757206451142086, + "eval_logits/chosen": -2.3941876888275146, + "eval_logits/rejected": -2.3941876888275146, + "eval_logps/chosen": -307.0490417480469, + "eval_logps/rejected": -307.0490417480469, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.005704815499484539, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.005704815499484539, + "eval_runtime": 2667.7916, + "eval_samples_per_second": 2.283, + "eval_steps_per_second": 0.286, + "step": 300 + }, + { + "epoch": 0.09049113332846821, + "grad_norm": 0.01153564453125, + "learning_rate": 4.518950437317785e-06, + "logits/chosen": -2.420503854751587, + "logits/rejected": -2.420503854751587, + "logps/chosen": -276.64093017578125, + "logps/rejected": -276.64093017578125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.007363935001194477, + "rewards/margins": 0.0, + "rewards/rejected": -0.007363935001194477, + "step": 310 + }, + { + "epoch": 0.09341020214551558, + "grad_norm": 0.0185546875, + "learning_rate": 4.664723032069971e-06, + "logits/chosen": -2.4066500663757324, + "logits/rejected": -2.4066500663757324, + "logps/chosen": -315.653076171875, + "logps/rejected": -315.653076171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.006720393896102905, + "rewards/margins": 0.0, + "rewards/rejected": -0.006720393896102905, + "step": 320 + }, + { + "epoch": 0.09632927096256294, + "grad_norm": 0.015625, + "learning_rate": 4.810495626822158e-06, + "logits/chosen": -2.445965528488159, + "logits/rejected": -2.445965528488159, + "logps/chosen": -324.6703796386719, + "logps/rejected": -324.6703796386719, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.007294761948287487, + "rewards/margins": 0.0, + "rewards/rejected": -0.007294761948287487, + "step": 330 + }, + { + "epoch": 0.0992483397796103, + "grad_norm": 0.01446533203125, + "learning_rate": 4.956268221574344e-06, + "logits/chosen": -2.4288485050201416, + "logits/rejected": -2.4288485050201416, + "logps/chosen": -323.6286926269531, + "logps/rejected": -323.6286926269531, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.009048479609191418, + "rewards/margins": 0.0, + "rewards/rejected": -0.009048479609191418, + "step": 340 + }, + { + "epoch": 0.10216740859665767, + "grad_norm": 0.01458740234375, + "learning_rate": 4.999936358746211e-06, + "logits/chosen": -2.4309639930725098, + "logits/rejected": -2.4309639930725098, + "logps/chosen": -271.655029296875, + "logps/rejected": -271.655029296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.009400355629622936, + "rewards/margins": 0.0, + "rewards/rejected": -0.009400355629622936, + "step": 350 + }, + { + "epoch": 0.10508647741370503, + "grad_norm": 0.0152587890625, + "learning_rate": 4.99962465428288e-06, + "logits/chosen": -2.4447290897369385, + "logits/rejected": -2.4447290897369385, + "logps/chosen": -303.4416198730469, + "logps/rejected": -303.4416198730469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.013941009528934956, + "rewards/margins": 0.0, + "rewards/rejected": -0.013941009528934956, + "step": 360 + }, + { + "epoch": 0.10800554623075238, + "grad_norm": 0.0185546875, + "learning_rate": 4.999053229746866e-06, + "logits/chosen": -2.440117359161377, + "logits/rejected": -2.440117359161377, + "logps/chosen": -290.806884765625, + "logps/rejected": -290.806884765625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01759205386042595, + "rewards/margins": 0.0, + "rewards/rejected": -0.01759205386042595, + "step": 370 + }, + { + "epoch": 0.11092461504779975, + "grad_norm": 0.01263427734375, + "learning_rate": 4.9982221445112535e-06, + "logits/chosen": -2.4275150299072266, + "logits/rejected": -2.4275150299072266, + "logps/chosen": -320.67938232421875, + "logps/rejected": -320.67938232421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.018790820613503456, + "rewards/margins": 0.0, + "rewards/rejected": -0.018790820613503456, + "step": 380 + }, + { + "epoch": 0.11384368386484711, + "grad_norm": 0.01397705078125, + "learning_rate": 4.997131484928813e-06, + "logits/chosen": -2.414685010910034, + "logits/rejected": -2.414685010910034, + "logps/chosen": -301.1441650390625, + "logps/rejected": -301.1441650390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.015089067630469799, + "rewards/margins": 0.0, + "rewards/rejected": -0.015089067630469799, + "step": 390 + }, + { + "epoch": 0.11676275268189447, + "grad_norm": 0.01458740234375, + "learning_rate": 4.995781364323035e-06, + "logits/chosen": -2.391239643096924, + "logits/rejected": -2.391239643096924, + "logps/chosen": -285.70941162109375, + "logps/rejected": -285.70941162109375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.010374903678894043, + "rewards/margins": 0.0, + "rewards/rejected": -0.010374903678894043, + "step": 400 + }, + { + "epoch": 0.11676275268189447, + "eval_logits/chosen": -2.393982172012329, + "eval_logits/rejected": -2.393982172012329, + "eval_logps/chosen": -307.3796081542969, + "eval_logps/rejected": -307.3796081542969, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.009010241366922855, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.009010241366922855, + "eval_runtime": 2667.3233, + "eval_samples_per_second": 2.283, + "eval_steps_per_second": 0.286, + "step": 400 + }, + { + "epoch": 0.11968182149894184, + "grad_norm": 0.01300048828125, + "learning_rate": 4.994171922976349e-06, + "logits/chosen": -2.4642019271850586, + "logits/rejected": -2.4642019271850586, + "logps/chosen": -298.46978759765625, + "logps/rejected": -298.46978759765625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.009510824456810951, + "rewards/margins": 0.0, + "rewards/rejected": -0.009510824456810951, + "step": 410 + }, + { + "epoch": 0.1226008903159892, + "grad_norm": 0.0159912109375, + "learning_rate": 4.992303328115551e-06, + "logits/chosen": -2.420297145843506, + "logits/rejected": -2.420297145843506, + "logps/chosen": -306.69610595703125, + "logps/rejected": -306.69610595703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0014959282707422972, + "rewards/margins": 0.0, + "rewards/rejected": -0.0014959282707422972, + "step": 420 + }, + { + "epoch": 0.12551995913303657, + "grad_norm": 0.0159912109375, + "learning_rate": 4.990175773894428e-06, + "logits/chosen": -2.46386981010437, + "logits/rejected": -2.46386981010437, + "logps/chosen": -281.81097412109375, + "logps/rejected": -281.81097412109375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.008724676445126534, + "rewards/margins": 0.0, + "rewards/rejected": -0.008724676445126534, + "step": 430 + }, + { + "epoch": 0.1284390279500839, + "grad_norm": 0.01287841796875, + "learning_rate": 4.987789481373586e-06, + "logits/chosen": -2.406324625015259, + "logits/rejected": -2.406324625015259, + "logps/chosen": -297.7574157714844, + "logps/rejected": -297.7574157714844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.006952273193746805, + "rewards/margins": 0.0, + "rewards/rejected": -0.006952273193746805, + "step": 440 + }, + { + "epoch": 0.13135809676713128, + "grad_norm": 0.015869140625, + "learning_rate": 4.985144698497477e-06, + "logits/chosen": -2.4094862937927246, + "logits/rejected": -2.4094862937927246, + "logps/chosen": -294.4402160644531, + "logps/rejected": -294.4402160644531, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.009783747605979443, + "rewards/margins": 0.0, + "rewards/rejected": -0.009783747605979443, + "step": 450 + }, + { + "epoch": 0.13427716558417865, + "grad_norm": 0.015625, + "learning_rate": 4.982241700068639e-06, + "logits/chosen": -2.448880434036255, + "logits/rejected": -2.448880434036255, + "logps/chosen": -312.9103088378906, + "logps/rejected": -312.9103088378906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.010099256411194801, + "rewards/margins": 0.0, + "rewards/rejected": -0.010099256411194801, + "step": 460 + }, + { + "epoch": 0.137196234401226, + "grad_norm": 0.014404296875, + "learning_rate": 4.979080787719144e-06, + "logits/chosen": -2.4513556957244873, + "logits/rejected": -2.4513556957244873, + "logps/chosen": -330.3889465332031, + "logps/rejected": -330.3889465332031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.012815428897738457, + "rewards/margins": 0.0, + "rewards/rejected": -0.012815428897738457, + "step": 470 + }, + { + "epoch": 0.14011530321827337, + "grad_norm": 0.013427734375, + "learning_rate": 4.975662289879257e-06, + "logits/chosen": -2.3824195861816406, + "logits/rejected": -2.3824195861816406, + "logps/chosen": -324.45654296875, + "logps/rejected": -324.45654296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.010385606437921524, + "rewards/margins": 0.0, + "rewards/rejected": -0.010385606437921524, + "step": 480 + }, + { + "epoch": 0.14303437203532074, + "grad_norm": 0.016845703125, + "learning_rate": 4.971986561743308e-06, + "logits/chosen": -2.388378620147705, + "logits/rejected": -2.388378620147705, + "logps/chosen": -292.9872131347656, + "logps/rejected": -292.9872131347656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00819515809416771, + "rewards/margins": 0.0, + "rewards/rejected": -0.00819515809416771, + "step": 490 + }, + { + "epoch": 0.14595344085236808, + "grad_norm": 0.01348876953125, + "learning_rate": 4.96805398523279e-06, + "logits/chosen": -2.438722610473633, + "logits/rejected": -2.438722610473633, + "logps/chosen": -333.7470397949219, + "logps/rejected": -333.7470397949219, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.007836517877876759, + "rewards/margins": 0.0, + "rewards/rejected": -0.007836517877876759, + "step": 500 + }, + { + "epoch": 0.14595344085236808, + "eval_logits/chosen": -2.3937265872955322, + "eval_logits/rejected": -2.3937265872955322, + "eval_logps/chosen": -307.1580505371094, + "eval_logps/rejected": -307.1580505371094, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.006794503424316645, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.006794503424316645, + "eval_runtime": 2668.7964, + "eval_samples_per_second": 2.282, + "eval_steps_per_second": 0.286, + "step": 500 + }, + { + "epoch": 0.14887250966941545, + "grad_norm": 0.0146484375, + "learning_rate": 4.963864968956674e-06, + "logits/chosen": -2.4363291263580322, + "logits/rejected": -2.4363291263580322, + "logps/chosen": -295.4735412597656, + "logps/rejected": -295.4735412597656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.008334552869200706, + "rewards/margins": 0.0, + "rewards/rejected": -0.008334552869200706, + "step": 510 + }, + { + "epoch": 0.15179157848646282, + "grad_norm": 0.0113525390625, + "learning_rate": 4.959419948168952e-06, + "logits/chosen": -2.4209957122802734, + "logits/rejected": -2.4209957122802734, + "logps/chosen": -252.09475708007812, + "logps/rejected": -252.09475708007812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.005244333762675524, + "rewards/margins": 0.0, + "rewards/rejected": -0.005244333762675524, + "step": 520 + }, + { + "epoch": 0.15471064730351017, + "grad_norm": 0.011962890625, + "learning_rate": 4.954719384723416e-06, + "logits/chosen": -2.4421539306640625, + "logits/rejected": -2.4421539306640625, + "logps/chosen": -290.62939453125, + "logps/rejected": -290.62939453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.006143758539110422, + "rewards/margins": 0.0, + "rewards/rejected": -0.006143758539110422, + "step": 530 + }, + { + "epoch": 0.15762971612055754, + "grad_norm": 0.0155029296875, + "learning_rate": 4.949763767025665e-06, + "logits/chosen": -2.433292865753174, + "logits/rejected": -2.433292865753174, + "logps/chosen": -301.56488037109375, + "logps/rejected": -301.56488037109375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.007085380610078573, + "rewards/margins": 0.0, + "rewards/rejected": -0.007085380610078573, + "step": 540 + }, + { + "epoch": 0.1605487849376049, + "grad_norm": 0.01513671875, + "learning_rate": 4.944553609982363e-06, + "logits/chosen": -2.397106647491455, + "logits/rejected": -2.397106647491455, + "logps/chosen": -274.3099670410156, + "logps/rejected": -274.3099670410156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.002214896958321333, + "rewards/margins": 0.0, + "rewards/rejected": -0.002214896958321333, + "step": 550 + }, + { + "epoch": 0.16346785375465225, + "grad_norm": 0.0152587890625, + "learning_rate": 4.939089454947734e-06, + "logits/chosen": -2.417797088623047, + "logits/rejected": -2.417797088623047, + "logps/chosen": -299.5130615234375, + "logps/rejected": -299.5130615234375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.005161653272807598, + "rewards/margins": 0.0, + "rewards/rejected": -0.005161653272807598, + "step": 560 + }, + { + "epoch": 0.16638692257169962, + "grad_norm": 0.01507568359375, + "learning_rate": 4.933371869667315e-06, + "logits/chosen": -2.4109036922454834, + "logits/rejected": -2.4109036922454834, + "logps/chosen": -279.4015808105469, + "logps/rejected": -279.4015808105469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0024168032687157393, + "rewards/margins": 0.0, + "rewards/rejected": -0.0024168032687157393, + "step": 570 + }, + { + "epoch": 0.169305991388747, + "grad_norm": 0.00885009765625, + "learning_rate": 4.9274014482189654e-06, + "logits/chosen": -2.4315690994262695, + "logits/rejected": -2.4315690994262695, + "logps/chosen": -309.34234619140625, + "logps/rejected": -309.34234619140625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.004016817547380924, + "rewards/margins": 0.0, + "rewards/rejected": -0.004016817547380924, + "step": 580 + }, + { + "epoch": 0.17222506020579434, + "grad_norm": 0.017578125, + "learning_rate": 4.9211788109511405e-06, + "logits/chosen": -2.460508108139038, + "logits/rejected": -2.460508108139038, + "logps/chosen": -334.00933837890625, + "logps/rejected": -334.00933837890625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.005641533527523279, + "rewards/margins": 0.0, + "rewards/rejected": -0.005641533527523279, + "step": 590 + }, + { + "epoch": 0.1751441290228417, + "grad_norm": 0.016845703125, + "learning_rate": 4.914704604418435e-06, + "logits/chosen": -2.4566855430603027, + "logits/rejected": -2.4566855430603027, + "logps/chosen": -307.21331787109375, + "logps/rejected": -307.21331787109375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0077440254390239716, + "rewards/margins": 0.0, + "rewards/rejected": -0.0077440254390239716, + "step": 600 + }, + { + "epoch": 0.1751441290228417, + "eval_logits/chosen": -2.394993782043457, + "eval_logits/rejected": -2.394993782043457, + "eval_logps/chosen": -306.9631042480469, + "eval_logps/rejected": -306.9631042480469, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.004845078103244305, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.004845078103244305, + "eval_runtime": 2667.3075, + "eval_samples_per_second": 2.283, + "eval_steps_per_second": 0.286, + "step": 600 + }, + { + "epoch": 0.17806319783988908, + "grad_norm": 0.01312255859375, + "learning_rate": 4.907979501314402e-06, + "logits/chosen": -2.452761173248291, + "logits/rejected": -2.452761173248291, + "logps/chosen": -293.330078125, + "logps/rejected": -293.330078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.005413960665464401, + "rewards/margins": 0.0, + "rewards/rejected": -0.005413960665464401, + "step": 610 + }, + { + "epoch": 0.18098226665693642, + "grad_norm": 0.013427734375, + "learning_rate": 4.901004200401659e-06, + "logits/chosen": -2.415590763092041, + "logits/rejected": -2.415590763092041, + "logps/chosen": -316.59185791015625, + "logps/rejected": -316.59185791015625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.009168794378638268, + "rewards/margins": 0.0, + "rewards/rejected": -0.009168794378638268, + "step": 620 + }, + { + "epoch": 0.1839013354739838, + "grad_norm": 0.017333984375, + "learning_rate": 4.893779426439285e-06, + "logits/chosen": -2.4269957542419434, + "logits/rejected": -2.4269957542419434, + "logps/chosen": -330.297607421875, + "logps/rejected": -330.297607421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.008635496720671654, + "rewards/margins": 0.0, + "rewards/rejected": -0.008635496720671654, + "step": 630 + }, + { + "epoch": 0.18682040429103117, + "grad_norm": 0.0137939453125, + "learning_rate": 4.886305930107512e-06, + "logits/chosen": -2.4132332801818848, + "logits/rejected": -2.4132332801818848, + "logps/chosen": -334.0628967285156, + "logps/rejected": -334.0628967285156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00843154825270176, + "rewards/margins": 0.0, + "rewards/rejected": -0.00843154825270176, + "step": 640 + }, + { + "epoch": 0.1897394731080785, + "grad_norm": 0.0162353515625, + "learning_rate": 4.878584487929731e-06, + "logits/chosen": -2.393531084060669, + "logits/rejected": -2.393531084060669, + "logps/chosen": -312.2678527832031, + "logps/rejected": -312.2678527832031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.008157333359122276, + "rewards/margins": 0.0, + "rewards/rejected": -0.008157333359122276, + "step": 650 + }, + { + "epoch": 0.19265854192512588, + "grad_norm": 0.01141357421875, + "learning_rate": 4.8706159021918046e-06, + "logits/chosen": -2.4334394931793213, + "logits/rejected": -2.4334394931793213, + "logps/chosen": -313.9178466796875, + "logps/rejected": -313.9178466796875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.010157248005270958, + "rewards/margins": 0.0, + "rewards/rejected": -0.010157248005270958, + "step": 660 + }, + { + "epoch": 0.19557761074217325, + "grad_norm": 0.01446533203125, + "learning_rate": 4.86240100085871e-06, + "logits/chosen": -2.4123024940490723, + "logits/rejected": -2.4123024940490723, + "logps/chosen": -330.71856689453125, + "logps/rejected": -330.71856689453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01049681194126606, + "rewards/margins": 0.0, + "rewards/rejected": -0.01049681194126606, + "step": 670 + }, + { + "epoch": 0.1984966795592206, + "grad_norm": 0.0145263671875, + "learning_rate": 4.853940637488505e-06, + "logits/chosen": -2.4219470024108887, + "logits/rejected": -2.4219470024108887, + "logps/chosen": -347.1614990234375, + "logps/rejected": -347.1614990234375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.010124921798706055, + "rewards/margins": 0.0, + "rewards/rejected": -0.010124921798706055, + "step": 680 + }, + { + "epoch": 0.20141574837626797, + "grad_norm": 0.0140380859375, + "learning_rate": 4.84523569114365e-06, + "logits/chosen": -2.441845417022705, + "logits/rejected": -2.441845417022705, + "logps/chosen": -268.2397766113281, + "logps/rejected": -268.2397766113281, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.013552245683968067, + "rewards/margins": 0.0, + "rewards/rejected": -0.013552245683968067, + "step": 690 + }, + { + "epoch": 0.20433481719331534, + "grad_norm": 0.020751953125, + "learning_rate": 4.8362870662996574e-06, + "logits/chosen": -2.408205509185791, + "logits/rejected": -2.408205509185791, + "logps/chosen": -313.0887756347656, + "logps/rejected": -313.0887756347656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01138794980943203, + "rewards/margins": 0.0, + "rewards/rejected": -0.01138794980943203, + "step": 700 + }, + { + "epoch": 0.20433481719331534, + "eval_logits/chosen": -2.394869565963745, + "eval_logits/rejected": -2.394869565963745, + "eval_logps/chosen": -307.6349182128906, + "eval_logps/rejected": -307.6349182128906, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.011563203297555447, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.011563203297555447, + "eval_runtime": 2685.1829, + "eval_samples_per_second": 2.268, + "eval_steps_per_second": 0.284, + "step": 700 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 0.015380859375, + "learning_rate": 4.827095692751124e-06, + "logits/chosen": -2.4306788444519043, + "logits/rejected": -2.4306788444519043, + "logps/chosen": -295.8254089355469, + "logps/rejected": -295.8254089355469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.009687040001153946, + "rewards/margins": 0.0, + "rewards/rejected": -0.009687040001153946, + "step": 710 + }, + { + "epoch": 0.21017295482741005, + "grad_norm": 0.0135498046875, + "learning_rate": 4.817662525515116e-06, + "logits/chosen": -2.399963855743408, + "logits/rejected": -2.399963855743408, + "logps/chosen": -285.0207824707031, + "logps/rejected": -285.0207824707031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.010509965009987354, + "rewards/margins": 0.0, + "rewards/rejected": -0.010509965009987354, + "step": 720 + }, + { + "epoch": 0.21309202364445742, + "grad_norm": 0.01275634765625, + "learning_rate": 4.807988544731944e-06, + "logits/chosen": -2.4015610218048096, + "logits/rejected": -2.4015610218048096, + "logps/chosen": -301.6191711425781, + "logps/rejected": -301.6191711425781, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0023958988022059202, + "rewards/margins": 0.0, + "rewards/rejected": -0.0023958988022059202, + "step": 730 + }, + { + "epoch": 0.21601109246150477, + "grad_norm": 0.0120849609375, + "learning_rate": 4.7980747555633174e-06, + "logits/chosen": -2.421522617340088, + "logits/rejected": -2.421522617340088, + "logps/chosen": -300.5765380859375, + "logps/rejected": -300.5765380859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0066505610011518, + "rewards/margins": 0.0, + "rewards/rejected": -0.0066505610011518, + "step": 740 + }, + { + "epoch": 0.21893016127855214, + "grad_norm": 0.0167236328125, + "learning_rate": 4.787922188087907e-06, + "logits/chosen": -2.3898696899414062, + "logits/rejected": -2.3898696899414062, + "logps/chosen": -312.099853515625, + "logps/rejected": -312.099853515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.009563307277858257, + "rewards/margins": 0.0, + "rewards/rejected": -0.009563307277858257, + "step": 750 + }, + { + "epoch": 0.2218492300955995, + "grad_norm": 0.0185546875, + "learning_rate": 4.7775318971943165e-06, + "logits/chosen": -2.368053674697876, + "logits/rejected": -2.368053674697876, + "logps/chosen": -280.77703857421875, + "logps/rejected": -280.77703857421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.008711813017725945, + "rewards/margins": 0.0, + "rewards/rejected": -0.008711813017725945, + "step": 760 + }, + { + "epoch": 0.22476829891264685, + "grad_norm": 0.01434326171875, + "learning_rate": 4.766904962471477e-06, + "logits/chosen": -2.428321361541748, + "logits/rejected": -2.428321361541748, + "logps/chosen": -283.40704345703125, + "logps/rejected": -283.40704345703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0074463835917413235, + "rewards/margins": 0.0, + "rewards/rejected": -0.0074463835917413235, + "step": 770 + }, + { + "epoch": 0.22768736772969422, + "grad_norm": 0.020751953125, + "learning_rate": 4.756042488096472e-06, + "logits/chosen": -2.421441078186035, + "logits/rejected": -2.421441078186035, + "logps/chosen": -283.1347961425781, + "logps/rejected": -283.1347961425781, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.009277506731450558, + "rewards/margins": 0.0, + "rewards/rejected": -0.009277506731450558, + "step": 780 + }, + { + "epoch": 0.2306064365467416, + "grad_norm": 0.0169677734375, + "learning_rate": 4.744945602719806e-06, + "logits/chosen": -2.4225807189941406, + "logits/rejected": -2.4225807189941406, + "logps/chosen": -296.5173645019531, + "logps/rejected": -296.5173645019531, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.009408360347151756, + "rewards/margins": 0.0, + "rewards/rejected": -0.009408360347151756, + "step": 790 + }, + { + "epoch": 0.23352550536378894, + "grad_norm": 0.01495361328125, + "learning_rate": 4.733615459348143e-06, + "logits/chosen": -2.3777918815612793, + "logits/rejected": -2.3777918815612793, + "logps/chosen": -337.0318298339844, + "logps/rejected": -337.0318298339844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.012588550336658955, + "rewards/margins": 0.0, + "rewards/rejected": -0.012588550336658955, + "step": 800 + }, + { + "epoch": 0.23352550536378894, + "eval_logits/chosen": -2.394713878631592, + "eval_logits/rejected": -2.394713878631592, + "eval_logps/chosen": -307.6956787109375, + "eval_logps/rejected": -307.6956787109375, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.012170875445008278, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.012170875445008278, + "eval_runtime": 2762.1462, + "eval_samples_per_second": 2.205, + "eval_steps_per_second": 0.276, + "step": 800 + }, + { + "epoch": 0.2364445741808363, + "grad_norm": 0.0145263671875, + "learning_rate": 4.722053235224495e-06, + "logits/chosen": -2.4402616024017334, + "logits/rejected": -2.4402616024017334, + "logps/chosen": -333.5353698730469, + "logps/rejected": -333.5353698730469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.008296088315546513, + "rewards/margins": 0.0, + "rewards/rejected": -0.008296088315546513, + "step": 810 + }, + { + "epoch": 0.23936364299788368, + "grad_norm": 0.0128173828125, + "learning_rate": 4.710260131705908e-06, + "logits/chosen": -2.411567211151123, + "logits/rejected": -2.411567211151123, + "logps/chosen": -274.9350280761719, + "logps/rejected": -274.9350280761719, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.015997527167201042, + "rewards/margins": 0.0, + "rewards/rejected": -0.015997527167201042, + "step": 820 + }, + { + "epoch": 0.24228271181493102, + "grad_norm": 0.01531982421875, + "learning_rate": 4.698237374138634e-06, + "logits/chosen": -2.420203447341919, + "logits/rejected": -2.420203447341919, + "logps/chosen": -312.3550720214844, + "logps/rejected": -312.3550720214844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.015846502035856247, + "rewards/margins": 0.0, + "rewards/rejected": -0.015846502035856247, + "step": 830 + }, + { + "epoch": 0.2452017806319784, + "grad_norm": 0.01513671875, + "learning_rate": 4.685986211730816e-06, + "logits/chosen": -2.3960068225860596, + "logits/rejected": -2.3960068225860596, + "logps/chosen": -331.6641845703125, + "logps/rejected": -331.6641845703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01894356682896614, + "rewards/margins": 0.0, + "rewards/rejected": -0.01894356682896614, + "step": 840 + }, + { + "epoch": 0.24812084944902577, + "grad_norm": 0.01165771484375, + "learning_rate": 4.6735079174226864e-06, + "logits/chosen": -2.408433198928833, + "logits/rejected": -2.408433198928833, + "logps/chosen": -269.3624572753906, + "logps/rejected": -269.3624572753906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.009970271959900856, + "rewards/margins": 0.0, + "rewards/rejected": -0.009970271959900856, + "step": 850 + }, + { + "epoch": 0.25103991826607314, + "grad_norm": 0.01483154296875, + "learning_rate": 4.660803787754306e-06, + "logits/chosen": -2.416790723800659, + "logits/rejected": -2.416790723800659, + "logps/chosen": -302.0819396972656, + "logps/rejected": -302.0819396972656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.010707431472837925, + "rewards/margins": 0.0, + "rewards/rejected": -0.010707431472837925, + "step": 860 + }, + { + "epoch": 0.2539589870831205, + "grad_norm": 0.0157470703125, + "learning_rate": 4.647875142730853e-06, + "logits/chosen": -2.3868987560272217, + "logits/rejected": -2.3868987560272217, + "logps/chosen": -299.74444580078125, + "logps/rejected": -299.74444580078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.012594198808073997, + "rewards/margins": 0.0, + "rewards/rejected": -0.012594198808073997, + "step": 870 + }, + { + "epoch": 0.2568780559001678, + "grad_norm": 0.0140380859375, + "learning_rate": 4.634723325685462e-06, + "logits/chosen": -2.442610263824463, + "logits/rejected": -2.442610263824463, + "logps/chosen": -308.396240234375, + "logps/rejected": -308.396240234375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.011057281866669655, + "rewards/margins": 0.0, + "rewards/rejected": -0.011057281866669655, + "step": 880 + }, + { + "epoch": 0.2597971247172152, + "grad_norm": 0.0157470703125, + "learning_rate": 4.621349703139651e-06, + "logits/chosen": -2.4502758979797363, + "logits/rejected": -2.4502758979797363, + "logps/chosen": -327.5845031738281, + "logps/rejected": -327.5845031738281, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.012428502552211285, + "rewards/margins": 0.0, + "rewards/rejected": -0.012428502552211285, + "step": 890 + }, + { + "epoch": 0.26271619353426257, + "grad_norm": 0.01519775390625, + "learning_rate": 4.6077556646613365e-06, + "logits/chosen": -2.4429335594177246, + "logits/rejected": -2.4429335594177246, + "logps/chosen": -309.44598388671875, + "logps/rejected": -309.44598388671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.008300786837935448, + "rewards/margins": 0.0, + "rewards/rejected": -0.008300786837935448, + "step": 900 + }, + { + "epoch": 0.26271619353426257, + "eval_logits/chosen": -2.396768093109131, + "eval_logits/rejected": -2.396768093109131, + "eval_logps/chosen": -307.1708068847656, + "eval_logps/rejected": -307.1708068847656, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.0069224112667143345, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.0069224112667143345, + "eval_runtime": 2667.1988, + "eval_samples_per_second": 2.283, + "eval_steps_per_second": 0.286, + "step": 900 + }, + { + "epoch": 0.2656352623513099, + "grad_norm": 0.0150146484375, + "learning_rate": 4.593942622720449e-06, + "logits/chosen": -2.431570529937744, + "logits/rejected": -2.431570529937744, + "logps/chosen": -333.9033203125, + "logps/rejected": -333.9033203125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.008790754713118076, + "rewards/margins": 0.0, + "rewards/rejected": -0.008790754713118076, + "step": 910 + }, + { + "epoch": 0.2685543311683573, + "grad_norm": 0.011474609375, + "learning_rate": 4.579912012542172e-06, + "logits/chosen": -2.4538259506225586, + "logits/rejected": -2.4538259506225586, + "logps/chosen": -330.14776611328125, + "logps/rejected": -330.14776611328125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.010161884129047394, + "rewards/margins": 0.0, + "rewards/rejected": -0.010161884129047394, + "step": 920 + }, + { + "epoch": 0.27147339998540465, + "grad_norm": 0.0164794921875, + "learning_rate": 4.565665291957821e-06, + "logits/chosen": -2.412051200866699, + "logits/rejected": -2.412051200866699, + "logps/chosen": -300.0600891113281, + "logps/rejected": -300.0600891113281, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.010114507749676704, + "rewards/margins": 0.0, + "rewards/rejected": -0.010114507749676704, + "step": 930 + }, + { + "epoch": 0.274392468802452, + "grad_norm": 0.0125732421875, + "learning_rate": 4.551203941253367e-06, + "logits/chosen": -2.4353108406066895, + "logits/rejected": -2.4353108406066895, + "logps/chosen": -288.15032958984375, + "logps/rejected": -288.15032958984375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01036372222006321, + "rewards/margins": 0.0, + "rewards/rejected": -0.01036372222006321, + "step": 940 + }, + { + "epoch": 0.2773115376194994, + "grad_norm": 0.01434326171875, + "learning_rate": 4.5365294630156264e-06, + "logits/chosen": -2.4350383281707764, + "logits/rejected": -2.4350383281707764, + "logps/chosen": -319.06195068359375, + "logps/rejected": -319.06195068359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.011402562260627747, + "rewards/margins": 0.0, + "rewards/rejected": -0.011402562260627747, + "step": 950 + }, + { + "epoch": 0.28023060643654674, + "grad_norm": 0.012451171875, + "learning_rate": 4.521643381976142e-06, + "logits/chosen": -2.428330898284912, + "logits/rejected": -2.428330898284912, + "logps/chosen": -322.0547790527344, + "logps/rejected": -322.0547790527344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.012878289446234703, + "rewards/margins": 0.0, + "rewards/rejected": -0.012878289446234703, + "step": 960 + }, + { + "epoch": 0.2831496752535941, + "grad_norm": 0.013671875, + "learning_rate": 4.506547244852756e-06, + "logits/chosen": -2.4220213890075684, + "logits/rejected": -2.4220213890075684, + "logps/chosen": -298.77056884765625, + "logps/rejected": -298.77056884765625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.012091143056750298, + "rewards/margins": 0.0, + "rewards/rejected": -0.012091143056750298, + "step": 970 + }, + { + "epoch": 0.2860687440706415, + "grad_norm": 0.0145263671875, + "learning_rate": 4.491242620188898e-06, + "logits/chosen": -2.400778293609619, + "logits/rejected": -2.400778293609619, + "logps/chosen": -302.7762756347656, + "logps/rejected": -302.7762756347656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01696743816137314, + "rewards/margins": 0.0, + "rewards/rejected": -0.01696743816137314, + "step": 980 + }, + { + "epoch": 0.2889878128876888, + "grad_norm": 0.012451171875, + "learning_rate": 4.475731098190611e-06, + "logits/chosen": -2.4159862995147705, + "logits/rejected": -2.4159862995147705, + "logps/chosen": -278.34356689453125, + "logps/rejected": -278.34356689453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.014010600745677948, + "rewards/margins": 0.0, + "rewards/rejected": -0.014010600745677948, + "step": 990 + }, + { + "epoch": 0.29190688170473617, + "grad_norm": 0.0145263671875, + "learning_rate": 4.4600142905613216e-06, + "logits/chosen": -2.416891098022461, + "logits/rejected": -2.416891098022461, + "logps/chosen": -310.4523620605469, + "logps/rejected": -310.4523620605469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01909947767853737, + "rewards/margins": 0.0, + "rewards/rejected": -0.01909947767853737, + "step": 1000 + }, + { + "epoch": 0.29190688170473617, + "eval_logits/chosen": -2.3967111110687256, + "eval_logits/rejected": -2.3967111110687256, + "eval_logps/chosen": -308.2130432128906, + "eval_logps/rejected": -308.2130432128906, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.017344659194350243, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.017344659194350243, + "eval_runtime": 2668.0913, + "eval_samples_per_second": 2.283, + "eval_steps_per_second": 0.286, + "step": 1000 + }, + { + "epoch": 0.29482595052178356, + "grad_norm": 0.029296875, + "learning_rate": 4.444093830334381e-06, + "logits/chosen": -2.395017147064209, + "logits/rejected": -2.395017147064209, + "logps/chosen": -330.1224670410156, + "logps/rejected": -330.1224670410156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.013958754017949104, + "rewards/margins": 0.0, + "rewards/rejected": -0.013958754017949104, + "step": 1010 + }, + { + "epoch": 0.2977450193388309, + "grad_norm": 0.01611328125, + "learning_rate": 4.427971371703378e-06, + "logits/chosen": -2.4404492378234863, + "logits/rejected": -2.4404492378234863, + "logps/chosen": -314.79888916015625, + "logps/rejected": -314.79888916015625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.027685949578881264, + "rewards/margins": 0.0, + "rewards/rejected": -0.027685949578881264, + "step": 1020 + }, + { + "epoch": 0.30066408815587825, + "grad_norm": 0.01263427734375, + "learning_rate": 4.411648589850276e-06, + "logits/chosen": -2.4368889331817627, + "logits/rejected": -2.4368889331817627, + "logps/chosen": -299.6970520019531, + "logps/rejected": -299.6970520019531, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01648426428437233, + "rewards/margins": 0.0, + "rewards/rejected": -0.01648426428437233, + "step": 1030 + }, + { + "epoch": 0.30358315697292565, + "grad_norm": 0.01416015625, + "learning_rate": 4.395127180771342e-06, + "logits/chosen": -2.4541175365448, + "logits/rejected": -2.4541175365448, + "logps/chosen": -326.87841796875, + "logps/rejected": -326.87841796875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.020237499848008156, + "rewards/margins": 0.0, + "rewards/rejected": -0.020237499848008156, + "step": 1040 + }, + { + "epoch": 0.306502225789973, + "grad_norm": 0.01318359375, + "learning_rate": 4.378408861100937e-06, + "logits/chosen": -2.415283203125, + "logits/rejected": -2.415283203125, + "logps/chosen": -261.1552429199219, + "logps/rejected": -261.1552429199219, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01741962879896164, + "rewards/margins": 0.0, + "rewards/rejected": -0.01741962879896164, + "step": 1050 + }, + { + "epoch": 0.30942129460702034, + "grad_norm": 0.01416015625, + "learning_rate": 4.361495367933144e-06, + "logits/chosen": -2.396031141281128, + "logits/rejected": -2.396031141281128, + "logps/chosen": -322.30377197265625, + "logps/rejected": -322.30377197265625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.014474359340965748, + "rewards/margins": 0.0, + "rewards/rejected": -0.014474359340965748, + "step": 1060 + }, + { + "epoch": 0.31234036342406774, + "grad_norm": 0.0181884765625, + "learning_rate": 4.344388458641283e-06, + "logits/chosen": -2.4288814067840576, + "logits/rejected": -2.4288814067840576, + "logps/chosen": -324.64501953125, + "logps/rejected": -324.64501953125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.025701653212308884, + "rewards/margins": 0.0, + "rewards/rejected": -0.025701653212308884, + "step": 1070 + }, + { + "epoch": 0.3152594322411151, + "grad_norm": 0.0164794921875, + "learning_rate": 4.32708991069531e-06, + "logits/chosen": -2.411003589630127, + "logits/rejected": -2.411003589630127, + "logps/chosen": -318.289794921875, + "logps/rejected": -318.289794921875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02725202962756157, + "rewards/margins": 0.0, + "rewards/rejected": -0.02725202962756157, + "step": 1080 + }, + { + "epoch": 0.3181785010581624, + "grad_norm": 0.01312255859375, + "learning_rate": 4.309601521477134e-06, + "logits/chosen": -2.437730550765991, + "logits/rejected": -2.437730550765991, + "logps/chosen": -318.1125793457031, + "logps/rejected": -318.1125793457031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.035508617758750916, + "rewards/margins": 0.0, + "rewards/rejected": -0.035508617758750916, + "step": 1090 + }, + { + "epoch": 0.3210975698752098, + "grad_norm": 0.01373291015625, + "learning_rate": 4.291925108093856e-06, + "logits/chosen": -2.4134514331817627, + "logits/rejected": -2.4134514331817627, + "logps/chosen": -306.98712158203125, + "logps/rejected": -306.98712158203125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02741456963121891, + "rewards/margins": 0.0, + "rewards/rejected": -0.02741456963121891, + "step": 1100 + }, + { + "epoch": 0.3210975698752098, + "eval_logits/chosen": -2.3970751762390137, + "eval_logits/rejected": -2.3970751762390137, + "eval_logps/chosen": -309.472412109375, + "eval_logps/rejected": -309.472412109375, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.029938040301203728, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.029938040301203728, + "eval_runtime": 2667.8688, + "eval_samples_per_second": 2.283, + "eval_steps_per_second": 0.286, + "step": 1100 + }, + { + "epoch": 0.32401663869225716, + "grad_norm": 0.0120849609375, + "learning_rate": 4.274062507188978e-06, + "logits/chosen": -2.413846492767334, + "logits/rejected": -2.413846492767334, + "logps/chosen": -319.53887939453125, + "logps/rejected": -319.53887939453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03637847676873207, + "rewards/margins": 0.0, + "rewards/rejected": -0.03637847676873207, + "step": 1110 + }, + { + "epoch": 0.3269357075093045, + "grad_norm": 0.0130615234375, + "learning_rate": 4.256015574751555e-06, + "logits/chosen": -2.443239212036133, + "logits/rejected": -2.443239212036133, + "logps/chosen": -302.9671630859375, + "logps/rejected": -302.9671630859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.032804206013679504, + "rewards/margins": 0.0, + "rewards/rejected": -0.032804206013679504, + "step": 1120 + }, + { + "epoch": 0.3298547763263519, + "grad_norm": 0.0159912109375, + "learning_rate": 4.2377861859233604e-06, + "logits/chosen": -2.4368813037872314, + "logits/rejected": -2.4368813037872314, + "logps/chosen": -277.4005126953125, + "logps/rejected": -277.4005126953125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.030909573659300804, + "rewards/margins": 0.0, + "rewards/rejected": -0.030909573659300804, + "step": 1130 + }, + { + "epoch": 0.33277384514339925, + "grad_norm": 0.01263427734375, + "learning_rate": 4.219376234804047e-06, + "logits/chosen": -2.4358789920806885, + "logits/rejected": -2.4358789920806885, + "logps/chosen": -294.87567138671875, + "logps/rejected": -294.87567138671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.033531349152326584, + "rewards/margins": 0.0, + "rewards/rejected": -0.033531349152326584, + "step": 1140 + }, + { + "epoch": 0.3356929139604466, + "grad_norm": 0.01519775390625, + "learning_rate": 4.200787634254345e-06, + "logits/chosen": -2.458458662033081, + "logits/rejected": -2.458458662033081, + "logps/chosen": -284.5567321777344, + "logps/rejected": -284.5567321777344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.029438916593790054, + "rewards/margins": 0.0, + "rewards/rejected": -0.029438916593790054, + "step": 1150 + }, + { + "epoch": 0.338611982777494, + "grad_norm": 0.0157470703125, + "learning_rate": 4.18202231569731e-06, + "logits/chosen": -2.465770721435547, + "logits/rejected": -2.465770721435547, + "logps/chosen": -325.60443115234375, + "logps/rejected": -325.60443115234375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03488076478242874, + "rewards/margins": 0.0, + "rewards/rejected": -0.03488076478242874, + "step": 1160 + }, + { + "epoch": 0.34153105159454133, + "grad_norm": 0.0194091796875, + "learning_rate": 4.163082228917639e-06, + "logits/chosen": -2.42230224609375, + "logits/rejected": -2.42230224609375, + "logps/chosen": -332.96807861328125, + "logps/rejected": -332.96807861328125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03761008754372597, + "rewards/margins": 0.0, + "rewards/rejected": -0.03761008754372597, + "step": 1170 + }, + { + "epoch": 0.3444501204115887, + "grad_norm": 0.01519775390625, + "learning_rate": 4.143969341859083e-06, + "logits/chosen": -2.4006218910217285, + "logits/rejected": -2.4006218910217285, + "logps/chosen": -298.38372802734375, + "logps/rejected": -298.38372802734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.027944693341851234, + "rewards/margins": 0.0, + "rewards/rejected": -0.027944693341851234, + "step": 1180 + }, + { + "epoch": 0.3473691892286361, + "grad_norm": 0.0167236328125, + "learning_rate": 4.124685640419967e-06, + "logits/chosen": -2.4376044273376465, + "logits/rejected": -2.4376044273376465, + "logps/chosen": -339.3370666503906, + "logps/rejected": -339.3370666503906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04471305012702942, + "rewards/margins": 0.0, + "rewards/rejected": -0.04471305012702942, + "step": 1190 + }, + { + "epoch": 0.3502882580456834, + "grad_norm": 0.015625, + "learning_rate": 4.105233128246849e-06, + "logits/chosen": -2.4307379722595215, + "logits/rejected": -2.4307379722595215, + "logps/chosen": -314.7157287597656, + "logps/rejected": -314.7157287597656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04377968981862068, + "rewards/margins": 0.0, + "rewards/rejected": -0.04377968981862068, + "step": 1200 + }, + { + "epoch": 0.3502882580456834, + "eval_logits/chosen": -2.3975985050201416, + "eval_logits/rejected": -2.3975985050201416, + "eval_logps/chosen": -310.0194091796875, + "eval_logps/rejected": -310.0194091796875, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.035407647490501404, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.035407647490501404, + "eval_runtime": 2667.8638, + "eval_samples_per_second": 2.283, + "eval_steps_per_second": 0.286, + "step": 1200 + }, + { + "epoch": 0.35320732686273076, + "grad_norm": 0.01373291015625, + "learning_rate": 4.085613826526338e-06, + "logits/chosen": -2.4104952812194824, + "logits/rejected": -2.4104952812194824, + "logps/chosen": -307.89056396484375, + "logps/rejected": -307.89056396484375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.034878071397542953, + "rewards/margins": 0.0, + "rewards/rejected": -0.034878071397542953, + "step": 1210 + }, + { + "epoch": 0.35612639567977816, + "grad_norm": 0.0130615234375, + "learning_rate": 4.065829773775082e-06, + "logits/chosen": -2.454697847366333, + "logits/rejected": -2.454697847366333, + "logps/chosen": -331.95556640625, + "logps/rejected": -331.95556640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.035688284784555435, + "rewards/margins": 0.0, + "rewards/rejected": -0.035688284784555435, + "step": 1220 + }, + { + "epoch": 0.3590454644968255, + "grad_norm": 0.01318359375, + "learning_rate": 4.045883025627957e-06, + "logits/chosen": -2.416503429412842, + "logits/rejected": -2.416503429412842, + "logps/chosen": -317.5516662597656, + "logps/rejected": -317.5516662597656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.036794569343328476, + "rewards/margins": 0.0, + "rewards/rejected": -0.036794569343328476, + "step": 1230 + }, + { + "epoch": 0.36196453331387285, + "grad_norm": 0.0159912109375, + "learning_rate": 4.025775654624481e-06, + "logits/chosen": -2.431762218475342, + "logits/rejected": -2.431762218475342, + "logps/chosen": -286.4144592285156, + "logps/rejected": -286.4144592285156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0327475443482399, + "rewards/margins": 0.0, + "rewards/rejected": -0.0327475443482399, + "step": 1240 + }, + { + "epoch": 0.36488360213092025, + "grad_norm": 0.01373291015625, + "learning_rate": 4.005509749993471e-06, + "logits/chosen": -2.4348835945129395, + "logits/rejected": -2.4348835945129395, + "logps/chosen": -264.43670654296875, + "logps/rejected": -264.43670654296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03447514772415161, + "rewards/margins": 0.0, + "rewards/rejected": -0.03447514772415161, + "step": 1250 + }, + { + "epoch": 0.3678026709479676, + "grad_norm": 0.01544189453125, + "learning_rate": 3.985087417435964e-06, + "logits/chosen": -2.4379494190216064, + "logits/rejected": -2.4379494190216064, + "logps/chosen": -306.0783386230469, + "logps/rejected": -306.0783386230469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03243451565504074, + "rewards/margins": 0.0, + "rewards/rejected": -0.03243451565504074, + "step": 1260 + }, + { + "epoch": 0.37072173976501493, + "grad_norm": 0.01318359375, + "learning_rate": 3.964510778906425e-06, + "logits/chosen": -2.434380292892456, + "logits/rejected": -2.434380292892456, + "logps/chosen": -316.9388427734375, + "logps/rejected": -316.9388427734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.038867734372615814, + "rewards/margins": 0.0, + "rewards/rejected": -0.038867734372615814, + "step": 1270 + }, + { + "epoch": 0.37364080858206233, + "grad_norm": 0.0142822265625, + "learning_rate": 3.943781972392269e-06, + "logits/chosen": -2.4212710857391357, + "logits/rejected": -2.4212710857391357, + "logps/chosen": -326.74237060546875, + "logps/rejected": -326.74237060546875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.038525618612766266, + "rewards/margins": 0.0, + "rewards/rejected": -0.038525618612766266, + "step": 1280 + }, + { + "epoch": 0.3765598773991097, + "grad_norm": 0.016357421875, + "learning_rate": 3.922903151691716e-06, + "logits/chosen": -2.450032949447632, + "logits/rejected": -2.450032949447632, + "logps/chosen": -329.82073974609375, + "logps/rejected": -329.82073974609375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.037473224103450775, + "rewards/margins": 0.0, + "rewards/rejected": -0.037473224103450775, + "step": 1290 + }, + { + "epoch": 0.379478946216157, + "grad_norm": 0.018310546875, + "learning_rate": 3.901876486190008e-06, + "logits/chosen": -2.4351401329040527, + "logits/rejected": -2.4351401329040527, + "logps/chosen": -315.5516662597656, + "logps/rejected": -315.5516662597656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03452508896589279, + "rewards/margins": 0.0, + "rewards/rejected": -0.03452508896589279, + "step": 1300 + }, + { + "epoch": 0.379478946216157, + "eval_logits/chosen": -2.3963370323181152, + "eval_logits/rejected": -2.3963370323181152, + "eval_logps/chosen": -309.5113525390625, + "eval_logps/rejected": -309.5113525390625, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.030327608808875084, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.030327608808875084, + "eval_runtime": 2666.8225, + "eval_samples_per_second": 2.284, + "eval_steps_per_second": 0.286, + "step": 1300 + }, + { + "epoch": 0.3823980150332044, + "grad_norm": 0.018798828125, + "learning_rate": 3.880704160633995e-06, + "logits/chosen": -2.4444994926452637, + "logits/rejected": -2.4444994926452637, + "logps/chosen": -295.88348388671875, + "logps/rejected": -295.88348388671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.036420173943042755, + "rewards/margins": 0.0, + "rewards/rejected": -0.036420173943042755, + "step": 1310 + }, + { + "epoch": 0.38531708385025176, + "grad_norm": 0.013671875, + "learning_rate": 3.859388374905136e-06, + "logits/chosen": -2.41549015045166, + "logits/rejected": -2.41549015045166, + "logps/chosen": -291.2346496582031, + "logps/rejected": -291.2346496582031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03046022728085518, + "rewards/margins": 0.0, + "rewards/rejected": -0.03046022728085518, + "step": 1320 + }, + { + "epoch": 0.3882361526672991, + "grad_norm": 0.0152587890625, + "learning_rate": 3.837931343790924e-06, + "logits/chosen": -2.4401891231536865, + "logits/rejected": -2.4401891231536865, + "logps/chosen": -297.060791015625, + "logps/rejected": -297.060791015625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02374189719557762, + "rewards/margins": 0.0, + "rewards/rejected": -0.02374189719557762, + "step": 1330 + }, + { + "epoch": 0.3911552214843465, + "grad_norm": 0.0152587890625, + "learning_rate": 3.8163352967547575e-06, + "logits/chosen": -2.4282491207122803, + "logits/rejected": -2.4282491207122803, + "logps/chosen": -350.7884216308594, + "logps/rejected": -350.7884216308594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.031809043139219284, + "rewards/margins": 0.0, + "rewards/rejected": -0.031809043139219284, + "step": 1340 + }, + { + "epoch": 0.39407429030139385, + "grad_norm": 0.01190185546875, + "learning_rate": 3.7946024777042974e-06, + "logits/chosen": -2.423346996307373, + "logits/rejected": -2.423346996307373, + "logps/chosen": -300.26800537109375, + "logps/rejected": -300.26800537109375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.029370862990617752, + "rewards/margins": 0.0, + "rewards/rejected": -0.029370862990617752, + "step": 1350 + }, + { + "epoch": 0.3969933591184412, + "grad_norm": 0.01953125, + "learning_rate": 3.7727351447583095e-06, + "logits/chosen": -2.397026538848877, + "logits/rejected": -2.397026538848877, + "logps/chosen": -318.9501647949219, + "logps/rejected": -318.9501647949219, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.030634000897407532, + "rewards/margins": 0.0, + "rewards/rejected": -0.030634000897407532, + "step": 1360 + }, + { + "epoch": 0.3999124279354886, + "grad_norm": 0.01385498046875, + "learning_rate": 3.750735570012043e-06, + "logits/chosen": -2.438441276550293, + "logits/rejected": -2.438441276550293, + "logps/chosen": -330.5710754394531, + "logps/rejected": -330.5710754394531, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03722671791911125, + "rewards/margins": 0.0, + "rewards/rejected": -0.03722671791911125, + "step": 1370 + }, + { + "epoch": 0.40283149675253593, + "grad_norm": 0.01806640625, + "learning_rate": 3.7286060393011513e-06, + "logits/chosen": -2.419067144393921, + "logits/rejected": -2.419067144393921, + "logps/chosen": -314.528564453125, + "logps/rejected": -314.528564453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.032639987766742706, + "rewards/margins": 0.0, + "rewards/rejected": -0.032639987766742706, + "step": 1380 + }, + { + "epoch": 0.4057505655695833, + "grad_norm": 0.01904296875, + "learning_rate": 3.7063488519641825e-06, + "logits/chosen": -2.4223015308380127, + "logits/rejected": -2.4223015308380127, + "logps/chosen": -329.4114685058594, + "logps/rejected": -329.4114685058594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03504698723554611, + "rewards/margins": 0.0, + "rewards/rejected": -0.03504698723554611, + "step": 1390 + }, + { + "epoch": 0.4086696343866307, + "grad_norm": 0.0162353515625, + "learning_rate": 3.6839663206036715e-06, + "logits/chosen": -2.4432168006896973, + "logits/rejected": -2.4432168006896973, + "logps/chosen": -293.8369445800781, + "logps/rejected": -293.8369445800781, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.031177738681435585, + "rewards/margins": 0.0, + "rewards/rejected": -0.031177738681435585, + "step": 1400 + }, + { + "epoch": 0.4086696343866307, + "eval_logits/chosen": -2.395508050918579, + "eval_logits/rejected": -2.395508050918579, + "eval_logps/chosen": -309.2061462402344, + "eval_logps/rejected": -309.2061462402344, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.02727527543902397, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.02727527543902397, + "eval_runtime": 2667.2175, + "eval_samples_per_second": 2.283, + "eval_steps_per_second": 0.286, + "step": 1400 + }, + { + "epoch": 0.411588703203678, + "grad_norm": 0.01239013671875, + "learning_rate": 3.6614607708458532e-06, + "logits/chosen": -2.418804883956909, + "logits/rejected": -2.418804883956909, + "logps/chosen": -295.696533203125, + "logps/rejected": -295.696533203125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.024944758042693138, + "rewards/margins": 0.0, + "rewards/rejected": -0.024944758042693138, + "step": 1410 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 0.0146484375, + "learning_rate": 3.6388345410990195e-06, + "logits/chosen": -2.4199652671813965, + "logits/rejected": -2.4199652671813965, + "logps/chosen": -341.0202331542969, + "logps/rejected": -341.0202331542969, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.030235985293984413, + "rewards/margins": 0.0, + "rewards/rejected": -0.030235985293984413, + "step": 1420 + }, + { + "epoch": 0.41742684083777276, + "grad_norm": 0.01141357421875, + "learning_rate": 3.6160899823105518e-06, + "logits/chosen": -2.4291069507598877, + "logits/rejected": -2.4291069507598877, + "logps/chosen": -287.2336730957031, + "logps/rejected": -287.2336730957031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0277925543487072, + "rewards/margins": 0.0, + "rewards/rejected": -0.0277925543487072, + "step": 1430 + }, + { + "epoch": 0.4203459096548201, + "grad_norm": 0.0140380859375, + "learning_rate": 3.5932294577226468e-06, + "logits/chosen": -2.440561532974243, + "logits/rejected": -2.440561532974243, + "logps/chosen": -276.7684020996094, + "logps/rejected": -276.7684020996094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.017870336771011353, + "rewards/margins": 0.0, + "rewards/rejected": -0.017870336771011353, + "step": 1440 + }, + { + "epoch": 0.42326497847186745, + "grad_norm": 0.0118408203125, + "learning_rate": 3.5702553426267704e-06, + "logits/chosen": -2.449218988418579, + "logits/rejected": -2.449218988418579, + "logps/chosen": -305.78814697265625, + "logps/rejected": -305.78814697265625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.024231892079114914, + "rewards/margins": 0.0, + "rewards/rejected": -0.024231892079114914, + "step": 1450 + }, + { + "epoch": 0.42618404728891485, + "grad_norm": 0.015625, + "learning_rate": 3.547170024116854e-06, + "logits/chosen": -2.4015636444091797, + "logits/rejected": -2.4015636444091797, + "logps/chosen": -281.1402893066406, + "logps/rejected": -281.1402893066406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.027333328500390053, + "rewards/margins": 0.0, + "rewards/rejected": -0.027333328500390053, + "step": 1460 + }, + { + "epoch": 0.4291031161059622, + "grad_norm": 0.0164794921875, + "learning_rate": 3.5239759008412666e-06, + "logits/chosen": -2.461341381072998, + "logits/rejected": -2.461341381072998, + "logps/chosen": -315.0804443359375, + "logps/rejected": -315.0804443359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.026240995153784752, + "rewards/margins": 0.0, + "rewards/rejected": -0.026240995153784752, + "step": 1470 + }, + { + "epoch": 0.43202218492300953, + "grad_norm": 0.0164794921875, + "learning_rate": 3.500675382753588e-06, + "logits/chosen": -2.420381784439087, + "logits/rejected": -2.420381784439087, + "logps/chosen": -310.7515563964844, + "logps/rejected": -310.7515563964844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.023152858018875122, + "rewards/margins": 0.0, + "rewards/rejected": -0.023152858018875122, + "step": 1480 + }, + { + "epoch": 0.43494125374005693, + "grad_norm": 0.01336669921875, + "learning_rate": 3.477270890862204e-06, + "logits/chosen": -2.3881866931915283, + "logits/rejected": -2.3881866931915283, + "logps/chosen": -318.3128356933594, + "logps/rejected": -318.3128356933594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.030725980177521706, + "rewards/margins": 0.0, + "rewards/rejected": -0.030725980177521706, + "step": 1490 + }, + { + "epoch": 0.4378603225571043, + "grad_norm": 0.0140380859375, + "learning_rate": 3.453764856978758e-06, + "logits/chosen": -2.409209728240967, + "logits/rejected": -2.409209728240967, + "logps/chosen": -331.4593200683594, + "logps/rejected": -331.4593200683594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.022285277023911476, + "rewards/margins": 0.0, + "rewards/rejected": -0.022285277023911476, + "step": 1500 + }, + { + "epoch": 0.4378603225571043, + "eval_logits/chosen": -2.394321918487549, + "eval_logits/rejected": -2.394321918487549, + "eval_logps/chosen": -308.9651794433594, + "eval_logps/rejected": -308.9651794433594, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.024866018444299698, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.024866018444299698, + "eval_runtime": 2666.7789, + "eval_samples_per_second": 2.284, + "eval_steps_per_second": 0.286, + "step": 1500 + }, + { + "epoch": 0.4407793913741516, + "grad_norm": 0.01312255859375, + "learning_rate": 3.4301597234654733e-06, + "logits/chosen": -2.4193215370178223, + "logits/rejected": -2.4193215370178223, + "logps/chosen": -304.951171875, + "logps/rejected": -304.951171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02852988801896572, + "rewards/margins": 0.0, + "rewards/rejected": -0.02852988801896572, + "step": 1510 + }, + { + "epoch": 0.443698460191199, + "grad_norm": 0.0177001953125, + "learning_rate": 3.406457942981384e-06, + "logits/chosen": -2.430614948272705, + "logits/rejected": -2.430614948272705, + "logps/chosen": -333.06988525390625, + "logps/rejected": -333.06988525390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.024759288877248764, + "rewards/margins": 0.0, + "rewards/rejected": -0.024759288877248764, + "step": 1520 + }, + { + "epoch": 0.44661752900824636, + "grad_norm": 0.0133056640625, + "learning_rate": 3.3826619782274954e-06, + "logits/chosen": -2.43021559715271, + "logits/rejected": -2.43021559715271, + "logps/chosen": -284.0345153808594, + "logps/rejected": -284.0345153808594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.025433775037527084, + "rewards/margins": 0.0, + "rewards/rejected": -0.025433775037527084, + "step": 1530 + }, + { + "epoch": 0.4495365978252937, + "grad_norm": 0.0142822265625, + "learning_rate": 3.3587743016909013e-06, + "logits/chosen": -2.439312219619751, + "logits/rejected": -2.439312219619751, + "logps/chosen": -320.015380859375, + "logps/rejected": -320.015380859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02944205328822136, + "rewards/margins": 0.0, + "rewards/rejected": -0.02944205328822136, + "step": 1540 + }, + { + "epoch": 0.4524556666423411, + "grad_norm": 0.044677734375, + "learning_rate": 3.334797395387882e-06, + "logits/chosen": -2.4262938499450684, + "logits/rejected": -2.4262938499450684, + "logps/chosen": -329.60504150390625, + "logps/rejected": -329.60504150390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.027106398716568947, + "rewards/margins": 0.0, + "rewards/rejected": -0.027106398716568947, + "step": 1550 + }, + { + "epoch": 0.45537473545938845, + "grad_norm": 0.01226806640625, + "learning_rate": 3.3107337506060145e-06, + "logits/chosen": -2.4414420127868652, + "logits/rejected": -2.4414420127868652, + "logps/chosen": -289.9877014160156, + "logps/rejected": -289.9877014160156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.028158003464341164, + "rewards/margins": 0.0, + "rewards/rejected": -0.028158003464341164, + "step": 1560 + }, + { + "epoch": 0.4582938042764358, + "grad_norm": 0.0301513671875, + "learning_rate": 3.2865858676453172e-06, + "logits/chosen": -2.434182643890381, + "logits/rejected": -2.434182643890381, + "logps/chosen": -306.0428466796875, + "logps/rejected": -306.0428466796875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02475564181804657, + "rewards/margins": 0.0, + "rewards/rejected": -0.02475564181804657, + "step": 1570 + }, + { + "epoch": 0.4612128730934832, + "grad_norm": 0.0098876953125, + "learning_rate": 3.2623562555584633e-06, + "logits/chosen": -2.430816411972046, + "logits/rejected": -2.430816411972046, + "logps/chosen": -281.2196960449219, + "logps/rejected": -281.2196960449219, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02929893136024475, + "rewards/margins": 0.0, + "rewards/rejected": -0.02929893136024475, + "step": 1580 + }, + { + "epoch": 0.46413194191053053, + "grad_norm": 0.024658203125, + "learning_rate": 3.2380474318900766e-06, + "logits/chosen": -2.4165406227111816, + "logits/rejected": -2.4165406227111816, + "logps/chosen": -310.68511962890625, + "logps/rejected": -310.68511962890625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03376628831028938, + "rewards/margins": 0.0, + "rewards/rejected": -0.03376628831028938, + "step": 1590 + }, + { + "epoch": 0.4670510107275779, + "grad_norm": 0.016845703125, + "learning_rate": 3.2136619224151533e-06, + "logits/chosen": -2.4508678913116455, + "logits/rejected": -2.4508678913116455, + "logps/chosen": -327.84619140625, + "logps/rejected": -327.84619140625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03426826745271683, + "rewards/margins": 0.0, + "rewards/rejected": -0.03426826745271683, + "step": 1600 + }, + { + "epoch": 0.4670510107275779, + "eval_logits/chosen": -2.3953943252563477, + "eval_logits/rejected": -2.3953943252563477, + "eval_logps/chosen": -309.15863037109375, + "eval_logps/rejected": -309.15863037109375, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.026800233870744705, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.026800233870744705, + "eval_runtime": 2666.9806, + "eval_samples_per_second": 2.283, + "eval_steps_per_second": 0.286, + "step": 1600 + }, + { + "epoch": 0.4699700795446253, + "grad_norm": 0.014892578125, + "learning_rate": 3.1892022608766215e-06, + "logits/chosen": -2.361971378326416, + "logits/rejected": -2.361971378326416, + "logps/chosen": -299.3944396972656, + "logps/rejected": -299.3944396972656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0262086633592844, + "rewards/margins": 0.0, + "rewards/rejected": -0.0262086633592844, + "step": 1610 + }, + { + "epoch": 0.4728891483616726, + "grad_norm": 0.01422119140625, + "learning_rate": 3.16467098872208e-06, + "logits/chosen": -2.4706971645355225, + "logits/rejected": -2.4706971645355225, + "logps/chosen": -332.5861511230469, + "logps/rejected": -332.5861511230469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.034811943769454956, + "rewards/margins": 0.0, + "rewards/rejected": -0.034811943769454956, + "step": 1620 + }, + { + "epoch": 0.47580821717871996, + "grad_norm": 0.032470703125, + "learning_rate": 3.140070654839728e-06, + "logits/chosen": -2.4026148319244385, + "logits/rejected": -2.4026148319244385, + "logps/chosen": -296.76605224609375, + "logps/rejected": -296.76605224609375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02368254028260708, + "rewards/margins": 0.0, + "rewards/rejected": -0.02368254028260708, + "step": 1630 + }, + { + "epoch": 0.47872728599576736, + "grad_norm": 0.0242919921875, + "learning_rate": 3.115403815293532e-06, + "logits/chosen": -2.43617582321167, + "logits/rejected": -2.43617582321167, + "logps/chosen": -342.2427062988281, + "logps/rejected": -342.2427062988281, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.036105576902627945, + "rewards/margins": 0.0, + "rewards/rejected": -0.036105576902627945, + "step": 1640 + }, + { + "epoch": 0.4816463548128147, + "grad_norm": 0.0113525390625, + "learning_rate": 3.0906730330576345e-06, + "logits/chosen": -2.4739155769348145, + "logits/rejected": -2.4739155769348145, + "logps/chosen": -332.26678466796875, + "logps/rejected": -332.26678466796875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.028261488303542137, + "rewards/margins": 0.0, + "rewards/rejected": -0.028261488303542137, + "step": 1650 + }, + { + "epoch": 0.48456542362986205, + "grad_norm": 0.017333984375, + "learning_rate": 3.065880877750059e-06, + "logits/chosen": -2.427436351776123, + "logits/rejected": -2.427436351776123, + "logps/chosen": -304.4495544433594, + "logps/rejected": -304.4495544433594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03417867794632912, + "rewards/margins": 0.0, + "rewards/rejected": -0.03417867794632912, + "step": 1660 + }, + { + "epoch": 0.48748449244690945, + "grad_norm": 0.01226806640625, + "learning_rate": 3.041029925365711e-06, + "logits/chosen": -2.4058425426483154, + "logits/rejected": -2.4058425426483154, + "logps/chosen": -308.30072021484375, + "logps/rejected": -308.30072021484375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.035630594938993454, + "rewards/margins": 0.0, + "rewards/rejected": -0.035630594938993454, + "step": 1670 + }, + { + "epoch": 0.4904035612639568, + "grad_norm": 0.0126953125, + "learning_rate": 3.0161227580087282e-06, + "logits/chosen": -2.433281421661377, + "logits/rejected": -2.433281421661377, + "logps/chosen": -342.0614013671875, + "logps/rejected": -342.0614013671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03289630264043808, + "rewards/margins": 0.0, + "rewards/rejected": -0.03289630264043808, + "step": 1680 + }, + { + "epoch": 0.49332263008100413, + "grad_norm": 0.0123291015625, + "learning_rate": 2.9911619636241862e-06, + "logits/chosen": -2.4333884716033936, + "logits/rejected": -2.4333884716033936, + "logps/chosen": -322.1616516113281, + "logps/rejected": -322.1616516113281, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.034327663481235504, + "rewards/margins": 0.0, + "rewards/rejected": -0.034327663481235504, + "step": 1690 + }, + { + "epoch": 0.49624169889805153, + "grad_norm": 0.01275634765625, + "learning_rate": 2.966150135729203e-06, + "logits/chosen": -2.38623046875, + "logits/rejected": -2.38623046875, + "logps/chosen": -335.8984680175781, + "logps/rejected": -335.8984680175781, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03050742670893669, + "rewards/margins": 0.0, + "rewards/rejected": -0.03050742670893669, + "step": 1700 + }, + { + "epoch": 0.49624169889805153, + "eval_logits/chosen": -2.3913044929504395, + "eval_logits/rejected": -2.3913044929504395, + "eval_logps/chosen": -309.405517578125, + "eval_logps/rejected": -309.405517578125, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.029269486665725708, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.029269486665725708, + "eval_runtime": 2669.612, + "eval_samples_per_second": 2.281, + "eval_steps_per_second": 0.285, + "step": 1700 + }, + { + "epoch": 0.4991607677150989, + "grad_norm": 0.01324462890625, + "learning_rate": 2.9410898731434667e-06, + "logits/chosen": -2.41214919090271, + "logits/rejected": -2.41214919090271, + "logps/chosen": -302.40887451171875, + "logps/rejected": -302.40887451171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.028926188126206398, + "rewards/margins": 0.0, + "rewards/rejected": -0.028926188126206398, + "step": 1710 + }, + { + "epoch": 0.5020798365321463, + "grad_norm": 0.0152587890625, + "learning_rate": 2.9159837797192003e-06, + "logits/chosen": -2.415527820587158, + "logits/rejected": -2.415527820587158, + "logps/chosen": -329.7999267578125, + "logps/rejected": -329.7999267578125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03768650442361832, + "rewards/margins": 0.0, + "rewards/rejected": -0.03768650442361832, + "step": 1720 + }, + { + "epoch": 0.5049989053491936, + "grad_norm": 0.014404296875, + "learning_rate": 2.890834464070623e-06, + "logits/chosen": -2.4205574989318848, + "logits/rejected": -2.4205574989318848, + "logps/chosen": -309.94329833984375, + "logps/rejected": -309.94329833984375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03702525794506073, + "rewards/margins": 0.0, + "rewards/rejected": -0.03702525794506073, + "step": 1730 + }, + { + "epoch": 0.507917974166241, + "grad_norm": 0.013671875, + "learning_rate": 2.865644539302896e-06, + "logits/chosen": -2.389092206954956, + "logits/rejected": -2.389092206954956, + "logps/chosen": -339.6660461425781, + "logps/rejected": -339.6660461425781, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.029835382476449013, + "rewards/margins": 0.0, + "rewards/rejected": -0.029835382476449013, + "step": 1740 + }, + { + "epoch": 0.5108370429832884, + "grad_norm": 0.01300048828125, + "learning_rate": 2.840416622740617e-06, + "logits/chosen": -2.444392681121826, + "logits/rejected": -2.444392681121826, + "logps/chosen": -318.47296142578125, + "logps/rejected": -318.47296142578125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03087993524968624, + "rewards/margins": 0.0, + "rewards/rejected": -0.03087993524968624, + "step": 1750 + }, + { + "epoch": 0.5137561118003356, + "grad_norm": 0.01263427734375, + "learning_rate": 2.8151533356558673e-06, + "logits/chosen": -2.4179341793060303, + "logits/rejected": -2.4179341793060303, + "logps/chosen": -295.8548889160156, + "logps/rejected": -295.8548889160156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.032246123999357224, + "rewards/margins": 0.0, + "rewards/rejected": -0.032246123999357224, + "step": 1760 + }, + { + "epoch": 0.516675180617383, + "grad_norm": 0.014892578125, + "learning_rate": 2.7898573029958563e-06, + "logits/chosen": -2.377382516860962, + "logits/rejected": -2.377382516860962, + "logps/chosen": -305.41656494140625, + "logps/rejected": -305.41656494140625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03165289759635925, + "rewards/margins": 0.0, + "rewards/rejected": -0.03165289759635925, + "step": 1770 + }, + { + "epoch": 0.5195942494344304, + "grad_norm": 0.0103759765625, + "learning_rate": 2.7645311531101763e-06, + "logits/chosen": -2.412802219390869, + "logits/rejected": -2.412802219390869, + "logps/chosen": -312.50067138671875, + "logps/rejected": -312.50067138671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.034763775765895844, + "rewards/margins": 0.0, + "rewards/rejected": -0.034763775765895844, + "step": 1780 + }, + { + "epoch": 0.5225133182514777, + "grad_norm": 0.0135498046875, + "learning_rate": 2.7391775174777084e-06, + "logits/chosen": -2.419868230819702, + "logits/rejected": -2.419868230819702, + "logps/chosen": -310.26922607421875, + "logps/rejected": -310.26922607421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.035530101507902145, + "rewards/margins": 0.0, + "rewards/rejected": -0.035530101507902145, + "step": 1790 + }, + { + "epoch": 0.5254323870685251, + "grad_norm": 0.0167236328125, + "learning_rate": 2.713799030433203e-06, + "logits/chosen": -2.423767566680908, + "logits/rejected": -2.423767566680908, + "logps/chosen": -308.0718688964844, + "logps/rejected": -308.0718688964844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03834807127714157, + "rewards/margins": 0.0, + "rewards/rejected": -0.03834807127714157, + "step": 1800 + }, + { + "epoch": 0.5254323870685251, + "eval_logits/chosen": -2.392709732055664, + "eval_logits/rejected": -2.392709732055664, + "eval_logps/chosen": -310.26434326171875, + "eval_logps/rejected": -310.26434326171875, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.037857454270124435, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.037857454270124435, + "eval_runtime": 2669.2359, + "eval_samples_per_second": 2.282, + "eval_steps_per_second": 0.285, + "step": 1800 + }, + { + "epoch": 0.5283514558855725, + "grad_norm": 0.01373291015625, + "learning_rate": 2.688398328893561e-06, + "logits/chosen": -2.4216887950897217, + "logits/rejected": -2.4216887950897217, + "logps/chosen": -307.491455078125, + "logps/rejected": -307.491455078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03987189009785652, + "rewards/margins": 0.0, + "rewards/rejected": -0.03987189009785652, + "step": 1810 + }, + { + "epoch": 0.5312705247026198, + "grad_norm": 0.013916015625, + "learning_rate": 2.6629780520838526e-06, + "logits/chosen": -2.389004945755005, + "logits/rejected": -2.389004945755005, + "logps/chosen": -314.912353515625, + "logps/rejected": -314.912353515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03697946295142174, + "rewards/margins": 0.0, + "rewards/rejected": -0.03697946295142174, + "step": 1820 + }, + { + "epoch": 0.5341895935196672, + "grad_norm": 0.016845703125, + "learning_rate": 2.637540841263088e-06, + "logits/chosen": -2.4251251220703125, + "logits/rejected": -2.4251251220703125, + "logps/chosen": -309.82611083984375, + "logps/rejected": -309.82611083984375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.041506171226501465, + "rewards/margins": 0.0, + "rewards/rejected": -0.041506171226501465, + "step": 1830 + }, + { + "epoch": 0.5371086623367146, + "grad_norm": 0.0130615234375, + "learning_rate": 2.6120893394497825e-06, + "logits/chosen": -2.4095826148986816, + "logits/rejected": -2.4095826148986816, + "logps/chosen": -290.29876708984375, + "logps/rejected": -290.29876708984375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03885159641504288, + "rewards/margins": 0.0, + "rewards/rejected": -0.03885159641504288, + "step": 1840 + }, + { + "epoch": 0.5400277311537619, + "grad_norm": 0.0203857421875, + "learning_rate": 2.586626191147337e-06, + "logits/chosen": -2.414461612701416, + "logits/rejected": -2.414461612701416, + "logps/chosen": -298.74444580078125, + "logps/rejected": -298.74444580078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.035465486347675323, + "rewards/margins": 0.0, + "rewards/rejected": -0.035465486347675323, + "step": 1850 + }, + { + "epoch": 0.5429467999708093, + "grad_norm": 0.0142822265625, + "learning_rate": 2.5611540420692666e-06, + "logits/chosen": -2.4189705848693848, + "logits/rejected": -2.4189705848693848, + "logps/chosen": -361.6686706542969, + "logps/rejected": -361.6686706542969, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04054202139377594, + "rewards/margins": 0.0, + "rewards/rejected": -0.04054202139377594, + "step": 1860 + }, + { + "epoch": 0.5458658687878567, + "grad_norm": 0.01446533203125, + "learning_rate": 2.5356755388642973e-06, + "logits/chosen": -2.4053876399993896, + "logits/rejected": -2.4053876399993896, + "logps/chosen": -290.9534606933594, + "logps/rejected": -290.9534606933594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.037081241607666016, + "rewards/margins": 0.0, + "rewards/rejected": -0.037081241607666016, + "step": 1870 + }, + { + "epoch": 0.548784937604904, + "grad_norm": 0.01611328125, + "learning_rate": 2.510193328841375e-06, + "logits/chosen": -2.4209909439086914, + "logits/rejected": -2.4209909439086914, + "logps/chosen": -304.0765075683594, + "logps/rejected": -304.0765075683594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03415703400969505, + "rewards/margins": 0.0, + "rewards/rejected": -0.03415703400969505, + "step": 1880 + }, + { + "epoch": 0.5517040064219514, + "grad_norm": 0.0164794921875, + "learning_rate": 2.484710059694594e-06, + "logits/chosen": -2.4459662437438965, + "logits/rejected": -2.4459662437438965, + "logps/chosen": -274.7349548339844, + "logps/rejected": -274.7349548339844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03464942425489426, + "rewards/margins": 0.0, + "rewards/rejected": -0.03464942425489426, + "step": 1890 + }, + { + "epoch": 0.5546230752389988, + "grad_norm": 0.01348876953125, + "learning_rate": 2.4592283792280977e-06, + "logits/chosen": -2.384141206741333, + "logits/rejected": -2.384141206741333, + "logps/chosen": -293.96533203125, + "logps/rejected": -293.96533203125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04034542292356491, + "rewards/margins": 0.0, + "rewards/rejected": -0.04034542292356491, + "step": 1900 + }, + { + "epoch": 0.5546230752389988, + "eval_logits/chosen": -2.3927481174468994, + "eval_logits/rejected": -2.3927481174468994, + "eval_logps/chosen": -310.4163818359375, + "eval_logps/rejected": -310.4163818359375, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.03937768191099167, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.03937768191099167, + "eval_runtime": 2717.9911, + "eval_samples_per_second": 2.241, + "eval_steps_per_second": 0.28, + "step": 1900 + }, + { + "epoch": 0.5575421440560461, + "grad_norm": 0.01123046875, + "learning_rate": 2.433750935080959e-06, + "logits/chosen": -2.438390016555786, + "logits/rejected": -2.438390016555786, + "logps/chosen": -282.78106689453125, + "logps/rejected": -282.78106689453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05149908736348152, + "rewards/margins": 0.0, + "rewards/rejected": -0.05149908736348152, + "step": 1910 + }, + { + "epoch": 0.5604612128730935, + "grad_norm": 0.011962890625, + "learning_rate": 2.408280374452083e-06, + "logits/chosen": -2.4534342288970947, + "logits/rejected": -2.4534342288970947, + "logps/chosen": -306.63946533203125, + "logps/rejected": -306.63946533203125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04204695671796799, + "rewards/margins": 0.0, + "rewards/rejected": -0.04204695671796799, + "step": 1920 + }, + { + "epoch": 0.5633802816901409, + "grad_norm": 0.01385498046875, + "learning_rate": 2.3828193438251497e-06, + "logits/chosen": -2.4302496910095215, + "logits/rejected": -2.4302496910095215, + "logps/chosen": -328.1105651855469, + "logps/rejected": -328.1105651855469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03839876502752304, + "rewards/margins": 0.0, + "rewards/rejected": -0.03839876502752304, + "step": 1930 + }, + { + "epoch": 0.5662993505071882, + "grad_norm": 0.01513671875, + "learning_rate": 2.3573704886936414e-06, + "logits/chosen": -2.4566609859466553, + "logits/rejected": -2.4566609859466553, + "logps/chosen": -314.76910400390625, + "logps/rejected": -314.76910400390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04071163386106491, + "rewards/margins": 0.0, + "rewards/rejected": -0.04071163386106491, + "step": 1940 + }, + { + "epoch": 0.5692184193242356, + "grad_norm": 0.01397705078125, + "learning_rate": 2.331936453285957e-06, + "logits/chosen": -2.414055109024048, + "logits/rejected": -2.414055109024048, + "logps/chosen": -346.7576904296875, + "logps/rejected": -346.7576904296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.036326270550489426, + "rewards/margins": 0.0, + "rewards/rejected": -0.036326270550489426, + "step": 1950 + }, + { + "epoch": 0.572137488141283, + "grad_norm": 0.0157470703125, + "learning_rate": 2.3065198802906767e-06, + "logits/chosen": -2.4286112785339355, + "logits/rejected": -2.4286112785339355, + "logps/chosen": -339.60064697265625, + "logps/rejected": -339.60064697265625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04548191279172897, + "rewards/margins": 0.0, + "rewards/rejected": -0.04548191279172897, + "step": 1960 + }, + { + "epoch": 0.5750565569583302, + "grad_norm": 0.01141357421875, + "learning_rate": 2.2811234105819714e-06, + "logits/chosen": -2.4342637062072754, + "logits/rejected": -2.4342637062072754, + "logps/chosen": -314.4915771484375, + "logps/rejected": -314.4915771484375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03697306662797928, + "rewards/margins": 0.0, + "rewards/rejected": -0.03697306662797928, + "step": 1970 + }, + { + "epoch": 0.5779756257753776, + "grad_norm": 0.01495361328125, + "learning_rate": 2.2557496829452056e-06, + "logits/chosen": -2.387324810028076, + "logits/rejected": -2.387324810028076, + "logps/chosen": -349.37835693359375, + "logps/rejected": -349.37835693359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04250973090529442, + "rewards/margins": 0.0, + "rewards/rejected": -0.04250973090529442, + "step": 1980 + }, + { + "epoch": 0.580894694592425, + "grad_norm": 0.0152587890625, + "learning_rate": 2.230401333802763e-06, + "logits/chosen": -2.412137985229492, + "logits/rejected": -2.412137985229492, + "logps/chosen": -310.9895324707031, + "logps/rejected": -310.9895324707031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.040226660668849945, + "rewards/margins": 0.0, + "rewards/rejected": -0.040226660668849945, + "step": 1990 + }, + { + "epoch": 0.5838137634094723, + "grad_norm": 0.01483154296875, + "learning_rate": 2.205080996940108e-06, + "logits/chosen": -2.4124810695648193, + "logits/rejected": -2.4124810695648193, + "logps/chosen": -273.5890197753906, + "logps/rejected": -273.5890197753906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04197770729660988, + "rewards/margins": 0.0, + "rewards/rejected": -0.04197770729660988, + "step": 2000 + }, + { + "epoch": 0.5838137634094723, + "eval_logits/chosen": -2.392037868499756, + "eval_logits/rejected": -2.392037868499756, + "eval_logps/chosen": -310.4427185058594, + "eval_logps/rejected": -310.4427185058594, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.039641354233026505, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.039641354233026505, + "eval_runtime": 2711.551, + "eval_samples_per_second": 2.246, + "eval_steps_per_second": 0.281, + "step": 2000 + }, + { + "epoch": 0.5867328322265197, + "grad_norm": 0.01214599609375, + "learning_rate": 2.1797913032321283e-06, + "logits/chosen": -2.420572519302368, + "logits/rejected": -2.420572519302368, + "logps/chosen": -277.4279479980469, + "logps/rejected": -277.4279479980469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03539072722196579, + "rewards/margins": 0.0, + "rewards/rejected": -0.03539072722196579, + "step": 2010 + }, + { + "epoch": 0.5896519010435671, + "grad_norm": 0.0157470703125, + "learning_rate": 2.1545348803697745e-06, + "logits/chosen": -2.4433321952819824, + "logits/rejected": -2.4433321952819824, + "logps/chosen": -281.5128479003906, + "logps/rejected": -281.5128479003906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.016543438658118248, + "rewards/margins": 0.0, + "rewards/rejected": 0.016543438658118248, + "step": 2020 + }, + { + "epoch": 0.5925709698606144, + "grad_norm": 0.015869140625, + "learning_rate": 2.1293143525870396e-06, + "logits/chosen": -2.435228109359741, + "logits/rejected": -2.435228109359741, + "logps/chosen": -315.1198425292969, + "logps/rejected": -315.1198425292969, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04324204847216606, + "rewards/margins": 0.0, + "rewards/rejected": -0.04324204847216606, + "step": 2030 + }, + { + "epoch": 0.5954900386776618, + "grad_norm": 0.0133056640625, + "learning_rate": 2.1041323403882836e-06, + "logits/chosen": -2.458317995071411, + "logits/rejected": -2.458317995071411, + "logps/chosen": -314.63482666015625, + "logps/rejected": -314.63482666015625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.039002105593681335, + "rewards/margins": 0.0, + "rewards/rejected": -0.039002105593681335, + "step": 2040 + }, + { + "epoch": 0.5984091074947092, + "grad_norm": 0.0164794921875, + "learning_rate": 2.078991460275958e-06, + "logits/chosen": -2.4496326446533203, + "logits/rejected": -2.4496326446533203, + "logps/chosen": -295.86199951171875, + "logps/rejected": -295.86199951171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03856682404875755, + "rewards/margins": 0.0, + "rewards/rejected": -0.03856682404875755, + "step": 2050 + }, + { + "epoch": 0.6013281763117565, + "grad_norm": 0.01409912109375, + "learning_rate": 2.0538943244787452e-06, + "logits/chosen": -2.440256118774414, + "logits/rejected": -2.440256118774414, + "logps/chosen": -302.68463134765625, + "logps/rejected": -302.68463134765625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.040991030633449554, + "rewards/margins": 0.0, + "rewards/rejected": -0.040991030633449554, + "step": 2060 + }, + { + "epoch": 0.6042472451288039, + "grad_norm": 0.01226806640625, + "learning_rate": 2.0288435406801293e-06, + "logits/chosen": -2.4207422733306885, + "logits/rejected": -2.4207422733306885, + "logps/chosen": -347.23297119140625, + "logps/rejected": -347.23297119140625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03826383873820305, + "rewards/margins": 0.0, + "rewards/rejected": -0.03826383873820305, + "step": 2070 + }, + { + "epoch": 0.6071663139458513, + "grad_norm": 0.01275634765625, + "learning_rate": 2.0038417117474574e-06, + "logits/chosen": -2.4277267456054688, + "logits/rejected": -2.4277267456054688, + "logps/chosen": -314.09674072265625, + "logps/rejected": -314.09674072265625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05563684552907944, + "rewards/margins": 0.0, + "rewards/rejected": -0.05563684552907944, + "step": 2080 + }, + { + "epoch": 0.6100853827628986, + "grad_norm": 0.01251220703125, + "learning_rate": 1.9788914354614853e-06, + "logits/chosen": -2.4430274963378906, + "logits/rejected": -2.4430274963378906, + "logps/chosen": -280.791015625, + "logps/rejected": -280.791015625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.039162371307611465, + "rewards/margins": 0.0, + "rewards/rejected": -0.039162371307611465, + "step": 2090 + }, + { + "epoch": 0.613004451579946, + "grad_norm": 0.0159912109375, + "learning_rate": 1.9539953042464656e-06, + "logits/chosen": -2.4126973152160645, + "logits/rejected": -2.4126973152160645, + "logps/chosen": -341.8514709472656, + "logps/rejected": -341.8514709472656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04444648697972298, + "rewards/margins": 0.0, + "rewards/rejected": -0.04444648697972298, + "step": 2100 + }, + { + "epoch": 0.613004451579946, + "eval_logits/chosen": -2.390094041824341, + "eval_logits/rejected": -2.390094041824341, + "eval_logps/chosen": -310.71502685546875, + "eval_logps/rejected": -310.71502685546875, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.042364299297332764, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.042364299297332764, + "eval_runtime": 2698.9127, + "eval_samples_per_second": 2.256, + "eval_steps_per_second": 0.282, + "step": 2100 + }, + { + "epoch": 0.6159235203969934, + "grad_norm": 0.0126953125, + "learning_rate": 1.929155904900778e-06, + "logits/chosen": -2.442920207977295, + "logits/rejected": -2.442920207977295, + "logps/chosen": -336.13153076171875, + "logps/rejected": -336.13153076171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04605261981487274, + "rewards/margins": 0.0, + "rewards/rejected": -0.04605261981487274, + "step": 2110 + }, + { + "epoch": 0.6188425892140407, + "grad_norm": 0.0128173828125, + "learning_rate": 1.9043758183281548e-06, + "logits/chosen": -2.398139476776123, + "logits/rejected": -2.398139476776123, + "logps/chosen": -297.93353271484375, + "logps/rejected": -297.93353271484375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03661734238266945, + "rewards/margins": 0.0, + "rewards/rejected": -0.03661734238266945, + "step": 2120 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 0.0162353515625, + "learning_rate": 1.8796576192695198e-06, + "logits/chosen": -2.4115586280822754, + "logits/rejected": -2.4115586280822754, + "logps/chosen": -283.5032653808594, + "logps/rejected": -283.5032653808594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05024952441453934, + "rewards/margins": 0.0, + "rewards/rejected": -0.05024952441453934, + "step": 2130 + }, + { + "epoch": 0.6246807268481355, + "grad_norm": 0.01611328125, + "learning_rate": 1.8550038760354559e-06, + "logits/chosen": -2.4140570163726807, + "logits/rejected": -2.4140570163726807, + "logps/chosen": -328.29241943359375, + "logps/rejected": -328.29241943359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03783569857478142, + "rewards/margins": 0.0, + "rewards/rejected": -0.03783569857478142, + "step": 2140 + }, + { + "epoch": 0.6275997956651828, + "grad_norm": 0.01470947265625, + "learning_rate": 1.8304171502393542e-06, + "logits/chosen": -2.4498252868652344, + "logits/rejected": -2.4498252868652344, + "logps/chosen": -333.46807861328125, + "logps/rejected": -333.46807861328125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05009561777114868, + "rewards/margins": 0.0, + "rewards/rejected": -0.05009561777114868, + "step": 2150 + }, + { + "epoch": 0.6305188644822302, + "grad_norm": 0.0198974609375, + "learning_rate": 1.8058999965312484e-06, + "logits/chosen": -2.3965957164764404, + "logits/rejected": -2.3965957164764404, + "logps/chosen": -306.3211669921875, + "logps/rejected": -306.3211669921875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04475449398159981, + "rewards/margins": 0.0, + "rewards/rejected": -0.04475449398159981, + "step": 2160 + }, + { + "epoch": 0.6334379332992776, + "grad_norm": 0.016357421875, + "learning_rate": 1.7814549623323828e-06, + "logits/chosen": -2.400684356689453, + "logits/rejected": -2.400684356689453, + "logps/chosen": -286.625, + "logps/rejected": -286.625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.043368883430957794, + "rewards/margins": 0.0, + "rewards/rejected": -0.043368883430957794, + "step": 2170 + }, + { + "epoch": 0.6363570021163248, + "grad_norm": 0.01531982421875, + "learning_rate": 1.7570845875705205e-06, + "logits/chosen": -2.4366753101348877, + "logits/rejected": -2.4366753101348877, + "logps/chosen": -338.27679443359375, + "logps/rejected": -338.27679443359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05562018230557442, + "rewards/margins": 0.0, + "rewards/rejected": -0.05562018230557442, + "step": 2180 + }, + { + "epoch": 0.6392760709333722, + "grad_norm": 0.0162353515625, + "learning_rate": 1.7327914044160388e-06, + "logits/chosen": -2.449612617492676, + "logits/rejected": -2.449612617492676, + "logps/chosen": -316.91766357421875, + "logps/rejected": -316.91766357421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04444233328104019, + "rewards/margins": 0.0, + "rewards/rejected": -0.04444233328104019, + "step": 2190 + }, + { + "epoch": 0.6421951397504196, + "grad_norm": 0.0145263671875, + "learning_rate": 1.7085779370188276e-06, + "logits/chosen": -2.3980746269226074, + "logits/rejected": -2.3980746269226074, + "logps/chosen": -308.85906982421875, + "logps/rejected": -308.85906982421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0481327660381794, + "rewards/margins": 0.0, + "rewards/rejected": -0.0481327660381794, + "step": 2200 + }, + { + "epoch": 0.6421951397504196, + "eval_logits/chosen": -2.3910679817199707, + "eval_logits/rejected": -2.3910679817199707, + "eval_logps/chosen": -311.0310363769531, + "eval_logps/rejected": -311.0310363769531, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.04552413523197174, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.04552413523197174, + "eval_runtime": 2707.3295, + "eval_samples_per_second": 2.249, + "eval_steps_per_second": 0.281, + "step": 2200 + }, + { + "epoch": 0.6451142085674669, + "grad_norm": 0.016845703125, + "learning_rate": 1.6844467012460193e-06, + "logits/chosen": -2.429086446762085, + "logits/rejected": -2.429086446762085, + "logps/chosen": -306.8155822753906, + "logps/rejected": -306.8155822753906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04282836988568306, + "rewards/margins": 0.0, + "rewards/rejected": -0.04282836988568306, + "step": 2210 + }, + { + "epoch": 0.6480332773845143, + "grad_norm": 0.014404296875, + "learning_rate": 1.6604002044205825e-06, + "logits/chosen": -2.4325811862945557, + "logits/rejected": -2.4325811862945557, + "logps/chosen": -337.0578308105469, + "logps/rejected": -337.0578308105469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04457175359129906, + "rewards/margins": 0.0, + "rewards/rejected": -0.04457175359129906, + "step": 2220 + }, + { + "epoch": 0.6509523462015617, + "grad_norm": 0.01397705078125, + "learning_rate": 1.6364409450608018e-06, + "logits/chosen": -2.4428985118865967, + "logits/rejected": -2.4428985118865967, + "logps/chosen": -308.55657958984375, + "logps/rejected": -308.55657958984375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04731472209095955, + "rewards/margins": 0.0, + "rewards/rejected": -0.04731472209095955, + "step": 2230 + }, + { + "epoch": 0.653871415018609, + "grad_norm": 0.013427734375, + "learning_rate": 1.6125714126206736e-06, + "logits/chosen": -2.4196009635925293, + "logits/rejected": -2.4196009635925293, + "logps/chosen": -348.8056335449219, + "logps/rejected": -348.8056335449219, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.049455009400844574, + "rewards/margins": 0.0, + "rewards/rejected": -0.049455009400844574, + "step": 2240 + }, + { + "epoch": 0.6567904838356564, + "grad_norm": 0.01556396484375, + "learning_rate": 1.5887940872312391e-06, + "logits/chosen": -2.4100897312164307, + "logits/rejected": -2.4100897312164307, + "logps/chosen": -320.3233642578125, + "logps/rejected": -320.3233642578125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04676957428455353, + "rewards/margins": 0.0, + "rewards/rejected": -0.04676957428455353, + "step": 2250 + }, + { + "epoch": 0.6597095526527038, + "grad_norm": 0.0147705078125, + "learning_rate": 1.5651114394428955e-06, + "logits/chosen": -2.4624266624450684, + "logits/rejected": -2.4624266624450684, + "logps/chosen": -344.6718444824219, + "logps/rejected": -344.6718444824219, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0538489893078804, + "rewards/margins": 0.0, + "rewards/rejected": -0.0538489893078804, + "step": 2260 + }, + { + "epoch": 0.6626286214697511, + "grad_norm": 0.01251220703125, + "learning_rate": 1.5415259299686903e-06, + "logits/chosen": -2.4147191047668457, + "logits/rejected": -2.4147191047668457, + "logps/chosen": -316.6529235839844, + "logps/rejected": -316.6529235839844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.041484713554382324, + "rewards/margins": 0.0, + "rewards/rejected": -0.041484713554382324, + "step": 2270 + }, + { + "epoch": 0.6655476902867985, + "grad_norm": 0.01348876953125, + "learning_rate": 1.5180400094286496e-06, + "logits/chosen": -2.440053939819336, + "logits/rejected": -2.440053939819336, + "logps/chosen": -309.5370178222656, + "logps/rejected": -309.5370178222656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04570756107568741, + "rewards/margins": 0.0, + "rewards/rejected": -0.04570756107568741, + "step": 2280 + }, + { + "epoch": 0.6684667591038459, + "grad_norm": 0.017822265625, + "learning_rate": 1.494656118095149e-06, + "logits/chosen": -2.407764434814453, + "logits/rejected": -2.407764434814453, + "logps/chosen": -320.51263427734375, + "logps/rejected": -320.51263427734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04706931859254837, + "rewards/margins": 0.0, + "rewards/rejected": -0.04706931859254837, + "step": 2290 + }, + { + "epoch": 0.6713858279208932, + "grad_norm": 0.0120849609375, + "learning_rate": 1.4713766856393557e-06, + "logits/chosen": -2.420919895172119, + "logits/rejected": -2.420919895172119, + "logps/chosen": -295.04547119140625, + "logps/rejected": -295.04547119140625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05071335285902023, + "rewards/margins": 0.0, + "rewards/rejected": -0.05071335285902023, + "step": 2300 + }, + { + "epoch": 0.6713858279208932, + "eval_logits/chosen": -2.391244411468506, + "eval_logits/rejected": -2.391244411468506, + "eval_logps/chosen": -310.7880554199219, + "eval_logps/rejected": -310.7880554199219, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.04309455305337906, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.04309455305337906, + "eval_runtime": 2670.058, + "eval_samples_per_second": 2.281, + "eval_steps_per_second": 0.285, + "step": 2300 + }, + { + "epoch": 0.6743048967379406, + "grad_norm": 0.0198974609375, + "learning_rate": 1.448204130878785e-06, + "logits/chosen": -2.3968968391418457, + "logits/rejected": -2.3968968391418457, + "logps/chosen": -287.2406005859375, + "logps/rejected": -287.2406005859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04974811524152756, + "rewards/margins": 0.0, + "rewards/rejected": -0.04974811524152756, + "step": 2310 + }, + { + "epoch": 0.677223965554988, + "grad_norm": 0.013916015625, + "learning_rate": 1.425140861525967e-06, + "logits/chosen": -2.407982587814331, + "logits/rejected": -2.407982587814331, + "logps/chosen": -346.8302307128906, + "logps/rejected": -346.8302307128906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.045040082186460495, + "rewards/margins": 0.0, + "rewards/rejected": -0.045040082186460495, + "step": 2320 + }, + { + "epoch": 0.6801430343720353, + "grad_norm": 0.01531982421875, + "learning_rate": 1.4021892739382853e-06, + "logits/chosen": -2.4366557598114014, + "logits/rejected": -2.4366557598114014, + "logps/chosen": -315.5507507324219, + "logps/rejected": -315.5507507324219, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.053034014999866486, + "rewards/margins": 0.0, + "rewards/rejected": -0.053034014999866486, + "step": 2330 + }, + { + "epoch": 0.6830621031890827, + "grad_norm": 0.013916015625, + "learning_rate": 1.3793517528689804e-06, + "logits/chosen": -2.40993070602417, + "logits/rejected": -2.40993070602417, + "logps/chosen": -322.5754699707031, + "logps/rejected": -322.5754699707031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04859765246510506, + "rewards/margins": 0.0, + "rewards/rejected": -0.04859765246510506, + "step": 2340 + }, + { + "epoch": 0.6859811720061301, + "grad_norm": 0.0167236328125, + "learning_rate": 1.3566306712193704e-06, + "logits/chosen": -2.4204134941101074, + "logits/rejected": -2.4204134941101074, + "logps/chosen": -349.4993896484375, + "logps/rejected": -349.4993896484375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05103806406259537, + "rewards/margins": 0.0, + "rewards/rejected": -0.05103806406259537, + "step": 2350 + }, + { + "epoch": 0.6889002408231774, + "grad_norm": 0.01531982421875, + "learning_rate": 1.3340283897922911e-06, + "logits/chosen": -2.4295237064361572, + "logits/rejected": -2.4295237064361572, + "logps/chosen": -330.99005126953125, + "logps/rejected": -330.99005126953125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04879484325647354, + "rewards/margins": 0.0, + "rewards/rejected": -0.04879484325647354, + "step": 2360 + }, + { + "epoch": 0.6918193096402248, + "grad_norm": 0.0146484375, + "learning_rate": 1.3115472570468058e-06, + "logits/chosen": -2.4285712242126465, + "logits/rejected": -2.4285712242126465, + "logps/chosen": -336.67364501953125, + "logps/rejected": -336.67364501953125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04440216347575188, + "rewards/margins": 0.0, + "rewards/rejected": -0.04440216347575188, + "step": 2370 + }, + { + "epoch": 0.6947383784572722, + "grad_norm": 0.0162353515625, + "learning_rate": 1.2891896088541928e-06, + "logits/chosen": -2.405956745147705, + "logits/rejected": -2.405956745147705, + "logps/chosen": -338.88739013671875, + "logps/rejected": -338.88739013671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.047105275094509125, + "rewards/margins": 0.0, + "rewards/rejected": -0.047105275094509125, + "step": 2380 + }, + { + "epoch": 0.6976574472743194, + "grad_norm": 0.0169677734375, + "learning_rate": 1.266957768255232e-06, + "logits/chosen": -2.422194719314575, + "logits/rejected": -2.422194719314575, + "logps/chosen": -318.286865234375, + "logps/rejected": -318.286865234375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04666005074977875, + "rewards/margins": 0.0, + "rewards/rejected": -0.04666005074977875, + "step": 2390 + }, + { + "epoch": 0.7005765160913668, + "grad_norm": 0.0142822265625, + "learning_rate": 1.2448540452188432e-06, + "logits/chosen": -2.3955206871032715, + "logits/rejected": -2.3955206871032715, + "logps/chosen": -314.3586120605469, + "logps/rejected": -314.3586120605469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04044215753674507, + "rewards/margins": 0.0, + "rewards/rejected": -0.04044215753674507, + "step": 2400 + }, + { + "epoch": 0.7005765160913668, + "eval_logits/chosen": -2.3899266719818115, + "eval_logits/rejected": -2.3899266719818115, + "eval_logps/chosen": -310.6455078125, + "eval_logps/rejected": -310.6455078125, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.04166920483112335, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.04166920483112335, + "eval_runtime": 2668.7744, + "eval_samples_per_second": 2.282, + "eval_steps_per_second": 0.286, + "step": 2400 + }, + { + "epoch": 0.7034955849084142, + "grad_norm": 0.01446533203125, + "learning_rate": 1.2228807364020617e-06, + "logits/chosen": -2.4090027809143066, + "logits/rejected": -2.4090027809143066, + "logps/chosen": -268.48944091796875, + "logps/rejected": -268.48944091796875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.037643421441316605, + "rewards/margins": 0.0, + "rewards/rejected": -0.037643421441316605, + "step": 2410 + }, + { + "epoch": 0.7064146537254615, + "grad_norm": 0.012451171875, + "learning_rate": 1.2010401249114166e-06, + "logits/chosen": -2.4060184955596924, + "logits/rejected": -2.4060184955596924, + "logps/chosen": -338.2677001953125, + "logps/rejected": -338.2677001953125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.035085879266262054, + "rewards/margins": 0.0, + "rewards/rejected": -0.035085879266262054, + "step": 2420 + }, + { + "epoch": 0.7093337225425089, + "grad_norm": 0.0206298828125, + "learning_rate": 1.1793344800656995e-06, + "logits/chosen": -2.3857572078704834, + "logits/rejected": -2.3857572078704834, + "logps/chosen": -325.4837646484375, + "logps/rejected": -325.4837646484375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03704181686043739, + "rewards/margins": 0.0, + "rewards/rejected": -0.03704181686043739, + "step": 2430 + }, + { + "epoch": 0.7122527913595563, + "grad_norm": 0.01544189453125, + "learning_rate": 1.1577660571601796e-06, + "logits/chosen": -2.396127223968506, + "logits/rejected": -2.396127223968506, + "logps/chosen": -321.38897705078125, + "logps/rejected": -321.38897705078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0452260822057724, + "rewards/margins": 0.0, + "rewards/rejected": -0.0452260822057724, + "step": 2440 + }, + { + "epoch": 0.7151718601766036, + "grad_norm": 0.0137939453125, + "learning_rate": 1.1363370972322694e-06, + "logits/chosen": -2.4177489280700684, + "logits/rejected": -2.4177489280700684, + "logps/chosen": -296.6512756347656, + "logps/rejected": -296.6512756347656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04763947054743767, + "rewards/margins": 0.0, + "rewards/rejected": -0.04763947054743767, + "step": 2450 + }, + { + "epoch": 0.718090928993651, + "grad_norm": 0.0142822265625, + "learning_rate": 1.115049826828669e-06, + "logits/chosen": -2.4321625232696533, + "logits/rejected": -2.4321625232696533, + "logps/chosen": -306.14141845703125, + "logps/rejected": -306.14141845703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04333222657442093, + "rewards/margins": 0.0, + "rewards/rejected": -0.04333222657442093, + "step": 2460 + }, + { + "epoch": 0.7210099978106984, + "grad_norm": 0.01483154296875, + "learning_rate": 1.0939064577740266e-06, + "logits/chosen": -2.4054694175720215, + "logits/rejected": -2.4054694175720215, + "logps/chosen": -301.36334228515625, + "logps/rejected": -301.36334228515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.041240572929382324, + "rewards/margins": 0.0, + "rewards/rejected": -0.041240572929382324, + "step": 2470 + }, + { + "epoch": 0.7239290666277457, + "grad_norm": 0.0159912109375, + "learning_rate": 1.0729091869411137e-06, + "logits/chosen": -2.4020252227783203, + "logits/rejected": -2.4020252227783203, + "logps/chosen": -332.1387023925781, + "logps/rejected": -332.1387023925781, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.043921731412410736, + "rewards/margins": 0.0, + "rewards/rejected": -0.043921731412410736, + "step": 2480 + }, + { + "epoch": 0.7268481354447931, + "grad_norm": 0.013427734375, + "learning_rate": 1.0520601960225708e-06, + "logits/chosen": -2.421534299850464, + "logits/rejected": -2.421534299850464, + "logps/chosen": -314.00311279296875, + "logps/rejected": -314.00311279296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01183334831148386, + "rewards/margins": 0.0, + "rewards/rejected": -0.01183334831148386, + "step": 2490 + }, + { + "epoch": 0.7297672042618405, + "grad_norm": 0.020751953125, + "learning_rate": 1.0313616513042133e-06, + "logits/chosen": -2.4747350215911865, + "logits/rejected": -2.4747350215911865, + "logps/chosen": -319.47918701171875, + "logps/rejected": -319.47918701171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.055976878851652145, + "rewards/margins": 0.0, + "rewards/rejected": -0.055976878851652145, + "step": 2500 + }, + { + "epoch": 0.7297672042618405, + "eval_logits/chosen": -2.3914709091186523, + "eval_logits/rejected": -2.3914709091186523, + "eval_logps/chosen": -310.819580078125, + "eval_logps/rejected": -310.819580078125, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.04341000318527222, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.04341000318527222, + "eval_runtime": 2669.3967, + "eval_samples_per_second": 2.281, + "eval_steps_per_second": 0.285, + "step": 2500 + }, + { + "epoch": 0.7326862730788878, + "grad_norm": 0.0145263671875, + "learning_rate": 1.0108157034399532e-06, + "logits/chosen": -2.4052977561950684, + "logits/rejected": -2.4052977561950684, + "logps/chosen": -298.67474365234375, + "logps/rejected": -298.67474365234375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04686864838004112, + "rewards/margins": 0.0, + "rewards/rejected": -0.04686864838004112, + "step": 2510 + }, + { + "epoch": 0.7356053418959352, + "grad_norm": 0.0179443359375, + "learning_rate": 9.90424487228334e-07, + "logits/chosen": -2.411712646484375, + "logits/rejected": -2.411712646484375, + "logps/chosen": -322.70428466796875, + "logps/rejected": -322.70428466796875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.043312918394804, + "rewards/margins": 0.0, + "rewards/rejected": -0.043312918394804, + "step": 2520 + }, + { + "epoch": 0.7385244107129826, + "grad_norm": 0.01611328125, + "learning_rate": 9.701901213907192e-07, + "logits/chosen": -2.4330382347106934, + "logits/rejected": -2.4330382347106934, + "logps/chosen": -324.5224609375, + "logps/rejected": -324.5224609375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05447854846715927, + "rewards/margins": 0.0, + "rewards/rejected": -0.05447854846715927, + "step": 2530 + }, + { + "epoch": 0.7414434795300299, + "grad_norm": 0.01416015625, + "learning_rate": 9.501147083511511e-07, + "logits/chosen": -2.45332407951355, + "logits/rejected": -2.45332407951355, + "logps/chosen": -321.7140808105469, + "logps/rejected": -321.7140808105469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0516216978430748, + "rewards/margins": 0.0, + "rewards/rejected": -0.0516216978430748, + "step": 2540 + }, + { + "epoch": 0.7443625483470773, + "grad_norm": 0.015625, + "learning_rate": 9.302003340178962e-07, + "logits/chosen": -2.417236804962158, + "logits/rejected": -2.417236804962158, + "logps/chosen": -333.95574951171875, + "logps/rejected": -333.95574951171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0465201810002327, + "rewards/margins": 0.0, + "rewards/rejected": -0.0465201810002327, + "step": 2550 + }, + { + "epoch": 0.7472816171641247, + "grad_norm": 0.01422119140625, + "learning_rate": 9.10449067566718e-07, + "logits/chosen": -2.459394931793213, + "logits/rejected": -2.459394931793213, + "logps/chosen": -303.9725646972656, + "logps/rejected": -303.9725646972656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04736841470003128, + "rewards/margins": 0.0, + "rewards/rejected": -0.04736841470003128, + "step": 2560 + }, + { + "epoch": 0.750200685981172, + "grad_norm": 0.01513671875, + "learning_rate": 8.908629612258765e-07, + "logits/chosen": -2.435121774673462, + "logits/rejected": -2.435121774673462, + "logps/chosen": -300.51055908203125, + "logps/rejected": -300.51055908203125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04969844967126846, + "rewards/margins": 0.0, + "rewards/rejected": -0.04969844967126846, + "step": 2570 + }, + { + "epoch": 0.7531197547982194, + "grad_norm": 0.0145263671875, + "learning_rate": 8.714440500628999e-07, + "logits/chosen": -2.393557071685791, + "logits/rejected": -2.393557071685791, + "logps/chosen": -305.946044921875, + "logps/rejected": -305.946044921875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.035433102399110794, + "rewards/margins": 0.0, + "rewards/rejected": -0.035433102399110794, + "step": 2580 + }, + { + "epoch": 0.7560388236152668, + "grad_norm": 0.01385498046875, + "learning_rate": 8.521943517731276e-07, + "logits/chosen": -2.394944667816162, + "logits/rejected": -2.394944667816162, + "logps/chosen": -329.5417175292969, + "logps/rejected": -329.5417175292969, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.029879886656999588, + "rewards/margins": 0.0, + "rewards/rejected": -0.029879886656999588, + "step": 2590 + }, + { + "epoch": 0.758957892432314, + "grad_norm": 0.01513671875, + "learning_rate": 8.33115866470069e-07, + "logits/chosen": -2.3986093997955322, + "logits/rejected": -2.3986093997955322, + "logps/chosen": -297.0606994628906, + "logps/rejected": -297.0606994628906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04173046723008156, + "rewards/margins": 0.0, + "rewards/rejected": -0.04173046723008156, + "step": 2600 + }, + { + "epoch": 0.758957892432314, + "eval_logits/chosen": -2.3918919563293457, + "eval_logits/rejected": -2.3918919563293457, + "eval_logps/chosen": -310.8546447753906, + "eval_logps/rejected": -310.8546447753906, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.0437602661550045, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.0437602661550045, + "eval_runtime": 2682.4018, + "eval_samples_per_second": 2.27, + "eval_steps_per_second": 0.284, + "step": 2600 + }, + { + "epoch": 0.7618769612493614, + "grad_norm": 0.01544189453125, + "learning_rate": 8.142105764775824e-07, + "logits/chosen": -2.384005546569824, + "logits/rejected": -2.384005546569824, + "logps/chosen": -327.1615295410156, + "logps/rejected": -327.1615295410156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.051810719072818756, + "rewards/margins": 0.0, + "rewards/rejected": -0.051810719072818756, + "step": 2610 + }, + { + "epoch": 0.7647960300664088, + "grad_norm": 0.01458740234375, + "learning_rate": 7.954804461239054e-07, + "logits/chosen": -2.444282054901123, + "logits/rejected": -2.444282054901123, + "logps/chosen": -314.5889587402344, + "logps/rejected": -314.5889587402344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04727676510810852, + "rewards/margins": 0.0, + "rewards/rejected": -0.04727676510810852, + "step": 2620 + }, + { + "epoch": 0.7677150988834561, + "grad_norm": 0.016357421875, + "learning_rate": 7.769274215375544e-07, + "logits/chosen": -2.432978391647339, + "logits/rejected": -2.432978391647339, + "logps/chosen": -293.0484924316406, + "logps/rejected": -293.0484924316406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0413900688290596, + "rewards/margins": 0.0, + "rewards/rejected": -0.0413900688290596, + "step": 2630 + }, + { + "epoch": 0.7706341677005035, + "grad_norm": 0.01446533203125, + "learning_rate": 7.585534304451103e-07, + "logits/chosen": -2.444913387298584, + "logits/rejected": -2.444913387298584, + "logps/chosen": -330.8976135253906, + "logps/rejected": -330.8976135253906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.043237775564193726, + "rewards/margins": 0.0, + "rewards/rejected": -0.043237775564193726, + "step": 2640 + }, + { + "epoch": 0.7735532365175509, + "grad_norm": 0.01312255859375, + "learning_rate": 7.403603819709288e-07, + "logits/chosen": -2.4194247722625732, + "logits/rejected": -2.4194247722625732, + "logps/chosen": -302.08465576171875, + "logps/rejected": -302.08465576171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04694979637861252, + "rewards/margins": 0.0, + "rewards/rejected": -0.04694979637861252, + "step": 2650 + }, + { + "epoch": 0.7764723053345982, + "grad_norm": 0.014404296875, + "learning_rate": 7.223501664387664e-07, + "logits/chosen": -2.440764904022217, + "logits/rejected": -2.440764904022217, + "logps/chosen": -280.7825622558594, + "logps/rejected": -280.7825622558594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.036083877086639404, + "rewards/margins": 0.0, + "rewards/rejected": -0.036083877086639404, + "step": 2660 + }, + { + "epoch": 0.7793913741516456, + "grad_norm": 0.01458740234375, + "learning_rate": 7.045246551753779e-07, + "logits/chosen": -2.4197888374328613, + "logits/rejected": -2.4197888374328613, + "logps/chosen": -323.67938232421875, + "logps/rejected": -323.67938232421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.043979812413454056, + "rewards/margins": 0.0, + "rewards/rejected": -0.043979812413454056, + "step": 2670 + }, + { + "epoch": 0.782310442968693, + "grad_norm": 0.0142822265625, + "learning_rate": 6.868857003160709e-07, + "logits/chosen": -2.470567226409912, + "logits/rejected": -2.470567226409912, + "logps/chosen": -356.6578369140625, + "logps/rejected": -356.6578369140625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05309978872537613, + "rewards/margins": 0.0, + "rewards/rejected": -0.05309978872537613, + "step": 2680 + }, + { + "epoch": 0.7852295117857403, + "grad_norm": 0.0150146484375, + "learning_rate": 6.69435134612266e-07, + "logits/chosen": -2.4125561714172363, + "logits/rejected": -2.4125561714172363, + "logps/chosen": -302.1919250488281, + "logps/rejected": -302.1919250488281, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04638701677322388, + "rewards/margins": 0.0, + "rewards/rejected": -0.04638701677322388, + "step": 2690 + }, + { + "epoch": 0.7881485806027877, + "grad_norm": 0.013427734375, + "learning_rate": 6.521747712410687e-07, + "logits/chosen": -2.431802988052368, + "logits/rejected": -2.431802988052368, + "logps/chosen": -319.6323547363281, + "logps/rejected": -319.6323547363281, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04552530124783516, + "rewards/margins": 0.0, + "rewards/rejected": -0.04552530124783516, + "step": 2700 + }, + { + "epoch": 0.7881485806027877, + "eval_logits/chosen": -2.3916165828704834, + "eval_logits/rejected": -2.3916165828704834, + "eval_logps/chosen": -310.8406677246094, + "eval_logps/rejected": -310.8406677246094, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.04362065717577934, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.04362065717577934, + "eval_runtime": 2682.1268, + "eval_samples_per_second": 2.271, + "eval_steps_per_second": 0.284, + "step": 2700 + }, + { + "epoch": 0.7910676494198351, + "grad_norm": 0.0250244140625, + "learning_rate": 6.351064036168708e-07, + "logits/chosen": -2.4238877296447754, + "logits/rejected": -2.4238877296447754, + "logps/chosen": -338.21759033203125, + "logps/rejected": -338.21759033203125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.048554692417383194, + "rewards/margins": 0.0, + "rewards/rejected": -0.048554692417383194, + "step": 2710 + }, + { + "epoch": 0.7939867182368824, + "grad_norm": 0.01416015625, + "learning_rate": 6.182318052050102e-07, + "logits/chosen": -2.398974895477295, + "logits/rejected": -2.398974895477295, + "logps/chosen": -329.53106689453125, + "logps/rejected": -329.53106689453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05249527841806412, + "rewards/margins": 0.0, + "rewards/rejected": -0.05249527841806412, + "step": 2720 + }, + { + "epoch": 0.7969057870539298, + "grad_norm": 0.019287109375, + "learning_rate": 6.015527293374979e-07, + "logits/chosen": -2.4338581562042236, + "logits/rejected": -2.4338581562042236, + "logps/chosen": -334.1202087402344, + "logps/rejected": -334.1202087402344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04841463267803192, + "rewards/margins": 0.0, + "rewards/rejected": -0.04841463267803192, + "step": 2730 + }, + { + "epoch": 0.7998248558709772, + "grad_norm": 0.014404296875, + "learning_rate": 5.850709090308459e-07, + "logits/chosen": -2.4255330562591553, + "logits/rejected": -2.4255330562591553, + "logps/chosen": -295.30523681640625, + "logps/rejected": -295.30523681640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0433431938290596, + "rewards/margins": 0.0, + "rewards/rejected": -0.0433431938290596, + "step": 2740 + }, + { + "epoch": 0.8027439246880245, + "grad_norm": 0.0133056640625, + "learning_rate": 5.687880568059961e-07, + "logits/chosen": -2.3997416496276855, + "logits/rejected": -2.3997416496276855, + "logps/chosen": -314.76361083984375, + "logps/rejected": -314.76361083984375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04872073233127594, + "rewards/margins": 0.0, + "rewards/rejected": -0.04872073233127594, + "step": 2750 + }, + { + "epoch": 0.8056629935050719, + "grad_norm": 0.01422119140625, + "learning_rate": 5.527058645103842e-07, + "logits/chosen": -2.3996376991271973, + "logits/rejected": -2.3996376991271973, + "logps/chosen": -376.6802673339844, + "logps/rejected": -376.6802673339844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0522351935505867, + "rewards/margins": 0.0, + "rewards/rejected": -0.0522351935505867, + "step": 2760 + }, + { + "epoch": 0.8085820623221193, + "grad_norm": 0.0159912109375, + "learning_rate": 5.368260031421526e-07, + "logits/chosen": -2.4533755779266357, + "logits/rejected": -2.4533755779266357, + "logps/chosen": -338.7648010253906, + "logps/rejected": -338.7648010253906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04105439782142639, + "rewards/margins": 0.0, + "rewards/rejected": -0.04105439782142639, + "step": 2770 + }, + { + "epoch": 0.8115011311391666, + "grad_norm": 0.01263427734375, + "learning_rate": 5.211501226765242e-07, + "logits/chosen": -2.43373441696167, + "logits/rejected": -2.43373441696167, + "logps/chosen": -285.7012023925781, + "logps/rejected": -285.7012023925781, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02375207468867302, + "rewards/margins": 0.0, + "rewards/rejected": -0.02375207468867302, + "step": 2780 + }, + { + "epoch": 0.814420199956214, + "grad_norm": 0.0184326171875, + "learning_rate": 5.056798518943678e-07, + "logits/chosen": -2.4133718013763428, + "logits/rejected": -2.4133718013763428, + "logps/chosen": -315.09210205078125, + "logps/rejected": -315.09210205078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.053018081933259964, + "rewards/margins": 0.0, + "rewards/rejected": -0.053018081933259964, + "step": 2790 + }, + { + "epoch": 0.8173392687732614, + "grad_norm": 0.01397705078125, + "learning_rate": 4.904167982129591e-07, + "logits/chosen": -2.423839569091797, + "logits/rejected": -2.423839569091797, + "logps/chosen": -294.44683837890625, + "logps/rejected": -294.44683837890625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04831403121352196, + "rewards/margins": 0.0, + "rewards/rejected": -0.04831403121352196, + "step": 2800 + }, + { + "epoch": 0.8173392687732614, + "eval_logits/chosen": -2.3914895057678223, + "eval_logits/rejected": -2.3914895057678223, + "eval_logps/chosen": -310.798095703125, + "eval_logps/rejected": -310.798095703125, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.04319505766034126, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.04319505766034126, + "eval_runtime": 2682.0962, + "eval_samples_per_second": 2.271, + "eval_steps_per_second": 0.284, + "step": 2800 + }, + { + "epoch": 0.8202583375903086, + "grad_norm": 0.0126953125, + "learning_rate": 4.7536254751896493e-07, + "logits/chosen": -2.4333229064941406, + "logits/rejected": -2.4333229064941406, + "logps/chosen": -315.96234130859375, + "logps/rejected": -315.96234130859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05122748017311096, + "rewards/margins": 0.0, + "rewards/rejected": -0.05122748017311096, + "step": 2810 + }, + { + "epoch": 0.823177406407356, + "grad_norm": 0.0167236328125, + "learning_rate": 4.6051866400366354e-07, + "logits/chosen": -2.4289793968200684, + "logits/rejected": -2.4289793968200684, + "logps/chosen": -344.29608154296875, + "logps/rejected": -344.29608154296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04764155298471451, + "rewards/margins": 0.0, + "rewards/rejected": -0.04764155298471451, + "step": 2820 + }, + { + "epoch": 0.8260964752244034, + "grad_norm": 0.0166015625, + "learning_rate": 4.4588669000042133e-07, + "logits/chosen": -2.4046084880828857, + "logits/rejected": -2.4046084880828857, + "logps/chosen": -325.74957275390625, + "logps/rejected": -325.74957275390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.035486068576574326, + "rewards/margins": 0.0, + "rewards/rejected": -0.035486068576574326, + "step": 2830 + }, + { + "epoch": 0.8290155440414507, + "grad_norm": 0.016845703125, + "learning_rate": 4.3146814582443605e-07, + "logits/chosen": -2.418729066848755, + "logits/rejected": -2.418729066848755, + "logps/chosen": -327.8818359375, + "logps/rejected": -327.8818359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.046850480139255524, + "rewards/margins": 0.0, + "rewards/rejected": -0.046850480139255524, + "step": 2840 + }, + { + "epoch": 0.8319346128584981, + "grad_norm": 0.0135498046875, + "learning_rate": 4.1726452961477147e-07, + "logits/chosen": -2.416329860687256, + "logits/rejected": -2.416329860687256, + "logps/chosen": -319.5370178222656, + "logps/rejected": -319.5370178222656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0506584569811821, + "rewards/margins": 0.0, + "rewards/rejected": -0.0506584569811821, + "step": 2850 + }, + { + "epoch": 0.8348536816755455, + "grad_norm": 0.0146484375, + "learning_rate": 4.0327731717869775e-07, + "logits/chosen": -2.4376559257507324, + "logits/rejected": -2.4376559257507324, + "logps/chosen": -272.7819519042969, + "logps/rejected": -272.7819519042969, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05310596153140068, + "rewards/margins": 0.0, + "rewards/rejected": -0.05310596153140068, + "step": 2860 + }, + { + "epoch": 0.8377727504925928, + "grad_norm": 0.0191650390625, + "learning_rate": 3.8950796183834516e-07, + "logits/chosen": -2.4388468265533447, + "logits/rejected": -2.4388468265533447, + "logps/chosen": -345.3861389160156, + "logps/rejected": -345.3861389160156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05019260570406914, + "rewards/margins": 0.0, + "rewards/rejected": -0.05019260570406914, + "step": 2870 + }, + { + "epoch": 0.8406918193096402, + "grad_norm": 0.01495361328125, + "learning_rate": 3.759578942797029e-07, + "logits/chosen": -2.4550201892852783, + "logits/rejected": -2.4550201892852783, + "logps/chosen": -306.2907409667969, + "logps/rejected": -306.2907409667969, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.046652454882860184, + "rewards/margins": 0.0, + "rewards/rejected": -0.046652454882860184, + "step": 2880 + }, + { + "epoch": 0.8436108881266876, + "grad_norm": 0.0126953125, + "learning_rate": 3.6262852240396356e-07, + "logits/chosen": -2.446690082550049, + "logits/rejected": -2.446690082550049, + "logps/chosen": -310.69708251953125, + "logps/rejected": -310.69708251953125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04568660259246826, + "rewards/margins": 0.0, + "rewards/rejected": -0.04568660259246826, + "step": 2890 + }, + { + "epoch": 0.8465299569437349, + "grad_norm": 0.014404296875, + "learning_rate": 3.4952123118123735e-07, + "logits/chosen": -2.402627468109131, + "logits/rejected": -2.402627468109131, + "logps/chosen": -312.1624755859375, + "logps/rejected": -312.1624755859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.045014895498752594, + "rewards/margins": 0.0, + "rewards/rejected": -0.045014895498752594, + "step": 2900 + }, + { + "epoch": 0.8465299569437349, + "eval_logits/chosen": -2.391954183578491, + "eval_logits/rejected": -2.391954183578491, + "eval_logps/chosen": -310.79425048828125, + "eval_logps/rejected": -310.79425048828125, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.043156567960977554, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.043156567960977554, + "eval_runtime": 2682.3759, + "eval_samples_per_second": 2.27, + "eval_steps_per_second": 0.284, + "step": 2900 + }, + { + "epoch": 0.8494490257607823, + "grad_norm": 0.01470947265625, + "learning_rate": 3.3663738250664853e-07, + "logits/chosen": -2.416839122772217, + "logits/rejected": -2.416839122772217, + "logps/chosen": -342.91815185546875, + "logps/rejected": -342.91815185546875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.051371246576309204, + "rewards/margins": 0.0, + "rewards/rejected": -0.051371246576309204, + "step": 2910 + }, + { + "epoch": 0.8523680945778297, + "grad_norm": 0.0159912109375, + "learning_rate": 3.239783150588283e-07, + "logits/chosen": -2.3476662635803223, + "logits/rejected": -2.3476662635803223, + "logps/chosen": -304.71368408203125, + "logps/rejected": -304.71368408203125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.005339882802218199, + "rewards/margins": 0.0, + "rewards/rejected": 0.005339882802218199, + "step": 2920 + }, + { + "epoch": 0.855287163394877, + "grad_norm": 0.01409912109375, + "learning_rate": 3.1154534416082573e-07, + "logits/chosen": -2.416965961456299, + "logits/rejected": -2.416965961456299, + "logps/chosen": -299.3199157714844, + "logps/rejected": -299.3199157714844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04180007427930832, + "rewards/margins": 0.0, + "rewards/rejected": -0.04180007427930832, + "step": 2930 + }, + { + "epoch": 0.8582062322119244, + "grad_norm": 0.01055908203125, + "learning_rate": 2.9933976164343514e-07, + "logits/chosen": -2.4285387992858887, + "logits/rejected": -2.4285387992858887, + "logps/chosen": -303.32183837890625, + "logps/rejected": -303.32183837890625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04410778731107712, + "rewards/margins": 0.0, + "rewards/rejected": -0.04410778731107712, + "step": 2940 + }, + { + "epoch": 0.8611253010289718, + "grad_norm": 0.0162353515625, + "learning_rate": 2.873628357109745e-07, + "logits/chosen": -2.4083211421966553, + "logits/rejected": -2.4083211421966553, + "logps/chosen": -326.7142028808594, + "logps/rejected": -326.7142028808594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.046554580330848694, + "rewards/margins": 0.0, + "rewards/rejected": -0.046554580330848694, + "step": 2950 + }, + { + "epoch": 0.8640443698460191, + "grad_norm": 0.01324462890625, + "learning_rate": 2.7561581080951195e-07, + "logits/chosen": -2.4226157665252686, + "logits/rejected": -2.4226157665252686, + "logps/chosen": -292.55767822265625, + "logps/rejected": -292.55767822265625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04246639460325241, + "rewards/margins": 0.0, + "rewards/rejected": -0.04246639460325241, + "step": 2960 + }, + { + "epoch": 0.8669634386630665, + "grad_norm": 0.01361083984375, + "learning_rate": 2.640999074975645e-07, + "logits/chosen": -2.43457293510437, + "logits/rejected": -2.43457293510437, + "logps/chosen": -298.2882385253906, + "logps/rejected": -298.2882385253906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04169774800539017, + "rewards/margins": 0.0, + "rewards/rejected": -0.04169774800539017, + "step": 2970 + }, + { + "epoch": 0.8698825074801139, + "grad_norm": 0.01708984375, + "learning_rate": 2.5281632231927786e-07, + "logits/chosen": -2.473017930984497, + "logits/rejected": -2.473017930984497, + "logps/chosen": -307.8494567871094, + "logps/rejected": -307.8494567871094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.045904386788606644, + "rewards/margins": 0.0, + "rewards/rejected": -0.045904386788606644, + "step": 2980 + }, + { + "epoch": 0.8728015762971612, + "grad_norm": 0.014404296875, + "learning_rate": 2.417662276800997e-07, + "logits/chosen": -2.4377925395965576, + "logits/rejected": -2.4377925395965576, + "logps/chosen": -329.8043518066406, + "logps/rejected": -329.8043518066406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.052566416561603546, + "rewards/margins": 0.0, + "rewards/rejected": -0.052566416561603546, + "step": 2990 + }, + { + "epoch": 0.8757206451142086, + "grad_norm": 0.01226806640625, + "learning_rate": 2.30950771724964e-07, + "logits/chosen": -2.4452061653137207, + "logits/rejected": -2.4452061653137207, + "logps/chosen": -316.7723388671875, + "logps/rejected": -316.7723388671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.048340607434511185, + "rewards/margins": 0.0, + "rewards/rejected": -0.048340607434511185, + "step": 3000 + }, + { + "epoch": 0.8757206451142086, + "eval_logits/chosen": -2.3918232917785645, + "eval_logits/rejected": -2.3918232917785645, + "eval_logps/chosen": -310.78662109375, + "eval_logps/rejected": -310.78662109375, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.04308019578456879, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.04308019578456879, + "eval_runtime": 2681.8561, + "eval_samples_per_second": 2.271, + "eval_steps_per_second": 0.284, + "step": 3000 + }, + { + "epoch": 0.878639713931256, + "grad_norm": 0.017822265625, + "learning_rate": 2.2037107821899272e-07, + "logits/chosen": -2.414727210998535, + "logits/rejected": -2.414727210998535, + "logps/chosen": -343.22796630859375, + "logps/rejected": -343.22796630859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.047485075891017914, + "rewards/margins": 0.0, + "rewards/rejected": -0.047485075891017914, + "step": 3010 + }, + { + "epoch": 0.8815587827483032, + "grad_norm": 0.01708984375, + "learning_rate": 2.100282464307357e-07, + "logits/chosen": -2.4386258125305176, + "logits/rejected": -2.4386258125305176, + "logps/chosen": -305.25250244140625, + "logps/rejected": -305.25250244140625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04185379669070244, + "rewards/margins": 0.0, + "rewards/rejected": -0.04185379669070244, + "step": 3020 + }, + { + "epoch": 0.8844778515653506, + "grad_norm": 0.016357421875, + "learning_rate": 1.999233510179488e-07, + "logits/chosen": -2.4112370014190674, + "logits/rejected": -2.4112370014190674, + "logps/chosen": -339.65093994140625, + "logps/rejected": -339.65093994140625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.044059764593839645, + "rewards/margins": 0.0, + "rewards/rejected": -0.044059764593839645, + "step": 3030 + }, + { + "epoch": 0.887396920382398, + "grad_norm": 0.012939453125, + "learning_rate": 1.9005744191593678e-07, + "logits/chosen": -2.4179887771606445, + "logits/rejected": -2.4179887771606445, + "logps/chosen": -297.5303649902344, + "logps/rejected": -297.5303649902344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03384246677160263, + "rewards/margins": 0.0, + "rewards/rejected": -0.03384246677160263, + "step": 3040 + }, + { + "epoch": 0.8903159891994453, + "grad_norm": 0.0120849609375, + "learning_rate": 1.8043154422845794e-07, + "logits/chosen": -2.4646730422973633, + "logits/rejected": -2.4646730422973633, + "logps/chosen": -295.91790771484375, + "logps/rejected": -295.91790771484375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04882458597421646, + "rewards/margins": 0.0, + "rewards/rejected": -0.04882458597421646, + "step": 3050 + }, + { + "epoch": 0.8932350580164927, + "grad_norm": 0.0186767578125, + "learning_rate": 1.7104665812121445e-07, + "logits/chosen": -2.423285961151123, + "logits/rejected": -2.423285961151123, + "logps/chosen": -297.9593505859375, + "logps/rejected": -297.9593505859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04218859225511551, + "rewards/margins": 0.0, + "rewards/rejected": -0.04218859225511551, + "step": 3060 + }, + { + "epoch": 0.8961541268335401, + "grad_norm": 0.0164794921875, + "learning_rate": 1.619037587179309e-07, + "logits/chosen": -2.3985249996185303, + "logits/rejected": -2.3985249996185303, + "logps/chosen": -332.85809326171875, + "logps/rejected": -332.85809326171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.048540227115154266, + "rewards/margins": 0.0, + "rewards/rejected": -0.048540227115154266, + "step": 3070 + }, + { + "epoch": 0.8990731956505874, + "grad_norm": 0.0172119140625, + "learning_rate": 1.5300379599903408e-07, + "logits/chosen": -2.4070308208465576, + "logits/rejected": -2.4070308208465576, + "logps/chosen": -310.7314147949219, + "logps/rejected": -310.7314147949219, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03895800933241844, + "rewards/margins": 0.0, + "rewards/rejected": -0.03895800933241844, + "step": 3080 + }, + { + "epoch": 0.9019922644676348, + "grad_norm": 0.013671875, + "learning_rate": 1.44347694702949e-07, + "logits/chosen": -2.3916313648223877, + "logits/rejected": -2.3916313648223877, + "logps/chosen": -288.28106689453125, + "logps/rejected": -288.28106689453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03293871134519577, + "rewards/margins": 0.0, + "rewards/rejected": -0.03293871134519577, + "step": 3090 + }, + { + "epoch": 0.9049113332846822, + "grad_norm": 0.017822265625, + "learning_rate": 1.359363542300124e-07, + "logits/chosen": -2.4147801399230957, + "logits/rejected": -2.4147801399230957, + "logps/chosen": -295.56768798828125, + "logps/rejected": -295.56768798828125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04364749416708946, + "rewards/margins": 0.0, + "rewards/rejected": -0.04364749416708946, + "step": 3100 + }, + { + "epoch": 0.9049113332846822, + "eval_logits/chosen": -2.390821933746338, + "eval_logits/rejected": -2.390821933746338, + "eval_logps/chosen": -310.7793884277344, + "eval_logps/rejected": -310.7793884277344, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.04300786182284355, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.04300786182284355, + "eval_runtime": 2681.9583, + "eval_samples_per_second": 2.271, + "eval_steps_per_second": 0.284, + "step": 3100 + }, + { + "epoch": 0.9078304021017295, + "grad_norm": 0.0152587890625, + "learning_rate": 1.2777064854902487e-07, + "logits/chosen": -2.44869065284729, + "logits/rejected": -2.44869065284729, + "logps/chosen": -324.82257080078125, + "logps/rejected": -324.82257080078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04247719421982765, + "rewards/margins": 0.0, + "rewards/rejected": -0.04247719421982765, + "step": 3110 + }, + { + "epoch": 0.9107494709187769, + "grad_norm": 0.023681640625, + "learning_rate": 1.1985142610643902e-07, + "logits/chosen": -2.4080257415771484, + "logits/rejected": -2.4080257415771484, + "logps/chosen": -321.1974792480469, + "logps/rejected": -321.1974792480469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05160089209675789, + "rewards/margins": 0.0, + "rewards/rejected": -0.05160089209675789, + "step": 3120 + }, + { + "epoch": 0.9136685397358243, + "grad_norm": 0.01275634765625, + "learning_rate": 1.121795097382064e-07, + "logits/chosen": -2.422560691833496, + "logits/rejected": -2.422560691833496, + "logps/chosen": -335.0086975097656, + "logps/rejected": -335.0086975097656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04736005887389183, + "rewards/margins": 0.0, + "rewards/rejected": -0.04736005887389183, + "step": 3130 + }, + { + "epoch": 0.9165876085528716, + "grad_norm": 0.0169677734375, + "learning_rate": 1.0475569658427803e-07, + "logits/chosen": -2.438781261444092, + "logits/rejected": -2.438781261444092, + "logps/chosen": -311.33868408203125, + "logps/rejected": -311.33868408203125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03844516724348068, + "rewards/margins": 0.0, + "rewards/rejected": -0.03844516724348068, + "step": 3140 + }, + { + "epoch": 0.919506677369919, + "grad_norm": 0.02001953125, + "learning_rate": 9.758075800578193e-08, + "logits/chosen": -2.4374260902404785, + "logits/rejected": -2.4374260902404785, + "logps/chosen": -300.9288635253906, + "logps/rejected": -300.9288635253906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0434752032160759, + "rewards/margins": 0.0, + "rewards/rejected": -0.0434752032160759, + "step": 3150 + }, + { + "epoch": 0.9224257461869664, + "grad_norm": 0.01544189453125, + "learning_rate": 9.06554395048742e-08, + "logits/chosen": -2.4104561805725098, + "logits/rejected": -2.4104561805725098, + "logps/chosen": -310.27789306640625, + "logps/rejected": -310.27789306640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04003220796585083, + "rewards/margins": 0.0, + "rewards/rejected": -0.04003220796585083, + "step": 3160 + }, + { + "epoch": 0.9253448150040137, + "grad_norm": 0.01416015625, + "learning_rate": 8.398046064727855e-08, + "logits/chosen": -2.448122262954712, + "logits/rejected": -2.448122262954712, + "logps/chosen": -303.9940185546875, + "logps/rejected": -303.9940185546875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04497329518198967, + "rewards/margins": 0.0, + "rewards/rejected": -0.04497329518198967, + "step": 3170 + }, + { + "epoch": 0.9282638838210611, + "grad_norm": 0.0140380859375, + "learning_rate": 7.755651498752265e-08, + "logits/chosen": -2.4395852088928223, + "logits/rejected": -2.4395852088928223, + "logps/chosen": -292.140380859375, + "logps/rejected": -292.140380859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04522908851504326, + "rewards/margins": 0.0, + "rewards/rejected": -0.04522908851504326, + "step": 3180 + }, + { + "epoch": 0.9311829526381085, + "grad_norm": 0.016357421875, + "learning_rate": 7.138426999687171e-08, + "logits/chosen": -2.4227964878082275, + "logits/rejected": -2.4227964878082275, + "logps/chosen": -333.205810546875, + "logps/rejected": -333.205810546875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04331531375646591, + "rewards/margins": 0.0, + "rewards/rejected": -0.04331531375646591, + "step": 3190 + }, + { + "epoch": 0.9341020214551558, + "grad_norm": 0.0177001953125, + "learning_rate": 6.546436699398029e-08, + "logits/chosen": -2.4100470542907715, + "logits/rejected": -2.4100470542907715, + "logps/chosen": -334.2508850097656, + "logps/rejected": -334.2508850097656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.043238040059804916, + "rewards/margins": 0.0, + "rewards/rejected": -0.043238040059804916, + "step": 3200 + }, + { + "epoch": 0.9341020214551558, + "eval_logits/chosen": -2.391075849533081, + "eval_logits/rejected": -2.391075849533081, + "eval_logps/chosen": -310.7811584472656, + "eval_logps/rejected": -310.7811584472656, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.043025679886341095, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.043025679886341095, + "eval_runtime": 2682.606, + "eval_samples_per_second": 2.27, + "eval_steps_per_second": 0.284, + "step": 3200 + }, + { + "epoch": 0.9370210902722031, + "grad_norm": 0.0150146484375, + "learning_rate": 5.979742107825287e-08, + "logits/chosen": -2.3894600868225098, + "logits/rejected": -2.3894600868225098, + "logps/chosen": -313.91131591796875, + "logps/rejected": -313.91131591796875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04388252645730972, + "rewards/margins": 0.0, + "rewards/rejected": -0.04388252645730972, + "step": 3210 + }, + { + "epoch": 0.9399401590892505, + "grad_norm": 0.01446533203125, + "learning_rate": 5.4384021065936045e-08, + "logits/chosen": -2.408024549484253, + "logits/rejected": -2.408024549484253, + "logps/chosen": -288.5419006347656, + "logps/rejected": -288.5419006347656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0435100793838501, + "rewards/margins": 0.0, + "rewards/rejected": -0.0435100793838501, + "step": 3220 + }, + { + "epoch": 0.9428592279062978, + "grad_norm": 0.033447265625, + "learning_rate": 4.9224729428935806e-08, + "logits/chosen": -2.423318862915039, + "logits/rejected": -2.423318862915039, + "logps/chosen": -309.74176025390625, + "logps/rejected": -309.74176025390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04652589559555054, + "rewards/margins": 0.0, + "rewards/rejected": -0.04652589559555054, + "step": 3230 + }, + { + "epoch": 0.9457782967233452, + "grad_norm": 0.011962890625, + "learning_rate": 4.432008223637596e-08, + "logits/chosen": -2.4209766387939453, + "logits/rejected": -2.4209766387939453, + "logps/chosen": -299.3330078125, + "logps/rejected": -299.3330078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04555036872625351, + "rewards/margins": 0.0, + "rewards/rejected": -0.04555036872625351, + "step": 3240 + }, + { + "epoch": 0.9486973655403926, + "grad_norm": 0.01953125, + "learning_rate": 3.967058909889937e-08, + "logits/chosen": -2.397352457046509, + "logits/rejected": -2.397352457046509, + "logps/chosen": -313.8124694824219, + "logps/rejected": -313.8124694824219, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03916158154606819, + "rewards/margins": 0.0, + "rewards/rejected": -0.03916158154606819, + "step": 3250 + }, + { + "epoch": 0.9516164343574399, + "grad_norm": 0.014404296875, + "learning_rate": 3.5276733115715556e-08, + "logits/chosen": -2.448172092437744, + "logits/rejected": -2.448172092437744, + "logps/chosen": -305.734130859375, + "logps/rejected": -305.734130859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.050101179629564285, + "rewards/margins": 0.0, + "rewards/rejected": -0.050101179629564285, + "step": 3260 + }, + { + "epoch": 0.9545355031744873, + "grad_norm": 0.01397705078125, + "learning_rate": 3.11389708244067e-08, + "logits/chosen": -2.4387991428375244, + "logits/rejected": -2.4387991428375244, + "logps/chosen": -325.77374267578125, + "logps/rejected": -325.77374267578125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04632676765322685, + "rewards/margins": 0.0, + "rewards/rejected": -0.04632676765322685, + "step": 3270 + }, + { + "epoch": 0.9574545719915347, + "grad_norm": 0.0130615234375, + "learning_rate": 2.7257732153490313e-08, + "logits/chosen": -2.3997585773468018, + "logits/rejected": -2.3997585773468018, + "logps/chosen": -323.36962890625, + "logps/rejected": -323.36962890625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04246259480714798, + "rewards/margins": 0.0, + "rewards/rejected": -0.04246259480714798, + "step": 3280 + }, + { + "epoch": 0.960373640808582, + "grad_norm": 0.01226806640625, + "learning_rate": 2.3633420377749684e-08, + "logits/chosen": -2.404913902282715, + "logits/rejected": -2.404913902282715, + "logps/chosen": -309.89715576171875, + "logps/rejected": -309.89715576171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05698453634977341, + "rewards/margins": 0.0, + "rewards/rejected": -0.05698453634977341, + "step": 3290 + }, + { + "epoch": 0.9632927096256294, + "grad_norm": 0.013671875, + "learning_rate": 2.0266412076330457e-08, + "logits/chosen": -2.431570529937744, + "logits/rejected": -2.431570529937744, + "logps/chosen": -297.8599548339844, + "logps/rejected": -297.8599548339844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.052738986909389496, + "rewards/margins": 0.0, + "rewards/rejected": -0.052738986909389496, + "step": 3300 + }, + { + "epoch": 0.9632927096256294, + "eval_logits/chosen": -2.3914639949798584, + "eval_logits/rejected": -2.3914639949798584, + "eval_logps/chosen": -310.7767333984375, + "eval_logps/rejected": -310.7767333984375, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.0429811105132103, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.0429811105132103, + "eval_runtime": 2682.6978, + "eval_samples_per_second": 2.27, + "eval_steps_per_second": 0.284, + "step": 3300 + }, + { + "epoch": 0.9662117784426768, + "grad_norm": 0.01361083984375, + "learning_rate": 1.7157057093614704e-08, + "logits/chosen": -2.452519178390503, + "logits/rejected": -2.452519178390503, + "logps/chosen": -296.8190002441406, + "logps/rejected": -296.8190002441406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.041969865560531616, + "rewards/margins": 0.0, + "rewards/rejected": -0.041969865560531616, + "step": 3310 + }, + { + "epoch": 0.9691308472597241, + "grad_norm": 0.01422119140625, + "learning_rate": 1.430567850286807e-08, + "logits/chosen": -2.4390811920166016, + "logits/rejected": -2.4390811920166016, + "logps/chosen": -339.14801025390625, + "logps/rejected": -339.14801025390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0495796874165535, + "rewards/margins": 0.0, + "rewards/rejected": -0.0495796874165535, + "step": 3320 + }, + { + "epoch": 0.9720499160767715, + "grad_norm": 0.017333984375, + "learning_rate": 1.1712572572674386e-08, + "logits/chosen": -2.3779425621032715, + "logits/rejected": -2.3779425621032715, + "logps/chosen": -342.52117919921875, + "logps/rejected": -342.52117919921875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03777734562754631, + "rewards/margins": 0.0, + "rewards/rejected": -0.03777734562754631, + "step": 3330 + }, + { + "epoch": 0.9749689848938189, + "grad_norm": 0.0166015625, + "learning_rate": 9.378008736149746e-09, + "logits/chosen": -2.408357620239258, + "logits/rejected": -2.408357620239258, + "logps/chosen": -321.5648193359375, + "logps/rejected": -321.5648193359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03947510942816734, + "rewards/margins": 0.0, + "rewards/rejected": -0.03947510942816734, + "step": 3340 + }, + { + "epoch": 0.9778880537108662, + "grad_norm": 0.01275634765625, + "learning_rate": 7.30222956294907e-09, + "logits/chosen": -2.456228733062744, + "logits/rejected": -2.456228733062744, + "logps/chosen": -322.9805603027344, + "logps/rejected": -322.9805603027344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04836040362715721, + "rewards/margins": 0.0, + "rewards/rejected": -0.04836040362715721, + "step": 3350 + }, + { + "epoch": 0.9808071225279136, + "grad_norm": 0.015380859375, + "learning_rate": 5.485450734061259e-09, + "logits/chosen": -2.395473003387451, + "logits/rejected": -2.395473003387451, + "logps/chosen": -292.87994384765625, + "logps/rejected": -292.87994384765625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0445740707218647, + "rewards/margins": 0.0, + "rewards/rejected": -0.0445740707218647, + "step": 3360 + }, + { + "epoch": 0.983726191344961, + "grad_norm": 0.01544189453125, + "learning_rate": 3.927861019399903e-09, + "logits/chosen": -2.406294345855713, + "logits/rejected": -2.406294345855713, + "logps/chosen": -288.55987548828125, + "logps/rejected": -288.55987548828125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03885042294859886, + "rewards/margins": 0.0, + "rewards/rejected": -0.03885042294859886, + "step": 3370 + }, + { + "epoch": 0.9866452601620083, + "grad_norm": 0.0155029296875, + "learning_rate": 2.629622258188691e-09, + "logits/chosen": -2.4149577617645264, + "logits/rejected": -2.4149577617645264, + "logps/chosen": -282.56585693359375, + "logps/rejected": -282.56585693359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0403311550617218, + "rewards/margins": 0.0, + "rewards/rejected": -0.0403311550617218, + "step": 3380 + }, + { + "epoch": 0.9895643289790557, + "grad_norm": 0.01544189453125, + "learning_rate": 1.5908693421465282e-09, + "logits/chosen": -2.4097559452056885, + "logits/rejected": -2.4097559452056885, + "logps/chosen": -284.1174011230469, + "logps/rejected": -284.1174011230469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04213656857609749, + "rewards/margins": 0.0, + "rewards/rejected": -0.04213656857609749, + "step": 3390 + }, + { + "epoch": 0.9924833977961031, + "grad_norm": 0.0137939453125, + "learning_rate": 8.11710201470417e-10, + "logits/chosen": -2.4348714351654053, + "logits/rejected": -2.4348714351654053, + "logps/chosen": -325.41339111328125, + "logps/rejected": -325.41339111328125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04575268179178238, + "rewards/margins": 0.0, + "rewards/rejected": -0.04575268179178238, + "step": 3400 + }, + { + "epoch": 0.9924833977961031, + "eval_logits/chosen": -2.3908708095550537, + "eval_logits/rejected": -2.3908708095550537, + "eval_logps/chosen": -310.7832336425781, + "eval_logps/rejected": -310.7832336425781, + "eval_loss": 0.6931472420692444, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": -0.04304642230272293, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": -0.04304642230272293, + "eval_runtime": 2747.5083, + "eval_samples_per_second": 2.217, + "eval_steps_per_second": 0.277, + "step": 3400 + }, + { + "epoch": 0.9954024666131503, + "grad_norm": 0.01165771484375, + "learning_rate": 2.922257936230355e-10, + "logits/chosen": -2.409545421600342, + "logits/rejected": -2.409545421600342, + "logps/chosen": -264.7913818359375, + "logps/rejected": -264.7913818359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.038275159895420074, + "rewards/margins": 0.0, + "rewards/rejected": -0.038275159895420074, + "step": 3410 + }, + { + "epoch": 0.9983215354301977, + "grad_norm": 0.0167236328125, + "learning_rate": 3.247009491946784e-11, + "logits/chosen": -2.429719924926758, + "logits/rejected": -2.429719924926758, + "logps/chosen": -340.23126220703125, + "logps/rejected": -340.23126220703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04289505258202553, + "rewards/margins": 0.0, + "rewards/rejected": -0.04289505258202553, + "step": 3420 + }, + { + "epoch": 0.9997810698387214, + "step": 3425, + "total_flos": 0.0, + "train_loss": 0.18720042388804636, + "train_runtime": 41876.0871, + "train_samples_per_second": 1.309, + "train_steps_per_second": 0.082 + } + ], + "logging_steps": 10, + "max_steps": 3425, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}