{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1274, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007849293563579278, "grad_norm": 7.6070190011438665, "learning_rate": 3.90625e-09, "logits/chosen": 5881.4375, "logits/rejected": 2834.66162109375, "logps/chosen": -257.5969543457031, "logps/rejected": -120.09489440917969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.007849293563579277, "grad_norm": 8.292103607244062, "learning_rate": 3.9062499999999997e-08, "logits/chosen": 4946.02587890625, "logits/rejected": 4332.443359375, "logps/chosen": -248.808837890625, "logps/rejected": -214.24110412597656, "loss": 0.6932, "rewards/accuracies": 0.3888889253139496, "rewards/chosen": 0.007352272514253855, "rewards/margins": -0.04363558441400528, "rewards/rejected": 0.0509878545999527, "step": 10 }, { "epoch": 0.015698587127158554, "grad_norm": 7.004971570853845, "learning_rate": 7.812499999999999e-08, "logits/chosen": 6023.1796875, "logits/rejected": 4842.9091796875, "logps/chosen": -266.2599792480469, "logps/rejected": -232.38601684570312, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": 0.05332508683204651, "rewards/margins": -0.012739470228552818, "rewards/rejected": 0.06606455147266388, "step": 20 }, { "epoch": 0.023547880690737835, "grad_norm": 6.884207614473328, "learning_rate": 1.1718749999999999e-07, "logits/chosen": 6032.62890625, "logits/rejected": 5038.45556640625, "logps/chosen": -310.3996887207031, "logps/rejected": -262.9111328125, "loss": 0.693, "rewards/accuracies": 0.5750000476837158, "rewards/chosen": 0.20968368649482727, "rewards/margins": 0.08424334973096848, "rewards/rejected": 0.1254403293132782, "step": 30 }, { "epoch": 0.03139717425431711, "grad_norm": 7.259392888061773, "learning_rate": 1.5624999999999999e-07, "logits/chosen": 5241.93896484375, "logits/rejected": 4344.1708984375, "logps/chosen": -263.4868469238281, "logps/rejected": -224.7186737060547, "loss": 0.6923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5333760976791382, "rewards/margins": 0.18804652988910675, "rewards/rejected": 0.3453296422958374, "step": 40 }, { "epoch": 0.03924646781789639, "grad_norm": 7.738968697847914, "learning_rate": 1.9531249999999998e-07, "logits/chosen": 6364.2314453125, "logits/rejected": 4975.1728515625, "logps/chosen": -324.3128967285156, "logps/rejected": -257.80108642578125, "loss": 0.6913, "rewards/accuracies": 0.6916666626930237, "rewards/chosen": 1.7967116832733154, "rewards/margins": 1.0479546785354614, "rewards/rejected": 0.748757004737854, "step": 50 }, { "epoch": 0.04709576138147567, "grad_norm": 7.908366760507787, "learning_rate": 2.3437499999999998e-07, "logits/chosen": 5382.3125, "logits/rejected": 4503.8955078125, "logps/chosen": -268.0205383300781, "logps/rejected": -261.6384582519531, "loss": 0.6894, "rewards/accuracies": 0.6416667103767395, "rewards/chosen": 1.9065296649932861, "rewards/margins": 1.3541371822357178, "rewards/rejected": 0.5523924827575684, "step": 60 }, { "epoch": 0.054945054945054944, "grad_norm": 6.1363185745056095, "learning_rate": 2.734375e-07, "logits/chosen": 5104.6328125, "logits/rejected": 4838.6162109375, "logps/chosen": -231.5017547607422, "logps/rejected": -233.1387481689453, "loss": 0.6876, "rewards/accuracies": 0.6583333611488342, "rewards/chosen": 1.7562404870986938, "rewards/margins": 2.241947650909424, "rewards/rejected": -0.48570698499679565, "step": 70 }, { "epoch": 0.06279434850863422, "grad_norm": 6.313648303908843, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 5691.92822265625, "logits/rejected": 5161.9208984375, "logps/chosen": -256.8810119628906, "logps/rejected": -246.78268432617188, "loss": 0.6837, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.9025794863700867, "rewards/margins": 3.5677428245544434, "rewards/rejected": -4.470322608947754, "step": 80 }, { "epoch": 0.0706436420722135, "grad_norm": 7.17648924651081, "learning_rate": 3.5156249999999997e-07, "logits/chosen": 5875.908203125, "logits/rejected": 5055.4775390625, "logps/chosen": -287.22882080078125, "logps/rejected": -253.146728515625, "loss": 0.6781, "rewards/accuracies": 0.7416667342185974, "rewards/chosen": -2.037750720977783, "rewards/margins": 7.642518043518066, "rewards/rejected": -9.680269241333008, "step": 90 }, { "epoch": 0.07849293563579278, "grad_norm": 7.7941596557126624, "learning_rate": 3.9062499999999997e-07, "logits/chosen": 5771.35400390625, "logits/rejected": 5554.82470703125, "logps/chosen": -283.1836853027344, "logps/rejected": -282.7068176269531, "loss": 0.6766, "rewards/accuracies": 0.6916667222976685, "rewards/chosen": -4.9085798263549805, "rewards/margins": 9.60995864868164, "rewards/rejected": -14.518537521362305, "step": 100 }, { "epoch": 0.08634222919937205, "grad_norm": 9.345408351017173, "learning_rate": 4.2968749999999996e-07, "logits/chosen": 6213.28173828125, "logits/rejected": 5103.55517578125, "logps/chosen": -261.7968444824219, "logps/rejected": -261.9189453125, "loss": 0.6654, "rewards/accuracies": 0.7083333730697632, "rewards/chosen": -4.658997535705566, "rewards/margins": 13.647018432617188, "rewards/rejected": -18.306015014648438, "step": 110 }, { "epoch": 0.09419152276295134, "grad_norm": 10.145203775900786, "learning_rate": 4.6874999999999996e-07, "logits/chosen": 6057.29052734375, "logits/rejected": 5066.5625, "logps/chosen": -305.77459716796875, "logps/rejected": -314.111572265625, "loss": 0.6665, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": -26.187755584716797, "rewards/margins": 18.321544647216797, "rewards/rejected": -44.50930404663086, "step": 120 }, { "epoch": 0.10204081632653061, "grad_norm": 7.094150089546659, "learning_rate": 4.999962424962166e-07, "logits/chosen": 6036.177734375, "logits/rejected": 5572.85009765625, "logps/chosen": -299.9012145996094, "logps/rejected": -295.4285583496094, "loss": 0.6674, "rewards/accuracies": 0.658333420753479, "rewards/chosen": -9.793970108032227, "rewards/margins": 12.203898429870605, "rewards/rejected": -21.997867584228516, "step": 130 }, { "epoch": 0.10989010989010989, "grad_norm": 8.740739358429577, "learning_rate": 4.998647417232375e-07, "logits/chosen": 5906.3203125, "logits/rejected": 5174.52783203125, "logps/chosen": -266.81195068359375, "logps/rejected": -270.7115173339844, "loss": 0.6712, "rewards/accuracies": 0.6416667103767395, "rewards/chosen": -6.569715976715088, "rewards/margins": 13.785995483398438, "rewards/rejected": -20.355710983276367, "step": 140 }, { "epoch": 0.11773940345368916, "grad_norm": 15.789592358858716, "learning_rate": 4.995454786965036e-07, "logits/chosen": 6020.12939453125, "logits/rejected": 5010.9521484375, "logps/chosen": -284.8111267089844, "logps/rejected": -268.1068420410156, "loss": 0.6656, "rewards/accuracies": 0.6750000715255737, "rewards/chosen": -6.939129829406738, "rewards/margins": 19.278301239013672, "rewards/rejected": -26.21742820739746, "step": 150 }, { "epoch": 0.12558869701726844, "grad_norm": 9.090539317617692, "learning_rate": 4.990386933279972e-07, "logits/chosen": 5993.25341796875, "logits/rejected": 5322.8603515625, "logps/chosen": -292.59307861328125, "logps/rejected": -305.57769775390625, "loss": 0.6599, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": -19.02887535095215, "rewards/margins": 20.415613174438477, "rewards/rejected": -39.444488525390625, "step": 160 }, { "epoch": 0.13343799058084774, "grad_norm": 13.372920007721838, "learning_rate": 4.983447664444096e-07, "logits/chosen": 6258.017578125, "logits/rejected": 5565.6845703125, "logps/chosen": -315.3527526855469, "logps/rejected": -318.45050048828125, "loss": 0.6544, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -25.55523681640625, "rewards/margins": 23.88918113708496, "rewards/rejected": -49.444419860839844, "step": 170 }, { "epoch": 0.141287284144427, "grad_norm": 14.27741481958782, "learning_rate": 4.97464219500968e-07, "logits/chosen": 5531.6142578125, "logits/rejected": 4802.4638671875, "logps/chosen": -295.49737548828125, "logps/rejected": -306.994384765625, "loss": 0.6564, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -32.053043365478516, "rewards/margins": 27.674352645874023, "rewards/rejected": -59.72739791870117, "step": 180 }, { "epoch": 0.14913657770800628, "grad_norm": 14.09329925384393, "learning_rate": 4.963977141895843e-07, "logits/chosen": 5590.63818359375, "logits/rejected": 4784.70068359375, "logps/chosen": -334.12109375, "logps/rejected": -354.5445556640625, "loss": 0.6353, "rewards/accuracies": 0.7416667342185974, "rewards/chosen": -46.83644485473633, "rewards/margins": 46.24668502807617, "rewards/rejected": -93.0831298828125, "step": 190 }, { "epoch": 0.15698587127158556, "grad_norm": 11.55780859392514, "learning_rate": 4.951460519416227e-07, "logits/chosen": 5547.5302734375, "logits/rejected": 5110.81005859375, "logps/chosen": -287.798095703125, "logps/rejected": -339.49029541015625, "loss": 0.6461, "rewards/accuracies": 0.7083333730697632, "rewards/chosen": -36.548126220703125, "rewards/margins": 39.468544006347656, "rewards/rejected": -76.01667785644531, "step": 200 }, { "epoch": 0.16483516483516483, "grad_norm": 21.82214651560815, "learning_rate": 4.937101733256606e-07, "logits/chosen": 5047.630859375, "logits/rejected": 4442.10498046875, "logps/chosen": -271.02423095703125, "logps/rejected": -302.889404296875, "loss": 0.6477, "rewards/accuracies": 0.625, "rewards/chosen": -46.31133270263672, "rewards/margins": 34.11729431152344, "rewards/rejected": -80.42862701416016, "step": 210 }, { "epoch": 0.1726844583987441, "grad_norm": 29.099460861389012, "learning_rate": 4.920911573406924e-07, "logits/chosen": 6142.0341796875, "logits/rejected": 5178.005859375, "logps/chosen": -333.2041320800781, "logps/rejected": -327.04351806640625, "loss": 0.6432, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -50.571189880371094, "rewards/margins": 42.93184280395508, "rewards/rejected": -93.50303649902344, "step": 220 }, { "epoch": 0.18053375196232338, "grad_norm": 11.33998439372873, "learning_rate": 4.902902206053098e-07, "logits/chosen": 5614.0341796875, "logits/rejected": 5065.51123046875, "logps/chosen": -323.0210266113281, "logps/rejected": -348.66387939453125, "loss": 0.645, "rewards/accuracies": 0.6583333611488342, "rewards/chosen": -52.74376678466797, "rewards/margins": 37.89786148071289, "rewards/rejected": -90.6416244506836, "step": 230 }, { "epoch": 0.18838304552590268, "grad_norm": 19.303454918030365, "learning_rate": 4.883087164434672e-07, "logits/chosen": 5058.9169921875, "logits/rejected": 4021.076904296875, "logps/chosen": -298.27947998046875, "logps/rejected": -326.84698486328125, "loss": 0.6375, "rewards/accuracies": 0.73333340883255, "rewards/chosen": -58.58591842651367, "rewards/margins": 55.33477020263672, "rewards/rejected": -113.9206771850586, "step": 240 }, { "epoch": 0.19623233908948196, "grad_norm": 22.796191821196953, "learning_rate": 4.861481338675183e-07, "logits/chosen": 5985.2314453125, "logits/rejected": 5300.16259765625, "logps/chosen": -288.8643798828125, "logps/rejected": -353.60687255859375, "loss": 0.6467, "rewards/accuracies": 0.7083333730697632, "rewards/chosen": -53.216957092285156, "rewards/margins": 45.99776077270508, "rewards/rejected": -99.21472930908203, "step": 250 }, { "epoch": 0.20408163265306123, "grad_norm": 22.085356937400984, "learning_rate": 4.838100964592904e-07, "logits/chosen": 6164.17724609375, "logits/rejected": 4921.74072265625, "logps/chosen": -344.24920654296875, "logps/rejected": -333.50433349609375, "loss": 0.6499, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -59.49656295776367, "rewards/margins": 35.483604431152344, "rewards/rejected": -94.98017883300781, "step": 260 }, { "epoch": 0.2119309262166405, "grad_norm": 12.489294995446524, "learning_rate": 4.812963611500339e-07, "logits/chosen": 5998.5322265625, "logits/rejected": 5768.5107421875, "logps/chosen": -372.312255859375, "logps/rejected": -408.15008544921875, "loss": 0.6342, "rewards/accuracies": 0.6916667222976685, "rewards/chosen": -97.80362701416016, "rewards/margins": 43.51809310913086, "rewards/rejected": -141.3217315673828, "step": 270 }, { "epoch": 0.21978021978021978, "grad_norm": 16.417376858466337, "learning_rate": 4.786088169001671e-07, "logits/chosen": 5130.7666015625, "logits/rejected": 4434.3466796875, "logps/chosen": -319.5972595214844, "logps/rejected": -379.0505676269531, "loss": 0.6394, "rewards/accuracies": 0.7916666269302368, "rewards/chosen": -90.88194274902344, "rewards/margins": 53.75859451293945, "rewards/rejected": -144.64053344726562, "step": 280 }, { "epoch": 0.22762951334379905, "grad_norm": 11.775433051492161, "learning_rate": 4.7574948327980567e-07, "logits/chosen": 7151.7099609375, "logits/rejected": 5225.49169921875, "logps/chosen": -390.4671325683594, "logps/rejected": -385.64324951171875, "loss": 0.6242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -65.6766128540039, "rewards/margins": 62.20024490356445, "rewards/rejected": -127.8768539428711, "step": 290 }, { "epoch": 0.23547880690737832, "grad_norm": 20.393847208182954, "learning_rate": 4.727205089511466e-07, "logits/chosen": 5204.720703125, "logits/rejected": 5109.517578125, "logps/chosen": -291.4892272949219, "logps/rejected": -349.64764404296875, "loss": 0.636, "rewards/accuracies": 0.6583333611488342, "rewards/chosen": -54.74866485595703, "rewards/margins": 50.97196960449219, "rewards/rejected": -105.72062683105469, "step": 300 }, { "epoch": 0.24332810047095763, "grad_norm": 12.64896234084034, "learning_rate": 4.6952417005384247e-07, "logits/chosen": 5871.3505859375, "logits/rejected": 5241.853515625, "logps/chosen": -292.93157958984375, "logps/rejected": -314.87689208984375, "loss": 0.6464, "rewards/accuracies": 0.7166666388511658, "rewards/chosen": -37.26299285888672, "rewards/margins": 35.94820022583008, "rewards/rejected": -73.21118927001953, "step": 310 }, { "epoch": 0.25117739403453687, "grad_norm": 11.711611435010754, "learning_rate": 4.661628684945851e-07, "logits/chosen": 5884.6513671875, "logits/rejected": 5080.0869140625, "logps/chosen": -317.3797302246094, "logps/rejected": -366.9216613769531, "loss": 0.6397, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -40.09249496459961, "rewards/margins": 54.90401077270508, "rewards/rejected": -94.99651336669922, "step": 320 }, { "epoch": 0.25902668759811615, "grad_norm": 16.606143175279893, "learning_rate": 4.626391301421782e-07, "logits/chosen": 5659.63232421875, "logits/rejected": 5144.689453125, "logps/chosen": -322.4103088378906, "logps/rejected": -339.0274353027344, "loss": 0.6465, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -52.312225341796875, "rewards/margins": 37.32312774658203, "rewards/rejected": -89.6353530883789, "step": 330 }, { "epoch": 0.2668759811616955, "grad_norm": 11.957399046843578, "learning_rate": 4.5895560292945996e-07, "logits/chosen": 5975.3984375, "logits/rejected": 6048.248046875, "logps/chosen": -306.0935974121094, "logps/rejected": -384.51226806640625, "loss": 0.6382, "rewards/accuracies": 0.6666667461395264, "rewards/chosen": -38.0104866027832, "rewards/margins": 44.62034225463867, "rewards/rejected": -82.63082885742188, "step": 340 }, { "epoch": 0.27472527472527475, "grad_norm": 19.05929325375797, "learning_rate": 4.5511505486349865e-07, "logits/chosen": 6281.623046875, "logits/rejected": 5641.45556640625, "logps/chosen": -324.3663330078125, "logps/rejected": -403.8905944824219, "loss": 0.6352, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -51.458587646484375, "rewards/margins": 68.93352508544922, "rewards/rejected": -120.3921127319336, "step": 350 }, { "epoch": 0.282574568288854, "grad_norm": 22.14740963615657, "learning_rate": 4.5112037194555876e-07, "logits/chosen": 5807.75927734375, "logits/rejected": 5624.56396484375, "logps/chosen": -337.0799255371094, "logps/rejected": -433.28485107421875, "loss": 0.6332, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -93.39493560791016, "rewards/margins": 65.91687774658203, "rewards/rejected": -159.31179809570312, "step": 360 }, { "epoch": 0.2904238618524333, "grad_norm": 26.999582029041648, "learning_rate": 4.4697455600239863e-07, "logits/chosen": 5266.83642578125, "logits/rejected": 4898.1513671875, "logps/chosen": -338.98809814453125, "logps/rejected": -360.06561279296875, "loss": 0.6496, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -80.08573913574219, "rewards/margins": 41.56218719482422, "rewards/rejected": -121.64791107177734, "step": 370 }, { "epoch": 0.29827315541601257, "grad_norm": 10.145481304719103, "learning_rate": 4.426807224305315e-07, "logits/chosen": 6373.15576171875, "logits/rejected": 5213.06591796875, "logps/chosen": -349.0050964355469, "logps/rejected": -351.0923156738281, "loss": 0.6338, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -41.02812576293945, "rewards/margins": 54.1557731628418, "rewards/rejected": -95.18390655517578, "step": 380 }, { "epoch": 0.30612244897959184, "grad_norm": 15.861363596358661, "learning_rate": 4.3824209785514326e-07, "logits/chosen": 6477.939453125, "logits/rejected": 4905.5, "logps/chosen": -341.7349548339844, "logps/rejected": -361.1855163574219, "loss": 0.6272, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -50.20849609375, "rewards/margins": 65.48529052734375, "rewards/rejected": -115.69378662109375, "step": 390 }, { "epoch": 0.3139717425431711, "grad_norm": 20.26773681163985, "learning_rate": 4.3366201770542687e-07, "logits/chosen": 5476.63818359375, "logits/rejected": 5291.42138671875, "logps/chosen": -368.3787536621094, "logps/rejected": -435.31072998046875, "loss": 0.6384, "rewards/accuracies": 0.75, "rewards/chosen": -93.76130676269531, "rewards/margins": 54.93895721435547, "rewards/rejected": -148.7002716064453, "step": 400 }, { "epoch": 0.3218210361067504, "grad_norm": 25.748156010376984, "learning_rate": 4.2894392370815567e-07, "logits/chosen": 5894.26708984375, "logits/rejected": 5256.75634765625, "logps/chosen": -404.880126953125, "logps/rejected": -464.7074279785156, "loss": 0.608, "rewards/accuracies": 0.7583333849906921, "rewards/chosen": -113.01127624511719, "rewards/margins": 69.0715560913086, "rewards/rejected": -182.08285522460938, "step": 410 }, { "epoch": 0.32967032967032966, "grad_norm": 21.231993994206345, "learning_rate": 4.2409136130137845e-07, "logits/chosen": 5564.29833984375, "logits/rejected": 4995.458984375, "logps/chosen": -390.73931884765625, "logps/rejected": -428.530517578125, "loss": 0.6319, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -104.84605407714844, "rewards/margins": 70.60201263427734, "rewards/rejected": -175.44808959960938, "step": 420 }, { "epoch": 0.33751962323390894, "grad_norm": 21.61004113876247, "learning_rate": 4.1910797697018017e-07, "logits/chosen": 5397.30712890625, "logits/rejected": 4497.5673828125, "logps/chosen": -340.72528076171875, "logps/rejected": -390.37115478515625, "loss": 0.6261, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -83.93035888671875, "rewards/margins": 81.43653869628906, "rewards/rejected": -165.36691284179688, "step": 430 }, { "epoch": 0.3453689167974882, "grad_norm": 34.40148362381541, "learning_rate": 4.1399751550651084e-07, "logits/chosen": 5793.47119140625, "logits/rejected": 5737.45263671875, "logps/chosen": -303.27130126953125, "logps/rejected": -359.61187744140625, "loss": 0.6351, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -52.5032844543457, "rewards/margins": 46.893043518066406, "rewards/rejected": -99.39633178710938, "step": 440 }, { "epoch": 0.3532182103610675, "grad_norm": 12.221057520913048, "learning_rate": 4.087638171951401e-07, "logits/chosen": 6695.4169921875, "logits/rejected": 4779.6669921875, "logps/chosen": -343.7985534667969, "logps/rejected": -348.4494934082031, "loss": 0.63, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": -56.74163818359375, "rewards/margins": 65.7086181640625, "rewards/rejected": -122.45025634765625, "step": 450 }, { "epoch": 0.36106750392464676, "grad_norm": 14.928154473572407, "learning_rate": 4.034108149278543e-07, "logits/chosen": 6924.58056640625, "logits/rejected": 5269.8779296875, "logps/chosen": -399.5293273925781, "logps/rejected": -400.42681884765625, "loss": 0.6167, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -66.81822204589844, "rewards/margins": 68.95711517333984, "rewards/rejected": -135.77532958984375, "step": 460 }, { "epoch": 0.36891679748822603, "grad_norm": 33.075423246716404, "learning_rate": 3.979425312480629e-07, "logits/chosen": 5881.1962890625, "logits/rejected": 5122.140625, "logps/chosen": -389.74981689453125, "logps/rejected": -441.1351623535156, "loss": 0.6418, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -101.96400451660156, "rewards/margins": 61.07242965698242, "rewards/rejected": -163.03643798828125, "step": 470 }, { "epoch": 0.37676609105180536, "grad_norm": 14.819747656679679, "learning_rate": 3.923630753280357e-07, "logits/chosen": 6393.35693359375, "logits/rejected": 5423.63671875, "logps/chosen": -363.44342041015625, "logps/rejected": -406.1517028808594, "loss": 0.6263, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -83.63682556152344, "rewards/margins": 75.72159576416016, "rewards/rejected": -159.3583984375, "step": 480 }, { "epoch": 0.38461538461538464, "grad_norm": 15.214454392153003, "learning_rate": 3.866766398810424e-07, "logits/chosen": 6030.3671875, "logits/rejected": 5667.58935546875, "logps/chosen": -291.21600341796875, "logps/rejected": -404.61065673828125, "loss": 0.6114, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -50.27495574951172, "rewards/margins": 81.7363052368164, "rewards/rejected": -132.01126098632812, "step": 490 }, { "epoch": 0.3924646781789639, "grad_norm": 26.504081081962486, "learning_rate": 3.8088749801071496e-07, "logits/chosen": 6422.83056640625, "logits/rejected": 4957.6240234375, "logps/chosen": -383.7412109375, "logps/rejected": -426.57318115234375, "loss": 0.6265, "rewards/accuracies": 0.7583333253860474, "rewards/chosen": -85.2517318725586, "rewards/margins": 76.8886947631836, "rewards/rejected": -162.1404266357422, "step": 500 }, { "epoch": 0.4003139717425432, "grad_norm": 19.92131685224601, "learning_rate": 3.75e-07, "logits/chosen": 5211.85888671875, "logits/rejected": 4546.10302734375, "logps/chosen": -338.0028076171875, "logps/rejected": -391.8753356933594, "loss": 0.6233, "rewards/accuracies": 0.7750000953674316, "rewards/chosen": -81.1429672241211, "rewards/margins": 81.61470031738281, "rewards/rejected": -162.75765991210938, "step": 510 }, { "epoch": 0.40816326530612246, "grad_norm": 17.195866910495663, "learning_rate": 3.6901857004211443e-07, "logits/chosen": 5492.595703125, "logits/rejected": 5055.5869140625, "logps/chosen": -354.20428466796875, "logps/rejected": -411.8438415527344, "loss": 0.6448, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": -88.03140258789062, "rewards/margins": 54.68613815307617, "rewards/rejected": -142.71755981445312, "step": 520 }, { "epoch": 0.41601255886970173, "grad_norm": 11.497234011291072, "learning_rate": 3.6294770291596076e-07, "logits/chosen": 6242.5810546875, "logits/rejected": 5086.48828125, "logps/chosen": -348.9159240722656, "logps/rejected": -372.79046630859375, "loss": 0.619, "rewards/accuracies": 0.7166666388511658, "rewards/chosen": -64.62379455566406, "rewards/margins": 44.09585189819336, "rewards/rejected": -108.71965026855469, "step": 530 }, { "epoch": 0.423861852433281, "grad_norm": 16.092408213575606, "learning_rate": 3.5679196060850034e-07, "logits/chosen": 5961.3251953125, "logits/rejected": 5273.64794921875, "logps/chosen": -344.6712646484375, "logps/rejected": -383.50048828125, "loss": 0.6307, "rewards/accuracies": 0.75, "rewards/chosen": -67.28762817382812, "rewards/margins": 60.815452575683594, "rewards/rejected": -128.1031036376953, "step": 540 }, { "epoch": 0.4317111459968603, "grad_norm": 22.117577767433108, "learning_rate": 3.505559688866229e-07, "logits/chosen": 5753.71728515625, "logits/rejected": 5307.4189453125, "logps/chosen": -353.93414306640625, "logps/rejected": -433.62823486328125, "loss": 0.6245, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -72.5975112915039, "rewards/margins": 69.84739685058594, "rewards/rejected": -142.44491577148438, "step": 550 }, { "epoch": 0.43956043956043955, "grad_norm": 16.177064459530992, "learning_rate": 3.4424441382108826e-07, "logits/chosen": 5751.61767578125, "logits/rejected": 5381.62939453125, "logps/chosen": -358.9526672363281, "logps/rejected": -400.9219055175781, "loss": 0.6408, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": -82.8250732421875, "rewards/margins": 51.318634033203125, "rewards/rejected": -134.14370727539062, "step": 560 }, { "epoch": 0.4474097331240188, "grad_norm": 22.155261104381104, "learning_rate": 3.378620382651523e-07, "logits/chosen": 6090.033203125, "logits/rejected": 5616.0712890625, "logps/chosen": -389.67449951171875, "logps/rejected": -432.17083740234375, "loss": 0.6186, "rewards/accuracies": 0.7916667461395264, "rewards/chosen": -75.10955810546875, "rewards/margins": 65.12030792236328, "rewards/rejected": -140.22988891601562, "step": 570 }, { "epoch": 0.4552590266875981, "grad_norm": 21.25069883148197, "learning_rate": 3.314136382905234e-07, "logits/chosen": 6104.90234375, "logits/rejected": 5452.4833984375, "logps/chosen": -363.5154724121094, "logps/rejected": -447.11553955078125, "loss": 0.6198, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -76.26863098144531, "rewards/margins": 85.99644470214844, "rewards/rejected": -162.2650909423828, "step": 580 }, { "epoch": 0.4631083202511774, "grad_norm": 17.362372097837717, "learning_rate": 3.249040595833274e-07, "logits/chosen": 6601.0, "logits/rejected": 5511.4287109375, "logps/chosen": -402.6153259277344, "logps/rejected": -423.739501953125, "loss": 0.6132, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -96.46189880371094, "rewards/margins": 81.43740844726562, "rewards/rejected": -177.89932250976562, "step": 590 }, { "epoch": 0.47095761381475665, "grad_norm": 23.17004436475313, "learning_rate": 3.1833819380279023e-07, "logits/chosen": 6238.583984375, "logits/rejected": 5254.064453125, "logps/chosen": -360.0862731933594, "logps/rejected": -433.875, "loss": 0.6181, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -107.59527587890625, "rewards/margins": 71.31594848632812, "rewards/rejected": -178.91122436523438, "step": 600 }, { "epoch": 0.478806907378336, "grad_norm": 12.736341325859744, "learning_rate": 3.11720974905373e-07, "logits/chosen": 5983.64990234375, "logits/rejected": 5169.474609375, "logps/chosen": -365.88275146484375, "logps/rejected": -418.73504638671875, "loss": 0.6131, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -93.04127502441406, "rewards/margins": 75.20207977294922, "rewards/rejected": -168.2433624267578, "step": 610 }, { "epoch": 0.48665620094191525, "grad_norm": 14.41955430489932, "learning_rate": 3.0505737543712275e-07, "logits/chosen": 5046.630859375, "logits/rejected": 4108.693359375, "logps/chosen": -358.97125244140625, "logps/rejected": -385.05572509765625, "loss": 0.6279, "rewards/accuracies": 0.7083333730697632, "rewards/chosen": -111.14530181884766, "rewards/margins": 60.3331298828125, "rewards/rejected": -171.4784393310547, "step": 620 }, { "epoch": 0.4945054945054945, "grad_norm": 14.568591201789333, "learning_rate": 2.9835240279702513e-07, "logits/chosen": 6596.6220703125, "logits/rejected": 5537.1279296875, "logps/chosen": -393.75347900390625, "logps/rejected": -425.38739013671875, "loss": 0.621, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -77.04191589355469, "rewards/margins": 82.31060028076172, "rewards/rejected": -159.35252380371094, "step": 630 }, { "epoch": 0.5023547880690737, "grad_norm": 13.739673447904634, "learning_rate": 2.9161109547416667e-07, "logits/chosen": 6305.04052734375, "logits/rejected": 5372.958984375, "logps/chosen": -365.43939208984375, "logps/rejected": -413.84014892578125, "loss": 0.6222, "rewards/accuracies": 0.6916667222976685, "rewards/chosen": -86.74894714355469, "rewards/margins": 49.25041580200195, "rewards/rejected": -135.99935913085938, "step": 640 }, { "epoch": 0.5102040816326531, "grad_norm": 15.591649633238864, "learning_rate": 2.848385192615339e-07, "logits/chosen": 5376.2451171875, "logits/rejected": 4345.314453125, "logps/chosen": -336.9205322265625, "logps/rejected": -367.5696105957031, "loss": 0.6219, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": -79.38199615478516, "rewards/margins": 61.08577346801758, "rewards/rejected": -140.46775817871094, "step": 650 }, { "epoch": 0.5180533751962323, "grad_norm": 18.657960252226093, "learning_rate": 2.780397634492949e-07, "logits/chosen": 5987.65380859375, "logits/rejected": 4756.13671875, "logps/chosen": -371.12078857421875, "logps/rejected": -419.0254821777344, "loss": 0.625, "rewards/accuracies": 0.8000000715255737, "rewards/chosen": -80.80254364013672, "rewards/margins": 85.95211791992188, "rewards/rejected": -166.75466918945312, "step": 660 }, { "epoch": 0.5259026687598116, "grad_norm": 20.61975768497305, "learning_rate": 2.71219937000424e-07, "logits/chosen": 5989.3681640625, "logits/rejected": 4901.67138671875, "logps/chosen": -370.17462158203125, "logps/rejected": -413.4952697753906, "loss": 0.6243, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": -93.9641342163086, "rewards/margins": 68.44216918945312, "rewards/rejected": -162.4062957763672, "step": 670 }, { "epoch": 0.533751962323391, "grad_norm": 18.87684601216833, "learning_rate": 2.6438416471154273e-07, "logits/chosen": 5865.7666015625, "logits/rejected": 4891.36669921875, "logps/chosen": -372.4833679199219, "logps/rejected": -396.7239685058594, "loss": 0.6267, "rewards/accuracies": 0.75, "rewards/chosen": -94.6555404663086, "rewards/margins": 67.06285095214844, "rewards/rejected": -161.7183837890625, "step": 680 }, { "epoch": 0.5416012558869702, "grad_norm": 14.822028291771003, "learning_rate": 2.5753758336186326e-07, "logits/chosen": 5776.60498046875, "logits/rejected": 5268.0185546875, "logps/chosen": -350.5502624511719, "logps/rejected": -428.06854248046875, "loss": 0.603, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -75.46115112304688, "rewards/margins": 75.14856719970703, "rewards/rejected": -150.60972595214844, "step": 690 }, { "epoch": 0.5494505494505495, "grad_norm": 13.55417698931459, "learning_rate": 2.5068533785312666e-07, "logits/chosen": 5536.5478515625, "logits/rejected": 5285.6142578125, "logps/chosen": -339.8960876464844, "logps/rejected": -404.1838684082031, "loss": 0.6348, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -89.77982330322266, "rewards/margins": 68.95240783691406, "rewards/rejected": -158.7322235107422, "step": 700 }, { "epoch": 0.5572998430141287, "grad_norm": 13.967254349794798, "learning_rate": 2.4383257734343794e-07, "logits/chosen": 5494.7998046875, "logits/rejected": 5455.1875, "logps/chosen": -332.26739501953125, "logps/rejected": -411.1310119628906, "loss": 0.6169, "rewards/accuracies": 0.7750000953674316, "rewards/chosen": -83.0215072631836, "rewards/margins": 64.02027893066406, "rewards/rejected": -147.04177856445312, "step": 710 }, { "epoch": 0.565149136577708, "grad_norm": 15.383264284785263, "learning_rate": 2.3698445137790258e-07, "logits/chosen": 5931.5546875, "logits/rejected": 5057.220703125, "logps/chosen": -349.0887756347656, "logps/rejected": -400.6289978027344, "loss": 0.6146, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -64.32401275634766, "rewards/margins": 78.18653869628906, "rewards/rejected": -142.51055908203125, "step": 720 }, { "epoch": 0.5729984301412873, "grad_norm": 13.338256084406837, "learning_rate": 2.3014610601897157e-07, "logits/chosen": 6402.5517578125, "logits/rejected": 4874.70361328125, "logps/chosen": -372.53619384765625, "logps/rejected": -397.6921081542969, "loss": 0.6161, "rewards/accuracies": 0.7916667461395264, "rewards/chosen": -77.47715759277344, "rewards/margins": 79.93022918701172, "rewards/rejected": -157.40737915039062, "step": 730 }, { "epoch": 0.5808477237048666, "grad_norm": 17.616655194208075, "learning_rate": 2.2332267997940513e-07, "logits/chosen": 5318.05322265625, "logits/rejected": 4492.63818359375, "logps/chosen": -317.3236999511719, "logps/rejected": -361.37701416015625, "loss": 0.6124, "rewards/accuracies": 0.783333420753479, "rewards/chosen": -69.31564331054688, "rewards/margins": 74.50846862792969, "rewards/rejected": -143.82412719726562, "step": 740 }, { "epoch": 0.5886970172684458, "grad_norm": 17.26484587041811, "learning_rate": 2.1651930076075723e-07, "logits/chosen": 5809.0048828125, "logits/rejected": 5208.9267578125, "logps/chosen": -324.5057373046875, "logps/rejected": -368.41693115234375, "loss": 0.6283, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -84.24357604980469, "rewards/margins": 61.6234130859375, "rewards/rejected": -145.8669891357422, "step": 750 }, { "epoch": 0.5965463108320251, "grad_norm": 18.908183485265077, "learning_rate": 2.0974108080028692e-07, "logits/chosen": 6082.65185546875, "logits/rejected": 4746.8046875, "logps/chosen": -353.37567138671875, "logps/rejected": -395.29595947265625, "loss": 0.6149, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -90.01509094238281, "rewards/margins": 67.36105346679688, "rewards/rejected": -157.3761444091797, "step": 760 }, { "epoch": 0.6043956043956044, "grad_norm": 17.1773340775, "learning_rate": 2.0299311362918773e-07, "logits/chosen": 6311.5419921875, "logits/rejected": 5381.1201171875, "logps/chosen": -386.3832092285156, "logps/rejected": -444.20550537109375, "loss": 0.6297, "rewards/accuracies": 0.7583333849906921, "rewards/chosen": -91.73824310302734, "rewards/margins": 67.6690902709961, "rewards/rejected": -159.40731811523438, "step": 770 }, { "epoch": 0.6122448979591837, "grad_norm": 12.140213576431595, "learning_rate": 1.962804700450265e-07, "logits/chosen": 6080.9736328125, "logits/rejected": 5723.802734375, "logps/chosen": -384.7323303222656, "logps/rejected": -490.9775390625, "loss": 0.6177, "rewards/accuracies": 0.7583333849906921, "rewards/chosen": -101.59971618652344, "rewards/margins": 74.81465148925781, "rewards/rejected": -176.4143829345703, "step": 780 }, { "epoch": 0.6200941915227629, "grad_norm": 23.19008194008529, "learning_rate": 1.8960819430126334e-07, "logits/chosen": 5738.3056640625, "logits/rejected": 5031.94189453125, "logps/chosen": -359.13568115234375, "logps/rejected": -446.73992919921875, "loss": 0.61, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -104.6843032836914, "rewards/margins": 86.70986938476562, "rewards/rejected": -191.39418029785156, "step": 790 }, { "epoch": 0.6279434850863422, "grad_norm": 21.233318483525284, "learning_rate": 1.8298130031671972e-07, "logits/chosen": 5713.6044921875, "logits/rejected": 4974.9345703125, "logps/chosen": -377.11602783203125, "logps/rejected": -432.98297119140625, "loss": 0.6231, "rewards/accuracies": 0.7583333849906921, "rewards/chosen": -103.37955474853516, "rewards/margins": 63.08623504638672, "rewards/rejected": -166.46578979492188, "step": 800 }, { "epoch": 0.6357927786499215, "grad_norm": 18.08127392611742, "learning_rate": 1.7640476790784075e-07, "logits/chosen": 5225.978515625, "logits/rejected": 4710.2734375, "logps/chosen": -369.49896240234375, "logps/rejected": -446.40484619140625, "loss": 0.6149, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -95.6690673828125, "rewards/margins": 76.96039581298828, "rewards/rejected": -172.62945556640625, "step": 810 }, { "epoch": 0.6436420722135008, "grad_norm": 27.28580138938831, "learning_rate": 1.6988353904658492e-07, "logits/chosen": 5757.90625, "logits/rejected": 4414.92041015625, "logps/chosen": -383.70391845703125, "logps/rejected": -388.5174560546875, "loss": 0.6193, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -100.63114166259766, "rewards/margins": 72.06126403808594, "rewards/rejected": -172.69239807128906, "step": 820 }, { "epoch": 0.6514913657770801, "grad_norm": 26.165106071182983, "learning_rate": 1.634225141467513e-07, "logits/chosen": 5647.6025390625, "logits/rejected": 5025.44970703125, "logps/chosen": -368.6767883300781, "logps/rejected": -442.51751708984375, "loss": 0.6168, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -105.22233581542969, "rewards/margins": 89.50996398925781, "rewards/rejected": -194.73228454589844, "step": 830 }, { "epoch": 0.6593406593406593, "grad_norm": 12.05974686118023, "learning_rate": 1.570265483815364e-07, "logits/chosen": 6232.81005859375, "logits/rejected": 5058.81494140625, "logps/chosen": -383.663818359375, "logps/rejected": -444.72442626953125, "loss": 0.6187, "rewards/accuracies": 0.6916666030883789, "rewards/chosen": -98.13055419921875, "rewards/margins": 72.39865112304688, "rewards/rejected": -170.52920532226562, "step": 840 }, { "epoch": 0.6671899529042387, "grad_norm": 14.238384519302777, "learning_rate": 1.5070044803508691e-07, "logits/chosen": 5735.76953125, "logits/rejected": 5156.517578125, "logps/chosen": -371.0335693359375, "logps/rejected": -424.44903564453125, "loss": 0.6042, "rewards/accuracies": 0.7583333253860474, "rewards/chosen": -98.491455078125, "rewards/margins": 79.99961853027344, "rewards/rejected": -178.49105834960938, "step": 850 }, { "epoch": 0.6750392464678179, "grad_norm": 21.691254923241676, "learning_rate": 1.444489668907914e-07, "logits/chosen": 6212.0146484375, "logits/rejected": 5258.91845703125, "logps/chosen": -396.6695251464844, "logps/rejected": -415.2281799316406, "loss": 0.6366, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -93.80113220214844, "rewards/margins": 67.12193298339844, "rewards/rejected": -160.92306518554688, "step": 860 }, { "epoch": 0.6828885400313972, "grad_norm": 15.830471277855608, "learning_rate": 1.3827680265902232e-07, "logits/chosen": 6164.56201171875, "logits/rejected": 5070.73779296875, "logps/chosen": -370.5370788574219, "logps/rejected": -404.1648254394531, "loss": 0.6199, "rewards/accuracies": 0.7499999403953552, "rewards/chosen": -81.1246109008789, "rewards/margins": 68.8310775756836, "rewards/rejected": -149.9556884765625, "step": 870 }, { "epoch": 0.6907378335949764, "grad_norm": 19.644559397703166, "learning_rate": 1.3218859344701632e-07, "logits/chosen": 5451.4140625, "logits/rejected": 5144.291015625, "logps/chosen": -334.55950927734375, "logps/rejected": -431.77099609375, "loss": 0.6184, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -75.80677032470703, "rewards/margins": 67.65657806396484, "rewards/rejected": -143.46334838867188, "step": 880 }, { "epoch": 0.6985871271585558, "grad_norm": 15.901674866172888, "learning_rate": 1.2618891427354172e-07, "logits/chosen": 6396.1279296875, "logits/rejected": 5205.91162109375, "logps/chosen": -382.37579345703125, "logps/rejected": -417.2171936035156, "loss": 0.6239, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -69.84732818603516, "rewards/margins": 85.34009552001953, "rewards/rejected": -155.18740844726562, "step": 890 }, { "epoch": 0.706436420722135, "grad_norm": 16.09863720828146, "learning_rate": 1.202822736309758e-07, "logits/chosen": 5450.23681640625, "logits/rejected": 4962.1318359375, "logps/chosen": -340.2064514160156, "logps/rejected": -422.90478515625, "loss": 0.6223, "rewards/accuracies": 0.75, "rewards/chosen": -82.16495513916016, "rewards/margins": 76.31831359863281, "rewards/rejected": -158.4832763671875, "step": 900 }, { "epoch": 0.7142857142857143, "grad_norm": 25.11806872646705, "learning_rate": 1.1447311009737299e-07, "logits/chosen": 5361.8212890625, "logits/rejected": 5049.03173828125, "logps/chosen": -341.89459228515625, "logps/rejected": -418.7533264160156, "loss": 0.631, "rewards/accuracies": 0.7583333849906921, "rewards/chosen": -84.32719421386719, "rewards/margins": 76.45072937011719, "rewards/rejected": -160.77792358398438, "step": 910 }, { "epoch": 0.7221350078492935, "grad_norm": 15.417662364209441, "learning_rate": 1.0876578900107053e-07, "logits/chosen": 5907.921875, "logits/rejected": 4922.83349609375, "logps/chosen": -362.728759765625, "logps/rejected": -393.6321716308594, "loss": 0.6225, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -76.00703430175781, "rewards/margins": 72.67141723632812, "rewards/rejected": -148.67845153808594, "step": 920 }, { "epoch": 0.7299843014128728, "grad_norm": 13.068633060932935, "learning_rate": 1.0316459914033793e-07, "logits/chosen": 5875.71240234375, "logits/rejected": 4352.42041015625, "logps/chosen": -371.8520812988281, "logps/rejected": -387.53839111328125, "loss": 0.618, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -83.89849090576172, "rewards/margins": 70.6930160522461, "rewards/rejected": -154.59152221679688, "step": 930 }, { "epoch": 0.7378335949764521, "grad_norm": 19.777463963280816, "learning_rate": 9.767374956053584e-08, "logits/chosen": 5708.94482421875, "logits/rejected": 4980.36181640625, "logps/chosen": -343.82733154296875, "logps/rejected": -408.415771484375, "loss": 0.6222, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -78.96133422851562, "rewards/margins": 82.29380798339844, "rewards/rejected": -161.25514221191406, "step": 940 }, { "epoch": 0.7456828885400314, "grad_norm": 14.697590410758233, "learning_rate": 9.229736639120561e-08, "logits/chosen": 5883.52978515625, "logits/rejected": 5368.93359375, "logps/chosen": -346.15911865234375, "logps/rejected": -399.55218505859375, "loss": 0.6306, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -82.5989990234375, "rewards/margins": 54.160057067871094, "rewards/rejected": -136.75906372070312, "step": 950 }, { "epoch": 0.7535321821036107, "grad_norm": 23.114265338822328, "learning_rate": 8.70394897454659e-08, "logits/chosen": 5751.0556640625, "logits/rejected": 5106.9912109375, "logps/chosen": -331.0142517089844, "logps/rejected": -385.95538330078125, "loss": 0.6177, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -67.00068664550781, "rewards/margins": 72.63203430175781, "rewards/rejected": -139.63272094726562, "step": 960 }, { "epoch": 0.7613814756671899, "grad_norm": 18.27070448355251, "learning_rate": 8.19040706840472e-08, "logits/chosen": 5845.2509765625, "logits/rejected": 4865.30810546875, "logps/chosen": -361.5416564941406, "logps/rejected": -396.51556396484375, "loss": 0.6133, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -72.41023254394531, "rewards/margins": 77.77088928222656, "rewards/rejected": -150.18113708496094, "step": 970 }, { "epoch": 0.7692307692307693, "grad_norm": 12.08737263588805, "learning_rate": 7.689496824624525e-08, "logits/chosen": 5548.05224609375, "logits/rejected": 4407.2763671875, "logps/chosen": -346.5332336425781, "logps/rejected": -395.7588806152344, "loss": 0.6106, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -69.89125061035156, "rewards/margins": 87.18049621582031, "rewards/rejected": -157.07174682617188, "step": 980 }, { "epoch": 0.7770800627943485, "grad_norm": 18.79192365654695, "learning_rate": 7.201594655002458e-08, "logits/chosen": 5900.44482421875, "logits/rejected": 4868.0849609375, "logps/chosen": -349.3968811035156, "logps/rejected": -387.33416748046875, "loss": 0.6149, "rewards/accuracies": 0.73333340883255, "rewards/chosen": -81.28948974609375, "rewards/margins": 69.76005554199219, "rewards/rejected": -151.049560546875, "step": 990 }, { "epoch": 0.7849293563579278, "grad_norm": 18.79350845073137, "learning_rate": 6.727067196345099e-08, "logits/chosen": 5536.19873046875, "logits/rejected": 4639.8720703125, "logps/chosen": -336.35577392578125, "logps/rejected": -365.3442077636719, "loss": 0.6212, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -76.03428649902344, "rewards/margins": 63.63414764404297, "rewards/rejected": -139.66844177246094, "step": 1000 }, { "epoch": 0.792778649921507, "grad_norm": 15.409532424479464, "learning_rate": 6.26627103495786e-08, "logits/chosen": 5746.2666015625, "logits/rejected": 4735.36376953125, "logps/chosen": -329.68927001953125, "logps/rejected": -368.9176025390625, "loss": 0.624, "rewards/accuracies": 0.6583333015441895, "rewards/chosen": -70.27388763427734, "rewards/margins": 63.12858963012695, "rewards/rejected": -133.40248107910156, "step": 1010 }, { "epoch": 0.8006279434850864, "grad_norm": 13.475517503615837, "learning_rate": 5.8195524386862374e-08, "logits/chosen": 5811.63671875, "logits/rejected": 5106.11669921875, "logps/chosen": -357.29742431640625, "logps/rejected": -421.65594482421875, "loss": 0.614, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -59.400840759277344, "rewards/margins": 88.65455627441406, "rewards/rejected": -148.05538940429688, "step": 1020 }, { "epoch": 0.8084772370486656, "grad_norm": 18.09767323128694, "learning_rate": 5.38724709671092e-08, "logits/chosen": 6252.6279296875, "logits/rejected": 5865.5927734375, "logps/chosen": -347.6499328613281, "logps/rejected": -426.4112854003906, "loss": 0.6117, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -69.84974670410156, "rewards/margins": 81.45121765136719, "rewards/rejected": -151.30096435546875, "step": 1030 }, { "epoch": 0.8163265306122449, "grad_norm": 15.841210395276628, "learning_rate": 4.969679867292276e-08, "logits/chosen": 5529.40673828125, "logits/rejected": 4980.63818359375, "logps/chosen": -338.6473388671875, "logps/rejected": -407.2497253417969, "loss": 0.6156, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -73.40410614013672, "rewards/margins": 80.27185821533203, "rewards/rejected": -153.6759490966797, "step": 1040 }, { "epoch": 0.8241758241758241, "grad_norm": 16.846051622731085, "learning_rate": 4.5671645336537416e-08, "logits/chosen": 5591.8466796875, "logits/rejected": 5026.0830078125, "logps/chosen": -353.8576965332031, "logps/rejected": -416.2503967285156, "loss": 0.612, "rewards/accuracies": 0.75, "rewards/chosen": -72.3245620727539, "rewards/margins": 78.2017593383789, "rewards/rejected": -150.5263214111328, "step": 1050 }, { "epoch": 0.8320251177394035, "grad_norm": 45.4415767546636, "learning_rate": 4.180003568187776e-08, "logits/chosen": 6939.8369140625, "logits/rejected": 5414.35107421875, "logps/chosen": -385.07904052734375, "logps/rejected": -401.9789733886719, "loss": 0.6282, "rewards/accuracies": 0.6916666626930237, "rewards/chosen": -77.1563491821289, "rewards/margins": 60.28764724731445, "rewards/rejected": -137.44400024414062, "step": 1060 }, { "epoch": 0.8398744113029827, "grad_norm": 18.747827810509367, "learning_rate": 3.8084879051612144e-08, "logits/chosen": 5741.5634765625, "logits/rejected": 5204.7333984375, "logps/chosen": -347.3013916015625, "logps/rejected": -386.94561767578125, "loss": 0.6164, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -74.4090576171875, "rewards/margins": 75.30126953125, "rewards/rejected": -149.7103271484375, "step": 1070 }, { "epoch": 0.847723704866562, "grad_norm": 12.659356648082168, "learning_rate": 3.452896722091128e-08, "logits/chosen": 6310.4208984375, "logits/rejected": 4834.76171875, "logps/chosen": -383.05096435546875, "logps/rejected": -395.0693359375, "loss": 0.6029, "rewards/accuracies": 0.8083333969116211, "rewards/chosen": -70.56317901611328, "rewards/margins": 83.26502990722656, "rewards/rejected": -153.82821655273438, "step": 1080 }, { "epoch": 0.8555729984301413, "grad_norm": 16.859762113459805, "learning_rate": 3.11349722995527e-08, "logits/chosen": 6297.42236328125, "logits/rejected": 4726.97802734375, "logps/chosen": -357.27203369140625, "logps/rejected": -393.133056640625, "loss": 0.6114, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": -83.47435760498047, "rewards/margins": 59.989906311035156, "rewards/rejected": -143.46426391601562, "step": 1090 }, { "epoch": 0.8634222919937206, "grad_norm": 14.876276968043816, "learning_rate": 2.7905444723949762e-08, "logits/chosen": 6136.54248046875, "logits/rejected": 5038.2431640625, "logps/chosen": -369.5342712402344, "logps/rejected": -407.952392578125, "loss": 0.6182, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -86.31148529052734, "rewards/margins": 89.53190612792969, "rewards/rejected": -175.84341430664062, "step": 1100 }, { "epoch": 0.8712715855572999, "grad_norm": 23.19077581996903, "learning_rate": 2.484281134061142e-08, "logits/chosen": 6493.39794921875, "logits/rejected": 5203.970703125, "logps/chosen": -395.83636474609375, "logps/rejected": -427.874267578125, "loss": 0.616, "rewards/accuracies": 0.7833333015441895, "rewards/chosen": -86.56200408935547, "rewards/margins": 78.59150695800781, "rewards/rejected": -165.1535186767578, "step": 1110 }, { "epoch": 0.8791208791208791, "grad_norm": 20.803959092397093, "learning_rate": 2.194937358247506e-08, "logits/chosen": 6390.1259765625, "logits/rejected": 5163.4169921875, "logps/chosen": -380.3614807128906, "logps/rejected": -423.82916259765625, "loss": 0.613, "rewards/accuracies": 0.7583333253860474, "rewards/chosen": -89.02333068847656, "rewards/margins": 75.32238006591797, "rewards/rejected": -164.34573364257812, "step": 1120 }, { "epoch": 0.8869701726844584, "grad_norm": 18.963504933195512, "learning_rate": 1.9227305739481612e-08, "logits/chosen": 5820.75146484375, "logits/rejected": 4510.7939453125, "logps/chosen": -349.5916442871094, "logps/rejected": -381.07916259765625, "loss": 0.6091, "rewards/accuracies": 0.7833333015441895, "rewards/chosen": -71.02159118652344, "rewards/margins": 85.45536804199219, "rewards/rejected": -156.47695922851562, "step": 1130 }, { "epoch": 0.8948194662480377, "grad_norm": 19.01457656059988, "learning_rate": 1.6678653324693787e-08, "logits/chosen": 6429.29443359375, "logits/rejected": 5148.3701171875, "logps/chosen": -384.94915771484375, "logps/rejected": -422.2733459472656, "loss": 0.6034, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -79.50971984863281, "rewards/margins": 81.15669250488281, "rewards/rejected": -160.66641235351562, "step": 1140 }, { "epoch": 0.902668759811617, "grad_norm": 12.376640085144242, "learning_rate": 1.4305331537183384e-08, "logits/chosen": 5704.4609375, "logits/rejected": 5148.77978515625, "logps/chosen": -352.7065124511719, "logps/rejected": -419.0628356933594, "loss": 0.6087, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": -85.51119232177734, "rewards/margins": 70.08112335205078, "rewards/rejected": -155.59231567382812, "step": 1150 }, { "epoch": 0.9105180533751962, "grad_norm": 17.251075451707592, "learning_rate": 1.2109123822844653e-08, "logits/chosen": 5839.08251953125, "logits/rejected": 4554.01220703125, "logps/chosen": -359.0191955566406, "logps/rejected": -391.0977783203125, "loss": 0.6133, "rewards/accuracies": 0.783333420753479, "rewards/chosen": -88.34132385253906, "rewards/margins": 68.73795318603516, "rewards/rejected": -157.07925415039062, "step": 1160 }, { "epoch": 0.9183673469387755, "grad_norm": 30.790909367368283, "learning_rate": 1.0091680534213387e-08, "logits/chosen": 6360.50146484375, "logits/rejected": 6086.1513671875, "logps/chosen": -374.38970947265625, "logps/rejected": -467.6603088378906, "loss": 0.6196, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": -82.26263427734375, "rewards/margins": 81.5895767211914, "rewards/rejected": -163.85220336914062, "step": 1170 }, { "epoch": 0.9262166405023547, "grad_norm": 20.905568866608686, "learning_rate": 8.254517690300944e-09, "logits/chosen": 5604.05419921875, "logits/rejected": 4998.599609375, "logps/chosen": -350.8424377441406, "logps/rejected": -412.19940185546875, "loss": 0.6124, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -71.8908920288086, "rewards/margins": 81.49540710449219, "rewards/rejected": -153.3863067626953, "step": 1180 }, { "epoch": 0.9340659340659341, "grad_norm": 10.88103318658427, "learning_rate": 6.599015837372907e-09, "logits/chosen": 6126.52734375, "logits/rejected": 5268.48828125, "logps/chosen": -392.8667907714844, "logps/rejected": -439.15972900390625, "loss": 0.6196, "rewards/accuracies": 0.7083333730697632, "rewards/chosen": -100.54813385009766, "rewards/margins": 74.77506256103516, "rewards/rejected": -175.32318115234375, "step": 1190 }, { "epoch": 0.9419152276295133, "grad_norm": 20.539083448288594, "learning_rate": 5.126419011529992e-09, "logits/chosen": 6356.4951171875, "logits/rejected": 5332.37353515625, "logps/chosen": -379.75885009765625, "logps/rejected": -432.0904846191406, "loss": 0.6032, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -75.01070404052734, "rewards/margins": 98.93077850341797, "rewards/rejected": -173.9414825439453, "step": 1200 }, { "epoch": 0.9497645211930926, "grad_norm": 20.433280187267762, "learning_rate": 3.837833803870177e-09, "logits/chosen": 5961.4970703125, "logits/rejected": 5186.04833984375, "logps/chosen": -368.67034912109375, "logps/rejected": -424.91131591796875, "loss": 0.6183, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -88.86743927001953, "rewards/margins": 83.7270736694336, "rewards/rejected": -172.59451293945312, "step": 1210 }, { "epoch": 0.957613814756672, "grad_norm": 25.913384367594333, "learning_rate": 2.734228528934679e-09, "logits/chosen": 7407.4306640625, "logits/rejected": 5385.08544921875, "logps/chosen": -437.97711181640625, "logps/rejected": -454.0513610839844, "loss": 0.6148, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": -91.56160736083984, "rewards/margins": 77.19084167480469, "rewards/rejected": -168.75247192382812, "step": 1220 }, { "epoch": 0.9654631083202512, "grad_norm": 18.893977958471503, "learning_rate": 1.8164324970625645e-09, "logits/chosen": 6518.923828125, "logits/rejected": 5050.20166015625, "logps/chosen": -388.9313659667969, "logps/rejected": -423.03558349609375, "loss": 0.6311, "rewards/accuracies": 0.7583333253860474, "rewards/chosen": -80.57595825195312, "rewards/margins": 77.39918518066406, "rewards/rejected": -157.9751739501953, "step": 1230 }, { "epoch": 0.9733124018838305, "grad_norm": 16.390918429518635, "learning_rate": 1.0851353912008642e-09, "logits/chosen": 5605.1982421875, "logits/rejected": 5138.8681640625, "logps/chosen": -363.85174560546875, "logps/rejected": -445.0411682128906, "loss": 0.6098, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -90.89818572998047, "rewards/margins": 77.92682647705078, "rewards/rejected": -168.8250274658203, "step": 1240 }, { "epoch": 0.9811616954474097, "grad_norm": 13.229811746138825, "learning_rate": 5.408867486384471e-10, "logits/chosen": 5726.5830078125, "logits/rejected": 4779.26123046875, "logps/chosen": -346.5870361328125, "logps/rejected": -380.5937805175781, "loss": 0.6114, "rewards/accuracies": 0.7833333015441895, "rewards/chosen": -76.1484375, "rewards/margins": 81.09263610839844, "rewards/rejected": -157.24107360839844, "step": 1250 }, { "epoch": 0.989010989010989, "grad_norm": 23.115150939597314, "learning_rate": 1.840955480532924e-10, "logits/chosen": 5498.7197265625, "logits/rejected": 5114.80615234375, "logps/chosen": -348.0476989746094, "logps/rejected": -414.744140625, "loss": 0.6073, "rewards/accuracies": 0.7583333253860474, "rewards/chosen": -81.39493560791016, "rewards/margins": 75.64277648925781, "rewards/rejected": -157.0377197265625, "step": 1260 }, { "epoch": 0.9968602825745683, "grad_norm": 17.978099246101902, "learning_rate": 1.502990218302247e-11, "logits/chosen": 5703.56787109375, "logits/rejected": 4572.5458984375, "logps/chosen": -350.7594299316406, "logps/rejected": -395.73187255859375, "loss": 0.6146, "rewards/accuracies": 0.8083332777023315, "rewards/chosen": -89.28590393066406, "rewards/margins": 78.2876968383789, "rewards/rejected": -167.5736083984375, "step": 1270 }, { "epoch": 1.0, "step": 1274, "total_flos": 0.0, "train_loss": 0.6310614222072919, "train_runtime": 13007.3728, "train_samples_per_second": 4.7, "train_steps_per_second": 0.098 } ], "logging_steps": 10, "max_steps": 1274, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }