{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9991361934926575, "eval_steps": 100, "global_step": 5208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 10.611208510160006, "learning_rate": 9.596928982725527e-10, "logits/chosen": -2.688382625579834, "logits/rejected": -2.687504768371582, "logps/chosen": -154.15142822265625, "logps/rejected": -119.21998596191406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 11.275984429980808, "learning_rate": 9.596928982725526e-09, "logits/chosen": -2.693709135055542, "logits/rejected": -2.7062594890594482, "logps/chosen": -203.09437561035156, "logps/rejected": -203.58497619628906, "loss": 0.6932, "rewards/accuracies": 0.4236111044883728, "rewards/chosen": 6.469448999268934e-05, "rewards/margins": 0.00014834105968475342, "rewards/rejected": -8.364655514014885e-05, "step": 10 }, { "epoch": 0.01, "grad_norm": 10.990399197664567, "learning_rate": 1.9193857965451052e-08, "logits/chosen": -2.667926788330078, "logits/rejected": -2.663238286972046, "logps/chosen": -208.18679809570312, "logps/rejected": -195.68704223632812, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.0002055606310022995, "rewards/margins": 0.0006116779404692352, "rewards/rejected": -0.0004061173240188509, "step": 20 }, { "epoch": 0.02, "grad_norm": 10.993794354165766, "learning_rate": 2.8790786948176583e-08, "logits/chosen": -2.622999668121338, "logits/rejected": -2.6279687881469727, "logps/chosen": -179.3690948486328, "logps/rejected": -194.75942993164062, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 1.1262052794336341e-05, "rewards/margins": -0.0003482321626506746, "rewards/rejected": 0.0003594942099880427, "step": 30 }, { "epoch": 0.02, "grad_norm": 11.865900516603308, "learning_rate": 3.8387715930902104e-08, "logits/chosen": -2.6122729778289795, "logits/rejected": -2.573115825653076, "logps/chosen": -208.74740600585938, "logps/rejected": -187.6232452392578, "loss": 0.6931, "rewards/accuracies": 0.5625, "rewards/chosen": 3.379890767973848e-05, "rewards/margins": 0.0002555200189817697, "rewards/rejected": -0.00022172110038809478, "step": 40 }, { "epoch": 0.03, "grad_norm": 10.421674194226677, "learning_rate": 4.798464491362764e-08, "logits/chosen": -2.6442642211914062, "logits/rejected": -2.669315814971924, "logps/chosen": -236.63430786132812, "logps/rejected": -203.87472534179688, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0007920139469206333, "rewards/margins": 0.0007887674728408456, "rewards/rejected": 3.246474079787731e-06, "step": 50 }, { "epoch": 0.03, "grad_norm": 10.875571827215992, "learning_rate": 5.7581573896353166e-08, "logits/chosen": -2.653801441192627, "logits/rejected": -2.6664109230041504, "logps/chosen": -232.6512908935547, "logps/rejected": -211.4125518798828, "loss": 0.693, "rewards/accuracies": 0.46875, "rewards/chosen": 0.00018669811834115535, "rewards/margins": 4.591212928062305e-05, "rewards/rejected": 0.00014078602544032037, "step": 60 }, { "epoch": 0.04, "grad_norm": 11.22105852445743, "learning_rate": 6.71785028790787e-08, "logits/chosen": -2.6330461502075195, "logits/rejected": -2.62608003616333, "logps/chosen": -203.0271759033203, "logps/rejected": -205.40737915039062, "loss": 0.6927, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0007560437079519033, "rewards/margins": 0.000800161506049335, "rewards/rejected": -0.0015562052140012383, "step": 70 }, { "epoch": 0.05, "grad_norm": 10.752818890561544, "learning_rate": 7.677543186180421e-08, "logits/chosen": -2.622370719909668, "logits/rejected": -2.6615545749664307, "logps/chosen": -176.02774047851562, "logps/rejected": -185.0570831298828, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": -0.0015181910712271929, "rewards/margins": 8.574242383474484e-05, "rewards/rejected": -0.0016039334004744887, "step": 80 }, { "epoch": 0.05, "grad_norm": 11.198317090829164, "learning_rate": 8.637236084452975e-08, "logits/chosen": -2.591407299041748, "logits/rejected": -2.614539384841919, "logps/chosen": -177.1290283203125, "logps/rejected": -206.0966339111328, "loss": 0.6921, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0009284219704568386, "rewards/margins": 0.003772800788283348, "rewards/rejected": -0.004701223224401474, "step": 90 }, { "epoch": 0.06, "grad_norm": 9.937998864231563, "learning_rate": 9.596928982725528e-08, "logits/chosen": -2.6292388439178467, "logits/rejected": -2.6319985389709473, "logps/chosen": -187.04295349121094, "logps/rejected": -190.42141723632812, "loss": 0.6915, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0021987182553857565, "rewards/margins": 0.0032472398597747087, "rewards/rejected": -0.005445958115160465, "step": 100 }, { "epoch": 0.06, "eval_logits/chosen": -2.570690155029297, "eval_logits/rejected": -2.586681604385376, "eval_logps/chosen": -163.34715270996094, "eval_logps/rejected": -171.195556640625, "eval_loss": 0.691743016242981, "eval_rewards/accuracies": 0.5918949842453003, "eval_rewards/chosen": -0.005904410500079393, "eval_rewards/margins": 0.002785085467621684, "eval_rewards/rejected": -0.008689496666193008, "eval_runtime": 533.8206, "eval_samples_per_second": 13.113, "eval_steps_per_second": 0.41, "step": 100 }, { "epoch": 0.06, "grad_norm": 10.15130917806228, "learning_rate": 1.055662188099808e-07, "logits/chosen": -2.6020331382751465, "logits/rejected": -2.6334145069122314, "logps/chosen": -194.1101837158203, "logps/rejected": -210.51669311523438, "loss": 0.6902, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0016386958304792643, "rewards/margins": 0.005970651283860207, "rewards/rejected": -0.007609347812831402, "step": 110 }, { "epoch": 0.07, "grad_norm": 12.380884784899935, "learning_rate": 1.1516314779270633e-07, "logits/chosen": -2.5880379676818848, "logits/rejected": -2.5663695335388184, "logps/chosen": -182.2163543701172, "logps/rejected": -191.5764617919922, "loss": 0.6895, "rewards/accuracies": 0.625, "rewards/chosen": -0.0032195397652685642, "rewards/margins": 0.0058040558360517025, "rewards/rejected": -0.009023596532642841, "step": 120 }, { "epoch": 0.07, "grad_norm": 10.938069681930758, "learning_rate": 1.2476007677543185e-07, "logits/chosen": -2.648705244064331, "logits/rejected": -2.6452465057373047, "logps/chosen": -207.93997192382812, "logps/rejected": -197.58969116210938, "loss": 0.6877, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0013859583996236324, "rewards/margins": 0.012256019748747349, "rewards/rejected": -0.013641977682709694, "step": 130 }, { "epoch": 0.08, "grad_norm": 10.51839736081575, "learning_rate": 1.343570057581574e-07, "logits/chosen": -2.644381523132324, "logits/rejected": -2.644658327102661, "logps/chosen": -231.73486328125, "logps/rejected": -216.6191864013672, "loss": 0.687, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.006255399435758591, "rewards/margins": 0.009828994050621986, "rewards/rejected": -0.016084393486380577, "step": 140 }, { "epoch": 0.09, "grad_norm": 11.327135775378146, "learning_rate": 1.439539347408829e-07, "logits/chosen": -2.599203109741211, "logits/rejected": -2.607630491256714, "logps/chosen": -207.6547393798828, "logps/rejected": -207.8338165283203, "loss": 0.6857, "rewards/accuracies": 0.5625, "rewards/chosen": -0.013972711749374866, "rewards/margins": 0.013265794143080711, "rewards/rejected": -0.027238503098487854, "step": 150 }, { "epoch": 0.09, "grad_norm": 12.469992787672668, "learning_rate": 1.5355086372360842e-07, "logits/chosen": -2.5646302700042725, "logits/rejected": -2.576869487762451, "logps/chosen": -213.3702850341797, "logps/rejected": -194.04647827148438, "loss": 0.6797, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.019984139129519463, "rewards/margins": 0.028688553720712662, "rewards/rejected": -0.04867268726229668, "step": 160 }, { "epoch": 0.1, "grad_norm": 12.057118780138, "learning_rate": 1.6314779270633396e-07, "logits/chosen": -2.593844413757324, "logits/rejected": -2.586000442504883, "logps/chosen": -200.55960083007812, "logps/rejected": -193.16343688964844, "loss": 0.6793, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.05092308670282364, "rewards/margins": 0.03223235160112381, "rewards/rejected": -0.08315543830394745, "step": 170 }, { "epoch": 0.1, "grad_norm": 11.560646306285946, "learning_rate": 1.727447216890595e-07, "logits/chosen": -2.5619163513183594, "logits/rejected": -2.535081148147583, "logps/chosen": -207.1576385498047, "logps/rejected": -226.71334838867188, "loss": 0.6773, "rewards/accuracies": 0.625, "rewards/chosen": -0.06315551698207855, "rewards/margins": 0.033702537417411804, "rewards/rejected": -0.09685804694890976, "step": 180 }, { "epoch": 0.11, "grad_norm": 12.45877788794243, "learning_rate": 1.8234165067178504e-07, "logits/chosen": -2.5997958183288574, "logits/rejected": -2.595893383026123, "logps/chosen": -222.9274139404297, "logps/rejected": -216.8584747314453, "loss": 0.6701, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.09778516739606857, "rewards/margins": 0.05016006901860237, "rewards/rejected": -0.14794524013996124, "step": 190 }, { "epoch": 0.12, "grad_norm": 13.278965649359508, "learning_rate": 1.9193857965451055e-07, "logits/chosen": -2.5389647483825684, "logits/rejected": -2.5476126670837402, "logps/chosen": -211.9510498046875, "logps/rejected": -232.74020385742188, "loss": 0.6667, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.17205697298049927, "rewards/margins": 0.05315415933728218, "rewards/rejected": -0.22521111369132996, "step": 200 }, { "epoch": 0.12, "eval_logits/chosen": -2.5294487476348877, "eval_logits/rejected": -2.5361499786376953, "eval_logps/chosen": -183.25027465820312, "eval_logps/rejected": -196.4010772705078, "eval_loss": 0.6689916849136353, "eval_rewards/accuracies": 0.6307077407836914, "eval_rewards/chosen": -0.20493555068969727, "eval_rewards/margins": 0.055809177458286285, "eval_rewards/rejected": -0.26074472069740295, "eval_runtime": 530.1996, "eval_samples_per_second": 13.203, "eval_steps_per_second": 0.413, "step": 200 }, { "epoch": 0.12, "grad_norm": 14.075612543341052, "learning_rate": 2.0153550863723607e-07, "logits/chosen": -2.5845346450805664, "logits/rejected": -2.594388723373413, "logps/chosen": -215.8970184326172, "logps/rejected": -244.5321502685547, "loss": 0.6604, "rewards/accuracies": 0.625, "rewards/chosen": -0.17339566349983215, "rewards/margins": 0.08572752773761749, "rewards/rejected": -0.25912320613861084, "step": 210 }, { "epoch": 0.13, "grad_norm": 18.881709224472438, "learning_rate": 2.111324376199616e-07, "logits/chosen": -2.5975959300994873, "logits/rejected": -2.5805118083953857, "logps/chosen": -205.1549072265625, "logps/rejected": -221.88369750976562, "loss": 0.6548, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2524169981479645, "rewards/margins": 0.0873834416270256, "rewards/rejected": -0.3398004174232483, "step": 220 }, { "epoch": 0.13, "grad_norm": 20.74714118629395, "learning_rate": 2.2072936660268712e-07, "logits/chosen": -2.541008472442627, "logits/rejected": -2.533216714859009, "logps/chosen": -239.1126708984375, "logps/rejected": -257.67034912109375, "loss": 0.6426, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3573434352874756, "rewards/margins": 0.16846100986003876, "rewards/rejected": -0.5258044004440308, "step": 230 }, { "epoch": 0.14, "grad_norm": 26.60560362422598, "learning_rate": 2.3032629558541267e-07, "logits/chosen": -2.5126795768737793, "logits/rejected": -2.504943370819092, "logps/chosen": -251.2681427001953, "logps/rejected": -273.9628601074219, "loss": 0.6239, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4857562184333801, "rewards/margins": 0.21250848472118378, "rewards/rejected": -0.6982647180557251, "step": 240 }, { "epoch": 0.14, "grad_norm": 23.125531993685506, "learning_rate": 2.399232245681382e-07, "logits/chosen": -2.4556612968444824, "logits/rejected": -2.4707584381103516, "logps/chosen": -273.5422058105469, "logps/rejected": -313.97442626953125, "loss": 0.6257, "rewards/accuracies": 0.625, "rewards/chosen": -0.85858154296875, "rewards/margins": 0.24095073342323303, "rewards/rejected": -1.0995322465896606, "step": 250 }, { "epoch": 0.15, "grad_norm": 21.288333197832447, "learning_rate": 2.495201535508637e-07, "logits/chosen": -2.4645373821258545, "logits/rejected": -2.464017152786255, "logps/chosen": -232.4383544921875, "logps/rejected": -268.622802734375, "loss": 0.635, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5463618636131287, "rewards/margins": 0.24907764792442322, "rewards/rejected": -0.7954395413398743, "step": 260 }, { "epoch": 0.16, "grad_norm": 23.100907962408737, "learning_rate": 2.591170825335892e-07, "logits/chosen": -2.4447758197784424, "logits/rejected": -2.4442191123962402, "logps/chosen": -233.79605102539062, "logps/rejected": -270.1409912109375, "loss": 0.6329, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5584765076637268, "rewards/margins": 0.21696260571479797, "rewards/rejected": -0.7754390835762024, "step": 270 }, { "epoch": 0.16, "grad_norm": 25.487734163305436, "learning_rate": 2.687140115163148e-07, "logits/chosen": -2.4855802059173584, "logits/rejected": -2.465960741043091, "logps/chosen": -287.25970458984375, "logps/rejected": -288.3252258300781, "loss": 0.6281, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6578674912452698, "rewards/margins": 0.217629075050354, "rewards/rejected": -0.875496506690979, "step": 280 }, { "epoch": 0.17, "grad_norm": 30.448618274974507, "learning_rate": 2.783109404990403e-07, "logits/chosen": -2.4952151775360107, "logits/rejected": -2.4795172214508057, "logps/chosen": -264.05792236328125, "logps/rejected": -296.70831298828125, "loss": 0.6288, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6724523901939392, "rewards/margins": 0.2741486132144928, "rewards/rejected": -0.9466010928153992, "step": 290 }, { "epoch": 0.17, "grad_norm": 20.63269054294213, "learning_rate": 2.879078694817658e-07, "logits/chosen": -2.471968650817871, "logits/rejected": -2.4682021141052246, "logps/chosen": -280.3851013183594, "logps/rejected": -314.0673522949219, "loss": 0.6064, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7571940422058105, "rewards/margins": 0.35681623220443726, "rewards/rejected": -1.1140103340148926, "step": 300 }, { "epoch": 0.17, "eval_logits/chosen": -2.382413864135742, "eval_logits/rejected": -2.3765032291412354, "eval_logps/chosen": -271.49920654296875, "eval_logps/rejected": -312.40399169921875, "eval_loss": 0.6130638718605042, "eval_rewards/accuracies": 0.6529680490493774, "eval_rewards/chosen": -1.0874252319335938, "eval_rewards/margins": 0.3333488404750824, "eval_rewards/rejected": -1.420773983001709, "eval_runtime": 534.8347, "eval_samples_per_second": 13.088, "eval_steps_per_second": 0.409, "step": 300 }, { "epoch": 0.18, "grad_norm": 19.396816456346283, "learning_rate": 2.975047984644913e-07, "logits/chosen": -2.391308307647705, "logits/rejected": -2.4180564880371094, "logps/chosen": -291.649169921875, "logps/rejected": -345.5475769042969, "loss": 0.602, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9646072387695312, "rewards/margins": 0.3451134264469147, "rewards/rejected": -1.3097206354141235, "step": 310 }, { "epoch": 0.18, "grad_norm": 28.319986084735305, "learning_rate": 3.0710172744721683e-07, "logits/chosen": -2.3580570220947266, "logits/rejected": -2.3539388179779053, "logps/chosen": -253.64486694335938, "logps/rejected": -304.33270263671875, "loss": 0.5967, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.883933424949646, "rewards/margins": 0.3406381607055664, "rewards/rejected": -1.2245715856552124, "step": 320 }, { "epoch": 0.19, "grad_norm": 47.25368072121605, "learning_rate": 3.166986564299424e-07, "logits/chosen": -2.3519644737243652, "logits/rejected": -2.3475451469421387, "logps/chosen": -306.01275634765625, "logps/rejected": -332.52545166015625, "loss": 0.6249, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0529143810272217, "rewards/margins": 0.3473051190376282, "rewards/rejected": -1.4002195596694946, "step": 330 }, { "epoch": 0.2, "grad_norm": 27.220231031034565, "learning_rate": 3.262955854126679e-07, "logits/chosen": -2.4012954235076904, "logits/rejected": -2.3820691108703613, "logps/chosen": -281.8808898925781, "logps/rejected": -320.50262451171875, "loss": 0.5909, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8584387898445129, "rewards/margins": 0.37528762221336365, "rewards/rejected": -1.2337262630462646, "step": 340 }, { "epoch": 0.2, "grad_norm": 26.815383621294682, "learning_rate": 3.3589251439539343e-07, "logits/chosen": -2.370060443878174, "logits/rejected": -2.3450374603271484, "logps/chosen": -274.343994140625, "logps/rejected": -319.6899719238281, "loss": 0.5985, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8950921297073364, "rewards/margins": 0.4696590304374695, "rewards/rejected": -1.3647511005401611, "step": 350 }, { "epoch": 0.21, "grad_norm": 30.504913493364054, "learning_rate": 3.45489443378119e-07, "logits/chosen": -2.358250379562378, "logits/rejected": -2.3417699337005615, "logps/chosen": -347.94287109375, "logps/rejected": -389.13983154296875, "loss": 0.5645, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1560261249542236, "rewards/margins": 0.5985914468765259, "rewards/rejected": -1.7546173334121704, "step": 360 }, { "epoch": 0.21, "grad_norm": 41.04062377702306, "learning_rate": 3.550863723608445e-07, "logits/chosen": -2.2298038005828857, "logits/rejected": -2.233619213104248, "logps/chosen": -383.6699523925781, "logps/rejected": -436.1602478027344, "loss": 0.5775, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7848291397094727, "rewards/margins": 0.5287401080131531, "rewards/rejected": -2.3135695457458496, "step": 370 }, { "epoch": 0.22, "grad_norm": 32.47503408848043, "learning_rate": 3.646833013435701e-07, "logits/chosen": -2.293248176574707, "logits/rejected": -2.2647204399108887, "logps/chosen": -369.9467468261719, "logps/rejected": -422.0674743652344, "loss": 0.5664, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6948490142822266, "rewards/margins": 0.5455628633499146, "rewards/rejected": -2.2404117584228516, "step": 380 }, { "epoch": 0.22, "grad_norm": 31.479552319145334, "learning_rate": 3.742802303262956e-07, "logits/chosen": -2.223508596420288, "logits/rejected": -2.237217664718628, "logps/chosen": -308.84967041015625, "logps/rejected": -363.0736999511719, "loss": 0.5816, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3651267290115356, "rewards/margins": 0.4378534257411957, "rewards/rejected": -1.8029801845550537, "step": 390 }, { "epoch": 0.23, "grad_norm": 29.593950521242743, "learning_rate": 3.838771593090211e-07, "logits/chosen": -2.242884397506714, "logits/rejected": -2.231152057647705, "logps/chosen": -355.4832458496094, "logps/rejected": -379.53228759765625, "loss": 0.5768, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4919766187667847, "rewards/margins": 0.42949914932250977, "rewards/rejected": -1.9214760065078735, "step": 400 }, { "epoch": 0.23, "eval_logits/chosen": -2.208782434463501, "eval_logits/rejected": -2.2240514755249023, "eval_logps/chosen": -362.949462890625, "eval_logps/rejected": -421.64837646484375, "eval_loss": 0.5797730684280396, "eval_rewards/accuracies": 0.711758017539978, "eval_rewards/chosen": -2.001927137374878, "eval_rewards/margins": 0.5112901329994202, "eval_rewards/rejected": -2.513216972351074, "eval_runtime": 534.2445, "eval_samples_per_second": 13.103, "eval_steps_per_second": 0.41, "step": 400 }, { "epoch": 0.24, "grad_norm": 25.131250342097456, "learning_rate": 3.934740882917466e-07, "logits/chosen": -2.3061180114746094, "logits/rejected": -2.28863525390625, "logps/chosen": -398.7698669433594, "logps/rejected": -430.3541564941406, "loss": 0.5802, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5561281442642212, "rewards/margins": 0.5106848478317261, "rewards/rejected": -2.0668129920959473, "step": 410 }, { "epoch": 0.24, "grad_norm": 55.31594141871013, "learning_rate": 4.0307101727447214e-07, "logits/chosen": -2.1639745235443115, "logits/rejected": -2.1372509002685547, "logps/chosen": -431.411376953125, "logps/rejected": -479.0457458496094, "loss": 0.5711, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.0741934776306152, "rewards/margins": 0.5639596581459045, "rewards/rejected": -2.638153076171875, "step": 420 }, { "epoch": 0.25, "grad_norm": 21.29255724572271, "learning_rate": 4.126679462571977e-07, "logits/chosen": -2.2238850593566895, "logits/rejected": -2.200798511505127, "logps/chosen": -350.1602478027344, "logps/rejected": -393.104248046875, "loss": 0.5808, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5713073015213013, "rewards/margins": 0.4856873154640198, "rewards/rejected": -2.056994676589966, "step": 430 }, { "epoch": 0.25, "grad_norm": 27.95542366410069, "learning_rate": 4.222648752399232e-07, "logits/chosen": -2.212869167327881, "logits/rejected": -2.1920323371887207, "logps/chosen": -368.4070739746094, "logps/rejected": -391.7508544921875, "loss": 0.5629, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5058845281600952, "rewards/margins": 0.5601885318756104, "rewards/rejected": -2.066072940826416, "step": 440 }, { "epoch": 0.26, "grad_norm": 30.9456393047162, "learning_rate": 4.3186180422264873e-07, "logits/chosen": -2.103602170944214, "logits/rejected": -2.0618910789489746, "logps/chosen": -354.42138671875, "logps/rejected": -415.5582580566406, "loss": 0.5678, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5057729482650757, "rewards/margins": 0.6006572842597961, "rewards/rejected": -2.1064305305480957, "step": 450 }, { "epoch": 0.26, "grad_norm": 23.546880144729453, "learning_rate": 4.4145873320537425e-07, "logits/chosen": -2.0609335899353027, "logits/rejected": -2.014543056488037, "logps/chosen": -331.9206237792969, "logps/rejected": -408.72894287109375, "loss": 0.5665, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4313408136367798, "rewards/margins": 0.6059279441833496, "rewards/rejected": -2.037268877029419, "step": 460 }, { "epoch": 0.27, "grad_norm": 31.17005096046491, "learning_rate": 4.5105566218809976e-07, "logits/chosen": -2.0496230125427246, "logits/rejected": -2.0121560096740723, "logps/chosen": -331.5176086425781, "logps/rejected": -382.56927490234375, "loss": 0.5525, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.355355978012085, "rewards/margins": 0.6093749403953552, "rewards/rejected": -1.9647308588027954, "step": 470 }, { "epoch": 0.28, "grad_norm": 33.122068813890245, "learning_rate": 4.6065259117082533e-07, "logits/chosen": -1.9575620889663696, "logits/rejected": -1.934399962425232, "logps/chosen": -388.51416015625, "logps/rejected": -454.73809814453125, "loss": 0.5646, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.839609146118164, "rewards/margins": 0.6481025815010071, "rewards/rejected": -2.4877116680145264, "step": 480 }, { "epoch": 0.28, "grad_norm": 25.302660601139642, "learning_rate": 4.7024952015355085e-07, "logits/chosen": -2.0024499893188477, "logits/rejected": -1.9742482900619507, "logps/chosen": -360.33087158203125, "logps/rejected": -434.51605224609375, "loss": 0.557, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8837835788726807, "rewards/margins": 0.690355658531189, "rewards/rejected": -2.574139356613159, "step": 490 }, { "epoch": 0.29, "grad_norm": 24.541977698363272, "learning_rate": 4.798464491362764e-07, "logits/chosen": -1.9768650531768799, "logits/rejected": -1.9277032613754272, "logps/chosen": -427.0602111816406, "logps/rejected": -486.32281494140625, "loss": 0.5653, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0266995429992676, "rewards/margins": 0.7445794343948364, "rewards/rejected": -2.7712788581848145, "step": 500 }, { "epoch": 0.29, "eval_logits/chosen": -1.8721399307250977, "eval_logits/rejected": -1.8327217102050781, "eval_logps/chosen": -386.40472412109375, "eval_logps/rejected": -451.00628662109375, "eval_loss": 0.5732461810112, "eval_rewards/accuracies": 0.7037671208381653, "eval_rewards/chosen": -2.2364797592163086, "eval_rewards/margins": 0.5703169703483582, "eval_rewards/rejected": -2.8067967891693115, "eval_runtime": 529.8746, "eval_samples_per_second": 13.211, "eval_steps_per_second": 0.413, "step": 500 }, { "epoch": 0.29, "grad_norm": 26.306690428777948, "learning_rate": 4.894433781190019e-07, "logits/chosen": -1.9715452194213867, "logits/rejected": -1.9792684316635132, "logps/chosen": -378.4935607910156, "logps/rejected": -446.6212463378906, "loss": 0.5284, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7881218194961548, "rewards/margins": 0.691459596157074, "rewards/rejected": -2.479581356048584, "step": 510 }, { "epoch": 0.3, "grad_norm": 31.868668330411023, "learning_rate": 4.990403071017274e-07, "logits/chosen": -1.995591402053833, "logits/rejected": -1.858727216720581, "logps/chosen": -368.2765197753906, "logps/rejected": -416.9309997558594, "loss": 0.5751, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.725762128829956, "rewards/margins": 0.6348416209220886, "rewards/rejected": -2.3606038093566895, "step": 520 }, { "epoch": 0.31, "grad_norm": 21.28327670649449, "learning_rate": 4.9999545112971e-07, "logits/chosen": -2.068380355834961, "logits/rejected": -1.9709415435791016, "logps/chosen": -327.24029541015625, "logps/rejected": -353.082763671875, "loss": 0.5852, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2784364223480225, "rewards/margins": 0.4495919346809387, "rewards/rejected": -1.728028655052185, "step": 530 }, { "epoch": 0.31, "grad_norm": 50.54592515001582, "learning_rate": 4.99979726852344e-07, "logits/chosen": -2.0676636695861816, "logits/rejected": -1.9934663772583008, "logps/chosen": -333.8702087402344, "logps/rejected": -384.93743896484375, "loss": 0.5682, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.387322187423706, "rewards/margins": 0.49411287903785706, "rewards/rejected": -1.8814351558685303, "step": 540 }, { "epoch": 0.32, "grad_norm": 29.42565194990149, "learning_rate": 4.999527717152874e-07, "logits/chosen": -2.007429838180542, "logits/rejected": -1.9727427959442139, "logps/chosen": -343.5440368652344, "logps/rejected": -401.64190673828125, "loss": 0.5604, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.508192777633667, "rewards/margins": 0.5806811451911926, "rewards/rejected": -2.088873863220215, "step": 550 }, { "epoch": 0.32, "grad_norm": 40.083763853282065, "learning_rate": 4.999145869295556e-07, "logits/chosen": -1.9833877086639404, "logits/rejected": -1.9330778121948242, "logps/chosen": -346.67681884765625, "logps/rejected": -415.2813415527344, "loss": 0.5354, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.528934121131897, "rewards/margins": 0.6500917077064514, "rewards/rejected": -2.179025888442993, "step": 560 }, { "epoch": 0.33, "grad_norm": 28.686609634753335, "learning_rate": 4.998651742106798e-07, "logits/chosen": -2.137157917022705, "logits/rejected": -2.1629109382629395, "logps/chosen": -419.7047424316406, "logps/rejected": -466.9361267089844, "loss": 0.5529, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7164846658706665, "rewards/margins": 0.4566303789615631, "rewards/rejected": -2.1731152534484863, "step": 570 }, { "epoch": 0.33, "grad_norm": 22.602204629451293, "learning_rate": 4.998045357786292e-07, "logits/chosen": -2.037132740020752, "logits/rejected": -2.0769224166870117, "logps/chosen": -380.265625, "logps/rejected": -455.3236389160156, "loss": 0.5404, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8322168588638306, "rewards/margins": 0.6844093203544617, "rewards/rejected": -2.5166261196136475, "step": 580 }, { "epoch": 0.34, "grad_norm": 32.654581969816796, "learning_rate": 4.997326743577116e-07, "logits/chosen": -2.0082359313964844, "logits/rejected": -2.0254974365234375, "logps/chosen": -432.3203125, "logps/rejected": -493.34417724609375, "loss": 0.5498, "rewards/accuracies": 0.71875, "rewards/chosen": -1.940434455871582, "rewards/margins": 0.7341729998588562, "rewards/rejected": -2.674607276916504, "step": 590 }, { "epoch": 0.35, "grad_norm": 24.907538238206396, "learning_rate": 4.996495931764509e-07, "logits/chosen": -1.9402776956558228, "logits/rejected": -2.037241220474243, "logps/chosen": -370.3836975097656, "logps/rejected": -452.9737243652344, "loss": 0.5717, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.984858512878418, "rewards/margins": 0.5628988146781921, "rewards/rejected": -2.547757148742676, "step": 600 }, { "epoch": 0.35, "eval_logits/chosen": -1.9233992099761963, "eval_logits/rejected": -1.8751164674758911, "eval_logps/chosen": -365.6780090332031, "eval_logps/rejected": -428.3890075683594, "eval_loss": 0.5686014890670776, "eval_rewards/accuracies": 0.7174657583236694, "eval_rewards/chosen": -2.029212474822998, "eval_rewards/margins": 0.551411509513855, "eval_rewards/rejected": -2.5806243419647217, "eval_runtime": 533.88, "eval_samples_per_second": 13.112, "eval_steps_per_second": 0.41, "step": 600 }, { "epoch": 0.35, "grad_norm": 25.834773370255807, "learning_rate": 4.995552959674423e-07, "logits/chosen": -2.063260555267334, "logits/rejected": -1.9882431030273438, "logps/chosen": -388.81658935546875, "logps/rejected": -438.5895080566406, "loss": 0.5538, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5872763395309448, "rewards/margins": 0.6417847275733948, "rewards/rejected": -2.2290611267089844, "step": 610 }, { "epoch": 0.36, "grad_norm": 41.855417264327855, "learning_rate": 4.99449786967184e-07, "logits/chosen": -1.9198442697525024, "logits/rejected": -1.8154523372650146, "logps/chosen": -363.77349853515625, "logps/rejected": -415.21893310546875, "loss": 0.5453, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5375497341156006, "rewards/margins": 0.5646449327468872, "rewards/rejected": -2.1021947860717773, "step": 620 }, { "epoch": 0.36, "grad_norm": 33.88851283299703, "learning_rate": 4.993330709158879e-07, "logits/chosen": -1.8743531703948975, "logits/rejected": -1.8228861093521118, "logps/chosen": -367.91497802734375, "logps/rejected": -441.2967224121094, "loss": 0.554, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8477656841278076, "rewards/margins": 0.7941902279853821, "rewards/rejected": -2.641955852508545, "step": 630 }, { "epoch": 0.37, "grad_norm": 28.99503052488063, "learning_rate": 4.992051530572652e-07, "logits/chosen": -1.7986034154891968, "logits/rejected": -1.7905712127685547, "logps/chosen": -377.29669189453125, "logps/rejected": -451.7822265625, "loss": 0.5506, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8193461894989014, "rewards/margins": 0.7828404307365417, "rewards/rejected": -2.602187156677246, "step": 640 }, { "epoch": 0.37, "grad_norm": 27.033769297114613, "learning_rate": 4.990660391382923e-07, "logits/chosen": -1.885871171951294, "logits/rejected": -1.84404718875885, "logps/chosen": -379.4486999511719, "logps/rejected": -429.9854431152344, "loss": 0.5435, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6576058864593506, "rewards/margins": 0.6208540201187134, "rewards/rejected": -2.2784595489501953, "step": 650 }, { "epoch": 0.38, "grad_norm": 28.977810132921764, "learning_rate": 4.989157354089514e-07, "logits/chosen": -1.753136396408081, "logits/rejected": -1.7515590190887451, "logps/chosen": -322.30029296875, "logps/rejected": -386.52978515625, "loss": 0.5629, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.495133638381958, "rewards/margins": 0.5765607953071594, "rewards/rejected": -2.0716946125030518, "step": 660 }, { "epoch": 0.39, "grad_norm": 30.95411356378581, "learning_rate": 4.987542486219507e-07, "logits/chosen": -1.8626670837402344, "logits/rejected": -1.7799171209335327, "logps/chosen": -416.19757080078125, "logps/rejected": -450.8716735839844, "loss": 0.5451, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0000617504119873, "rewards/margins": 0.4827081263065338, "rewards/rejected": -2.48276948928833, "step": 670 }, { "epoch": 0.39, "grad_norm": 24.432687127518562, "learning_rate": 4.985815860324203e-07, "logits/chosen": -1.8531873226165771, "logits/rejected": -1.791940689086914, "logps/chosen": -409.1615905761719, "logps/rejected": -484.7565002441406, "loss": 0.5151, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1881613731384277, "rewards/margins": 0.7247720956802368, "rewards/rejected": -2.912933349609375, "step": 680 }, { "epoch": 0.4, "grad_norm": 37.60256295675174, "learning_rate": 4.983977553975863e-07, "logits/chosen": -1.825040578842163, "logits/rejected": -1.82125985622406, "logps/chosen": -414.04791259765625, "logps/rejected": -473.1602478027344, "loss": 0.5825, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0819144248962402, "rewards/margins": 0.6526156663894653, "rewards/rejected": -2.734529972076416, "step": 690 }, { "epoch": 0.4, "grad_norm": 28.830749822993035, "learning_rate": 4.98202764976423e-07, "logits/chosen": -1.8486942052841187, "logits/rejected": -1.8373234272003174, "logps/chosen": -379.6943664550781, "logps/rejected": -449.971435546875, "loss": 0.5752, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6764113903045654, "rewards/margins": 0.6149306297302246, "rewards/rejected": -2.291342258453369, "step": 700 }, { "epoch": 0.4, "eval_logits/chosen": -1.7177659273147583, "eval_logits/rejected": -1.723059892654419, "eval_logps/chosen": -363.1083068847656, "eval_logps/rejected": -426.30914306640625, "eval_loss": 0.5646109580993652, "eval_rewards/accuracies": 0.7151826620101929, "eval_rewards/chosen": -2.0035157203674316, "eval_rewards/margins": 0.5563095808029175, "eval_rewards/rejected": -2.5598254203796387, "eval_runtime": 529.2612, "eval_samples_per_second": 13.226, "eval_steps_per_second": 0.414, "step": 700 }, { "epoch": 0.41, "grad_norm": 30.297649161582697, "learning_rate": 4.979966235292809e-07, "logits/chosen": -1.8499233722686768, "logits/rejected": -1.8541345596313477, "logps/chosen": -377.6393737792969, "logps/rejected": -416.67333984375, "loss": 0.5586, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5448112487792969, "rewards/margins": 0.5460962057113647, "rewards/rejected": -2.090907573699951, "step": 710 }, { "epoch": 0.41, "grad_norm": 39.33034877054217, "learning_rate": 4.977793403174936e-07, "logits/chosen": -1.7733430862426758, "logits/rejected": -1.72952139377594, "logps/chosen": -376.0733642578125, "logps/rejected": -404.24932861328125, "loss": 0.5836, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7145448923110962, "rewards/margins": 0.4573994576931, "rewards/rejected": -2.1719441413879395, "step": 720 }, { "epoch": 0.42, "grad_norm": 40.52355243153651, "learning_rate": 4.97550925102962e-07, "logits/chosen": -1.829652190208435, "logits/rejected": -1.8321421146392822, "logps/chosen": -371.4091796875, "logps/rejected": -420.5167541503906, "loss": 0.5575, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5310615301132202, "rewards/margins": 0.5716816186904907, "rewards/rejected": -2.10274338722229, "step": 730 }, { "epoch": 0.43, "grad_norm": 28.879275582327473, "learning_rate": 4.97311388147715e-07, "logits/chosen": -1.7534255981445312, "logits/rejected": -1.6832650899887085, "logps/chosen": -350.0277099609375, "logps/rejected": -428.2278747558594, "loss": 0.5411, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4738718271255493, "rewards/margins": 0.6937412023544312, "rewards/rejected": -2.1676127910614014, "step": 740 }, { "epoch": 0.43, "grad_norm": 36.56795602130584, "learning_rate": 4.970607402134491e-07, "logits/chosen": -1.624246597290039, "logits/rejected": -1.609897255897522, "logps/chosen": -383.0580139160156, "logps/rejected": -449.1436462402344, "loss": 0.54, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9522749185562134, "rewards/margins": 0.5463067293167114, "rewards/rejected": -2.498581647872925, "step": 750 }, { "epoch": 0.44, "grad_norm": 25.206966345589485, "learning_rate": 4.967989925610447e-07, "logits/chosen": -1.4244545698165894, "logits/rejected": -1.3874050378799438, "logps/chosen": -446.9930725097656, "logps/rejected": -507.6349182128906, "loss": 0.5283, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.2830395698547363, "rewards/margins": 0.8258479833602905, "rewards/rejected": -3.108887195587158, "step": 760 }, { "epoch": 0.44, "grad_norm": 33.7262882743524, "learning_rate": 4.965261569500599e-07, "logits/chosen": -1.2625118494033813, "logits/rejected": -1.1843106746673584, "logps/chosen": -428.4283752441406, "logps/rejected": -500.03948974609375, "loss": 0.5432, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9767898321151733, "rewards/margins": 0.8348423838615417, "rewards/rejected": -2.8116321563720703, "step": 770 }, { "epoch": 0.45, "grad_norm": 37.801060435115694, "learning_rate": 4.962422456382026e-07, "logits/chosen": -1.0276730060577393, "logits/rejected": -0.9513728022575378, "logps/chosen": -414.26104736328125, "logps/rejected": -461.51641845703125, "loss": 0.5299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.999508261680603, "rewards/margins": 0.6127742528915405, "rewards/rejected": -2.6122825145721436, "step": 780 }, { "epoch": 0.45, "grad_norm": 26.435562278592414, "learning_rate": 4.959472713807797e-07, "logits/chosen": -0.8660637140274048, "logits/rejected": -0.7904596328735352, "logps/chosen": -390.1083068847656, "logps/rejected": -452.052734375, "loss": 0.564, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9469273090362549, "rewards/margins": 0.6172637939453125, "rewards/rejected": -2.5641913414001465, "step": 790 }, { "epoch": 0.46, "grad_norm": 24.38459415978698, "learning_rate": 4.956412474301236e-07, "logits/chosen": -0.7104419469833374, "logits/rejected": -0.5941707491874695, "logps/chosen": -393.9003601074219, "logps/rejected": -449.86297607421875, "loss": 0.5592, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9520524740219116, "rewards/margins": 0.6885526180267334, "rewards/rejected": -2.6406052112579346, "step": 800 }, { "epoch": 0.46, "eval_logits/chosen": -0.4635317027568817, "eval_logits/rejected": -0.474114328622818, "eval_logps/chosen": -380.431640625, "eval_logps/rejected": -449.3553771972656, "eval_loss": 0.5594721436500549, "eval_rewards/accuracies": 0.7151826620101929, "eval_rewards/chosen": -2.1767492294311523, "eval_rewards/margins": 0.6135382056236267, "eval_rewards/rejected": -2.790287733078003, "eval_runtime": 532.1277, "eval_samples_per_second": 13.155, "eval_steps_per_second": 0.412, "step": 800 }, { "epoch": 0.47, "grad_norm": 32.46903968564005, "learning_rate": 4.953241875349977e-07, "logits/chosen": -0.6683417558670044, "logits/rejected": -0.5753171443939209, "logps/chosen": -365.70703125, "logps/rejected": -457.50360107421875, "loss": 0.5283, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.769514799118042, "rewards/margins": 0.7628298401832581, "rewards/rejected": -2.5323448181152344, "step": 810 }, { "epoch": 0.47, "grad_norm": 26.49381957324683, "learning_rate": 4.949961059399779e-07, "logits/chosen": -0.7726486325263977, "logits/rejected": -0.5810061097145081, "logps/chosen": -378.6798400878906, "logps/rejected": -455.6795349121094, "loss": 0.5385, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6266822814941406, "rewards/margins": 0.9052413702011108, "rewards/rejected": -2.531923532485962, "step": 820 }, { "epoch": 0.48, "grad_norm": 28.194690683480072, "learning_rate": 4.946570173848127e-07, "logits/chosen": -0.6153978109359741, "logits/rejected": -0.6012212634086609, "logps/chosen": -325.25213623046875, "logps/rejected": -429.82989501953125, "loss": 0.5608, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5520198345184326, "rewards/margins": 0.7548178434371948, "rewards/rejected": -2.306837797164917, "step": 830 }, { "epoch": 0.48, "grad_norm": 40.08115021514806, "learning_rate": 4.943069371037617e-07, "logits/chosen": -0.8705947995185852, "logits/rejected": -0.774438202381134, "logps/chosen": -358.1176452636719, "logps/rejected": -417.9815368652344, "loss": 0.5613, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5859254598617554, "rewards/margins": 0.5456986427307129, "rewards/rejected": -2.1316239833831787, "step": 840 }, { "epoch": 0.49, "grad_norm": 31.399001717422586, "learning_rate": 4.939458808249102e-07, "logits/chosen": -0.5986772775650024, "logits/rejected": -0.5485242009162903, "logps/chosen": -354.3177185058594, "logps/rejected": -432.52130126953125, "loss": 0.5279, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7066242694854736, "rewards/margins": 0.768088698387146, "rewards/rejected": -2.4747133255004883, "step": 850 }, { "epoch": 0.5, "grad_norm": 26.840206379152985, "learning_rate": 4.935738647694632e-07, "logits/chosen": -0.5229077339172363, "logits/rejected": -0.48687559366226196, "logps/chosen": -397.5639953613281, "logps/rejected": -465.96490478515625, "loss": 0.5264, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9964309930801392, "rewards/margins": 0.8044846653938293, "rewards/rejected": -2.800915479660034, "step": 860 }, { "epoch": 0.5, "grad_norm": 28.876078206455308, "learning_rate": 4.931909056510169e-07, "logits/chosen": -1.0202022790908813, "logits/rejected": -1.041042685508728, "logps/chosen": -408.78125, "logps/rejected": -470.0559997558594, "loss": 0.5555, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.3556597232818604, "rewards/margins": 0.5206300020217896, "rewards/rejected": -2.8762896060943604, "step": 870 }, { "epoch": 0.51, "grad_norm": 29.981362437553457, "learning_rate": 4.927970206748066e-07, "logits/chosen": -1.791886329650879, "logits/rejected": -1.7506214380264282, "logps/chosen": -423.5604553222656, "logps/rejected": -491.63299560546875, "loss": 0.5737, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1132404804229736, "rewards/margins": 0.6096027493476868, "rewards/rejected": -2.7228426933288574, "step": 880 }, { "epoch": 0.51, "grad_norm": 36.744747343915186, "learning_rate": 4.923922275369353e-07, "logits/chosen": -2.036400556564331, "logits/rejected": -2.0184805393218994, "logps/chosen": -389.59637451171875, "logps/rejected": -449.41937255859375, "loss": 0.5621, "rewards/accuracies": 0.71875, "rewards/chosen": -1.729682207107544, "rewards/margins": 0.7165164947509766, "rewards/rejected": -2.4461987018585205, "step": 890 }, { "epoch": 0.52, "grad_norm": 29.705964987034406, "learning_rate": 4.919765444235771e-07, "logits/chosen": -2.1174654960632324, "logits/rejected": -2.0052731037139893, "logps/chosen": -373.59271240234375, "logps/rejected": -452.1903381347656, "loss": 0.5477, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7548086643218994, "rewards/margins": 0.7076038122177124, "rewards/rejected": -2.4624123573303223, "step": 900 }, { "epoch": 0.52, "eval_logits/chosen": -1.9477951526641846, "eval_logits/rejected": -1.8589999675750732, "eval_logps/chosen": -381.2916564941406, "eval_logps/rejected": -447.4022521972656, "eval_loss": 0.5613307952880859, "eval_rewards/accuracies": 0.7243150472640991, "eval_rewards/chosen": -2.185349464416504, "eval_rewards/margins": 0.5854070782661438, "eval_rewards/rejected": -2.770756721496582, "eval_runtime": 529.5207, "eval_samples_per_second": 13.22, "eval_steps_per_second": 0.414, "step": 900 }, { "epoch": 0.52, "grad_norm": 23.999569322440216, "learning_rate": 4.915499900101616e-07, "logits/chosen": -2.120882511138916, "logits/rejected": -2.0439200401306152, "logps/chosen": -358.4585266113281, "logps/rejected": -416.47601318359375, "loss": 0.5373, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.810420274734497, "rewards/margins": 0.5502551794052124, "rewards/rejected": -2.36067533493042, "step": 910 }, { "epoch": 0.53, "grad_norm": 32.69801009914847, "learning_rate": 4.911125834605339e-07, "logits/chosen": -2.153623342514038, "logits/rejected": -2.1145966053009033, "logps/chosen": -352.338134765625, "logps/rejected": -408.64788818359375, "loss": 0.5372, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7173570394515991, "rewards/margins": 0.5304625034332275, "rewards/rejected": -2.247819423675537, "step": 920 }, { "epoch": 0.54, "grad_norm": 27.974487350214723, "learning_rate": 4.906643444260938e-07, "logits/chosen": -2.097022533416748, "logits/rejected": -2.130666494369507, "logps/chosen": -389.94354248046875, "logps/rejected": -466.57647705078125, "loss": 0.5534, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7603232860565186, "rewards/margins": 0.7569082379341125, "rewards/rejected": -2.5172314643859863, "step": 930 }, { "epoch": 0.54, "grad_norm": 28.210659452239277, "learning_rate": 4.902052930449134e-07, "logits/chosen": -2.11311936378479, "logits/rejected": -2.0596041679382324, "logps/chosen": -436.7586975097656, "logps/rejected": -500.83099365234375, "loss": 0.5431, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0200068950653076, "rewards/margins": 0.7636159658432007, "rewards/rejected": -2.783622980117798, "step": 940 }, { "epoch": 0.55, "grad_norm": 24.141591152145743, "learning_rate": 4.897354499408315e-07, "logits/chosen": -1.9489428997039795, "logits/rejected": -1.847245216369629, "logps/chosen": -382.7989196777344, "logps/rejected": -452.16021728515625, "loss": 0.594, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1342132091522217, "rewards/margins": 0.5092564225196838, "rewards/rejected": -2.6434695720672607, "step": 950 }, { "epoch": 0.55, "grad_norm": 500.51104620692496, "learning_rate": 4.892548362225279e-07, "logits/chosen": -2.029165506362915, "logits/rejected": -1.8226451873779297, "logps/chosen": -397.0739440917969, "logps/rejected": -438.6582946777344, "loss": 0.5813, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.995446801185608, "rewards/margins": 0.5019537210464478, "rewards/rejected": -2.4974002838134766, "step": 960 }, { "epoch": 0.56, "grad_norm": 25.491055211852228, "learning_rate": 4.887634734825745e-07, "logits/chosen": -2.116440534591675, "logits/rejected": -2.029068946838379, "logps/chosen": -380.0238037109375, "logps/rejected": -430.4603576660156, "loss": 0.5596, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.578967809677124, "rewards/margins": 0.6902409791946411, "rewards/rejected": -2.2692086696624756, "step": 970 }, { "epoch": 0.56, "grad_norm": 33.507944595313134, "learning_rate": 4.882613837964654e-07, "logits/chosen": -1.9031013250350952, "logits/rejected": -1.9423463344573975, "logps/chosen": -378.9342956542969, "logps/rejected": -451.38604736328125, "loss": 0.5589, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7247003316879272, "rewards/margins": 0.6382731199264526, "rewards/rejected": -2.36297345161438, "step": 980 }, { "epoch": 0.57, "grad_norm": 22.851685136495238, "learning_rate": 4.877485897216252e-07, "logits/chosen": -1.9028632640838623, "logits/rejected": -1.821277379989624, "logps/chosen": -398.2716369628906, "logps/rejected": -445.14031982421875, "loss": 0.5581, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.9991661310195923, "rewards/margins": 0.5132952928543091, "rewards/rejected": -2.5124616622924805, "step": 990 }, { "epoch": 0.58, "grad_norm": 33.47895396727276, "learning_rate": 4.872251142963954e-07, "logits/chosen": -1.9217174053192139, "logits/rejected": -1.8421947956085205, "logps/chosen": -427.9972229003906, "logps/rejected": -493.62872314453125, "loss": 0.5136, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9277302026748657, "rewards/margins": 0.8349655866622925, "rewards/rejected": -2.762695789337158, "step": 1000 }, { "epoch": 0.58, "eval_logits/chosen": -1.717362403869629, "eval_logits/rejected": -1.6491066217422485, "eval_logps/chosen": -380.7241516113281, "eval_logps/rejected": -457.3544616699219, "eval_loss": 0.5532964468002319, "eval_rewards/accuracies": 0.7226027250289917, "eval_rewards/chosen": -2.1796743869781494, "eval_rewards/margins": 0.6906039714813232, "eval_rewards/rejected": -2.8702785968780518, "eval_runtime": 533.1813, "eval_samples_per_second": 13.129, "eval_steps_per_second": 0.411, "step": 1000 }, { "epoch": 0.58, "grad_norm": 41.86993938311435, "learning_rate": 4.866909810389991e-07, "logits/chosen": -1.8943265676498413, "logits/rejected": -1.7857849597930908, "logps/chosen": -399.3082580566406, "logps/rejected": -493.13299560546875, "loss": 0.5326, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8045670986175537, "rewards/margins": 0.9120405912399292, "rewards/rejected": -2.7166075706481934, "step": 1010 }, { "epoch": 0.59, "grad_norm": 35.30276113247114, "learning_rate": 4.861462139464852e-07, "logits/chosen": -1.8069865703582764, "logits/rejected": -1.601157784461975, "logps/chosen": -425.1826171875, "logps/rejected": -471.6793518066406, "loss": 0.532, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.03745174407959, "rewards/margins": 0.8287002444267273, "rewards/rejected": -2.866152286529541, "step": 1020 }, { "epoch": 0.59, "grad_norm": 27.128092999299874, "learning_rate": 4.855908374936494e-07, "logits/chosen": -1.6120271682739258, "logits/rejected": -1.4727498292922974, "logps/chosen": -400.17022705078125, "logps/rejected": -472.9214782714844, "loss": 0.5397, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.922376275062561, "rewards/margins": 0.8765281438827515, "rewards/rejected": -2.7989044189453125, "step": 1030 }, { "epoch": 0.6, "grad_norm": 38.248629229699695, "learning_rate": 4.850248766319352e-07, "logits/chosen": -1.5909291505813599, "logits/rejected": -1.54898202419281, "logps/chosen": -377.24151611328125, "logps/rejected": -437.99664306640625, "loss": 0.5866, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8620752096176147, "rewards/margins": 0.6017982959747314, "rewards/rejected": -2.4638731479644775, "step": 1040 }, { "epoch": 0.6, "grad_norm": 22.54771457644784, "learning_rate": 4.844483567883129e-07, "logits/chosen": -1.6205940246582031, "logits/rejected": -1.5349860191345215, "logps/chosen": -369.62982177734375, "logps/rejected": -428.35498046875, "loss": 0.5349, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6927416324615479, "rewards/margins": 0.7469199299812317, "rewards/rejected": -2.4396615028381348, "step": 1050 }, { "epoch": 0.61, "grad_norm": 30.280561604674833, "learning_rate": 4.838613038641363e-07, "logits/chosen": -1.4774177074432373, "logits/rejected": -1.4157425165176392, "logps/chosen": -383.84716796875, "logps/rejected": -452.05572509765625, "loss": 0.5462, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6795934438705444, "rewards/margins": 0.7229850888252258, "rewards/rejected": -2.402578353881836, "step": 1060 }, { "epoch": 0.62, "grad_norm": 44.480626947823716, "learning_rate": 4.832637442339807e-07, "logits/chosen": -1.2902618646621704, "logits/rejected": -1.140547752380371, "logps/chosen": -405.575439453125, "logps/rejected": -489.24755859375, "loss": 0.5352, "rewards/accuracies": 0.6875, "rewards/chosen": -2.031604290008545, "rewards/margins": 0.7442789673805237, "rewards/rejected": -2.775883197784424, "step": 1070 }, { "epoch": 0.62, "grad_norm": 33.8561942118734, "learning_rate": 4.826557047444563e-07, "logits/chosen": -1.3703272342681885, "logits/rejected": -1.2579128742218018, "logps/chosen": -387.44903564453125, "logps/rejected": -465.34991455078125, "loss": 0.5083, "rewards/accuracies": 0.71875, "rewards/chosen": -1.984887719154358, "rewards/margins": 0.909730076789856, "rewards/rejected": -2.894617795944214, "step": 1080 }, { "epoch": 0.63, "grad_norm": 24.807675157651428, "learning_rate": 4.820372127130033e-07, "logits/chosen": -1.6552495956420898, "logits/rejected": -1.567814588546753, "logps/chosen": -364.27783203125, "logps/rejected": -410.457763671875, "loss": 0.5756, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5754263401031494, "rewards/margins": 0.7061596512794495, "rewards/rejected": -2.281586170196533, "step": 1090 }, { "epoch": 0.63, "grad_norm": 36.85917676427359, "learning_rate": 4.81408295926664e-07, "logits/chosen": -1.6882715225219727, "logits/rejected": -1.5790865421295166, "logps/chosen": -341.84588623046875, "logps/rejected": -383.62445068359375, "loss": 0.5555, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3350101709365845, "rewards/margins": 0.7174500226974487, "rewards/rejected": -2.052460193634033, "step": 1100 }, { "epoch": 0.63, "eval_logits/chosen": -1.5565022230148315, "eval_logits/rejected": -1.5554715394973755, "eval_logps/chosen": -329.304931640625, "eval_logps/rejected": -395.4941101074219, "eval_loss": 0.5572943091392517, "eval_rewards/accuracies": 0.715753436088562, "eval_rewards/chosen": -1.6654820442199707, "eval_rewards/margins": 0.5861930251121521, "eval_rewards/rejected": -2.2516751289367676, "eval_runtime": 529.4696, "eval_samples_per_second": 13.221, "eval_steps_per_second": 0.414, "step": 1100 }, { "epoch": 0.64, "grad_norm": 22.522837769506296, "learning_rate": 4.807689826408344e-07, "logits/chosen": -1.492133617401123, "logits/rejected": -1.3830538988113403, "logps/chosen": -366.74603271484375, "logps/rejected": -421.2555236816406, "loss": 0.5273, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5595042705535889, "rewards/margins": 0.7607065439224243, "rewards/rejected": -2.3202106952667236, "step": 1110 }, { "epoch": 0.64, "grad_norm": 30.87003001015122, "learning_rate": 4.801193015779946e-07, "logits/chosen": -0.9767690896987915, "logits/rejected": -0.9523940086364746, "logps/chosen": -455.9415588378906, "logps/rejected": -527.2810668945312, "loss": 0.5541, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.3384251594543457, "rewards/margins": 0.7473399639129639, "rewards/rejected": -3.0857648849487305, "step": 1120 }, { "epoch": 0.65, "grad_norm": 38.939790058967176, "learning_rate": 4.794592819264193e-07, "logits/chosen": -0.977313220500946, "logits/rejected": -0.7740909457206726, "logps/chosen": -411.4134826660156, "logps/rejected": -508.5184631347656, "loss": 0.5322, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.108002185821533, "rewards/margins": 0.8991624712944031, "rewards/rejected": -3.007164716720581, "step": 1130 }, { "epoch": 0.66, "grad_norm": 39.006855739608525, "learning_rate": 4.787889533388657e-07, "logits/chosen": -1.0269806385040283, "logits/rejected": -0.8434581756591797, "logps/chosen": -433.4561462402344, "logps/rejected": -525.0525512695312, "loss": 0.5186, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.3027870655059814, "rewards/margins": 0.8387452960014343, "rewards/rejected": -3.1415321826934814, "step": 1140 }, { "epoch": 0.66, "grad_norm": 41.21680917128163, "learning_rate": 4.781083459312408e-07, "logits/chosen": -1.0086390972137451, "logits/rejected": -0.7854800820350647, "logps/chosen": -381.638916015625, "logps/rejected": -460.299072265625, "loss": 0.5218, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8460394144058228, "rewards/margins": 0.8829470872879028, "rewards/rejected": -2.7289862632751465, "step": 1150 }, { "epoch": 0.67, "grad_norm": 30.450209762528484, "learning_rate": 4.774174902812498e-07, "logits/chosen": -0.7466944456100464, "logits/rejected": -0.6347376108169556, "logps/chosen": -405.288330078125, "logps/rejected": -481.5003356933594, "loss": 0.5255, "rewards/accuracies": 0.75, "rewards/chosen": -2.108858108520508, "rewards/margins": 0.8705360293388367, "rewards/rejected": -2.97939395904541, "step": 1160 }, { "epoch": 0.67, "grad_norm": 32.268711565091685, "learning_rate": 4.767164174270208e-07, "logits/chosen": -0.45890140533447266, "logits/rejected": -0.4183754026889801, "logps/chosen": -406.1358337402344, "logps/rejected": -504.0204162597656, "loss": 0.5536, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8925126791000366, "rewards/margins": 0.8980854749679565, "rewards/rejected": -2.7905983924865723, "step": 1170 }, { "epoch": 0.68, "grad_norm": 26.86226178195377, "learning_rate": 4.7600515886571166e-07, "logits/chosen": -0.6243902444839478, "logits/rejected": -0.586922824382782, "logps/chosen": -398.369873046875, "logps/rejected": -473.0923767089844, "loss": 0.5638, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.052614450454712, "rewards/margins": 0.7696317434310913, "rewards/rejected": -2.8222460746765137, "step": 1180 }, { "epoch": 0.69, "grad_norm": 25.407947648577775, "learning_rate": 4.75283746552094e-07, "logits/chosen": -0.6037258505821228, "logits/rejected": -0.5371923446655273, "logps/chosen": -335.8143005371094, "logps/rejected": -384.31915283203125, "loss": 0.5275, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6792389154434204, "rewards/margins": 0.5704421401023865, "rewards/rejected": -2.249680995941162, "step": 1190 }, { "epoch": 0.69, "grad_norm": 30.919598076927073, "learning_rate": 4.745522128971181e-07, "logits/chosen": -0.3959011435508728, "logits/rejected": -0.34749776124954224, "logps/chosen": -376.80841064453125, "logps/rejected": -466.9195251464844, "loss": 0.5044, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7358510494232178, "rewards/margins": 0.6912168264389038, "rewards/rejected": -2.427067995071411, "step": 1200 }, { "epoch": 0.69, "eval_logits/chosen": 0.500892162322998, "eval_logits/rejected": 0.49330607056617737, "eval_logps/chosen": -421.9445495605469, "eval_logps/rejected": -506.9478454589844, "eval_loss": 0.5457122325897217, "eval_rewards/accuracies": 0.7203196287155151, "eval_rewards/chosen": -2.591878652572632, "eval_rewards/margins": 0.7743338942527771, "eval_rewards/rejected": -3.3662126064300537, "eval_runtime": 534.3348, "eval_samples_per_second": 13.1, "eval_steps_per_second": 0.41, "step": 1200 }, { "epoch": 0.7, "grad_norm": 35.57982520102404, "learning_rate": 4.738105907664564e-07, "logits/chosen": 0.7141098380088806, "logits/rejected": 1.0528442859649658, "logps/chosen": -418.5242614746094, "logps/rejected": -531.0250244140625, "loss": 0.4989, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.3081347942352295, "rewards/margins": 1.1190835237503052, "rewards/rejected": -3.427218198776245, "step": 1210 }, { "epoch": 0.7, "grad_norm": 30.643950101647338, "learning_rate": 4.730589134790272e-07, "logits/chosen": 0.8440307378768921, "logits/rejected": 1.0693213939666748, "logps/chosen": -466.4752502441406, "logps/rejected": -534.8712158203125, "loss": 0.5225, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5758888721466064, "rewards/margins": 0.8178550601005554, "rewards/rejected": -3.3937439918518066, "step": 1220 }, { "epoch": 0.71, "grad_norm": 36.42303884332228, "learning_rate": 4.7229721480549765e-07, "logits/chosen": 0.42027491331100464, "logits/rejected": 0.4065718650817871, "logps/chosen": -453.0296936035156, "logps/rejected": -523.9437255859375, "loss": 0.5404, "rewards/accuracies": 0.71875, "rewards/chosen": -2.3087098598480225, "rewards/margins": 0.7815896272659302, "rewards/rejected": -3.0902998447418213, "step": 1230 }, { "epoch": 0.71, "grad_norm": 34.4780558427113, "learning_rate": 4.715255289667665e-07, "logits/chosen": 0.3246513903141022, "logits/rejected": 0.25698739290237427, "logps/chosen": -371.596923828125, "logps/rejected": -461.18243408203125, "loss": 0.5186, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8430531024932861, "rewards/margins": 0.8264444470405579, "rewards/rejected": -2.669497489929199, "step": 1240 }, { "epoch": 0.72, "grad_norm": 35.10279004413756, "learning_rate": 4.707438906324267e-07, "logits/chosen": 0.3863239884376526, "logits/rejected": 0.4440188407897949, "logps/chosen": -395.8504638671875, "logps/rejected": -487.3111877441406, "loss": 0.5379, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2548255920410156, "rewards/margins": 0.7340821623802185, "rewards/rejected": -2.9889075756073, "step": 1250 }, { "epoch": 0.73, "grad_norm": 24.70818306021913, "learning_rate": 4.6995233491920793e-07, "logits/chosen": 0.24182286858558655, "logits/rejected": 0.2533157467842102, "logps/chosen": -419.7149353027344, "logps/rejected": -486.6378479003906, "loss": 0.4925, "rewards/accuracies": 0.75, "rewards/chosen": -2.066789150238037, "rewards/margins": 0.8948569297790527, "rewards/rejected": -2.9616458415985107, "step": 1260 }, { "epoch": 0.73, "grad_norm": 30.2912698661642, "learning_rate": 4.6915089738939853e-07, "logits/chosen": 0.46339306235313416, "logits/rejected": 0.6178001165390015, "logps/chosen": -441.19927978515625, "logps/rejected": -537.18896484375, "loss": 0.5228, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.4666507244110107, "rewards/margins": 0.9494393467903137, "rewards/rejected": -3.4160900115966797, "step": 1270 }, { "epoch": 0.74, "grad_norm": 67.36951634153075, "learning_rate": 4.683396140492481e-07, "logits/chosen": 0.720579206943512, "logits/rejected": 0.8675535917282104, "logps/chosen": -476.9345703125, "logps/rejected": -572.6681518554688, "loss": 0.5393, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.6376290321350098, "rewards/margins": 1.0271899700164795, "rewards/rejected": -3.6648192405700684, "step": 1280 }, { "epoch": 0.74, "grad_norm": 29.833772945487596, "learning_rate": 4.6751852134734987e-07, "logits/chosen": 0.5596036911010742, "logits/rejected": 0.6065183281898499, "logps/chosen": -426.705810546875, "logps/rejected": -502.31988525390625, "loss": 0.5323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.305546522140503, "rewards/margins": 0.8232072591781616, "rewards/rejected": -3.128753662109375, "step": 1290 }, { "epoch": 0.75, "grad_norm": 33.84395465641073, "learning_rate": 4.66687656173003e-07, "logits/chosen": 0.16365781426429749, "logits/rejected": 0.28904014825820923, "logps/chosen": -430.79229736328125, "logps/rejected": -516.2119750976562, "loss": 0.5078, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2299017906188965, "rewards/margins": 0.8503605127334595, "rewards/rejected": -3.0802621841430664, "step": 1300 }, { "epoch": 0.75, "eval_logits/chosen": 0.6094363927841187, "eval_logits/rejected": 0.4823082983493805, "eval_logps/chosen": -399.8520202636719, "eval_logps/rejected": -476.31463623046875, "eval_loss": 0.5505304336547852, "eval_rewards/accuracies": 0.7220319509506226, "eval_rewards/chosen": -2.37095308303833, "eval_rewards/margins": 0.6889274716377258, "eval_rewards/rejected": -3.059880495071411, "eval_runtime": 530.8928, "eval_samples_per_second": 13.185, "eval_steps_per_second": 0.413, "step": 1300 }, { "epoch": 0.75, "grad_norm": 29.52995289392915, "learning_rate": 4.658470558545553e-07, "logits/chosen": 0.5565316081047058, "logits/rejected": 0.4526277482509613, "logps/chosen": -406.7618103027344, "logps/rejected": -470.6244201660156, "loss": 0.5468, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.904358148574829, "rewards/margins": 0.5770174264907837, "rewards/rejected": -2.4813754558563232, "step": 1310 }, { "epoch": 0.76, "grad_norm": 27.919941338159607, "learning_rate": 4.6499675815772626e-07, "logits/chosen": 0.9634499549865723, "logits/rejected": 0.8664857149124146, "logps/chosen": -424.68133544921875, "logps/rejected": -537.3968505859375, "loss": 0.5612, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.261639356613159, "rewards/margins": 1.037082314491272, "rewards/rejected": -3.2987217903137207, "step": 1320 }, { "epoch": 0.77, "grad_norm": 25.584838148016555, "learning_rate": 4.641368012839102e-07, "logits/chosen": 1.0630701780319214, "logits/rejected": 0.838936448097229, "logps/chosen": -424.8299255371094, "logps/rejected": -482.51043701171875, "loss": 0.5382, "rewards/accuracies": 0.71875, "rewards/chosen": -2.242565870285034, "rewards/margins": 0.6874292492866516, "rewards/rejected": -2.929995059967041, "step": 1330 }, { "epoch": 0.77, "grad_norm": 27.418224511021567, "learning_rate": 4.632672238684601e-07, "logits/chosen": 1.0494606494903564, "logits/rejected": 0.7880681753158569, "logps/chosen": -388.66326904296875, "logps/rejected": -462.1800842285156, "loss": 0.5542, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.086183786392212, "rewards/margins": 0.5751680135726929, "rewards/rejected": -2.661351442337036, "step": 1340 }, { "epoch": 0.78, "grad_norm": 30.20780894752679, "learning_rate": 4.623880649789519e-07, "logits/chosen": 1.4114108085632324, "logits/rejected": 1.226702094078064, "logps/chosen": -383.88275146484375, "logps/rejected": -450.13201904296875, "loss": 0.5146, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.872510313987732, "rewards/margins": 0.8399595022201538, "rewards/rejected": -2.7124695777893066, "step": 1350 }, { "epoch": 0.78, "grad_norm": 41.99270421206662, "learning_rate": 4.6149936411342906e-07, "logits/chosen": 1.2934401035308838, "logits/rejected": 1.5587232112884521, "logps/chosen": -407.62451171875, "logps/rejected": -499.6255798339844, "loss": 0.5435, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0264687538146973, "rewards/margins": 0.7698057293891907, "rewards/rejected": -2.796274423599243, "step": 1360 }, { "epoch": 0.79, "grad_norm": 32.42565878881667, "learning_rate": 4.606011611986283e-07, "logits/chosen": 0.8485987782478333, "logits/rejected": 0.7325617074966431, "logps/chosen": -376.6981201171875, "logps/rejected": -460.94085693359375, "loss": 0.5682, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.739357352256775, "rewards/margins": 0.719510555267334, "rewards/rejected": -2.4588677883148193, "step": 1370 }, { "epoch": 0.79, "grad_norm": 33.290075286645255, "learning_rate": 4.596934965881857e-07, "logits/chosen": 0.42897891998291016, "logits/rejected": 0.40408191084861755, "logps/chosen": -381.7453308105469, "logps/rejected": -448.359130859375, "loss": 0.5087, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7363965511322021, "rewards/margins": 0.7646786570549011, "rewards/rejected": -2.501075506210327, "step": 1380 }, { "epoch": 0.8, "grad_norm": 33.82474590615408, "learning_rate": 4.587764110608234e-07, "logits/chosen": 1.1191015243530273, "logits/rejected": 0.9889567494392395, "logps/chosen": -397.16839599609375, "logps/rejected": -467.3800354003906, "loss": 0.5484, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.965309500694275, "rewards/margins": 0.8247777819633484, "rewards/rejected": -2.7900872230529785, "step": 1390 }, { "epoch": 0.81, "grad_norm": 24.843940117517658, "learning_rate": 4.5784994581851843e-07, "logits/chosen": 0.7254992723464966, "logits/rejected": 0.8247386813163757, "logps/chosen": -383.87017822265625, "logps/rejected": -456.92608642578125, "loss": 0.5333, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.032332420349121, "rewards/margins": 0.7764937281608582, "rewards/rejected": -2.808826208114624, "step": 1400 }, { "epoch": 0.81, "eval_logits/chosen": 0.6966539621353149, "eval_logits/rejected": 0.579403817653656, "eval_logps/chosen": -399.0350341796875, "eval_logps/rejected": -475.408203125, "eval_loss": 0.5486109256744385, "eval_rewards/accuracies": 0.7174657583236694, "eval_rewards/chosen": -2.3627829551696777, "eval_rewards/margins": 0.6880332231521606, "eval_rewards/rejected": -3.050816059112549, "eval_runtime": 533.8407, "eval_samples_per_second": 13.113, "eval_steps_per_second": 0.41, "step": 1400 }, { "epoch": 0.81, "grad_norm": 33.62223623391476, "learning_rate": 4.5691414248465057e-07, "logits/chosen": 0.9438153505325317, "logits/rejected": 0.9957318305969238, "logps/chosen": -396.3539123535156, "logps/rejected": -486.55609130859375, "loss": 0.5279, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1211135387420654, "rewards/margins": 0.8390033841133118, "rewards/rejected": -2.9601168632507324, "step": 1410 }, { "epoch": 0.82, "grad_norm": 34.96565372879313, "learning_rate": 4.5596904310213285e-07, "logits/chosen": 1.0015530586242676, "logits/rejected": 1.0689165592193604, "logps/chosen": -418.5540466308594, "logps/rejected": -513.3619384765625, "loss": 0.5323, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.401996374130249, "rewards/margins": 0.8496417999267578, "rewards/rejected": -3.2516379356384277, "step": 1420 }, { "epoch": 0.82, "grad_norm": 32.16930985756854, "learning_rate": 4.5501469013152295e-07, "logits/chosen": 0.9291422963142395, "logits/rejected": 0.9986698031425476, "logps/chosen": -421.0863342285156, "logps/rejected": -519.8434448242188, "loss": 0.5321, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0995755195617676, "rewards/margins": 0.8921242952346802, "rewards/rejected": -2.991699695587158, "step": 1430 }, { "epoch": 0.83, "grad_norm": 41.18004914848615, "learning_rate": 4.5405112644911484e-07, "logits/chosen": 0.8801876306533813, "logits/rejected": 1.0016380548477173, "logps/chosen": -423.07733154296875, "logps/rejected": -470.17657470703125, "loss": 0.5584, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.0800178050994873, "rewards/margins": 0.5193462371826172, "rewards/rejected": -2.5993640422821045, "step": 1440 }, { "epoch": 0.84, "grad_norm": 20.720446672132457, "learning_rate": 4.5307839534501333e-07, "logits/chosen": 1.1375744342803955, "logits/rejected": 1.2430613040924072, "logps/chosen": -421.048583984375, "logps/rejected": -501.53265380859375, "loss": 0.5099, "rewards/accuracies": 0.78125, "rewards/chosen": -2.199385404586792, "rewards/margins": 0.8371283411979675, "rewards/rejected": -3.0365140438079834, "step": 1450 }, { "epoch": 0.84, "grad_norm": 32.36279913209666, "learning_rate": 4.5209654052118835e-07, "logits/chosen": 1.4384114742279053, "logits/rejected": 1.60125732421875, "logps/chosen": -456.86956787109375, "logps/rejected": -500.36590576171875, "loss": 0.5401, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.6214048862457275, "rewards/margins": 0.6292562484741211, "rewards/rejected": -3.2506611347198486, "step": 1460 }, { "epoch": 0.85, "grad_norm": 31.900770076656997, "learning_rate": 4.5110560608951213e-07, "logits/chosen": 1.4086377620697021, "logits/rejected": 1.3671926259994507, "logps/chosen": -443.05316162109375, "logps/rejected": -545.9711303710938, "loss": 0.5231, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.5106449127197266, "rewards/margins": 0.9070757627487183, "rewards/rejected": -3.4177207946777344, "step": 1470 }, { "epoch": 0.85, "grad_norm": 28.84917871820926, "learning_rate": 4.501056365697772e-07, "logits/chosen": 0.9801831245422363, "logits/rejected": 1.0283689498901367, "logps/chosen": -470.3106384277344, "logps/rejected": -530.6856079101562, "loss": 0.524, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.5356154441833496, "rewards/margins": 0.7051264047622681, "rewards/rejected": -3.240741729736328, "step": 1480 }, { "epoch": 0.86, "grad_norm": 28.81946740616171, "learning_rate": 4.4909667688769617e-07, "logits/chosen": 0.896990180015564, "logits/rejected": 0.8845464587211609, "logps/chosen": -418.373046875, "logps/rejected": -472.3260803222656, "loss": 0.5504, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.3188891410827637, "rewards/margins": 0.6009450554847717, "rewards/rejected": -2.9198343753814697, "step": 1490 }, { "epoch": 0.86, "grad_norm": 26.426543607275352, "learning_rate": 4.480787723728834e-07, "logits/chosen": 1.1881037950515747, "logits/rejected": 1.1453741788864136, "logps/chosen": -422.7227478027344, "logps/rejected": -535.7371826171875, "loss": 0.4799, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.244492292404175, "rewards/margins": 0.9496955871582031, "rewards/rejected": -3.194187879562378, "step": 1500 }, { "epoch": 0.86, "eval_logits/chosen": 1.3814072608947754, "eval_logits/rejected": 1.2405532598495483, "eval_logps/chosen": -439.3845520019531, "eval_logps/rejected": -527.0655517578125, "eval_loss": 0.5452077984809875, "eval_rewards/accuracies": 0.7380136847496033, "eval_rewards/chosen": -2.7662782669067383, "eval_rewards/margins": 0.801111102104187, "eval_rewards/rejected": -3.5673892498016357, "eval_runtime": 531.6249, "eval_samples_per_second": 13.167, "eval_steps_per_second": 0.412, "step": 1500 }, { "epoch": 0.87, "grad_norm": 52.10313230287511, "learning_rate": 4.470519687568185e-07, "logits/chosen": 1.192626953125, "logits/rejected": 1.322495698928833, "logps/chosen": -434.76336669921875, "logps/rejected": -517.9180297851562, "loss": 0.5536, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3028738498687744, "rewards/margins": 0.9199673533439636, "rewards/rejected": -3.222841262817383, "step": 1510 }, { "epoch": 0.88, "grad_norm": 34.54332070911601, "learning_rate": 4.4601631217079184e-07, "logits/chosen": 0.88127201795578, "logits/rejected": 1.0698552131652832, "logps/chosen": -424.68896484375, "logps/rejected": -489.09747314453125, "loss": 0.5266, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2943062782287598, "rewards/margins": 0.8990936279296875, "rewards/rejected": -3.1933999061584473, "step": 1520 }, { "epoch": 0.88, "grad_norm": 30.174304313992888, "learning_rate": 4.449718491438317e-07, "logits/chosen": 0.8766907453536987, "logits/rejected": 0.8139283061027527, "logps/chosen": -443.4109802246094, "logps/rejected": -516.8344116210938, "loss": 0.5465, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.47041654586792, "rewards/margins": 0.7574352025985718, "rewards/rejected": -3.2278518676757812, "step": 1530 }, { "epoch": 0.89, "grad_norm": 27.83037929859812, "learning_rate": 4.439186266006142e-07, "logits/chosen": 0.5059286952018738, "logits/rejected": 0.5857676267623901, "logps/chosen": -467.68194580078125, "logps/rejected": -492.71282958984375, "loss": 0.5333, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.380493640899658, "rewards/margins": 0.5767788290977478, "rewards/rejected": -2.957272529602051, "step": 1540 }, { "epoch": 0.89, "grad_norm": 25.056068720457393, "learning_rate": 4.4285669185935494e-07, "logits/chosen": 0.566594660282135, "logits/rejected": 0.641537070274353, "logps/chosen": -401.33465576171875, "logps/rejected": -479.97796630859375, "loss": 0.5367, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.186471462249756, "rewards/margins": 0.7661560773849487, "rewards/rejected": -2.952627658843994, "step": 1550 }, { "epoch": 0.9, "grad_norm": 23.72084694775373, "learning_rate": 4.41786092629683e-07, "logits/chosen": 0.6932346224784851, "logits/rejected": 0.7394161224365234, "logps/chosen": -436.6734924316406, "logps/rejected": -483.3221740722656, "loss": 0.515, "rewards/accuracies": 0.71875, "rewards/chosen": -2.156404495239258, "rewards/margins": 0.6597731113433838, "rewards/rejected": -2.8161776065826416, "step": 1560 }, { "epoch": 0.9, "grad_norm": 30.48402197479478, "learning_rate": 4.40706877010498e-07, "logits/chosen": 1.1334117650985718, "logits/rejected": 1.1996811628341675, "logps/chosen": -469.0484924316406, "logps/rejected": -569.8128051757812, "loss": 0.5405, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.65855073928833, "rewards/margins": 0.9022210836410522, "rewards/rejected": -3.56077241897583, "step": 1570 }, { "epoch": 0.91, "grad_norm": 36.446877434573075, "learning_rate": 4.396190934878084e-07, "logits/chosen": 1.5465054512023926, "logits/rejected": 1.7850452661514282, "logps/chosen": -490.85406494140625, "logps/rejected": -562.577392578125, "loss": 0.5261, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.9315357208251953, "rewards/margins": 0.8616039156913757, "rewards/rejected": -3.793139696121216, "step": 1580 }, { "epoch": 0.92, "grad_norm": 26.97697994050041, "learning_rate": 4.3852279093255384e-07, "logits/chosen": 1.333219289779663, "logits/rejected": 1.6035034656524658, "logps/chosen": -479.027587890625, "logps/rejected": -576.1409301757812, "loss": 0.5087, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7792274951934814, "rewards/margins": 1.2061374187469482, "rewards/rejected": -3.9853649139404297, "step": 1590 }, { "epoch": 0.92, "grad_norm": 31.592649368905366, "learning_rate": 4.374180185984091e-07, "logits/chosen": 0.8321776390075684, "logits/rejected": 0.701606810092926, "logps/chosen": -473.61688232421875, "logps/rejected": -575.8402099609375, "loss": 0.5551, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.5903079509735107, "rewards/margins": 1.0294965505599976, "rewards/rejected": -3.619805097579956, "step": 1600 }, { "epoch": 0.92, "eval_logits/chosen": 0.8497821092605591, "eval_logits/rejected": 0.7892265915870667, "eval_logps/chosen": -431.6922912597656, "eval_logps/rejected": -515.7155151367188, "eval_loss": 0.5455161929130554, "eval_rewards/accuracies": 0.732876718044281, "eval_rewards/chosen": -2.6893556118011475, "eval_rewards/margins": 0.7645328044891357, "eval_rewards/rejected": -3.453888177871704, "eval_runtime": 534.7593, "eval_samples_per_second": 13.09, "eval_steps_per_second": 0.41, "step": 1600 }, { "epoch": 0.93, "grad_norm": 28.37824899120431, "learning_rate": 4.3630482611957135e-07, "logits/chosen": 0.6677711606025696, "logits/rejected": 0.7078309655189514, "logps/chosen": -440.38287353515625, "logps/rejected": -516.6839599609375, "loss": 0.5343, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.5657265186309814, "rewards/margins": 0.7193098664283752, "rewards/rejected": -3.285036087036133, "step": 1610 }, { "epoch": 0.93, "grad_norm": 40.46324248491287, "learning_rate": 4.351832635085306e-07, "logits/chosen": 0.9803969264030457, "logits/rejected": 1.2391095161437988, "logps/chosen": -492.634765625, "logps/rejected": -562.9388427734375, "loss": 0.5362, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.7479710578918457, "rewards/margins": 0.943402886390686, "rewards/rejected": -3.691373825073242, "step": 1620 }, { "epoch": 0.94, "grad_norm": 25.157420051876127, "learning_rate": 4.3405338115382206e-07, "logits/chosen": 0.7604900002479553, "logits/rejected": 0.8891085386276245, "logps/chosen": -506.68072509765625, "logps/rejected": -586.2363891601562, "loss": 0.5299, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8814549446105957, "rewards/margins": 0.9060016870498657, "rewards/rejected": -3.787456512451172, "step": 1630 }, { "epoch": 0.94, "grad_norm": 37.48800739154778, "learning_rate": 4.329152298177631e-07, "logits/chosen": 0.6950697302818298, "logits/rejected": 0.7056332230567932, "logps/chosen": -503.376220703125, "logps/rejected": -565.3928833007812, "loss": 0.5478, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.7201149463653564, "rewards/margins": 0.6453174352645874, "rewards/rejected": -3.3654322624206543, "step": 1640 }, { "epoch": 0.95, "grad_norm": 34.07224961926381, "learning_rate": 4.31768860634172e-07, "logits/chosen": 0.7638389468193054, "logits/rejected": 0.8961465954780579, "logps/chosen": -462.140625, "logps/rejected": -522.7424926757812, "loss": 0.5234, "rewards/accuracies": 0.75, "rewards/chosen": -2.581357955932617, "rewards/margins": 0.6950909495353699, "rewards/rejected": -3.2764487266540527, "step": 1650 }, { "epoch": 0.96, "grad_norm": 38.18259494942954, "learning_rate": 4.306143251060712e-07, "logits/chosen": 0.7471806406974792, "logits/rejected": 0.8691298365592957, "logps/chosen": -477.937744140625, "logps/rejected": -537.0381469726562, "loss": 0.5121, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.60550594329834, "rewards/margins": 0.833734393119812, "rewards/rejected": -3.4392402172088623, "step": 1660 }, { "epoch": 0.96, "grad_norm": 38.37242965595063, "learning_rate": 4.29451675103373e-07, "logits/chosen": 1.2500221729278564, "logits/rejected": 1.3068102598190308, "logps/chosen": -521.4398803710938, "logps/rejected": -618.3226318359375, "loss": 0.4989, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.14654803276062, "rewards/margins": 1.0083508491516113, "rewards/rejected": -4.154898643493652, "step": 1670 }, { "epoch": 0.97, "grad_norm": 36.78370041351263, "learning_rate": 4.282809628605495e-07, "logits/chosen": 1.7802565097808838, "logits/rejected": 2.1717264652252197, "logps/chosen": -530.1058959960938, "logps/rejected": -631.2371826171875, "loss": 0.4707, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.3198955059051514, "rewards/margins": 0.9894178509712219, "rewards/rejected": -4.3093132972717285, "step": 1680 }, { "epoch": 0.97, "grad_norm": 24.645444077485443, "learning_rate": 4.271022409742856e-07, "logits/chosen": 2.2679412364959717, "logits/rejected": 2.3735575675964355, "logps/chosen": -556.3650512695312, "logps/rejected": -627.4823608398438, "loss": 0.521, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.44758677482605, "rewards/margins": 0.9135257005691528, "rewards/rejected": -4.361112117767334, "step": 1690 }, { "epoch": 0.98, "grad_norm": 24.23564969363943, "learning_rate": 4.2591556240111614e-07, "logits/chosen": 2.104585886001587, "logits/rejected": 2.323397636413574, "logps/chosen": -510.76385498046875, "logps/rejected": -601.5013427734375, "loss": 0.4911, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.050355911254883, "rewards/margins": 1.0548977851867676, "rewards/rejected": -4.105254173278809, "step": 1700 }, { "epoch": 0.98, "eval_logits/chosen": 2.262242078781128, "eval_logits/rejected": 2.3144001960754395, "eval_logps/chosen": -495.8297424316406, "eval_logps/rejected": -587.16357421875, "eval_loss": 0.5509106516838074, "eval_rewards/accuracies": 0.7300228476524353, "eval_rewards/chosen": -3.3307297229766846, "eval_rewards/margins": 0.8376395106315613, "eval_rewards/rejected": -4.168369770050049, "eval_runtime": 524.1571, "eval_samples_per_second": 13.355, "eval_steps_per_second": 0.418, "step": 1700 }, { "epoch": 0.98, "grad_norm": 30.099887684226196, "learning_rate": 4.2472098045504676e-07, "logits/chosen": 1.8631641864776611, "logits/rejected": 1.8226597309112549, "logps/chosen": -481.734619140625, "logps/rejected": -559.9337768554688, "loss": 0.5406, "rewards/accuracies": 0.6875, "rewards/chosen": -2.8809375762939453, "rewards/margins": 0.666104793548584, "rewards/rejected": -3.5470423698425293, "step": 1710 }, { "epoch": 0.99, "grad_norm": 27.25031391610716, "learning_rate": 4.235185488051585e-07, "logits/chosen": 1.398980736732483, "logits/rejected": 1.486722469329834, "logps/chosen": -444.60211181640625, "logps/rejected": -529.7652587890625, "loss": 0.5502, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.599151134490967, "rewards/margins": 0.7241733074188232, "rewards/rejected": -3.323324203491211, "step": 1720 }, { "epoch": 1.0, "grad_norm": 22.606150654215448, "learning_rate": 4.223083214731966e-07, "logits/chosen": 1.072293996810913, "logits/rejected": 1.1955945491790771, "logps/chosen": -440.53265380859375, "logps/rejected": -502.02850341796875, "loss": 0.52, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.5479977130889893, "rewards/margins": 0.6766992211341858, "rewards/rejected": -3.224696636199951, "step": 1730 }, { "epoch": 1.0, "grad_norm": 21.70950542230914, "learning_rate": 4.2109035283114385e-07, "logits/chosen": 0.853971004486084, "logits/rejected": 1.0735433101654053, "logps/chosen": -430.94488525390625, "logps/rejected": -516.1624755859375, "loss": 0.4687, "rewards/accuracies": 0.78125, "rewards/chosen": -2.28427791595459, "rewards/margins": 0.9807289242744446, "rewards/rejected": -3.2650065422058105, "step": 1740 }, { "epoch": 1.01, "grad_norm": 27.145751391622372, "learning_rate": 4.1986469759877727e-07, "logits/chosen": 0.9728590250015259, "logits/rejected": 1.52602219581604, "logps/chosen": -496.41998291015625, "logps/rejected": -653.0174560546875, "loss": 0.353, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.7165207862854004, "rewards/margins": 1.827858567237854, "rewards/rejected": -4.544379234313965, "step": 1750 }, { "epoch": 1.01, "grad_norm": 33.836475970310794, "learning_rate": 4.1863141084121e-07, "logits/chosen": 1.9407678842544556, "logits/rejected": 2.4080090522766113, "logps/chosen": -554.0223388671875, "logps/rejected": -737.36376953125, "loss": 0.3472, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.556842088699341, "rewards/margins": 1.8612630367279053, "rewards/rejected": -5.418105125427246, "step": 1760 }, { "epoch": 1.02, "grad_norm": 23.43448554518606, "learning_rate": 4.1739054796641724e-07, "logits/chosen": 2.19294810295105, "logits/rejected": 2.68664813041687, "logps/chosen": -570.4949951171875, "logps/rejected": -787.9290161132812, "loss": 0.3505, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.6935958862304688, "rewards/margins": 2.1476540565490723, "rewards/rejected": -5.841249942779541, "step": 1770 }, { "epoch": 1.03, "grad_norm": 24.48762447318573, "learning_rate": 4.1614216472274725e-07, "logits/chosen": 1.2772367000579834, "logits/rejected": 1.9447253942489624, "logps/chosen": -483.00457763671875, "logps/rejected": -657.90966796875, "loss": 0.3236, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.896470308303833, "rewards/margins": 1.8307437896728516, "rewards/rejected": -4.727214336395264, "step": 1780 }, { "epoch": 1.03, "grad_norm": 26.4477212836452, "learning_rate": 4.148863171964164e-07, "logits/chosen": 1.4851411581039429, "logits/rejected": 1.8088572025299072, "logps/chosen": -499.3896484375, "logps/rejected": -691.7644653320312, "loss": 0.3511, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.0726377964019775, "rewards/margins": 1.8518154621124268, "rewards/rejected": -4.924452781677246, "step": 1790 }, { "epoch": 1.04, "grad_norm": 24.483963502182153, "learning_rate": 4.1362306180898953e-07, "logits/chosen": 1.869778037071228, "logits/rejected": 2.166045904159546, "logps/chosen": -552.2401123046875, "logps/rejected": -755.9622192382812, "loss": 0.3058, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.5332465171813965, "rewards/margins": 2.0943760871887207, "rewards/rejected": -5.627622604370117, "step": 1800 }, { "epoch": 1.04, "eval_logits/chosen": 2.4412858486175537, "eval_logits/rejected": 2.5171053409576416, "eval_logps/chosen": -620.4401245117188, "eval_logps/rejected": -734.1903686523438, "eval_loss": 0.5704112648963928, "eval_rewards/accuracies": 0.7214611768722534, "eval_rewards/chosen": -4.576834201812744, "eval_rewards/margins": 1.0618023872375488, "eval_rewards/rejected": -5.638637542724609, "eval_runtime": 535.8018, "eval_samples_per_second": 13.065, "eval_steps_per_second": 0.409, "step": 1800 }, { "epoch": 1.04, "grad_norm": 48.84021862605712, "learning_rate": 4.12352455314845e-07, "logits/chosen": 1.958675742149353, "logits/rejected": 2.4410579204559326, "logps/chosen": -660.9749755859375, "logps/rejected": -884.8785400390625, "loss": 0.3246, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.166441440582275, "rewards/margins": 2.301835298538208, "rewards/rejected": -6.4682769775390625, "step": 1810 }, { "epoch": 1.05, "grad_norm": 47.174387132108485, "learning_rate": 4.110745547986249e-07, "logits/chosen": 2.0799813270568848, "logits/rejected": 2.464001417160034, "logps/chosen": -520.8151245117188, "logps/rejected": -770.5751342773438, "loss": 0.3372, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.4119770526885986, "rewards/margins": 2.137289524078369, "rewards/rejected": -5.549267768859863, "step": 1820 }, { "epoch": 1.05, "grad_norm": 36.664903294981954, "learning_rate": 4.097894176726706e-07, "logits/chosen": 1.9449886083602905, "logits/rejected": 2.516695261001587, "logps/chosen": -562.2173461914062, "logps/rejected": -787.1259765625, "loss": 0.3145, "rewards/accuracies": 0.875, "rewards/chosen": -3.6568851470947266, "rewards/margins": 2.275550365447998, "rewards/rejected": -5.932435512542725, "step": 1830 }, { "epoch": 1.06, "grad_norm": 32.46754431906868, "learning_rate": 4.0849710167444327e-07, "logits/chosen": 1.8666200637817383, "logits/rejected": 2.1901090145111084, "logps/chosen": -576.4138793945312, "logps/rejected": -804.4579467773438, "loss": 0.3352, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.5906100273132324, "rewards/margins": 2.225919008255005, "rewards/rejected": -5.816529273986816, "step": 1840 }, { "epoch": 1.07, "grad_norm": 28.827469480541847, "learning_rate": 4.071976648639296e-07, "logits/chosen": 2.364532470703125, "logits/rejected": 2.7935092449188232, "logps/chosen": -556.0269165039062, "logps/rejected": -767.2452392578125, "loss": 0.3259, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8972649574279785, "rewards/margins": 1.9918088912963867, "rewards/rejected": -5.889073848724365, "step": 1850 }, { "epoch": 1.07, "grad_norm": 30.331732357466763, "learning_rate": 4.0589116562103374e-07, "logits/chosen": 2.5906758308410645, "logits/rejected": 2.810918092727661, "logps/chosen": -678.5337524414062, "logps/rejected": -870.5076293945312, "loss": 0.321, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.721530437469482, "rewards/margins": 1.985032320022583, "rewards/rejected": -6.7065629959106445, "step": 1860 }, { "epoch": 1.08, "grad_norm": 76.10559093216678, "learning_rate": 4.045776626429545e-07, "logits/chosen": 2.5308313369750977, "logits/rejected": 3.0142946243286133, "logps/chosen": -713.3955078125, "logps/rejected": -934.3165283203125, "loss": 0.3012, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.968944072723389, "rewards/margins": 2.351652145385742, "rewards/rejected": -7.320596218109131, "step": 1870 }, { "epoch": 1.08, "grad_norm": 48.154716391717265, "learning_rate": 4.032572149415477e-07, "logits/chosen": 1.9607959985733032, "logits/rejected": 2.661827802658081, "logps/chosen": -596.4283447265625, "logps/rejected": -837.1942138671875, "loss": 0.3105, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.957590103149414, "rewards/margins": 2.397327423095703, "rewards/rejected": -6.354917049407959, "step": 1880 }, { "epoch": 1.09, "grad_norm": 54.405285038894945, "learning_rate": 4.019298818406758e-07, "logits/chosen": 1.8610776662826538, "logits/rejected": 2.2727320194244385, "logps/chosen": -590.4713134765625, "logps/rejected": -782.579833984375, "loss": 0.3216, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.8373920917510986, "rewards/margins": 2.1050496101379395, "rewards/rejected": -5.942440986633301, "step": 1890 }, { "epoch": 1.09, "grad_norm": 39.641772311800054, "learning_rate": 4.0059572297354157e-07, "logits/chosen": 1.6991007328033447, "logits/rejected": 2.297924041748047, "logps/chosen": -611.2540283203125, "logps/rejected": -839.2840576171875, "loss": 0.3346, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.093662738800049, "rewards/margins": 2.388913869857788, "rewards/rejected": -6.482576847076416, "step": 1900 }, { "epoch": 1.09, "eval_logits/chosen": 1.9195640087127686, "eval_logits/rejected": 2.03857421875, "eval_logps/chosen": -618.0657348632812, "eval_logps/rejected": -727.3204345703125, "eval_loss": 0.5765285491943359, "eval_rewards/accuracies": 0.7151826620101929, "eval_rewards/chosen": -4.553089141845703, "eval_rewards/margins": 1.0168496370315552, "eval_rewards/rejected": -5.569939136505127, "eval_runtime": 523.6712, "eval_samples_per_second": 13.367, "eval_steps_per_second": 0.418, "step": 1900 }, { "epoch": 1.1, "grad_norm": 40.57093533118065, "learning_rate": 3.9925479828000995e-07, "logits/chosen": 1.698224663734436, "logits/rejected": 2.0287933349609375, "logps/chosen": -605.9989624023438, "logps/rejected": -803.6336669921875, "loss": 0.3212, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.150283336639404, "rewards/margins": 1.8559858798980713, "rewards/rejected": -6.0062689781188965, "step": 1910 }, { "epoch": 1.11, "grad_norm": 35.79476959549997, "learning_rate": 3.9790716800391477e-07, "logits/chosen": 1.4426591396331787, "logits/rejected": 2.1712703704833984, "logps/chosen": -656.7230224609375, "logps/rejected": -880.4617309570312, "loss": 0.3316, "rewards/accuracies": 0.84375, "rewards/chosen": -4.328715801239014, "rewards/margins": 2.351356029510498, "rewards/rejected": -6.6800713539123535, "step": 1920 }, { "epoch": 1.11, "grad_norm": 28.490103567555725, "learning_rate": 3.965528926903518e-07, "logits/chosen": 1.1435489654541016, "logits/rejected": 1.554862380027771, "logps/chosen": -563.4754638671875, "logps/rejected": -783.9634399414062, "loss": 0.3462, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.6701748371124268, "rewards/margins": 1.9637409448623657, "rewards/rejected": -5.633915901184082, "step": 1930 }, { "epoch": 1.12, "grad_norm": 29.634023497552814, "learning_rate": 3.951920331829592e-07, "logits/chosen": 1.0635690689086914, "logits/rejected": 1.3002026081085205, "logps/chosen": -497.9590759277344, "logps/rejected": -700.2932739257812, "loss": 0.2765, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.062175750732422, "rewards/margins": 1.9651362895965576, "rewards/rejected": -5.027312278747559, "step": 1940 }, { "epoch": 1.12, "grad_norm": 60.02269313092139, "learning_rate": 3.938246506211836e-07, "logits/chosen": 1.5466588735580444, "logits/rejected": 1.7413294315338135, "logps/chosen": -611.7237548828125, "logps/rejected": -848.703125, "loss": 0.2868, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.080990314483643, "rewards/margins": 2.290266752243042, "rewards/rejected": -6.371257781982422, "step": 1950 }, { "epoch": 1.13, "grad_norm": 31.606090939529228, "learning_rate": 3.9245080643753377e-07, "logits/chosen": 2.254460334777832, "logits/rejected": 2.7855446338653564, "logps/chosen": -735.8612060546875, "logps/rejected": -972.5032348632812, "loss": 0.3076, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -5.193572998046875, "rewards/margins": 2.5406463146209717, "rewards/rejected": -7.734219551086426, "step": 1960 }, { "epoch": 1.13, "grad_norm": 44.8856334569405, "learning_rate": 3.910705623548197e-07, "logits/chosen": 2.116036891937256, "logits/rejected": 2.580667018890381, "logps/chosen": -720.1099853515625, "logps/rejected": -934.330078125, "loss": 0.3303, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -5.059368133544922, "rewards/margins": 2.3587193489074707, "rewards/rejected": -7.418087005615234, "step": 1970 }, { "epoch": 1.14, "grad_norm": 31.093251887228394, "learning_rate": 3.896839803833806e-07, "logits/chosen": 1.5320136547088623, "logits/rejected": 1.823662519454956, "logps/chosen": -598.6609497070312, "logps/rejected": -789.0852661132812, "loss": 0.3362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.103881359100342, "rewards/margins": 1.8779538869857788, "rewards/rejected": -5.98183536529541, "step": 1980 }, { "epoch": 1.15, "grad_norm": 29.87137364342935, "learning_rate": 3.8829112281829845e-07, "logits/chosen": 1.4936949014663696, "logits/rejected": 1.9990638494491577, "logps/chosen": -576.7727661132812, "logps/rejected": -779.41796875, "loss": 0.3178, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.87956166267395, "rewards/margins": 2.1487553119659424, "rewards/rejected": -6.028317451477051, "step": 1990 }, { "epoch": 1.15, "grad_norm": 31.592949633755786, "learning_rate": 3.868920522365993e-07, "logits/chosen": 1.4371845722198486, "logits/rejected": 1.9529321193695068, "logps/chosen": -628.4710083007812, "logps/rejected": -846.5579833984375, "loss": 0.3186, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.21450662612915, "rewards/margins": 2.314563512802124, "rewards/rejected": -6.5290703773498535, "step": 2000 }, { "epoch": 1.15, "eval_logits/chosen": 1.7607609033584595, "eval_logits/rejected": 1.8747423887252808, "eval_logps/chosen": -678.9231567382812, "eval_logps/rejected": -794.5490112304688, "eval_loss": 0.5843532681465149, "eval_rewards/accuracies": 0.7140411138534546, "eval_rewards/chosen": -5.161664009094238, "eval_rewards/margins": 1.0805598497390747, "eval_rewards/rejected": -6.242224216461182, "eval_runtime": 534.9545, "eval_samples_per_second": 13.085, "eval_steps_per_second": 0.409, "step": 2000 }, { "epoch": 1.16, "grad_norm": 45.603636818029415, "learning_rate": 3.8548683149444197e-07, "logits/chosen": 1.2295787334442139, "logits/rejected": 1.6226072311401367, "logps/chosen": -657.1011962890625, "logps/rejected": -892.6251220703125, "loss": 0.281, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.348919868469238, "rewards/margins": 2.4515390396118164, "rewards/rejected": -6.800459384918213, "step": 2010 }, { "epoch": 1.16, "grad_norm": 39.96815827692319, "learning_rate": 3.840755237242939e-07, "logits/chosen": 0.8963111042976379, "logits/rejected": 1.2681035995483398, "logps/chosen": -618.8529052734375, "logps/rejected": -836.0001220703125, "loss": 0.3168, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.033407688140869, "rewards/margins": 2.260913133621216, "rewards/rejected": -6.294320583343506, "step": 2020 }, { "epoch": 1.17, "grad_norm": 68.65065816143934, "learning_rate": 3.826581923320951e-07, "logits/chosen": 1.1317006349563599, "logits/rejected": 1.4729664325714111, "logps/chosen": -564.8380737304688, "logps/rejected": -776.9664306640625, "loss": 0.3311, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.7947540283203125, "rewards/margins": 2.165470838546753, "rewards/rejected": -5.9602251052856445, "step": 2030 }, { "epoch": 1.17, "grad_norm": 67.93281685760073, "learning_rate": 3.8123490099440924e-07, "logits/chosen": 0.9724197387695312, "logits/rejected": 1.2723339796066284, "logps/chosen": -638.6416625976562, "logps/rejected": -810.9025268554688, "loss": 0.3453, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.178596019744873, "rewards/margins": 1.806787133216858, "rewards/rejected": -5.9853835105896, "step": 2040 }, { "epoch": 1.18, "grad_norm": 31.097720119034104, "learning_rate": 3.79805713655563e-07, "logits/chosen": 0.9377905130386353, "logits/rejected": 1.288569450378418, "logps/chosen": -573.2774658203125, "logps/rejected": -799.2069091796875, "loss": 0.3167, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.6513943672180176, "rewards/margins": 2.1768927574157715, "rewards/rejected": -5.828286647796631, "step": 2050 }, { "epoch": 1.19, "grad_norm": 36.08840405070444, "learning_rate": 3.783706945247732e-07, "logits/chosen": 1.2912366390228271, "logits/rejected": 1.5637694597244263, "logps/chosen": -593.0203247070312, "logps/rejected": -796.7218627929688, "loss": 0.3334, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.0096235275268555, "rewards/margins": 1.96588134765625, "rewards/rejected": -5.9755048751831055, "step": 2060 }, { "epoch": 1.19, "grad_norm": 48.998515468248385, "learning_rate": 3.769299080732619e-07, "logits/chosen": 1.0989503860473633, "logits/rejected": 1.5163471698760986, "logps/chosen": -634.7422485351562, "logps/rejected": -841.5469970703125, "loss": 0.3307, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.1387248039245605, "rewards/margins": 2.287487506866455, "rewards/rejected": -6.426212310791016, "step": 2070 }, { "epoch": 1.2, "grad_norm": 34.718207017526694, "learning_rate": 3.754834190313603e-07, "logits/chosen": 1.5281040668487549, "logits/rejected": 1.564267873764038, "logps/chosen": -534.3562622070312, "logps/rejected": -743.1126098632812, "loss": 0.3387, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.476851224899292, "rewards/margins": 2.219818115234375, "rewards/rejected": -5.696669578552246, "step": 2080 }, { "epoch": 1.2, "grad_norm": 53.50326588949733, "learning_rate": 3.740312923856002e-07, "logits/chosen": 1.177415370941162, "logits/rejected": 1.9115955829620361, "logps/chosen": -589.4149780273438, "logps/rejected": -756.3972778320312, "loss": 0.3557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.877960205078125, "rewards/margins": 1.8244924545288086, "rewards/rejected": -5.702451705932617, "step": 2090 }, { "epoch": 1.21, "grad_norm": 43.75224226636051, "learning_rate": 3.725735933757946e-07, "logits/chosen": 1.6777915954589844, "logits/rejected": 1.831566572189331, "logps/chosen": -578.42138671875, "logps/rejected": -775.08740234375, "loss": 0.3032, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.7722866535186768, "rewards/margins": 2.1240692138671875, "rewards/rejected": -5.896355628967285, "step": 2100 }, { "epoch": 1.21, "eval_logits/chosen": 1.9375393390655518, "eval_logits/rejected": 1.8097468614578247, "eval_logps/chosen": -613.7317504882812, "eval_logps/rejected": -726.1541748046875, "eval_loss": 0.5746448636054993, "eval_rewards/accuracies": 0.7254565954208374, "eval_rewards/chosen": -4.5097503662109375, "eval_rewards/margins": 1.0485247373580933, "eval_rewards/rejected": -5.5582756996154785, "eval_runtime": 523.6021, "eval_samples_per_second": 13.369, "eval_steps_per_second": 0.418, "step": 2100 }, { "epoch": 1.22, "grad_norm": 39.54711468009802, "learning_rate": 3.7111038749210615e-07, "logits/chosen": 1.8345085382461548, "logits/rejected": 2.1369950771331787, "logps/chosen": -613.9482421875, "logps/rejected": -821.70654296875, "loss": 0.3231, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.226766586303711, "rewards/margins": 2.071512222290039, "rewards/rejected": -6.29827880859375, "step": 2110 }, { "epoch": 1.22, "grad_norm": 41.7802389633882, "learning_rate": 3.6964174047210597e-07, "logits/chosen": 1.4287443161010742, "logits/rejected": 1.8681352138519287, "logps/chosen": -578.2172241210938, "logps/rejected": -815.9376220703125, "loss": 0.3188, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.7338626384735107, "rewards/margins": 2.2969706058502197, "rewards/rejected": -6.0308332443237305, "step": 2120 }, { "epoch": 1.23, "grad_norm": 43.46746858364706, "learning_rate": 3.6816771829781893e-07, "logits/chosen": 1.486219048500061, "logits/rejected": 1.4401004314422607, "logps/chosen": -599.9180908203125, "logps/rejected": -824.6650390625, "loss": 0.3128, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.903005599975586, "rewards/margins": 2.4457881450653076, "rewards/rejected": -6.348793983459473, "step": 2130 }, { "epoch": 1.23, "grad_norm": 32.261951825459974, "learning_rate": 3.666883871927603e-07, "logits/chosen": 1.280306100845337, "logits/rejected": 1.9898639917373657, "logps/chosen": -575.6622314453125, "logps/rejected": -770.5057373046875, "loss": 0.3312, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.600424289703369, "rewards/margins": 2.1685173511505127, "rewards/rejected": -5.7689409255981445, "step": 2140 }, { "epoch": 1.24, "grad_norm": 42.52590289999548, "learning_rate": 3.652038136189599e-07, "logits/chosen": 1.6559574604034424, "logits/rejected": 1.482783317565918, "logps/chosen": -587.6668701171875, "logps/rejected": -756.7838134765625, "loss": 0.3046, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.8027946949005127, "rewards/margins": 1.9600753784179688, "rewards/rejected": -5.762869834899902, "step": 2150 }, { "epoch": 1.24, "grad_norm": 41.29552848532768, "learning_rate": 3.6371406427397634e-07, "logits/chosen": 1.0592231750488281, "logits/rejected": 1.3463618755340576, "logps/chosen": -591.4105224609375, "logps/rejected": -780.130615234375, "loss": 0.3153, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.668306827545166, "rewards/margins": 2.12784481048584, "rewards/rejected": -5.796151638031006, "step": 2160 }, { "epoch": 1.25, "grad_norm": 44.08349659312553, "learning_rate": 3.622192060879008e-07, "logits/chosen": 1.237251877784729, "logits/rejected": 1.3016383647918701, "logps/chosen": -581.1758422851562, "logps/rejected": -786.3738403320312, "loss": 0.3435, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.8551025390625, "rewards/margins": 2.2383615970611572, "rewards/rejected": -6.0934648513793945, "step": 2170 }, { "epoch": 1.26, "grad_norm": 50.290566743874386, "learning_rate": 3.607193062203495e-07, "logits/chosen": 1.6983779668807983, "logits/rejected": 2.060429096221924, "logps/chosen": -569.6126098632812, "logps/rejected": -779.6986694335938, "loss": 0.3147, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.6916840076446533, "rewards/margins": 2.2418124675750732, "rewards/rejected": -5.933497428894043, "step": 2180 }, { "epoch": 1.26, "grad_norm": 143.26673664674934, "learning_rate": 3.59214432057447e-07, "logits/chosen": 1.9989519119262695, "logits/rejected": 2.2683498859405518, "logps/chosen": -617.5166015625, "logps/rejected": -877.4951171875, "loss": 0.2889, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.250457763671875, "rewards/margins": 2.5301175117492676, "rewards/rejected": -6.780574798583984, "step": 2190 }, { "epoch": 1.27, "grad_norm": 37.11430010222166, "learning_rate": 3.577046512087978e-07, "logits/chosen": 1.7908446788787842, "logits/rejected": 2.1658034324645996, "logps/chosen": -618.9979858398438, "logps/rejected": -857.0452880859375, "loss": 0.3192, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.161375999450684, "rewards/margins": 2.483597993850708, "rewards/rejected": -6.6449737548828125, "step": 2200 }, { "epoch": 1.27, "eval_logits/chosen": 1.6380729675292969, "eval_logits/rejected": 1.4935342073440552, "eval_logps/chosen": -618.5480346679688, "eval_logps/rejected": -732.409912109375, "eval_loss": 0.5697108507156372, "eval_rewards/accuracies": 0.7214611768722534, "eval_rewards/chosen": -4.557913780212402, "eval_rewards/margins": 1.062919020652771, "eval_rewards/rejected": -5.620832443237305, "eval_runtime": 536.9654, "eval_samples_per_second": 13.036, "eval_steps_per_second": 0.408, "step": 2200 }, { "epoch": 1.27, "grad_norm": 33.157921467980366, "learning_rate": 3.561900315044504e-07, "logits/chosen": 1.5319061279296875, "logits/rejected": 1.6983131170272827, "logps/chosen": -591.1768798828125, "logps/rejected": -796.5339965820312, "loss": 0.3342, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.112669944763184, "rewards/margins": 1.9184939861297607, "rewards/rejected": -6.031164646148682, "step": 2210 }, { "epoch": 1.28, "grad_norm": 46.23221712636413, "learning_rate": 3.5467064099184824e-07, "logits/chosen": 1.4129496812820435, "logits/rejected": 1.5509659051895142, "logps/chosen": -597.654541015625, "logps/rejected": -784.99365234375, "loss": 0.343, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.8843836784362793, "rewards/margins": 2.136141300201416, "rewards/rejected": -6.020524978637695, "step": 2220 }, { "epoch": 1.28, "grad_norm": 41.87300015399033, "learning_rate": 3.531465479327735e-07, "logits/chosen": 1.6229451894760132, "logits/rejected": 1.832493543624878, "logps/chosen": -582.9141845703125, "logps/rejected": -778.9899291992188, "loss": 0.3305, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.908311367034912, "rewards/margins": 2.1760025024414062, "rewards/rejected": -6.08431339263916, "step": 2230 }, { "epoch": 1.29, "grad_norm": 50.80717666551823, "learning_rate": 3.516178208002801e-07, "logits/chosen": 1.4920746088027954, "logits/rejected": 1.7889947891235352, "logps/chosen": -610.1861572265625, "logps/rejected": -817.4967041015625, "loss": 0.3619, "rewards/accuracies": 0.875, "rewards/chosen": -3.967106342315674, "rewards/margins": 2.107160806655884, "rewards/rejected": -6.0742669105529785, "step": 2240 }, { "epoch": 1.3, "grad_norm": 50.26160167527689, "learning_rate": 3.500845282756173e-07, "logits/chosen": 1.3496488332748413, "logits/rejected": 1.3950871229171753, "logps/chosen": -556.4750366210938, "logps/rejected": -800.4376220703125, "loss": 0.3351, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.9187893867492676, "rewards/margins": 2.169339656829834, "rewards/rejected": -6.08812952041626, "step": 2250 }, { "epoch": 1.3, "grad_norm": 42.72807127532973, "learning_rate": 3.485467392451441e-07, "logits/chosen": 1.2289178371429443, "logits/rejected": 1.2947728633880615, "logps/chosen": -591.5369873046875, "logps/rejected": -774.4034423828125, "loss": 0.3192, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.998089551925659, "rewards/margins": 1.8467918634414673, "rewards/rejected": -5.844881534576416, "step": 2260 }, { "epoch": 1.31, "grad_norm": 27.73863647122098, "learning_rate": 3.4700452279723436e-07, "logits/chosen": 1.30094313621521, "logits/rejected": 1.5681588649749756, "logps/chosen": -605.1574096679688, "logps/rejected": -841.23046875, "loss": 0.2856, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.081181526184082, "rewards/margins": 2.3216888904571533, "rewards/rejected": -6.402871131896973, "step": 2270 }, { "epoch": 1.31, "grad_norm": 36.795184064956594, "learning_rate": 3.4545794821917294e-07, "logits/chosen": 1.7648903131484985, "logits/rejected": 1.6187632083892822, "logps/chosen": -629.5055541992188, "logps/rejected": -840.4461669921875, "loss": 0.3231, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.354361057281494, "rewards/margins": 2.081214427947998, "rewards/rejected": -6.43557596206665, "step": 2280 }, { "epoch": 1.32, "grad_norm": 33.94251351594815, "learning_rate": 3.439070849940427e-07, "logits/chosen": 1.873849868774414, "logits/rejected": 2.150845766067505, "logps/chosen": -612.2574462890625, "logps/rejected": -835.7185668945312, "loss": 0.3426, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.387389183044434, "rewards/margins": 2.225327491760254, "rewards/rejected": -6.6127166748046875, "step": 2290 }, { "epoch": 1.32, "grad_norm": 32.92797002230817, "learning_rate": 3.423520027976031e-07, "logits/chosen": 1.412274718284607, "logits/rejected": 1.824458360671997, "logps/chosen": -654.5724487304688, "logps/rejected": -916.5109252929688, "loss": 0.3047, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.480207920074463, "rewards/margins": 2.563689708709717, "rewards/rejected": -7.043896675109863, "step": 2300 }, { "epoch": 1.32, "eval_logits/chosen": 2.0879855155944824, "eval_logits/rejected": 1.9596179723739624, "eval_logps/chosen": -696.7005615234375, "eval_logps/rejected": -823.0447387695312, "eval_loss": 0.5830277800559998, "eval_rewards/accuracies": 0.7265982031822205, "eval_rewards/chosen": -5.339437484741211, "eval_rewards/margins": 1.1877434253692627, "eval_rewards/rejected": -6.527180194854736, "eval_runtime": 523.768, "eval_samples_per_second": 13.365, "eval_steps_per_second": 0.418, "step": 2300 }, { "epoch": 1.33, "grad_norm": 70.5925809842385, "learning_rate": 3.407927714951595e-07, "logits/chosen": 2.123220920562744, "logits/rejected": 2.0706071853637695, "logps/chosen": -676.2492065429688, "logps/rejected": -879.76416015625, "loss": 0.3239, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.666895866394043, "rewards/margins": 2.2429466247558594, "rewards/rejected": -6.909843444824219, "step": 2310 }, { "epoch": 1.34, "grad_norm": 46.95676569062402, "learning_rate": 3.392294611384243e-07, "logits/chosen": 1.882577896118164, "logits/rejected": 2.2939114570617676, "logps/chosen": -661.1419067382812, "logps/rejected": -878.6154174804688, "loss": 0.3416, "rewards/accuracies": 0.84375, "rewards/chosen": -4.626810550689697, "rewards/margins": 2.2012743949890137, "rewards/rejected": -6.8280839920043945, "step": 2320 }, { "epoch": 1.34, "grad_norm": 35.668943035817854, "learning_rate": 3.376621419623703e-07, "logits/chosen": 2.363044261932373, "logits/rejected": 2.465853452682495, "logps/chosen": -676.5272216796875, "logps/rejected": -892.6278076171875, "loss": 0.3408, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.866837024688721, "rewards/margins": 2.059001922607422, "rewards/rejected": -6.925838470458984, "step": 2330 }, { "epoch": 1.35, "grad_norm": 41.02233778974841, "learning_rate": 3.3609088438207466e-07, "logits/chosen": 2.0729591846466064, "logits/rejected": 2.500368595123291, "logps/chosen": -631.5442504882812, "logps/rejected": -865.3001098632812, "loss": 0.336, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.331040382385254, "rewards/margins": 2.1503758430480957, "rewards/rejected": -6.48141622543335, "step": 2340 }, { "epoch": 1.35, "grad_norm": 37.19620985674034, "learning_rate": 3.3451575898955553e-07, "logits/chosen": 1.643340826034546, "logits/rejected": 1.9104621410369873, "logps/chosen": -634.7784423828125, "logps/rejected": -838.1502685546875, "loss": 0.3014, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.053469657897949, "rewards/margins": 2.192739248275757, "rewards/rejected": -6.246209621429443, "step": 2350 }, { "epoch": 1.36, "grad_norm": 28.110678376021287, "learning_rate": 3.3293683655060056e-07, "logits/chosen": 1.8788731098175049, "logits/rejected": 2.0477216243743896, "logps/chosen": -608.8865966796875, "logps/rejected": -806.3536376953125, "loss": 0.3071, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.354788780212402, "rewards/margins": 1.87799072265625, "rewards/rejected": -6.2327799797058105, "step": 2360 }, { "epoch": 1.36, "grad_norm": 36.65194126805586, "learning_rate": 3.313541880015877e-07, "logits/chosen": 1.879799246788025, "logits/rejected": 2.1894192695617676, "logps/chosen": -630.0236206054688, "logps/rejected": -897.3093872070312, "loss": 0.3053, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.580197334289551, "rewards/margins": 2.4769797325134277, "rewards/rejected": -7.0571770668029785, "step": 2370 }, { "epoch": 1.37, "grad_norm": 27.83901710397797, "learning_rate": 3.297678844462982e-07, "logits/chosen": 1.9561046361923218, "logits/rejected": 1.8269224166870117, "logps/chosen": -626.4075927734375, "logps/rejected": -827.5401611328125, "loss": 0.327, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.305657863616943, "rewards/margins": 2.173861026763916, "rewards/rejected": -6.479518890380859, "step": 2380 }, { "epoch": 1.38, "grad_norm": 28.83939374483748, "learning_rate": 3.2817799715272184e-07, "logits/chosen": 1.9727920293807983, "logits/rejected": 2.11802339553833, "logps/chosen": -586.7623901367188, "logps/rejected": -809.90380859375, "loss": 0.298, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.09929084777832, "rewards/margins": 2.2557644844055176, "rewards/rejected": -6.355055809020996, "step": 2390 }, { "epoch": 1.38, "grad_norm": 38.18984398435181, "learning_rate": 3.265845975498555e-07, "logits/chosen": 1.8293859958648682, "logits/rejected": 2.148918390274048, "logps/chosen": -604.6348876953125, "logps/rejected": -847.4898681640625, "loss": 0.3109, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.113718032836914, "rewards/margins": 2.5669283866882324, "rewards/rejected": -6.680645942687988, "step": 2400 }, { "epoch": 1.38, "eval_logits/chosen": 2.215569257736206, "eval_logits/rejected": 2.043840169906616, "eval_logps/chosen": -651.5050659179688, "eval_logps/rejected": -773.7960815429688, "eval_loss": 0.579673171043396, "eval_rewards/accuracies": 0.7191780805587769, "eval_rewards/chosen": -4.887484073638916, "eval_rewards/margins": 1.147210717201233, "eval_rewards/rejected": -6.034694671630859, "eval_runtime": 534.5894, "eval_samples_per_second": 13.094, "eval_steps_per_second": 0.41, "step": 2400 }, { "epoch": 1.39, "grad_norm": 30.070404151201014, "learning_rate": 3.249877572244939e-07, "logits/chosen": 1.8672001361846924, "logits/rejected": 2.268122434616089, "logps/chosen": -625.9879760742188, "logps/rejected": -855.7373046875, "loss": 0.3164, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.199160099029541, "rewards/margins": 2.3796563148498535, "rewards/rejected": -6.5788164138793945, "step": 2410 }, { "epoch": 1.39, "grad_norm": 30.3539756123522, "learning_rate": 3.233875479180131e-07, "logits/chosen": 1.6018146276474, "logits/rejected": 1.8064720630645752, "logps/chosen": -616.5755004882812, "logps/rejected": -813.0919799804688, "loss": 0.3454, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.166964054107666, "rewards/margins": 1.914471983909607, "rewards/rejected": -6.081435680389404, "step": 2420 }, { "epoch": 1.4, "grad_norm": 41.216509834895945, "learning_rate": 3.217840415231482e-07, "logits/chosen": 1.0864310264587402, "logits/rejected": 1.7743299007415771, "logps/chosen": -599.7847900390625, "logps/rejected": -779.0838012695312, "loss": 0.347, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.85355806350708, "rewards/margins": 1.8017772436141968, "rewards/rejected": -5.655335426330566, "step": 2430 }, { "epoch": 1.41, "grad_norm": 37.70329321186311, "learning_rate": 3.2017731008076224e-07, "logits/chosen": 1.3526211977005005, "logits/rejected": 1.524337887763977, "logps/chosen": -572.71533203125, "logps/rejected": -785.8888549804688, "loss": 0.3074, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.705867290496826, "rewards/margins": 2.2124104499816895, "rewards/rejected": -5.918278694152832, "step": 2440 }, { "epoch": 1.41, "grad_norm": 33.71898312603193, "learning_rate": 3.185674257766107e-07, "logits/chosen": 1.581162452697754, "logits/rejected": 1.8565731048583984, "logps/chosen": -630.492431640625, "logps/rejected": -807.0277099609375, "loss": 0.3549, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.452383995056152, "rewards/margins": 1.792571783065796, "rewards/rejected": -6.2449564933776855, "step": 2450 }, { "epoch": 1.42, "grad_norm": 37.67796290787722, "learning_rate": 3.169544609380975e-07, "logits/chosen": 1.5396718978881836, "logits/rejected": 1.4584314823150635, "logps/chosen": -627.9415893554688, "logps/rejected": -813.0889892578125, "loss": 0.3158, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.286644458770752, "rewards/margins": 2.0124571323394775, "rewards/rejected": -6.299101829528809, "step": 2460 }, { "epoch": 1.42, "grad_norm": 32.48538308781243, "learning_rate": 3.1533848803102643e-07, "logits/chosen": 1.43734610080719, "logits/rejected": 1.5827052593231201, "logps/chosen": -590.2487182617188, "logps/rejected": -799.57861328125, "loss": 0.3458, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.086043357849121, "rewards/margins": 2.132053852081299, "rewards/rejected": -6.218096733093262, "step": 2470 }, { "epoch": 1.43, "grad_norm": 57.410824334184085, "learning_rate": 3.1371957965634475e-07, "logits/chosen": 1.2114673852920532, "logits/rejected": 1.40522038936615, "logps/chosen": -580.3658447265625, "logps/rejected": -760.5119018554688, "loss": 0.3292, "rewards/accuracies": 0.84375, "rewards/chosen": -3.7364070415496826, "rewards/margins": 2.058490753173828, "rewards/rejected": -5.794898509979248, "step": 2480 }, { "epoch": 1.43, "grad_norm": 31.27574855499795, "learning_rate": 3.120978085468818e-07, "logits/chosen": 1.326228380203247, "logits/rejected": 1.3650546073913574, "logps/chosen": -598.9962158203125, "logps/rejected": -784.859130859375, "loss": 0.3166, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.052596569061279, "rewards/margins": 1.9071400165557861, "rewards/rejected": -5.959735870361328, "step": 2490 }, { "epoch": 1.44, "grad_norm": 48.10544926182724, "learning_rate": 3.104732475640812e-07, "logits/chosen": 1.605948805809021, "logits/rejected": 1.9895013570785522, "logps/chosen": -601.33349609375, "logps/rejected": -811.6580200195312, "loss": 0.3165, "rewards/accuracies": 0.84375, "rewards/chosen": -4.234116554260254, "rewards/margins": 1.9731727838516235, "rewards/rejected": -6.207289218902588, "step": 2500 }, { "epoch": 1.44, "eval_logits/chosen": 1.9232186079025269, "eval_logits/rejected": 1.6852322816848755, "eval_logps/chosen": -647.2462768554688, "eval_logps/rejected": -761.4922485351562, "eval_loss": 0.5703960657119751, "eval_rewards/accuracies": 0.7283105254173279, "eval_rewards/chosen": -4.844895839691162, "eval_rewards/margins": 1.0667606592178345, "eval_rewards/rejected": -5.911656379699707, "eval_runtime": 524.0242, "eval_samples_per_second": 13.358, "eval_steps_per_second": 0.418, "step": 2500 }, { "epoch": 1.45, "grad_norm": 53.02808078471569, "learning_rate": 3.0884596969472753e-07, "logits/chosen": 1.9154891967773438, "logits/rejected": 2.1159121990203857, "logps/chosen": -633.3983154296875, "logps/rejected": -854.7918701171875, "loss": 0.3118, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.537432670593262, "rewards/margins": 2.1566994190216064, "rewards/rejected": -6.6941328048706055, "step": 2510 }, { "epoch": 1.45, "grad_norm": 53.179217436788576, "learning_rate": 3.072160480476671e-07, "logits/chosen": 2.0104787349700928, "logits/rejected": 2.2272541522979736, "logps/chosen": -683.6210327148438, "logps/rejected": -908.9900512695312, "loss": 0.3132, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.788651466369629, "rewards/margins": 2.1851444244384766, "rewards/rejected": -6.973796844482422, "step": 2520 }, { "epoch": 1.46, "grad_norm": 40.306079811386546, "learning_rate": 3.055835558505233e-07, "logits/chosen": 1.4535454511642456, "logits/rejected": 1.4773526191711426, "logps/chosen": -669.595947265625, "logps/rejected": -902.9430541992188, "loss": 0.3588, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.544825553894043, "rewards/margins": 2.3575263023376465, "rewards/rejected": -6.902352333068848, "step": 2530 }, { "epoch": 1.46, "grad_norm": 39.004487743874954, "learning_rate": 3.039485664464071e-07, "logits/chosen": 1.6595430374145508, "logits/rejected": 1.7077264785766602, "logps/chosen": -625.1431274414062, "logps/rejected": -820.0090942382812, "loss": 0.3292, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.120484828948975, "rewards/margins": 2.162618398666382, "rewards/rejected": -6.283102989196777, "step": 2540 }, { "epoch": 1.47, "grad_norm": 29.714947443096005, "learning_rate": 3.023111532906214e-07, "logits/chosen": 1.8319562673568726, "logits/rejected": 1.977224349975586, "logps/chosen": -656.2291870117188, "logps/rejected": -846.8419799804688, "loss": 0.3046, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.283766269683838, "rewards/margins": 2.1574952602386475, "rewards/rejected": -6.441261291503906, "step": 2550 }, { "epoch": 1.47, "grad_norm": 44.57325507877033, "learning_rate": 3.006713899473615e-07, "logits/chosen": 2.2192223072052, "logits/rejected": 2.143354892730713, "logps/chosen": -651.9195556640625, "logps/rejected": -893.9539184570312, "loss": 0.3376, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.640814781188965, "rewards/margins": 2.3999695777893066, "rewards/rejected": -7.040783882141113, "step": 2560 }, { "epoch": 1.48, "grad_norm": 72.14355662553861, "learning_rate": 2.9902935008640956e-07, "logits/chosen": 2.124143123626709, "logits/rejected": 2.879894733428955, "logps/chosen": -695.3282470703125, "logps/rejected": -899.8555908203125, "loss": 0.3174, "rewards/accuracies": 0.84375, "rewards/chosen": -4.697274684906006, "rewards/margins": 2.3545241355895996, "rewards/rejected": -7.051799774169922, "step": 2570 }, { "epoch": 1.49, "grad_norm": 37.858678364970324, "learning_rate": 2.973851074798251e-07, "logits/chosen": 2.3803398609161377, "logits/rejected": 2.692211627960205, "logps/chosen": -702.2596435546875, "logps/rejected": -935.8321533203125, "loss": 0.3548, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.91534948348999, "rewards/margins": 2.413865566253662, "rewards/rejected": -7.329214572906494, "step": 2580 }, { "epoch": 1.49, "grad_norm": 63.46695481274794, "learning_rate": 2.9573873599863055e-07, "logits/chosen": 2.295797824859619, "logits/rejected": 2.1648011207580566, "logps/chosen": -622.8031005859375, "logps/rejected": -825.5576171875, "loss": 0.3684, "rewards/accuracies": 0.84375, "rewards/chosen": -4.3777008056640625, "rewards/margins": 1.879507064819336, "rewards/rejected": -6.257207870483398, "step": 2590 }, { "epoch": 1.5, "grad_norm": 27.77597158452685, "learning_rate": 2.9409030960949237e-07, "logits/chosen": 2.0252740383148193, "logits/rejected": 1.735515832901001, "logps/chosen": -611.2247314453125, "logps/rejected": -808.1098022460938, "loss": 0.321, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.0173821449279785, "rewards/margins": 2.089155912399292, "rewards/rejected": -6.10653829574585, "step": 2600 }, { "epoch": 1.5, "eval_logits/chosen": 2.044396162033081, "eval_logits/rejected": 1.809226632118225, "eval_logps/chosen": -605.1918334960938, "eval_logps/rejected": -708.8523559570312, "eval_loss": 0.5705053210258484, "eval_rewards/accuracies": 0.719748854637146, "eval_rewards/chosen": -4.424351215362549, "eval_rewards/margins": 0.9609061479568481, "eval_rewards/rejected": -5.385257244110107, "eval_runtime": 534.0984, "eval_samples_per_second": 13.106, "eval_steps_per_second": 0.41, "step": 2600 }, { "epoch": 1.5, "grad_norm": 54.64455780968803, "learning_rate": 2.9243990237139833e-07, "logits/chosen": 2.061228036880493, "logits/rejected": 2.065051317214966, "logps/chosen": -602.9407958984375, "logps/rejected": -793.8677978515625, "loss": 0.3193, "rewards/accuracies": 0.84375, "rewards/chosen": -4.021689414978027, "rewards/margins": 1.9177577495574951, "rewards/rejected": -5.939446926116943, "step": 2610 }, { "epoch": 1.51, "grad_norm": 45.45684260024722, "learning_rate": 2.9078758843232965e-07, "logits/chosen": 2.2369582653045654, "logits/rejected": 2.736159563064575, "logps/chosen": -637.3463134765625, "logps/rejected": -869.5232543945312, "loss": 0.3525, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.2675371170043945, "rewards/margins": 2.4316446781158447, "rewards/rejected": -6.69918155670166, "step": 2620 }, { "epoch": 1.51, "grad_norm": 42.87828082161847, "learning_rate": 2.891334420259302e-07, "logits/chosen": 2.316720962524414, "logits/rejected": 2.6986539363861084, "logps/chosen": -640.57861328125, "logps/rejected": -841.64599609375, "loss": 0.3698, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.400420665740967, "rewards/margins": 2.0892693996429443, "rewards/rejected": -6.48969030380249, "step": 2630 }, { "epoch": 1.52, "grad_norm": 32.24545884529666, "learning_rate": 2.874775374681712e-07, "logits/chosen": 2.4638264179229736, "logits/rejected": 2.3759593963623047, "logps/chosen": -668.535400390625, "logps/rejected": -867.4671020507812, "loss": 0.3176, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.493716239929199, "rewards/margins": 2.0932061672210693, "rewards/rejected": -6.586923122406006, "step": 2640 }, { "epoch": 1.53, "grad_norm": 41.52327724481713, "learning_rate": 2.858199491540127e-07, "logits/chosen": 2.4140312671661377, "logits/rejected": 2.101113796234131, "logps/chosen": -657.0660400390625, "logps/rejected": -848.8175659179688, "loss": 0.3263, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.539262294769287, "rewards/margins": 2.0102744102478027, "rewards/rejected": -6.54953670501709, "step": 2650 }, { "epoch": 1.53, "grad_norm": 42.338459189711344, "learning_rate": 2.8416075155406076e-07, "logits/chosen": 2.2348203659057617, "logits/rejected": 2.0736632347106934, "logps/chosen": -626.8009033203125, "logps/rejected": -873.6861572265625, "loss": 0.2888, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.369687557220459, "rewards/margins": 2.120250701904297, "rewards/rejected": -6.489938259124756, "step": 2660 }, { "epoch": 1.54, "grad_norm": 42.36658260188185, "learning_rate": 2.82500019211222e-07, "logits/chosen": 2.402601718902588, "logits/rejected": 2.6431448459625244, "logps/chosen": -674.5459594726562, "logps/rejected": -936.0911865234375, "loss": 0.2816, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.5717949867248535, "rewards/margins": 2.706157684326172, "rewards/rejected": -7.277953147888184, "step": 2670 }, { "epoch": 1.54, "grad_norm": 30.120412898523046, "learning_rate": 2.8083782673735454e-07, "logits/chosen": 2.6986870765686035, "logits/rejected": 2.9595091342926025, "logps/chosen": -669.9313354492188, "logps/rejected": -940.4530029296875, "loss": 0.2884, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.7323503494262695, "rewards/margins": 2.5740668773651123, "rewards/rejected": -7.306416988372803, "step": 2680 }, { "epoch": 1.55, "grad_norm": 63.27039717018376, "learning_rate": 2.791742488099161e-07, "logits/chosen": 2.39888334274292, "logits/rejected": 2.465786933898926, "logps/chosen": -683.6512451171875, "logps/rejected": -920.3508911132812, "loss": 0.2968, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.575619697570801, "rewards/margins": 2.6026134490966797, "rewards/rejected": -7.1782331466674805, "step": 2690 }, { "epoch": 1.55, "grad_norm": 53.48275764762997, "learning_rate": 2.7750936016860853e-07, "logits/chosen": 2.1316702365875244, "logits/rejected": 2.8009414672851562, "logps/chosen": -694.2730712890625, "logps/rejected": -903.9749145507812, "loss": 0.3164, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.696066856384277, "rewards/margins": 2.2753114700317383, "rewards/rejected": -6.971378326416016, "step": 2700 }, { "epoch": 1.55, "eval_logits/chosen": 2.6930806636810303, "eval_logits/rejected": 2.3926303386688232, "eval_logps/chosen": -672.1396484375, "eval_logps/rejected": -788.835205078125, "eval_loss": 0.5779096484184265, "eval_rewards/accuracies": 0.7168949842453003, "eval_rewards/chosen": -5.093829154968262, "eval_rewards/margins": 1.0912566184997559, "eval_rewards/rejected": -6.185085296630859, "eval_runtime": 523.9331, "eval_samples_per_second": 13.36, "eval_steps_per_second": 0.418, "step": 2700 }, { "epoch": 1.56, "grad_norm": 39.577488154611885, "learning_rate": 2.758432356120205e-07, "logits/chosen": 2.409973621368408, "logits/rejected": 2.2249832153320312, "logps/chosen": -669.9813232421875, "logps/rejected": -871.96630859375, "loss": 0.3355, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.623051643371582, "rewards/margins": 2.0220675468444824, "rewards/rejected": -6.645118713378906, "step": 2710 }, { "epoch": 1.57, "grad_norm": 30.230922993296016, "learning_rate": 2.7417594999426655e-07, "logits/chosen": 2.500739812850952, "logits/rejected": 2.3166146278381348, "logps/chosen": -635.5529174804688, "logps/rejected": -835.4622802734375, "loss": 0.3327, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.283183574676514, "rewards/margins": 2.143195629119873, "rewards/rejected": -6.4263787269592285, "step": 2720 }, { "epoch": 1.57, "grad_norm": 28.7567638679217, "learning_rate": 2.725075782216244e-07, "logits/chosen": 2.51102876663208, "logits/rejected": 2.3736655712127686, "logps/chosen": -629.8242797851562, "logps/rejected": -844.6765747070312, "loss": 0.3148, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.400738716125488, "rewards/margins": 2.1390271186828613, "rewards/rejected": -6.53976583480835, "step": 2730 }, { "epoch": 1.58, "grad_norm": 45.04655376990736, "learning_rate": 2.708381952491695e-07, "logits/chosen": 2.2213499546051025, "logits/rejected": 2.7950034141540527, "logps/chosen": -654.0599365234375, "logps/rejected": -916.375, "loss": 0.3218, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.603247165679932, "rewards/margins": 2.3657169342041016, "rewards/rejected": -6.968963623046875, "step": 2740 }, { "epoch": 1.58, "grad_norm": 50.77034331196584, "learning_rate": 2.691678760774076e-07, "logits/chosen": 2.589442729949951, "logits/rejected": 2.7616329193115234, "logps/chosen": -605.7033081054688, "logps/rejected": -833.8020629882812, "loss": 0.3318, "rewards/accuracies": 0.84375, "rewards/chosen": -4.299317359924316, "rewards/margins": 2.1774792671203613, "rewards/rejected": -6.476797580718994, "step": 2750 }, { "epoch": 1.59, "grad_norm": 41.812535348060635, "learning_rate": 2.6749669574890504e-07, "logits/chosen": 2.1217856407165527, "logits/rejected": 2.1618785858154297, "logps/chosen": -633.5518798828125, "logps/rejected": -871.1535034179688, "loss": 0.3516, "rewards/accuracies": 0.875, "rewards/chosen": -4.202546119689941, "rewards/margins": 2.450648069381714, "rewards/rejected": -6.653194427490234, "step": 2760 }, { "epoch": 1.6, "grad_norm": 40.8094135117776, "learning_rate": 2.658247293449175e-07, "logits/chosen": 2.1863837242126465, "logits/rejected": 2.355276584625244, "logps/chosen": -607.8322143554688, "logps/rejected": -820.6095581054688, "loss": 0.3153, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.9768433570861816, "rewards/margins": 2.154115676879883, "rewards/rejected": -6.1309590339660645, "step": 2770 }, { "epoch": 1.6, "grad_norm": 37.49546996782932, "learning_rate": 2.641520519820169e-07, "logits/chosen": 2.038665294647217, "logits/rejected": 2.0447142124176025, "logps/chosen": -650.781005859375, "logps/rejected": -864.7796630859375, "loss": 0.3244, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.306005477905273, "rewards/margins": 2.050109624862671, "rewards/rejected": -6.356115341186523, "step": 2780 }, { "epoch": 1.61, "grad_norm": 32.88273613295463, "learning_rate": 2.624787388087161e-07, "logits/chosen": 1.6760154962539673, "logits/rejected": 2.1336731910705566, "logps/chosen": -586.3971557617188, "logps/rejected": -811.5464477539062, "loss": 0.3517, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.8299527168273926, "rewards/margins": 2.2157304286956787, "rewards/rejected": -6.045682907104492, "step": 2790 }, { "epoch": 1.61, "grad_norm": 37.89364145663362, "learning_rate": 2.6080486500209347e-07, "logits/chosen": 2.0527493953704834, "logits/rejected": 2.01056170463562, "logps/chosen": -559.4613037109375, "logps/rejected": -741.4288330078125, "loss": 0.3201, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.570992946624756, "rewards/margins": 2.1016650199890137, "rewards/rejected": -5.6726579666137695, "step": 2800 }, { "epoch": 1.61, "eval_logits/chosen": 2.132537841796875, "eval_logits/rejected": 1.9325625896453857, "eval_logps/chosen": -594.9215087890625, "eval_logps/rejected": -704.46240234375, "eval_loss": 0.563424825668335, "eval_rewards/accuracies": 0.7248858213424683, "eval_rewards/chosen": -4.321647644042969, "eval_rewards/margins": 1.0197103023529053, "eval_rewards/rejected": -5.341357231140137, "eval_runtime": 536.7321, "eval_samples_per_second": 13.042, "eval_steps_per_second": 0.408, "step": 2800 }, { "epoch": 1.62, "grad_norm": 32.54935961466176, "learning_rate": 2.5913050576441473e-07, "logits/chosen": 1.8553149700164795, "logits/rejected": 2.1887240409851074, "logps/chosen": -617.3562622070312, "logps/rejected": -832.6392822265625, "loss": 0.3174, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.149141311645508, "rewards/margins": 2.1361336708068848, "rewards/rejected": -6.285275459289551, "step": 2810 }, { "epoch": 1.62, "grad_norm": 59.51233569226138, "learning_rate": 2.574557363197546e-07, "logits/chosen": 1.8138978481292725, "logits/rejected": 2.5922133922576904, "logps/chosen": -640.2615966796875, "logps/rejected": -834.3948974609375, "loss": 0.3313, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.1352715492248535, "rewards/margins": 2.1803834438323975, "rewards/rejected": -6.31565523147583, "step": 2820 }, { "epoch": 1.63, "grad_norm": 39.38640054515148, "learning_rate": 2.557806319106173e-07, "logits/chosen": 2.1832432746887207, "logits/rejected": 2.625620126724243, "logps/chosen": -676.94677734375, "logps/rejected": -932.50732421875, "loss": 0.3385, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.606096267700195, "rewards/margins": 2.4643807411193848, "rewards/rejected": -7.070477485656738, "step": 2830 }, { "epoch": 1.64, "grad_norm": 37.63242197741332, "learning_rate": 2.5410526779455616e-07, "logits/chosen": 2.4261972904205322, "logits/rejected": 2.269443988800049, "logps/chosen": -627.156494140625, "logps/rejected": -861.25439453125, "loss": 0.3138, "rewards/accuracies": 0.90625, "rewards/chosen": -4.399529933929443, "rewards/margins": 2.3475685119628906, "rewards/rejected": -6.747098445892334, "step": 2840 }, { "epoch": 1.64, "grad_norm": 38.35377185932628, "learning_rate": 2.52429719240792e-07, "logits/chosen": 2.03845477104187, "logits/rejected": 2.460355043411255, "logps/chosen": -622.10205078125, "logps/rejected": -813.9536743164062, "loss": 0.3356, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.155783653259277, "rewards/margins": 2.259232997894287, "rewards/rejected": -6.415016174316406, "step": 2850 }, { "epoch": 1.65, "grad_norm": 55.68041414577995, "learning_rate": 2.5075406152683237e-07, "logits/chosen": 1.7226059436798096, "logits/rejected": 2.0017714500427246, "logps/chosen": -657.0048828125, "logps/rejected": -830.6038818359375, "loss": 0.3503, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.638699531555176, "rewards/margins": 1.8057724237442017, "rewards/rejected": -6.444471836090088, "step": 2860 }, { "epoch": 1.65, "grad_norm": 34.311782737001906, "learning_rate": 2.490783699350885e-07, "logits/chosen": 1.747779130935669, "logits/rejected": 1.5229190587997437, "logps/chosen": -583.0499267578125, "logps/rejected": -791.3631591796875, "loss": 0.3371, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.9629435539245605, "rewards/margins": 2.0389885902404785, "rewards/rejected": -6.001931667327881, "step": 2870 }, { "epoch": 1.66, "grad_norm": 32.927307534293064, "learning_rate": 2.4740271974949427e-07, "logits/chosen": 0.8059943318367004, "logits/rejected": 1.3462636470794678, "logps/chosen": -557.4342041015625, "logps/rejected": -787.9205932617188, "loss": 0.3136, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.492671251296997, "rewards/margins": 2.4179155826568604, "rewards/rejected": -5.910586833953857, "step": 2880 }, { "epoch": 1.66, "grad_norm": 41.06933197718998, "learning_rate": 2.457271862521229e-07, "logits/chosen": 1.1248573064804077, "logits/rejected": 1.3330161571502686, "logps/chosen": -557.9208984375, "logps/rejected": -736.2996826171875, "loss": 0.3455, "rewards/accuracies": 0.8125, "rewards/chosen": -3.641382932662964, "rewards/margins": 1.888704538345337, "rewards/rejected": -5.530087471008301, "step": 2890 }, { "epoch": 1.67, "grad_norm": 47.412808908270925, "learning_rate": 2.440518447198051e-07, "logits/chosen": 1.629214882850647, "logits/rejected": 1.485229253768921, "logps/chosen": -626.759521484375, "logps/rejected": -820.0159912109375, "loss": 0.3367, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.2889909744262695, "rewards/margins": 1.9188477993011475, "rewards/rejected": -6.207839012145996, "step": 2900 }, { "epoch": 1.67, "eval_logits/chosen": 1.6802117824554443, "eval_logits/rejected": 1.4794156551361084, "eval_logps/chosen": -623.8733520507812, "eval_logps/rejected": -732.7039184570312, "eval_loss": 0.5630954504013062, "eval_rewards/accuracies": 0.7254565954208374, "eval_rewards/chosen": -4.611166000366211, "eval_rewards/margins": 1.0126063823699951, "eval_rewards/rejected": -5.623772621154785, "eval_runtime": 523.3749, "eval_samples_per_second": 13.375, "eval_steps_per_second": 0.418, "step": 2900 }, { "epoch": 1.68, "grad_norm": 34.8372232209117, "learning_rate": 2.4237677042074754e-07, "logits/chosen": 1.8085609674453735, "logits/rejected": 1.7269556522369385, "logps/chosen": -633.6416015625, "logps/rejected": -861.7503662109375, "loss": 0.3017, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.476997375488281, "rewards/margins": 2.401869535446167, "rewards/rejected": -6.878867149353027, "step": 2910 }, { "epoch": 1.68, "grad_norm": 32.936954840570316, "learning_rate": 2.407020386111505e-07, "logits/chosen": 2.0895845890045166, "logits/rejected": 2.5565178394317627, "logps/chosen": -700.415283203125, "logps/rejected": -906.5392456054688, "loss": 0.3349, "rewards/accuracies": 0.84375, "rewards/chosen": -5.020167350769043, "rewards/margins": 2.1652398109436035, "rewards/rejected": -7.1854071617126465, "step": 2920 }, { "epoch": 1.69, "grad_norm": 39.92219463279685, "learning_rate": 2.390277245318273e-07, "logits/chosen": 1.9945348501205444, "logits/rejected": 2.243025302886963, "logps/chosen": -685.6157836914062, "logps/rejected": -922.5826416015625, "loss": 0.3265, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.728598117828369, "rewards/margins": 2.3967082500457764, "rewards/rejected": -7.125306606292725, "step": 2930 }, { "epoch": 1.69, "grad_norm": 42.894523913885344, "learning_rate": 2.3735390340482403e-07, "logits/chosen": 2.1892974376678467, "logits/rejected": 2.360760450363159, "logps/chosen": -693.1878662109375, "logps/rejected": -952.4190673828125, "loss": 0.3071, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.82743501663208, "rewards/margins": 2.554232120513916, "rewards/rejected": -7.381666660308838, "step": 2940 }, { "epoch": 1.7, "grad_norm": 43.134737315214224, "learning_rate": 2.3568065043003956e-07, "logits/chosen": 2.021087169647217, "logits/rejected": 2.3437230587005615, "logps/chosen": -668.9281005859375, "logps/rejected": -886.6213989257812, "loss": 0.34, "rewards/accuracies": 0.84375, "rewards/chosen": -4.861583232879639, "rewards/margins": 2.1108720302581787, "rewards/rejected": -6.9724555015563965, "step": 2950 }, { "epoch": 1.7, "grad_norm": 44.43781479227309, "learning_rate": 2.3400804078184775e-07, "logits/chosen": 2.359149217605591, "logits/rejected": 2.1114280223846436, "logps/chosen": -673.1062622070312, "logps/rejected": -903.8165893554688, "loss": 0.3144, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.759329319000244, "rewards/margins": 2.2294793128967285, "rewards/rejected": -6.988807678222656, "step": 2960 }, { "epoch": 1.71, "grad_norm": 40.23168956695321, "learning_rate": 2.3233614960571928e-07, "logits/chosen": 1.8773263692855835, "logits/rejected": 2.1252002716064453, "logps/chosen": -657.8561401367188, "logps/rejected": -893.2589721679688, "loss": 0.3301, "rewards/accuracies": 0.875, "rewards/chosen": -4.6166815757751465, "rewards/margins": 2.185661792755127, "rewards/rejected": -6.802343845367432, "step": 2970 }, { "epoch": 1.72, "grad_norm": 27.3222448523482, "learning_rate": 2.3066505201484625e-07, "logits/chosen": 1.8130661249160767, "logits/rejected": 1.9770088195800781, "logps/chosen": -641.3779296875, "logps/rejected": -828.8336791992188, "loss": 0.2926, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.487464904785156, "rewards/margins": 1.984194040298462, "rewards/rejected": -6.471659183502197, "step": 2980 }, { "epoch": 1.72, "grad_norm": 31.904840169383213, "learning_rate": 2.2899482308676713e-07, "logits/chosen": 1.411312460899353, "logits/rejected": 1.867160439491272, "logps/chosen": -710.8165283203125, "logps/rejected": -908.5579833984375, "loss": 0.3449, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.655442237854004, "rewards/margins": 2.265839099884033, "rewards/rejected": -6.921281337738037, "step": 2990 }, { "epoch": 1.73, "grad_norm": 38.158390051570386, "learning_rate": 2.2732553785999387e-07, "logits/chosen": 1.9053354263305664, "logits/rejected": 2.0263123512268066, "logps/chosen": -601.5404052734375, "logps/rejected": -817.716064453125, "loss": 0.3414, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.208640098571777, "rewards/margins": 2.196655750274658, "rewards/rejected": -6.405296325683594, "step": 3000 }, { "epoch": 1.73, "eval_logits/chosen": 1.9588791131973267, "eval_logits/rejected": 1.691982626914978, "eval_logps/chosen": -623.7572021484375, "eval_logps/rejected": -732.3314819335938, "eval_loss": 0.5697500109672546, "eval_rewards/accuracies": 0.728881299495697, "eval_rewards/chosen": -4.610004425048828, "eval_rewards/margins": 1.0100440979003906, "eval_rewards/rejected": -5.620048522949219, "eval_runtime": 535.1159, "eval_samples_per_second": 13.081, "eval_steps_per_second": 0.409, "step": 3000 }, { "epoch": 1.73, "grad_norm": 22.981428691231773, "learning_rate": 2.2565727133064092e-07, "logits/chosen": 1.7134958505630493, "logits/rejected": 2.2806363105773926, "logps/chosen": -592.9300537109375, "logps/rejected": -780.8262329101562, "loss": 0.3295, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.997809886932373, "rewards/margins": 1.998029112815857, "rewards/rejected": -5.9958391189575195, "step": 3010 }, { "epoch": 1.74, "grad_norm": 42.77443537784261, "learning_rate": 2.2399009844905538e-07, "logits/chosen": 1.7472158670425415, "logits/rejected": 2.2648138999938965, "logps/chosen": -619.320068359375, "logps/rejected": -822.0887451171875, "loss": 0.2849, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.115148067474365, "rewards/margins": 2.1130096912384033, "rewards/rejected": -6.228157997131348, "step": 3020 }, { "epoch": 1.74, "grad_norm": 48.992840233756176, "learning_rate": 2.2232409411645009e-07, "logits/chosen": 1.6787803173065186, "logits/rejected": 2.277024269104004, "logps/chosen": -636.37939453125, "logps/rejected": -873.4612426757812, "loss": 0.3371, "rewards/accuracies": 0.84375, "rewards/chosen": -4.189149379730225, "rewards/margins": 2.2498040199279785, "rewards/rejected": -6.4389543533325195, "step": 3030 }, { "epoch": 1.75, "grad_norm": 46.407059531601256, "learning_rate": 2.206593331815383e-07, "logits/chosen": 1.6435436010360718, "logits/rejected": 2.4788150787353516, "logps/chosen": -638.3565673828125, "logps/rejected": -853.5006713867188, "loss": 0.3475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.318505764007568, "rewards/margins": 2.298260450363159, "rewards/rejected": -6.616766452789307, "step": 3040 }, { "epoch": 1.76, "grad_norm": 23.79275399994042, "learning_rate": 2.1899589043717116e-07, "logits/chosen": 2.2938549518585205, "logits/rejected": 2.2226691246032715, "logps/chosen": -585.2247314453125, "logps/rejected": -800.9713134765625, "loss": 0.3438, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.076876640319824, "rewards/margins": 2.17448353767395, "rewards/rejected": -6.2513604164123535, "step": 3050 }, { "epoch": 1.76, "grad_norm": 37.63639002537673, "learning_rate": 2.1733384061697706e-07, "logits/chosen": 1.9681453704833984, "logits/rejected": 2.04839825630188, "logps/chosen": -609.6961669921875, "logps/rejected": -833.9690551757812, "loss": 0.3157, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.0492777824401855, "rewards/margins": 2.1225171089172363, "rewards/rejected": -6.171794414520264, "step": 3060 }, { "epoch": 1.77, "grad_norm": 46.07610149018862, "learning_rate": 2.156732583920048e-07, "logits/chosen": 2.278414726257324, "logits/rejected": 2.5662617683410645, "logps/chosen": -579.07177734375, "logps/rejected": -788.9910278320312, "loss": 0.3336, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.146145343780518, "rewards/margins": 2.0838141441345215, "rewards/rejected": -6.229959487915039, "step": 3070 }, { "epoch": 1.77, "grad_norm": 58.1201670531535, "learning_rate": 2.1401421836736803e-07, "logits/chosen": 1.9984382390975952, "logits/rejected": 2.3716933727264404, "logps/chosen": -649.7659301757812, "logps/rejected": -868.2586059570312, "loss": 0.3338, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.363190174102783, "rewards/margins": 2.3637661933898926, "rewards/rejected": -6.726956367492676, "step": 3080 }, { "epoch": 1.78, "grad_norm": 27.27251614787792, "learning_rate": 2.1235679507889416e-07, "logits/chosen": 1.9786689281463623, "logits/rejected": 2.2483699321746826, "logps/chosen": -652.0988159179688, "logps/rejected": -898.9951171875, "loss": 0.2858, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.531172752380371, "rewards/margins": 2.3729565143585205, "rewards/rejected": -6.904128074645996, "step": 3090 }, { "epoch": 1.79, "grad_norm": 55.96967936823563, "learning_rate": 2.1070106298977514e-07, "logits/chosen": 2.1405718326568604, "logits/rejected": 2.1645560264587402, "logps/chosen": -655.5068359375, "logps/rejected": -877.6861572265625, "loss": 0.3097, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.552793502807617, "rewards/margins": 2.300513744354248, "rewards/rejected": -6.853308200836182, "step": 3100 }, { "epoch": 1.79, "eval_logits/chosen": 2.2979886531829834, "eval_logits/rejected": 2.025968074798584, "eval_logps/chosen": -661.5057373046875, "eval_logps/rejected": -782.4933471679688, "eval_loss": 0.5738718509674072, "eval_rewards/accuracies": 0.7254565954208374, "eval_rewards/chosen": -4.987489700317383, "eval_rewards/margins": 1.134177565574646, "eval_rewards/rejected": -6.121667861938477, "eval_runtime": 524.5145, "eval_samples_per_second": 13.346, "eval_steps_per_second": 0.418, "step": 3100 }, { "epoch": 1.79, "grad_norm": 45.15257744863436, "learning_rate": 2.090470964872223e-07, "logits/chosen": 2.252577304840088, "logits/rejected": 2.2381606101989746, "logps/chosen": -638.2000122070312, "logps/rejected": -885.7161254882812, "loss": 0.3176, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.5647687911987305, "rewards/margins": 2.2481143474578857, "rewards/rejected": -6.812882900238037, "step": 3110 }, { "epoch": 1.8, "grad_norm": 55.551475921721696, "learning_rate": 2.073949698791244e-07, "logits/chosen": 1.7351967096328735, "logits/rejected": 1.8707069158554077, "logps/chosen": -638.6422119140625, "logps/rejected": -843.5880737304688, "loss": 0.3165, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.290929317474365, "rewards/margins": 2.212524652481079, "rewards/rejected": -6.503453731536865, "step": 3120 }, { "epoch": 1.8, "grad_norm": 55.50491542065995, "learning_rate": 2.0574475739070905e-07, "logits/chosen": 1.7055730819702148, "logits/rejected": 1.715707540512085, "logps/chosen": -683.7041015625, "logps/rejected": -945.4763793945312, "loss": 0.321, "rewards/accuracies": 0.84375, "rewards/chosen": -4.806698799133301, "rewards/margins": 2.3809380531311035, "rewards/rejected": -7.187636375427246, "step": 3130 }, { "epoch": 1.81, "grad_norm": 58.923981515080236, "learning_rate": 2.0409653316120806e-07, "logits/chosen": 1.802146315574646, "logits/rejected": 1.8524999618530273, "logps/chosen": -627.0184326171875, "logps/rejected": -822.6629638671875, "loss": 0.3432, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.286912441253662, "rewards/margins": 2.0034584999084473, "rewards/rejected": -6.290371417999268, "step": 3140 }, { "epoch": 1.81, "grad_norm": 43.39780790951795, "learning_rate": 2.0245037124052658e-07, "logits/chosen": 1.9325037002563477, "logits/rejected": 1.9751043319702148, "logps/chosen": -605.7780151367188, "logps/rejected": -801.611083984375, "loss": 0.3403, "rewards/accuracies": 0.84375, "rewards/chosen": -4.047119617462158, "rewards/margins": 2.072427272796631, "rewards/rejected": -6.119546890258789, "step": 3150 }, { "epoch": 1.82, "grad_norm": 32.93401461290134, "learning_rate": 2.0080634558591626e-07, "logits/chosen": 2.1715760231018066, "logits/rejected": 1.8673083782196045, "logps/chosen": -576.1846923828125, "logps/rejected": -741.0400390625, "loss": 0.3422, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.949694871902466, "rewards/margins": 1.7254012823104858, "rewards/rejected": -5.675095558166504, "step": 3160 }, { "epoch": 1.83, "grad_norm": 24.035692473747247, "learning_rate": 1.9916453005865244e-07, "logits/chosen": 1.8671340942382812, "logits/rejected": 2.316265344619751, "logps/chosen": -561.3065795898438, "logps/rejected": -777.4314575195312, "loss": 0.2972, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.794464111328125, "rewards/margins": 2.0970263481140137, "rewards/rejected": -5.8914899826049805, "step": 3170 }, { "epoch": 1.83, "grad_norm": 48.6534465242388, "learning_rate": 1.9752499842071598e-07, "logits/chosen": 1.9304643869400024, "logits/rejected": 2.38059139251709, "logps/chosen": -614.7517700195312, "logps/rejected": -839.4931640625, "loss": 0.3239, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.203696250915527, "rewards/margins": 2.3450140953063965, "rewards/rejected": -6.548708915710449, "step": 3180 }, { "epoch": 1.84, "grad_norm": 42.444877976961656, "learning_rate": 1.9588782433147945e-07, "logits/chosen": 2.3279662132263184, "logits/rejected": 2.2215890884399414, "logps/chosen": -625.7352905273438, "logps/rejected": -879.4039916992188, "loss": 0.3469, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.34265661239624, "rewards/margins": 2.4497103691101074, "rewards/rejected": -6.792367458343506, "step": 3190 }, { "epoch": 1.84, "grad_norm": 36.91621716791759, "learning_rate": 1.9425308134439715e-07, "logits/chosen": 2.1773228645324707, "logits/rejected": 2.167372465133667, "logps/chosen": -652.9777221679688, "logps/rejected": -852.3330078125, "loss": 0.3077, "rewards/accuracies": 0.875, "rewards/chosen": -4.402958393096924, "rewards/margins": 2.042320489883423, "rewards/rejected": -6.445279121398926, "step": 3200 }, { "epoch": 1.84, "eval_logits/chosen": 2.399453639984131, "eval_logits/rejected": 2.0797512531280518, "eval_logps/chosen": -665.7410278320312, "eval_logps/rejected": -783.5215454101562, "eval_loss": 0.5685449242591858, "eval_rewards/accuracies": 0.7226027250289917, "eval_rewards/chosen": -5.029842376708984, "eval_rewards/margins": 1.1021068096160889, "eval_rewards/rejected": -6.131948947906494, "eval_runtime": 535.4029, "eval_samples_per_second": 13.074, "eval_steps_per_second": 0.409, "step": 3200 }, { "epoch": 1.85, "grad_norm": 42.20761821279265, "learning_rate": 1.926208429037014e-07, "logits/chosen": 2.6284165382385254, "logits/rejected": 2.0263724327087402, "logps/chosen": -716.9176025390625, "logps/rejected": -939.2355346679688, "loss": 0.29, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.956343173980713, "rewards/margins": 2.4249958992004395, "rewards/rejected": -7.381338596343994, "step": 3210 }, { "epoch": 1.85, "grad_norm": 38.64619415041815, "learning_rate": 1.909911823411026e-07, "logits/chosen": 2.2851080894470215, "logits/rejected": 2.646312952041626, "logps/chosen": -722.4159545898438, "logps/rejected": -939.0037231445312, "loss": 0.3042, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.932633399963379, "rewards/margins": 2.419227123260498, "rewards/rejected": -7.351861000061035, "step": 3220 }, { "epoch": 1.86, "grad_norm": 49.21475460158534, "learning_rate": 1.8936417287249446e-07, "logits/chosen": 2.9902467727661133, "logits/rejected": 2.5721237659454346, "logps/chosen": -758.75927734375, "logps/rejected": -987.5985107421875, "loss": 0.3078, "rewards/accuracies": 0.875, "rewards/chosen": -5.394339561462402, "rewards/margins": 2.513002872467041, "rewards/rejected": -7.907342433929443, "step": 3230 }, { "epoch": 1.87, "grad_norm": 34.73928244184459, "learning_rate": 1.877398875946648e-07, "logits/chosen": 2.422316074371338, "logits/rejected": 2.713583469390869, "logps/chosen": -715.2218017578125, "logps/rejected": -916.0929565429688, "loss": 0.3235, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -5.052298545837402, "rewards/margins": 2.2123332023620605, "rewards/rejected": -7.2646307945251465, "step": 3240 }, { "epoch": 1.87, "grad_norm": 37.39524787933084, "learning_rate": 1.8611839948201153e-07, "logits/chosen": 2.4158523082733154, "logits/rejected": 2.7355122566223145, "logps/chosen": -709.918701171875, "logps/rejected": -903.64599609375, "loss": 0.2691, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -5.021745204925537, "rewards/margins": 2.1376841068267822, "rewards/rejected": -7.159430027008057, "step": 3250 }, { "epoch": 1.88, "grad_norm": 27.622331127758635, "learning_rate": 1.84499781383264e-07, "logits/chosen": 2.6145851612091064, "logits/rejected": 2.292513847351074, "logps/chosen": -760.1905517578125, "logps/rejected": -975.4796142578125, "loss": 0.3273, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -5.300500392913818, "rewards/margins": 2.4408915042877197, "rewards/rejected": -7.741391658782959, "step": 3260 }, { "epoch": 1.88, "grad_norm": 65.906020273464, "learning_rate": 1.8288410601821041e-07, "logits/chosen": 2.1931099891662598, "logits/rejected": 2.6287407875061035, "logps/chosen": -689.6822509765625, "logps/rejected": -880.4434814453125, "loss": 0.3701, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.976579189300537, "rewards/margins": 1.9259990453720093, "rewards/rejected": -6.902577877044678, "step": 3270 }, { "epoch": 1.89, "grad_norm": 60.81036561968602, "learning_rate": 1.8127144597443034e-07, "logits/chosen": 2.343076229095459, "logits/rejected": 2.117987632751465, "logps/chosen": -673.6673583984375, "logps/rejected": -872.4158935546875, "loss": 0.3505, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.763646125793457, "rewards/margins": 2.1014764308929443, "rewards/rejected": -6.865122318267822, "step": 3280 }, { "epoch": 1.89, "grad_norm": 57.18800247290382, "learning_rate": 1.7966187370403385e-07, "logits/chosen": 2.1372358798980713, "logits/rejected": 2.4426989555358887, "logps/chosen": -626.4075317382812, "logps/rejected": -843.9088745117188, "loss": 0.3001, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.448263168334961, "rewards/margins": 2.286518096923828, "rewards/rejected": -6.734780788421631, "step": 3290 }, { "epoch": 1.9, "grad_norm": 44.18268476052576, "learning_rate": 1.7805546152040634e-07, "logits/chosen": 2.1215412616729736, "logits/rejected": 2.0184779167175293, "logps/chosen": -661.7462158203125, "logps/rejected": -915.4035034179688, "loss": 0.3101, "rewards/accuracies": 0.875, "rewards/chosen": -4.634057521820068, "rewards/margins": 2.3738913536071777, "rewards/rejected": -7.007948875427246, "step": 3300 }, { "epoch": 1.9, "eval_logits/chosen": 2.295004367828369, "eval_logits/rejected": 1.9781683683395386, "eval_logps/chosen": -663.111572265625, "eval_logps/rejected": -784.107421875, "eval_loss": 0.5708804726600647, "eval_rewards/accuracies": 0.7351598143577576, "eval_rewards/chosen": -5.003548622131348, "eval_rewards/margins": 1.1342597007751465, "eval_rewards/rejected": -6.137807846069336, "eval_runtime": 523.5322, "eval_samples_per_second": 13.371, "eval_steps_per_second": 0.418, "step": 3300 }, { "epoch": 1.91, "grad_norm": 54.8505374210298, "learning_rate": 1.7645228159495969e-07, "logits/chosen": 2.014904499053955, "logits/rejected": 1.9095014333724976, "logps/chosen": -723.8671875, "logps/rejected": -925.0339965820312, "loss": 0.3081, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.012709617614746, "rewards/margins": 2.038578510284424, "rewards/rejected": -7.051288604736328, "step": 3310 }, { "epoch": 1.91, "grad_norm": 46.270682025474706, "learning_rate": 1.7485240595389e-07, "logits/chosen": 1.8113120794296265, "logits/rejected": 2.22719407081604, "logps/chosen": -668.3312377929688, "logps/rejected": -910.0169677734375, "loss": 0.2811, "rewards/accuracies": 0.875, "rewards/chosen": -4.578722953796387, "rewards/margins": 2.639694929122925, "rewards/rejected": -7.218417167663574, "step": 3320 }, { "epoch": 1.92, "grad_norm": 27.97582595042281, "learning_rate": 1.732559064749413e-07, "logits/chosen": 2.177745819091797, "logits/rejected": 1.8711910247802734, "logps/chosen": -665.2601318359375, "logps/rejected": -944.1094970703125, "loss": 0.289, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.600152015686035, "rewards/margins": 2.791374683380127, "rewards/rejected": -7.391526699066162, "step": 3330 }, { "epoch": 1.92, "grad_norm": 63.01700337238851, "learning_rate": 1.7166285488417676e-07, "logits/chosen": 2.3040993213653564, "logits/rejected": 2.5220980644226074, "logps/chosen": -688.363525390625, "logps/rejected": -902.8987426757812, "loss": 0.3586, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -5.004676818847656, "rewards/margins": 2.2920422554016113, "rewards/rejected": -7.296719551086426, "step": 3340 }, { "epoch": 1.93, "grad_norm": 39.5868349846548, "learning_rate": 1.700733227527557e-07, "logits/chosen": 2.051144599914551, "logits/rejected": 2.4458742141723633, "logps/chosen": -662.1652221679688, "logps/rejected": -953.0900268554688, "loss": 0.2904, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.6415534019470215, "rewards/margins": 2.837695360183716, "rewards/rejected": -7.479249477386475, "step": 3350 }, { "epoch": 1.93, "grad_norm": 67.99336276942854, "learning_rate": 1.6848738149371867e-07, "logits/chosen": 1.9923057556152344, "logits/rejected": 2.1052722930908203, "logps/chosen": -692.83935546875, "logps/rejected": -980.7491455078125, "loss": 0.3143, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.835230350494385, "rewards/margins": 2.7563624382019043, "rewards/rejected": -7.591592311859131, "step": 3360 }, { "epoch": 1.94, "grad_norm": 53.8371328451996, "learning_rate": 1.6690510235877862e-07, "logits/chosen": 2.559948444366455, "logits/rejected": 2.0809054374694824, "logps/chosen": -668.9596557617188, "logps/rejected": -891.7936401367188, "loss": 0.3412, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.724262714385986, "rewards/margins": 2.146951198577881, "rewards/rejected": -6.871213436126709, "step": 3370 }, { "epoch": 1.95, "grad_norm": 54.025303954229656, "learning_rate": 1.6532655643512e-07, "logits/chosen": 2.163259506225586, "logits/rejected": 1.938847541809082, "logps/chosen": -683.0198974609375, "logps/rejected": -887.0689697265625, "loss": 0.3286, "rewards/accuracies": 0.875, "rewards/chosen": -4.647250175476074, "rewards/margins": 2.235180377960205, "rewards/rejected": -6.882430076599121, "step": 3380 }, { "epoch": 1.95, "grad_norm": 30.967255779949408, "learning_rate": 1.6375181464220504e-07, "logits/chosen": 2.149014949798584, "logits/rejected": 2.079514503479004, "logps/chosen": -607.1089477539062, "logps/rejected": -839.2340087890625, "loss": 0.3502, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.187648773193359, "rewards/margins": 2.291783571243286, "rewards/rejected": -6.47943115234375, "step": 3390 }, { "epoch": 1.96, "grad_norm": 31.399203142868604, "learning_rate": 1.6218094772858741e-07, "logits/chosen": 2.110233783721924, "logits/rejected": 2.4688830375671387, "logps/chosen": -645.4471435546875, "logps/rejected": -858.9615478515625, "loss": 0.3235, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.416108131408691, "rewards/margins": 2.2572340965270996, "rewards/rejected": -6.673342704772949, "step": 3400 }, { "epoch": 1.96, "eval_logits/chosen": 2.2625622749328613, "eval_logits/rejected": 1.915532112121582, "eval_logps/chosen": -647.6710205078125, "eval_logps/rejected": -755.5941772460938, "eval_loss": 0.5628695487976074, "eval_rewards/accuracies": 0.7345890402793884, "eval_rewards/chosen": -4.849143028259277, "eval_rewards/margins": 1.0035330057144165, "eval_rewards/rejected": -5.852675437927246, "eval_runtime": 535.184, "eval_samples_per_second": 13.08, "eval_steps_per_second": 0.409, "step": 3400 }, { "epoch": 1.96, "grad_norm": 29.656710300615856, "learning_rate": 1.6061402626873383e-07, "logits/chosen": 2.2905797958374023, "logits/rejected": 2.4246227741241455, "logps/chosen": -620.0130615234375, "logps/rejected": -829.9230346679688, "loss": 0.326, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.401915550231934, "rewards/margins": 2.1180906295776367, "rewards/rejected": -6.5200066566467285, "step": 3410 }, { "epoch": 1.97, "grad_norm": 24.29992631223249, "learning_rate": 1.5905112065985314e-07, "logits/chosen": 2.2553887367248535, "logits/rejected": 1.899258017539978, "logps/chosen": -635.0886840820312, "logps/rejected": -837.3094482421875, "loss": 0.3178, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.509106159210205, "rewards/margins": 2.044373035430908, "rewards/rejected": -6.553479194641113, "step": 3420 }, { "epoch": 1.98, "grad_norm": 49.684277766967256, "learning_rate": 1.5749230111873387e-07, "logits/chosen": 2.453914165496826, "logits/rejected": 2.11564564704895, "logps/chosen": -687.6725463867188, "logps/rejected": -881.5555419921875, "loss": 0.345, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.752093315124512, "rewards/margins": 2.1149234771728516, "rewards/rejected": -6.867016792297363, "step": 3430 }, { "epoch": 1.98, "grad_norm": 31.939681520934418, "learning_rate": 1.5593763767858936e-07, "logits/chosen": 1.8594415187835693, "logits/rejected": 2.3856873512268066, "logps/chosen": -686.5487060546875, "logps/rejected": -928.4705810546875, "loss": 0.3027, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.641936779022217, "rewards/margins": 2.475961446762085, "rewards/rejected": -7.117898464202881, "step": 3440 }, { "epoch": 1.99, "grad_norm": 34.579941644238374, "learning_rate": 1.5438720018591156e-07, "logits/chosen": 2.357339382171631, "logits/rejected": 2.1689164638519287, "logps/chosen": -668.3074340820312, "logps/rejected": -903.3863525390625, "loss": 0.3224, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.689288139343262, "rewards/margins": 2.3330307006835938, "rewards/rejected": -7.022318363189697, "step": 3450 }, { "epoch": 1.99, "grad_norm": 42.48366150188782, "learning_rate": 1.5284105829733282e-07, "logits/chosen": 2.188469886779785, "logits/rejected": 2.6690590381622314, "logps/chosen": -649.2243041992188, "logps/rejected": -836.4148559570312, "loss": 0.3052, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.514993667602539, "rewards/margins": 2.1017682552337646, "rewards/rejected": -6.616762638092041, "step": 3460 }, { "epoch": 2.0, "grad_norm": 78.40948692460326, "learning_rate": 1.5129928147649656e-07, "logits/chosen": 2.1678566932678223, "logits/rejected": 2.139063596725464, "logps/chosen": -683.1871337890625, "logps/rejected": -887.99658203125, "loss": 0.307, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.8374738693237305, "rewards/margins": 2.175691604614258, "rewards/rejected": -7.0131659507751465, "step": 3470 }, { "epoch": 2.0, "grad_norm": 14.058613394445395, "learning_rate": 1.4976193899093657e-07, "logits/chosen": 2.1931307315826416, "logits/rejected": 2.418992042541504, "logps/chosen": -612.0548706054688, "logps/rejected": -905.6593627929688, "loss": 0.1947, "rewards/accuracies": 0.9375, "rewards/chosen": -4.339973449707031, "rewards/margins": 2.768751382827759, "rewards/rejected": -7.108725070953369, "step": 3480 }, { "epoch": 2.01, "grad_norm": 19.73951878022837, "learning_rate": 1.4822909990896462e-07, "logits/chosen": 2.1134066581726074, "logits/rejected": 2.4730172157287598, "logps/chosen": -656.6695556640625, "logps/rejected": -1050.409912109375, "loss": 0.1486, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.613894462585449, "rewards/margins": 3.7059104442596436, "rewards/rejected": -8.319804191589355, "step": 3490 }, { "epoch": 2.02, "grad_norm": 14.988773327893071, "learning_rate": 1.4670083309656794e-07, "logits/chosen": 2.401637077331543, "logits/rejected": 3.152545213699341, "logps/chosen": -722.2550659179688, "logps/rejected": -1124.58984375, "loss": 0.1328, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.342690467834473, "rewards/margins": 3.871044635772705, "rewards/rejected": -9.21373462677002, "step": 3500 }, { "epoch": 2.02, "eval_logits/chosen": 3.063668966293335, "eval_logits/rejected": 2.7097582817077637, "eval_logps/chosen": -824.1729736328125, "eval_logps/rejected": -965.9568481445312, "eval_loss": 0.6063192486763, "eval_rewards/accuracies": 0.728881299495697, "eval_rewards/chosen": -6.614161968231201, "eval_rewards/margins": 1.3421401977539062, "eval_rewards/rejected": -7.956302165985107, "eval_runtime": 523.7313, "eval_samples_per_second": 13.366, "eval_steps_per_second": 0.418, "step": 3500 }, { "epoch": 2.02, "grad_norm": 27.185520893936562, "learning_rate": 1.4517720721431497e-07, "logits/chosen": 2.802900791168213, "logits/rejected": 3.0722851753234863, "logps/chosen": -814.65234375, "logps/rejected": -1195.796142578125, "loss": 0.1297, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.070180416107178, "rewards/margins": 3.782501220703125, "rewards/rejected": -9.852681159973145, "step": 3510 }, { "epoch": 2.03, "grad_norm": 36.44922375396643, "learning_rate": 1.436582907142706e-07, "logits/chosen": 3.121885299682617, "logits/rejected": 3.118112325668335, "logps/chosen": -869.04931640625, "logps/rejected": -1310.6949462890625, "loss": 0.1483, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.516282081604004, "rewards/margins": 4.537832260131836, "rewards/rejected": -11.054115295410156, "step": 3520 }, { "epoch": 2.03, "grad_norm": 36.16198874539062, "learning_rate": 1.421441518369212e-07, "logits/chosen": 3.4101879596710205, "logits/rejected": 3.7247796058654785, "logps/chosen": -862.8156127929688, "logps/rejected": -1283.30224609375, "loss": 0.1421, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.872193813323975, "rewards/margins": 3.9892983436584473, "rewards/rejected": -10.861493110656738, "step": 3530 }, { "epoch": 2.04, "grad_norm": 25.588592367403198, "learning_rate": 1.4063485860810804e-07, "logits/chosen": 3.492438554763794, "logits/rejected": 3.6731066703796387, "logps/chosen": -912.6950073242188, "logps/rejected": -1325.8114013671875, "loss": 0.1282, "rewards/accuracies": 0.9375, "rewards/chosen": -7.175641059875488, "rewards/margins": 4.118454456329346, "rewards/rejected": -11.294095039367676, "step": 3540 }, { "epoch": 2.04, "grad_norm": 24.243964456935604, "learning_rate": 1.3913047883597196e-07, "logits/chosen": 2.9837372303009033, "logits/rejected": 4.225147247314453, "logps/chosen": -935.2005615234375, "logps/rejected": -1330.525390625, "loss": 0.1277, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.353121280670166, "rewards/margins": 4.023127555847168, "rewards/rejected": -11.376249313354492, "step": 3550 }, { "epoch": 2.05, "grad_norm": 30.197201721256526, "learning_rate": 1.3763108010790636e-07, "logits/chosen": 3.671154022216797, "logits/rejected": 3.877824306488037, "logps/chosen": -863.4288940429688, "logps/rejected": -1228.314697265625, "loss": 0.1775, "rewards/accuracies": 0.9375, "rewards/chosen": -6.657944679260254, "rewards/margins": 3.7850654125213623, "rewards/rejected": -10.443010330200195, "step": 3560 }, { "epoch": 2.06, "grad_norm": 32.306599607328764, "learning_rate": 1.3613672978752083e-07, "logits/chosen": 3.4126439094543457, "logits/rejected": 3.2288880348205566, "logps/chosen": -848.1890869140625, "logps/rejected": -1291.7412109375, "loss": 0.123, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.435237884521484, "rewards/margins": 4.469638824462891, "rewards/rejected": -10.904876708984375, "step": 3570 }, { "epoch": 2.06, "grad_norm": 26.83174863529065, "learning_rate": 1.3464749501161483e-07, "logits/chosen": 3.4787535667419434, "logits/rejected": 4.094793796539307, "logps/chosen": -835.8655395507812, "logps/rejected": -1185.7265625, "loss": 0.138, "rewards/accuracies": 0.9375, "rewards/chosen": -6.386282920837402, "rewards/margins": 3.781203508377075, "rewards/rejected": -10.167486190795898, "step": 3580 }, { "epoch": 2.07, "grad_norm": 18.135837292727203, "learning_rate": 1.3316344268716117e-07, "logits/chosen": 3.9116604328155518, "logits/rejected": 3.5160889625549316, "logps/chosen": -875.2320556640625, "logps/rejected": -1254.057861328125, "loss": 0.132, "rewards/accuracies": 0.9375, "rewards/chosen": -6.825141906738281, "rewards/margins": 3.9471065998077393, "rewards/rejected": -10.772249221801758, "step": 3590 }, { "epoch": 2.07, "grad_norm": 27.805762050069198, "learning_rate": 1.3168463948830038e-07, "logits/chosen": 3.173962116241455, "logits/rejected": 3.6442294120788574, "logps/chosen": -841.2674560546875, "logps/rejected": -1264.9127197265625, "loss": 0.1438, "rewards/accuracies": 0.9375, "rewards/chosen": -6.678984642028809, "rewards/margins": 4.185361862182617, "rewards/rejected": -10.864347457885742, "step": 3600 }, { "epoch": 2.07, "eval_logits/chosen": 3.834343433380127, "eval_logits/rejected": 3.399441719055176, "eval_logps/chosen": -952.779541015625, "eval_logps/rejected": -1107.0623779296875, "eval_loss": 0.6421390175819397, "eval_rewards/accuracies": 0.715753436088562, "eval_rewards/chosen": -7.900227069854736, "eval_rewards/margins": 1.467130184173584, "eval_rewards/rejected": -9.367358207702637, "eval_runtime": 534.1911, "eval_samples_per_second": 13.104, "eval_steps_per_second": 0.41, "step": 3600 }, { "epoch": 2.08, "grad_norm": 38.207945668669026, "learning_rate": 1.3021115185334524e-07, "logits/chosen": 3.0689730644226074, "logits/rejected": 3.550379991531372, "logps/chosen": -863.3743286132812, "logps/rejected": -1267.830322265625, "loss": 0.1312, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.5663862228393555, "rewards/margins": 4.193253993988037, "rewards/rejected": -10.75964069366455, "step": 3610 }, { "epoch": 2.08, "grad_norm": 25.98262826521858, "learning_rate": 1.2874304598179542e-07, "logits/chosen": 3.4819798469543457, "logits/rejected": 3.7446789741516113, "logps/chosen": -878.4275512695312, "logps/rejected": -1280.141845703125, "loss": 0.1373, "rewards/accuracies": 0.9375, "rewards/chosen": -6.893362522125244, "rewards/margins": 4.019153118133545, "rewards/rejected": -10.912515640258789, "step": 3620 }, { "epoch": 2.09, "grad_norm": 26.510791943267133, "learning_rate": 1.2728038783136372e-07, "logits/chosen": 3.1064648628234863, "logits/rejected": 3.402360200881958, "logps/chosen": -947.4931640625, "logps/rejected": -1410.132568359375, "loss": 0.1404, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.086489200592041, "rewards/margins": 4.866405487060547, "rewards/rejected": -11.952895164489746, "step": 3630 }, { "epoch": 2.1, "grad_norm": 30.494403558249505, "learning_rate": 1.2582324311501303e-07, "logits/chosen": 3.043975353240967, "logits/rejected": 3.4579672813415527, "logps/chosen": -894.48046875, "logps/rejected": -1311.660888671875, "loss": 0.1292, "rewards/accuracies": 0.9375, "rewards/chosen": -6.980504035949707, "rewards/margins": 4.065277099609375, "rewards/rejected": -11.045781135559082, "step": 3640 }, { "epoch": 2.1, "grad_norm": 19.789016123381874, "learning_rate": 1.2437167729800339e-07, "logits/chosen": 3.0855515003204346, "logits/rejected": 3.4487509727478027, "logps/chosen": -887.46044921875, "logps/rejected": -1322.7373046875, "loss": 0.126, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.0005950927734375, "rewards/margins": 4.313690185546875, "rewards/rejected": -11.314286231994629, "step": 3650 }, { "epoch": 2.11, "grad_norm": 46.24266271033402, "learning_rate": 1.2292575559495143e-07, "logits/chosen": 3.4318652153015137, "logits/rejected": 3.6983237266540527, "logps/chosen": -898.39794921875, "logps/rejected": -1318.48486328125, "loss": 0.1569, "rewards/accuracies": 0.90625, "rewards/chosen": -7.054856300354004, "rewards/margins": 3.960808277130127, "rewards/rejected": -11.015665054321289, "step": 3660 }, { "epoch": 2.11, "grad_norm": 31.806160113694627, "learning_rate": 1.214855429668999e-07, "logits/chosen": 3.1044955253601074, "logits/rejected": 3.9279091358184814, "logps/chosen": -846.2428588867188, "logps/rejected": -1256.8299560546875, "loss": 0.1274, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.50442361831665, "rewards/margins": 4.103934288024902, "rewards/rejected": -10.608358383178711, "step": 3670 }, { "epoch": 2.12, "grad_norm": 42.37045277936191, "learning_rate": 1.200511041183998e-07, "logits/chosen": 3.0014257431030273, "logits/rejected": 3.2319750785827637, "logps/chosen": -914.0543212890625, "logps/rejected": -1283.556884765625, "loss": 0.1263, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.777462005615234, "rewards/margins": 4.0505781173706055, "rewards/rejected": -10.82804012298584, "step": 3680 }, { "epoch": 2.12, "grad_norm": 29.0485580283178, "learning_rate": 1.1862250349460301e-07, "logits/chosen": 3.214445114135742, "logits/rejected": 3.2933177947998047, "logps/chosen": -865.8875732421875, "logps/rejected": -1225.4921875, "loss": 0.143, "rewards/accuracies": 0.9375, "rewards/chosen": -6.774938106536865, "rewards/margins": 3.7125535011291504, "rewards/rejected": -10.4874906539917, "step": 3690 }, { "epoch": 2.13, "grad_norm": 34.758434526300505, "learning_rate": 1.1719980527836674e-07, "logits/chosen": 3.291172742843628, "logits/rejected": 4.1055216789245605, "logps/chosen": -861.9231567382812, "logps/rejected": -1266.238525390625, "loss": 0.1474, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.7583723068237305, "rewards/margins": 3.9905123710632324, "rewards/rejected": -10.748884201049805, "step": 3700 }, { "epoch": 2.13, "eval_logits/chosen": 3.9151997566223145, "eval_logits/rejected": 3.4597530364990234, "eval_logps/chosen": -960.7724609375, "eval_logps/rejected": -1124.85107421875, "eval_loss": 0.6611037850379944, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": -7.980156421661377, "eval_rewards/margins": 1.565087080001831, "eval_rewards/rejected": -9.545243263244629, "eval_runtime": 524.2153, "eval_samples_per_second": 13.353, "eval_steps_per_second": 0.418, "step": 3700 }, { "epoch": 2.14, "grad_norm": 24.02010108132901, "learning_rate": 1.1578307338737061e-07, "logits/chosen": 3.425081729888916, "logits/rejected": 3.006711959838867, "logps/chosen": -929.4527587890625, "logps/rejected": -1369.5863037109375, "loss": 0.1092, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.172210693359375, "rewards/margins": 4.432849407196045, "rewards/rejected": -11.605059623718262, "step": 3710 }, { "epoch": 2.14, "grad_norm": 41.75642657784448, "learning_rate": 1.1437237147124429e-07, "logits/chosen": 3.6742496490478516, "logits/rejected": 4.064270973205566, "logps/chosen": -934.1087646484375, "logps/rejected": -1374.8040771484375, "loss": 0.1367, "rewards/accuracies": 0.96875, "rewards/chosen": -7.4983811378479, "rewards/margins": 4.256420135498047, "rewards/rejected": -11.754800796508789, "step": 3720 }, { "epoch": 2.15, "grad_norm": 53.780363694579194, "learning_rate": 1.1296776290870857e-07, "logits/chosen": 3.671604633331299, "logits/rejected": 3.692060947418213, "logps/chosen": -959.42431640625, "logps/rejected": -1417.148681640625, "loss": 0.1325, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.633244514465332, "rewards/margins": 4.507730007171631, "rewards/rejected": -12.140974044799805, "step": 3730 }, { "epoch": 2.15, "grad_norm": 67.6138661654559, "learning_rate": 1.1156931080472765e-07, "logits/chosen": 3.69933819770813, "logits/rejected": 4.018538475036621, "logps/chosen": -1000.7516479492188, "logps/rejected": -1473.5906982421875, "loss": 0.123, "rewards/accuracies": 0.96875, "rewards/chosen": -8.006914138793945, "rewards/margins": 4.656982898712158, "rewards/rejected": -12.663896560668945, "step": 3740 }, { "epoch": 2.16, "grad_norm": 45.545868169167775, "learning_rate": 1.1017707798767367e-07, "logits/chosen": 3.3847403526306152, "logits/rejected": 4.568535327911377, "logps/chosen": -957.02197265625, "logps/rejected": -1380.7176513671875, "loss": 0.1456, "rewards/accuracies": 0.90625, "rewards/chosen": -7.494771480560303, "rewards/margins": 4.399388790130615, "rewards/rejected": -11.894161224365234, "step": 3750 }, { "epoch": 2.17, "grad_norm": 44.949653729115894, "learning_rate": 1.0879112700650484e-07, "logits/chosen": 3.3210902214050293, "logits/rejected": 3.9475436210632324, "logps/chosen": -843.4647216796875, "logps/rejected": -1226.0308837890625, "loss": 0.1567, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.650018215179443, "rewards/margins": 3.8027775287628174, "rewards/rejected": -10.45279598236084, "step": 3760 }, { "epoch": 2.17, "grad_norm": 33.28710416182623, "learning_rate": 1.074115201279544e-07, "logits/chosen": 3.145082950592041, "logits/rejected": 3.7794318199157715, "logps/chosen": -906.9852294921875, "logps/rejected": -1351.574951171875, "loss": 0.1273, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.750838279724121, "rewards/margins": 4.593637943267822, "rewards/rejected": -11.344475746154785, "step": 3770 }, { "epoch": 2.18, "grad_norm": 35.339512346166224, "learning_rate": 1.0603831933373367e-07, "logits/chosen": 3.861907482147217, "logits/rejected": 3.6606667041778564, "logps/chosen": -854.71630859375, "logps/rejected": -1299.02734375, "loss": 0.1232, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.672190189361572, "rewards/margins": 4.436799049377441, "rewards/rejected": -11.108988761901855, "step": 3780 }, { "epoch": 2.18, "grad_norm": 44.47716217713485, "learning_rate": 1.0467158631774753e-07, "logits/chosen": 3.5417447090148926, "logits/rejected": 4.032229423522949, "logps/chosen": -871.6853637695312, "logps/rejected": -1298.8951416015625, "loss": 0.1481, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.7777533531188965, "rewards/margins": 4.256501197814941, "rewards/rejected": -11.034255027770996, "step": 3790 }, { "epoch": 2.19, "grad_norm": 45.81386982066394, "learning_rate": 1.0331138248332214e-07, "logits/chosen": 3.774331569671631, "logits/rejected": 3.6081912517547607, "logps/chosen": -892.2232666015625, "logps/rejected": -1349.931640625, "loss": 0.1267, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.830786228179932, "rewards/margins": 4.49179220199585, "rewards/rejected": -11.322577476501465, "step": 3800 }, { "epoch": 2.19, "eval_logits/chosen": 4.250500202178955, "eval_logits/rejected": 3.77284836769104, "eval_logps/chosen": -1002.367431640625, "eval_logps/rejected": -1157.1673583984375, "eval_loss": 0.6577708721160889, "eval_rewards/accuracies": 0.7071917653083801, "eval_rewards/chosen": -8.396106719970703, "eval_rewards/margins": 1.4723012447357178, "eval_rewards/rejected": -9.868408203125, "eval_runtime": 534.3189, "eval_samples_per_second": 13.101, "eval_steps_per_second": 0.41, "step": 3800 }, { "epoch": 2.19, "grad_norm": 28.50204886681048, "learning_rate": 1.0195776894044677e-07, "logits/chosen": 3.4102988243103027, "logits/rejected": 3.7134222984313965, "logps/chosen": -964.5877075195312, "logps/rejected": -1406.3525390625, "loss": 0.1354, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.355888366699219, "rewards/margins": 4.4665207862854, "rewards/rejected": -11.822408676147461, "step": 3810 }, { "epoch": 2.2, "grad_norm": 37.723187727370615, "learning_rate": 1.006108065030282e-07, "logits/chosen": 3.3981571197509766, "logits/rejected": 4.016012668609619, "logps/chosen": -925.6012573242188, "logps/rejected": -1318.197265625, "loss": 0.15, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -7.1904754638671875, "rewards/margins": 3.990299701690674, "rewards/rejected": -11.18077564239502, "step": 3820 }, { "epoch": 2.21, "grad_norm": 24.58303494340956, "learning_rate": 9.927055568615814e-08, "logits/chosen": 3.5139594078063965, "logits/rejected": 3.3289895057678223, "logps/chosen": -917.9603271484375, "logps/rejected": -1346.0223388671875, "loss": 0.1326, "rewards/accuracies": 0.9375, "rewards/chosen": -7.094983100891113, "rewards/margins": 4.4679412841796875, "rewards/rejected": -11.562923431396484, "step": 3830 }, { "epoch": 2.21, "grad_norm": 25.87765068100138, "learning_rate": 9.793707670339512e-08, "logits/chosen": 3.466686248779297, "logits/rejected": 3.674762725830078, "logps/chosen": -891.5042724609375, "logps/rejected": -1398.803466796875, "loss": 0.1235, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.796309471130371, "rewards/margins": 4.927484512329102, "rewards/rejected": -11.723793029785156, "step": 3840 }, { "epoch": 2.22, "grad_norm": 35.68696183099004, "learning_rate": 9.661042946405862e-08, "logits/chosen": 3.8154091835021973, "logits/rejected": 3.488732099533081, "logps/chosen": -943.6209716796875, "logps/rejected": -1349.93798828125, "loss": 0.1349, "rewards/accuracies": 0.9375, "rewards/chosen": -7.341130256652832, "rewards/margins": 4.134890556335449, "rewards/rejected": -11.476019859313965, "step": 3850 }, { "epoch": 2.22, "grad_norm": 47.14188257977685, "learning_rate": 9.529067357053805e-08, "logits/chosen": 3.728644847869873, "logits/rejected": 3.4553585052490234, "logps/chosen": -929.4494018554688, "logps/rejected": -1341.4207763671875, "loss": 0.12, "rewards/accuracies": 0.9375, "rewards/chosen": -7.181456089019775, "rewards/margins": 4.260495185852051, "rewards/rejected": -11.441950798034668, "step": 3860 }, { "epoch": 2.23, "grad_norm": 35.793425553231096, "learning_rate": 9.397786831561477e-08, "logits/chosen": 3.466740131378174, "logits/rejected": 3.3636794090270996, "logps/chosen": -904.3995361328125, "logps/rejected": -1293.5791015625, "loss": 0.1363, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.029148101806641, "rewards/margins": 4.021247863769531, "rewards/rejected": -11.050396919250488, "step": 3870 }, { "epoch": 2.23, "grad_norm": 27.402177463088442, "learning_rate": 9.267207267979793e-08, "logits/chosen": 2.894775867462158, "logits/rejected": 3.4359707832336426, "logps/chosen": -904.2491455078125, "logps/rejected": -1339.700927734375, "loss": 0.1678, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -6.733850955963135, "rewards/margins": 4.447150707244873, "rewards/rejected": -11.181001663208008, "step": 3880 }, { "epoch": 2.24, "grad_norm": 45.88740709849321, "learning_rate": 9.137334532867539e-08, "logits/chosen": 3.561063051223755, "logits/rejected": 3.498849391937256, "logps/chosen": -928.4862060546875, "logps/rejected": -1328.699951171875, "loss": 0.1474, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -7.216361045837402, "rewards/margins": 3.949263334274292, "rewards/rejected": -11.165624618530273, "step": 3890 }, { "epoch": 2.25, "grad_norm": 30.928380409930774, "learning_rate": 9.008174461027723e-08, "logits/chosen": 3.6378097534179688, "logits/rejected": 3.3116886615753174, "logps/chosen": -877.1246948242188, "logps/rejected": -1321.7664794921875, "loss": 0.117, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.808138370513916, "rewards/margins": 4.310723781585693, "rewards/rejected": -11.11886215209961, "step": 3900 }, { "epoch": 2.25, "eval_logits/chosen": 4.328708648681641, "eval_logits/rejected": 3.8391733169555664, "eval_logps/chosen": -1050.190673828125, "eval_logps/rejected": -1213.0369873046875, "eval_loss": 0.6594930291175842, "eval_rewards/accuracies": 0.7071917653083801, "eval_rewards/chosen": -8.874338150024414, "eval_rewards/margins": 1.5527644157409668, "eval_rewards/rejected": -10.427102088928223, "eval_runtime": 523.5136, "eval_samples_per_second": 13.371, "eval_steps_per_second": 0.418, "step": 3900 }, { "epoch": 2.25, "grad_norm": 38.699212561823494, "learning_rate": 8.87973285524548e-08, "logits/chosen": 4.161770820617676, "logits/rejected": 3.695528507232666, "logps/chosen": -963.068359375, "logps/rejected": -1355.615966796875, "loss": 0.1245, "rewards/accuracies": 0.96875, "rewards/chosen": -7.666827201843262, "rewards/margins": 4.300318241119385, "rewards/rejected": -11.967144966125488, "step": 3910 }, { "epoch": 2.26, "grad_norm": 37.28483038437771, "learning_rate": 8.752015486027384e-08, "logits/chosen": 3.911364793777466, "logits/rejected": 4.49477481842041, "logps/chosen": -943.16357421875, "logps/rejected": -1390.397705078125, "loss": 0.1387, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -7.607475280761719, "rewards/margins": 4.296053886413574, "rewards/rejected": -11.903529167175293, "step": 3920 }, { "epoch": 2.26, "grad_norm": 41.470353554915235, "learning_rate": 8.625028091342141e-08, "logits/chosen": 3.720032215118408, "logits/rejected": 3.8456428050994873, "logps/chosen": -937.7360229492188, "logps/rejected": -1364.440673828125, "loss": 0.1155, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.360597133636475, "rewards/margins": 4.529088973999023, "rewards/rejected": -11.889686584472656, "step": 3930 }, { "epoch": 2.27, "grad_norm": 45.20445276429018, "learning_rate": 8.498776376362854e-08, "logits/chosen": 3.377300262451172, "logits/rejected": 4.29878568649292, "logps/chosen": -887.1690673828125, "logps/rejected": -1354.564208984375, "loss": 0.1183, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.171189308166504, "rewards/margins": 4.503496170043945, "rewards/rejected": -11.67468547821045, "step": 3940 }, { "epoch": 2.27, "grad_norm": 34.53975996370149, "learning_rate": 8.373266013210684e-08, "logits/chosen": 3.5421910285949707, "logits/rejected": 3.538344621658325, "logps/chosen": -994.2314453125, "logps/rejected": -1371.2886962890625, "loss": 0.1388, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -7.742239952087402, "rewards/margins": 4.032508850097656, "rewards/rejected": -11.774749755859375, "step": 3950 }, { "epoch": 2.28, "grad_norm": 44.872798527879844, "learning_rate": 8.248502640699994e-08, "logits/chosen": 4.068849086761475, "logits/rejected": 3.5856387615203857, "logps/chosen": -976.8825073242188, "logps/rejected": -1400.572265625, "loss": 0.1477, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.779143333435059, "rewards/margins": 4.234991073608398, "rewards/rejected": -12.014135360717773, "step": 3960 }, { "epoch": 2.29, "grad_norm": 31.034603359859002, "learning_rate": 8.12449186408507e-08, "logits/chosen": 3.145857334136963, "logits/rejected": 3.8005003929138184, "logps/chosen": -1019.9567260742188, "logps/rejected": -1460.1539306640625, "loss": 0.1442, "rewards/accuracies": 0.90625, "rewards/chosen": -8.049921035766602, "rewards/margins": 4.520318984985352, "rewards/rejected": -12.570239067077637, "step": 3970 }, { "epoch": 2.29, "grad_norm": 32.118142512295485, "learning_rate": 8.001239254808231e-08, "logits/chosen": 3.26387357711792, "logits/rejected": 3.4488682746887207, "logps/chosen": -939.5700073242188, "logps/rejected": -1372.3636474609375, "loss": 0.1469, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.15889835357666, "rewards/margins": 4.461990833282471, "rewards/rejected": -11.62088680267334, "step": 3980 }, { "epoch": 2.3, "grad_norm": 27.48657801490801, "learning_rate": 7.878750350249567e-08, "logits/chosen": 3.8599491119384766, "logits/rejected": 4.156215190887451, "logps/chosen": -948.8709106445312, "logps/rejected": -1331.973388671875, "loss": 0.1325, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.445446014404297, "rewards/margins": 3.8431270122528076, "rewards/rejected": -11.288573265075684, "step": 3990 }, { "epoch": 2.3, "grad_norm": 40.273043049907756, "learning_rate": 7.757030653478147e-08, "logits/chosen": 3.53704833984375, "logits/rejected": 4.0952253341674805, "logps/chosen": -901.9874877929688, "logps/rejected": -1301.491943359375, "loss": 0.1347, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.8940582275390625, "rewards/margins": 4.171154022216797, "rewards/rejected": -11.06521224975586, "step": 4000 }, { "epoch": 2.3, "eval_logits/chosen": 4.10559606552124, "eval_logits/rejected": 3.6606037616729736, "eval_logps/chosen": -997.5932006835938, "eval_logps/rejected": -1158.1610107421875, "eval_loss": 0.6542690992355347, "eval_rewards/accuracies": 0.7049086689949036, "eval_rewards/chosen": -8.348363876342773, "eval_rewards/margins": 1.5299806594848633, "eval_rewards/rejected": -9.87834644317627, "eval_runtime": 535.2179, "eval_samples_per_second": 13.079, "eval_steps_per_second": 0.409, "step": 4000 }, { "epoch": 2.31, "grad_norm": 32.16106848236791, "learning_rate": 7.636085633004758e-08, "logits/chosen": 3.445827007293701, "logits/rejected": 3.4879963397979736, "logps/chosen": -932.2862548828125, "logps/rejected": -1376.392822265625, "loss": 0.1265, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.2514753341674805, "rewards/margins": 4.412221431732178, "rewards/rejected": -11.663698196411133, "step": 4010 }, { "epoch": 2.32, "grad_norm": 45.06681548775071, "learning_rate": 7.515920722536265e-08, "logits/chosen": 3.576918840408325, "logits/rejected": 3.469095230102539, "logps/chosen": -883.6856689453125, "logps/rejected": -1326.669189453125, "loss": 0.1512, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.875430107116699, "rewards/margins": 4.3003644943237305, "rewards/rejected": -11.17579460144043, "step": 4020 }, { "epoch": 2.32, "grad_norm": 46.41424598566772, "learning_rate": 7.396541320731448e-08, "logits/chosen": 3.257497787475586, "logits/rejected": 4.040172100067139, "logps/chosen": -858.6160278320312, "logps/rejected": -1261.221923828125, "loss": 0.1611, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.689664363861084, "rewards/margins": 4.144506454467773, "rewards/rejected": -10.8341703414917, "step": 4030 }, { "epoch": 2.33, "grad_norm": 37.36838057120251, "learning_rate": 7.277952790958491e-08, "logits/chosen": 3.4619956016540527, "logits/rejected": 3.5404574871063232, "logps/chosen": -901.9367065429688, "logps/rejected": -1369.282470703125, "loss": 0.1216, "rewards/accuracies": 0.96875, "rewards/chosen": -7.207834720611572, "rewards/margins": 4.645166873931885, "rewards/rejected": -11.853002548217773, "step": 4040 }, { "epoch": 2.33, "grad_norm": 29.866014708822856, "learning_rate": 7.160160461053993e-08, "logits/chosen": 3.8753581047058105, "logits/rejected": 3.6236767768859863, "logps/chosen": -917.4772338867188, "logps/rejected": -1325.6378173828125, "loss": 0.1396, "rewards/accuracies": 0.9375, "rewards/chosen": -7.180599212646484, "rewards/margins": 4.154732704162598, "rewards/rejected": -11.335331916809082, "step": 4050 }, { "epoch": 2.34, "grad_norm": 36.587430755348564, "learning_rate": 7.043169623083615e-08, "logits/chosen": 3.0402495861053467, "logits/rejected": 4.015208721160889, "logps/chosen": -932.0596923828125, "logps/rejected": -1433.057373046875, "loss": 0.1186, "rewards/accuracies": 0.96875, "rewards/chosen": -7.22055196762085, "rewards/margins": 5.099529266357422, "rewards/rejected": -12.320083618164062, "step": 4060 }, { "epoch": 2.34, "grad_norm": 38.64365447044933, "learning_rate": 6.92698553310434e-08, "logits/chosen": 2.790558099746704, "logits/rejected": 3.6561102867126465, "logps/chosen": -948.6279296875, "logps/rejected": -1388.6761474609375, "loss": 0.139, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.413588047027588, "rewards/margins": 4.362631320953369, "rewards/rejected": -11.776219367980957, "step": 4070 }, { "epoch": 2.35, "grad_norm": 35.57592373447809, "learning_rate": 6.811613410928293e-08, "logits/chosen": 3.6757571697235107, "logits/rejected": 3.7987685203552246, "logps/chosen": -945.6618041992188, "logps/rejected": -1378.7412109375, "loss": 0.1488, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.383998870849609, "rewards/margins": 4.412209510803223, "rewards/rejected": -11.796209335327148, "step": 4080 }, { "epoch": 2.36, "grad_norm": 37.113314727863184, "learning_rate": 6.697058439888283e-08, "logits/chosen": 3.6899166107177734, "logits/rejected": 3.817552089691162, "logps/chosen": -950.6007080078125, "logps/rejected": -1415.916015625, "loss": 0.1396, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -7.53304386138916, "rewards/margins": 4.553194522857666, "rewards/rejected": -12.086237907409668, "step": 4090 }, { "epoch": 2.36, "grad_norm": 41.71807618952847, "learning_rate": 6.583325766604891e-08, "logits/chosen": 3.4956564903259277, "logits/rejected": 4.254444122314453, "logps/chosen": -868.5919799804688, "logps/rejected": -1314.2900390625, "loss": 0.1329, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.6864166259765625, "rewards/margins": 4.680367469787598, "rewards/rejected": -11.366785049438477, "step": 4100 }, { "epoch": 2.36, "eval_logits/chosen": 3.9027516841888428, "eval_logits/rejected": 3.4748451709747314, "eval_logps/chosen": -989.0842895507812, "eval_logps/rejected": -1151.953125, "eval_loss": 0.6601446866989136, "eval_rewards/accuracies": 0.715753436088562, "eval_rewards/chosen": -8.263275146484375, "eval_rewards/margins": 1.5529905557632446, "eval_rewards/rejected": -9.816266059875488, "eval_runtime": 523.8046, "eval_samples_per_second": 13.364, "eval_steps_per_second": 0.418, "step": 4100 }, { "epoch": 2.37, "grad_norm": 44.03524273684572, "learning_rate": 6.470420500755245e-08, "logits/chosen": 3.1219146251678467, "logits/rejected": 3.3446502685546875, "logps/chosen": -878.4068603515625, "logps/rejected": -1331.8453369140625, "loss": 0.1486, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.718100070953369, "rewards/margins": 4.668488025665283, "rewards/rejected": -11.386587142944336, "step": 4110 }, { "epoch": 2.37, "grad_norm": 34.9419863675018, "learning_rate": 6.358347714843496e-08, "logits/chosen": 3.138986587524414, "logits/rejected": 3.5306930541992188, "logps/chosen": -887.5491943359375, "logps/rejected": -1297.932373046875, "loss": 0.1519, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.7458319664001465, "rewards/margins": 4.272372245788574, "rewards/rejected": -11.018204689025879, "step": 4120 }, { "epoch": 2.38, "grad_norm": 24.868378715928674, "learning_rate": 6.247112443972877e-08, "logits/chosen": 3.538499355316162, "logits/rejected": 3.797011137008667, "logps/chosen": -843.6851806640625, "logps/rejected": -1306.202880859375, "loss": 0.1226, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.51306676864624, "rewards/margins": 4.658224582672119, "rewards/rejected": -11.171292304992676, "step": 4130 }, { "epoch": 2.38, "grad_norm": 14.67849731927182, "learning_rate": 6.136719685619532e-08, "logits/chosen": 2.784571409225464, "logits/rejected": 3.48388409614563, "logps/chosen": -877.7296752929688, "logps/rejected": -1341.5706787109375, "loss": 0.1129, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.584415435791016, "rewards/margins": 4.698119163513184, "rewards/rejected": -11.282533645629883, "step": 4140 }, { "epoch": 2.39, "grad_norm": 26.981519458159955, "learning_rate": 6.027174399407975e-08, "logits/chosen": 3.2360587120056152, "logits/rejected": 3.8966755867004395, "logps/chosen": -907.2763671875, "logps/rejected": -1369.2713623046875, "loss": 0.146, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.074806213378906, "rewards/margins": 4.484217166900635, "rewards/rejected": -11.559022903442383, "step": 4150 }, { "epoch": 2.4, "grad_norm": 35.704630431756776, "learning_rate": 5.918481506888254e-08, "logits/chosen": 3.5119457244873047, "logits/rejected": 3.3940329551696777, "logps/chosen": -929.1085815429688, "logps/rejected": -1363.948974609375, "loss": 0.1589, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.221730709075928, "rewards/margins": 4.331007957458496, "rewards/rejected": -11.552740097045898, "step": 4160 }, { "epoch": 2.4, "grad_norm": 53.701204340415565, "learning_rate": 5.810645891314875e-08, "logits/chosen": 3.2812492847442627, "logits/rejected": 3.86497163772583, "logps/chosen": -942.5027465820312, "logps/rejected": -1376.8060302734375, "loss": 0.1305, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -7.318559169769287, "rewards/margins": 4.350612163543701, "rewards/rejected": -11.669171333312988, "step": 4170 }, { "epoch": 2.41, "grad_norm": 39.292310753724045, "learning_rate": 5.7036723974273784e-08, "logits/chosen": 3.5441174507141113, "logits/rejected": 4.2507476806640625, "logps/chosen": -964.4812622070312, "logps/rejected": -1366.4954833984375, "loss": 0.145, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.610236167907715, "rewards/margins": 4.157071113586426, "rewards/rejected": -11.767308235168457, "step": 4180 }, { "epoch": 2.41, "grad_norm": 30.3993820093043, "learning_rate": 5.5975658312326897e-08, "logits/chosen": 3.7909579277038574, "logits/rejected": 3.3009555339813232, "logps/chosen": -973.6944580078125, "logps/rejected": -1398.533203125, "loss": 0.1349, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.814784049987793, "rewards/margins": 4.2737908363342285, "rewards/rejected": -12.08857536315918, "step": 4190 }, { "epoch": 2.42, "grad_norm": 44.42887108791089, "learning_rate": 5.4923309597892265e-08, "logits/chosen": 3.3749911785125732, "logits/rejected": 4.048464298248291, "logps/chosen": -940.4107666015625, "logps/rejected": -1377.796875, "loss": 0.1272, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.422223091125488, "rewards/margins": 4.310248374938965, "rewards/rejected": -11.73246955871582, "step": 4200 }, { "epoch": 2.42, "eval_logits/chosen": 4.056370735168457, "eval_logits/rejected": 3.5793864727020264, "eval_logps/chosen": -1001.013427734375, "eval_logps/rejected": -1163.147216796875, "eval_loss": 0.652097225189209, "eval_rewards/accuracies": 0.7128995656967163, "eval_rewards/chosen": -8.382566452026367, "eval_rewards/margins": 1.545638918876648, "eval_rewards/rejected": -9.928205490112305, "eval_runtime": 535.0283, "eval_samples_per_second": 13.083, "eval_steps_per_second": 0.409, "step": 4200 }, { "epoch": 2.42, "grad_norm": 39.11312414628646, "learning_rate": 5.3879725109926723e-08, "logits/chosen": 3.5782432556152344, "logits/rejected": 3.2305328845977783, "logps/chosen": -900.3201293945312, "logps/rejected": -1358.17041015625, "loss": 0.1444, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -7.092508792877197, "rewards/margins": 4.507454872131348, "rewards/rejected": -11.599964141845703, "step": 4210 }, { "epoch": 2.43, "grad_norm": 40.99713180492809, "learning_rate": 5.284495173363626e-08, "logits/chosen": 3.173649787902832, "logits/rejected": 3.9827969074249268, "logps/chosen": -916.6148681640625, "logps/rejected": -1312.101806640625, "loss": 0.1283, "rewards/accuracies": 0.9375, "rewards/chosen": -6.947560787200928, "rewards/margins": 4.14273738861084, "rewards/rejected": -11.09029769897461, "step": 4220 }, { "epoch": 2.44, "grad_norm": 51.274063630939764, "learning_rate": 5.1819035958369284e-08, "logits/chosen": 3.055410385131836, "logits/rejected": 3.512051820755005, "logps/chosen": -886.7150268554688, "logps/rejected": -1339.948974609375, "loss": 0.143, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.86300802230835, "rewards/margins": 4.426472187042236, "rewards/rejected": -11.289480209350586, "step": 4230 }, { "epoch": 2.44, "grad_norm": 38.116186692602646, "learning_rate": 5.080202387552779e-08, "logits/chosen": 3.409067153930664, "logits/rejected": 3.5700390338897705, "logps/chosen": -897.6995849609375, "logps/rejected": -1350.080078125, "loss": 0.1471, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.033686637878418, "rewards/margins": 4.646275997161865, "rewards/rejected": -11.679962158203125, "step": 4240 }, { "epoch": 2.45, "grad_norm": 32.66693864656507, "learning_rate": 4.979396117649723e-08, "logits/chosen": 3.36116361618042, "logits/rejected": 4.010204315185547, "logps/chosen": -881.8785400390625, "logps/rejected": -1306.0771484375, "loss": 0.1406, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -6.626011848449707, "rewards/margins": 4.47509241104126, "rewards/rejected": -11.101104736328125, "step": 4250 }, { "epoch": 2.45, "grad_norm": 35.1662117069838, "learning_rate": 4.8794893150592985e-08, "logits/chosen": 3.52885103225708, "logits/rejected": 3.6744933128356934, "logps/chosen": -876.4284057617188, "logps/rejected": -1341.84423828125, "loss": 0.1215, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.920140743255615, "rewards/margins": 4.564593315124512, "rewards/rejected": -11.484734535217285, "step": 4260 }, { "epoch": 2.46, "grad_norm": 64.57653782543068, "learning_rate": 4.7804864683026305e-08, "logits/chosen": 3.4071502685546875, "logits/rejected": 3.6584205627441406, "logps/chosen": -941.4830932617188, "logps/rejected": -1365.452392578125, "loss": 0.1515, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -7.169929504394531, "rewards/margins": 4.422930717468262, "rewards/rejected": -11.592859268188477, "step": 4270 }, { "epoch": 2.46, "grad_norm": 46.41519019098213, "learning_rate": 4.682392025288737e-08, "logits/chosen": 3.3917083740234375, "logits/rejected": 3.692983627319336, "logps/chosen": -903.2981567382812, "logps/rejected": -1324.2955322265625, "loss": 0.1739, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.126486778259277, "rewards/margins": 4.112493991851807, "rewards/rejected": -11.238981246948242, "step": 4280 }, { "epoch": 2.47, "grad_norm": 29.0056019781549, "learning_rate": 4.5852103931146914e-08, "logits/chosen": 3.222928524017334, "logits/rejected": 3.363715410232544, "logps/chosen": -917.5579833984375, "logps/rejected": -1358.8013916015625, "loss": 0.1377, "rewards/accuracies": 0.96875, "rewards/chosen": -6.974820613861084, "rewards/margins": 4.559780120849609, "rewards/rejected": -11.534601211547852, "step": 4290 }, { "epoch": 2.48, "grad_norm": 34.16319206524675, "learning_rate": 4.4889459378676716e-08, "logits/chosen": 3.677508592605591, "logits/rejected": 3.27167010307312, "logps/chosen": -914.7694091796875, "logps/rejected": -1342.080322265625, "loss": 0.1398, "rewards/accuracies": 0.9375, "rewards/chosen": -7.174644470214844, "rewards/margins": 4.208683490753174, "rewards/rejected": -11.383328437805176, "step": 4300 }, { "epoch": 2.48, "eval_logits/chosen": 4.010648250579834, "eval_logits/rejected": 3.5276999473571777, "eval_logps/chosen": -982.0401000976562, "eval_logps/rejected": -1140.152587890625, "eval_loss": 0.644024133682251, "eval_rewards/accuracies": 0.7146118879318237, "eval_rewards/chosen": -8.19283390045166, "eval_rewards/margins": 1.505425214767456, "eval_rewards/rejected": -9.698260307312012, "eval_runtime": 533.1528, "eval_samples_per_second": 13.129, "eval_steps_per_second": 0.411, "step": 4300 }, { "epoch": 2.48, "grad_norm": 52.35230672260995, "learning_rate": 4.3936029844287346e-08, "logits/chosen": 3.636483669281006, "logits/rejected": 3.7139065265655518, "logps/chosen": -913.3740234375, "logps/rejected": -1328.880615234375, "loss": 0.1658, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -7.011006832122803, "rewards/margins": 4.355842590332031, "rewards/rejected": -11.366849899291992, "step": 4310 }, { "epoch": 2.49, "grad_norm": 35.760824817713285, "learning_rate": 4.299185816278586e-08, "logits/chosen": 3.3019766807556152, "logits/rejected": 3.6124000549316406, "logps/chosen": -887.3786010742188, "logps/rejected": -1341.304443359375, "loss": 0.1346, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.918748378753662, "rewards/margins": 4.502823352813721, "rewards/rejected": -11.4215726852417, "step": 4320 }, { "epoch": 2.49, "grad_norm": 34.471122915239434, "learning_rate": 4.205698675305075e-08, "logits/chosen": 3.2285988330841064, "logits/rejected": 3.359661817550659, "logps/chosen": -871.5960693359375, "logps/rejected": -1235.963134765625, "loss": 0.1664, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.90744686126709, "rewards/margins": 3.6671767234802246, "rewards/rejected": -10.574625015258789, "step": 4330 }, { "epoch": 2.5, "grad_norm": 29.52788508184838, "learning_rate": 4.1131457616126435e-08, "logits/chosen": 3.282252550125122, "logits/rejected": 3.6051268577575684, "logps/chosen": -892.0718994140625, "logps/rejected": -1324.36279296875, "loss": 0.1779, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.922811031341553, "rewards/margins": 4.2147345542907715, "rewards/rejected": -11.137545585632324, "step": 4340 }, { "epoch": 2.51, "grad_norm": 27.822632694375944, "learning_rate": 4.021531233333647e-08, "logits/chosen": 3.266111373901367, "logits/rejected": 3.3648173809051514, "logps/chosen": -846.4069213867188, "logps/rejected": -1290.188720703125, "loss": 0.1608, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.597044467926025, "rewards/margins": 4.172820091247559, "rewards/rejected": -10.769864082336426, "step": 4350 }, { "epoch": 2.51, "grad_norm": 22.686078484510546, "learning_rate": 3.930859206441514e-08, "logits/chosen": 2.971287250518799, "logits/rejected": 3.480159282684326, "logps/chosen": -836.77392578125, "logps/rejected": -1265.6175537109375, "loss": 0.1417, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.353575706481934, "rewards/margins": 4.211709499359131, "rewards/rejected": -10.565286636352539, "step": 4360 }, { "epoch": 2.52, "grad_norm": 16.95042361064101, "learning_rate": 3.84113375456582e-08, "logits/chosen": 3.7946529388427734, "logits/rejected": 3.587723970413208, "logps/chosen": -838.5626831054688, "logps/rejected": -1234.576171875, "loss": 0.148, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.523820400238037, "rewards/margins": 4.122513771057129, "rewards/rejected": -10.646333694458008, "step": 4370 }, { "epoch": 2.52, "grad_norm": 31.48101304759037, "learning_rate": 3.7523589088093164e-08, "logits/chosen": 3.343883514404297, "logits/rejected": 3.222503185272217, "logps/chosen": -875.6920166015625, "logps/rejected": -1322.442626953125, "loss": 0.136, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.677162170410156, "rewards/margins": 4.560788154602051, "rewards/rejected": -11.237950325012207, "step": 4380 }, { "epoch": 2.53, "grad_norm": 40.61551117887762, "learning_rate": 3.664538657566765e-08, "logits/chosen": 3.3236308097839355, "logits/rejected": 3.6839561462402344, "logps/chosen": -865.4827270507812, "logps/rejected": -1297.187744140625, "loss": 0.1495, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.574391841888428, "rewards/margins": 4.302542686462402, "rewards/rejected": -10.876935958862305, "step": 4390 }, { "epoch": 2.53, "grad_norm": 34.468677981595995, "learning_rate": 3.577676946345801e-08, "logits/chosen": 3.310579776763916, "logits/rejected": 3.828270435333252, "logps/chosen": -881.82568359375, "logps/rejected": -1273.034912109375, "loss": 0.1452, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -6.846297264099121, "rewards/margins": 3.943183183670044, "rewards/rejected": -10.789480209350586, "step": 4400 }, { "epoch": 2.53, "eval_logits/chosen": 3.8628814220428467, "eval_logits/rejected": 3.3969695568084717, "eval_logps/chosen": -939.8471069335938, "eval_logps/rejected": -1096.2967529296875, "eval_loss": 0.6378697156906128, "eval_rewards/accuracies": 0.7140411138534546, "eval_rewards/chosen": -7.770903587341309, "eval_rewards/margins": 1.4887957572937012, "eval_rewards/rejected": -9.259699821472168, "eval_runtime": 523.5247, "eval_samples_per_second": 13.371, "eval_steps_per_second": 0.418, "step": 4400 }, { "epoch": 2.54, "grad_norm": 35.260687131757436, "learning_rate": 3.4917776775896576e-08, "logits/chosen": 3.1702721118927, "logits/rejected": 3.5836589336395264, "logps/chosen": -877.7635498046875, "logps/rejected": -1304.9306640625, "loss": 0.1729, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.844918251037598, "rewards/margins": 4.211400032043457, "rewards/rejected": -11.056320190429688, "step": 4410 }, { "epoch": 2.55, "grad_norm": 33.18005173630772, "learning_rate": 3.4068447105018134e-08, "logits/chosen": 3.609553098678589, "logits/rejected": 3.338224411010742, "logps/chosen": -890.1029052734375, "logps/rejected": -1287.7030029296875, "loss": 0.1552, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.696571350097656, "rewards/margins": 4.127292633056641, "rewards/rejected": -10.823864936828613, "step": 4420 }, { "epoch": 2.55, "grad_norm": 32.51875701680016, "learning_rate": 3.322881860872659e-08, "logits/chosen": 3.5151398181915283, "logits/rejected": 3.803191661834717, "logps/chosen": -836.3896484375, "logps/rejected": -1268.8572998046875, "loss": 0.1458, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.49831485748291, "rewards/margins": 4.259902000427246, "rewards/rejected": -10.75821590423584, "step": 4430 }, { "epoch": 2.56, "grad_norm": 23.42738936789001, "learning_rate": 3.239892900908012e-08, "logits/chosen": 3.669468641281128, "logits/rejected": 3.9529166221618652, "logps/chosen": -871.4064331054688, "logps/rejected": -1285.660400390625, "loss": 0.1508, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -6.824498653411865, "rewards/margins": 4.092757225036621, "rewards/rejected": -10.917256355285645, "step": 4440 }, { "epoch": 2.56, "grad_norm": 18.250895669897048, "learning_rate": 3.157881559059691e-08, "logits/chosen": 3.541872024536133, "logits/rejected": 3.377525806427002, "logps/chosen": -867.1380004882812, "logps/rejected": -1260.8739013671875, "loss": 0.1398, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.6389479637146, "rewards/margins": 3.8816921710968018, "rewards/rejected": -10.52064037322998, "step": 4450 }, { "epoch": 2.57, "grad_norm": 31.68922707896326, "learning_rate": 3.076851519857992e-08, "logits/chosen": 3.586460828781128, "logits/rejected": 3.671218156814575, "logps/chosen": -839.7230224609375, "logps/rejected": -1220.527587890625, "loss": 0.1571, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.481574058532715, "rewards/margins": 3.843846082687378, "rewards/rejected": -10.325420379638672, "step": 4460 }, { "epoch": 2.57, "grad_norm": 28.846365041402194, "learning_rate": 2.9968064237461036e-08, "logits/chosen": 3.8102595806121826, "logits/rejected": 2.9997200965881348, "logps/chosen": -857.5617065429688, "logps/rejected": -1306.5784912109375, "loss": 0.1183, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.628857612609863, "rewards/margins": 4.439366817474365, "rewards/rejected": -11.068224906921387, "step": 4470 }, { "epoch": 2.58, "grad_norm": 34.28501205701433, "learning_rate": 2.9177498669166385e-08, "logits/chosen": 3.3406081199645996, "logits/rejected": 3.922384262084961, "logps/chosen": -913.9679565429688, "logps/rejected": -1340.844482421875, "loss": 0.1308, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.009526252746582, "rewards/margins": 4.4512834548950195, "rewards/rejected": -11.460810661315918, "step": 4480 }, { "epoch": 2.59, "grad_norm": 17.755984176388782, "learning_rate": 2.839685401150016e-08, "logits/chosen": 3.2124557495117188, "logits/rejected": 4.063717842102051, "logps/chosen": -882.6492309570312, "logps/rejected": -1293.188720703125, "loss": 0.1325, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.850244045257568, "rewards/margins": 4.191952705383301, "rewards/rejected": -11.042196273803711, "step": 4490 }, { "epoch": 2.59, "grad_norm": 41.28756351683251, "learning_rate": 2.7626165336548767e-08, "logits/chosen": 3.6572654247283936, "logits/rejected": 3.4529061317443848, "logps/chosen": -889.6456909179688, "logps/rejected": -1312.345703125, "loss": 0.1686, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.958000183105469, "rewards/margins": 4.2848381996154785, "rewards/rejected": -11.242837905883789, "step": 4500 }, { "epoch": 2.59, "eval_logits/chosen": 3.9841318130493164, "eval_logits/rejected": 3.509991407394409, "eval_logps/chosen": -966.255859375, "eval_logps/rejected": -1124.885009765625, "eval_loss": 0.646466076374054, "eval_rewards/accuracies": 0.7151826620101929, "eval_rewards/chosen": -8.034990310668945, "eval_rewards/margins": 1.5105922222137451, "eval_rewards/rejected": -9.54558277130127, "eval_runtime": 534.8325, "eval_samples_per_second": 13.088, "eval_steps_per_second": 0.409, "step": 4500 }, { "epoch": 2.6, "grad_norm": 18.142410096661752, "learning_rate": 2.6865467269105656e-08, "logits/chosen": 3.2625327110290527, "logits/rejected": 4.080760955810547, "logps/chosen": -898.0673828125, "logps/rejected": -1296.6719970703125, "loss": 0.1547, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.928767204284668, "rewards/margins": 4.158444881439209, "rewards/rejected": -11.087211608886719, "step": 4510 }, { "epoch": 2.6, "grad_norm": 33.90865602325425, "learning_rate": 2.611479398511518e-08, "logits/chosen": 3.4053382873535156, "logits/rejected": 3.14452862739563, "logps/chosen": -849.9923095703125, "logps/rejected": -1325.1151123046875, "loss": 0.1354, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.318703651428223, "rewards/margins": 4.736924171447754, "rewards/rejected": -11.055627822875977, "step": 4520 }, { "epoch": 2.61, "grad_norm": 26.912418273029285, "learning_rate": 2.5374179210137598e-08, "logits/chosen": 3.3441529273986816, "logits/rejected": 3.627659320831299, "logps/chosen": -889.0759887695312, "logps/rejected": -1360.015380859375, "loss": 0.1516, "rewards/accuracies": 0.96875, "rewards/chosen": -6.886460781097412, "rewards/margins": 4.6205315589904785, "rewards/rejected": -11.506993293762207, "step": 4530 }, { "epoch": 2.61, "grad_norm": 40.225294980540966, "learning_rate": 2.464365621783368e-08, "logits/chosen": 3.03694224357605, "logits/rejected": 3.6655147075653076, "logps/chosen": -901.2951049804688, "logps/rejected": -1348.4556884765625, "loss": 0.1438, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.612797737121582, "rewards/margins": 4.596195220947266, "rewards/rejected": -11.208993911743164, "step": 4540 }, { "epoch": 2.62, "grad_norm": 27.857983547283695, "learning_rate": 2.3923257828469712e-08, "logits/chosen": 3.6198372840881348, "logits/rejected": 3.565645694732666, "logps/chosen": -851.4136962890625, "logps/rejected": -1310.963623046875, "loss": 0.1333, "rewards/accuracies": 0.9375, "rewards/chosen": -6.542910575866699, "rewards/margins": 4.454277992248535, "rewards/rejected": -10.997190475463867, "step": 4550 }, { "epoch": 2.63, "grad_norm": 23.33354642542785, "learning_rate": 2.3213016407443358e-08, "logits/chosen": 3.038256883621216, "logits/rejected": 3.6552834510803223, "logps/chosen": -864.9734497070312, "logps/rejected": -1263.995361328125, "loss": 0.1512, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.4751410484313965, "rewards/margins": 4.280707359313965, "rewards/rejected": -10.755849838256836, "step": 4560 }, { "epoch": 2.63, "grad_norm": 23.946893414070093, "learning_rate": 2.251296386382906e-08, "logits/chosen": 3.6699681282043457, "logits/rejected": 3.5617077350616455, "logps/chosen": -899.76611328125, "logps/rejected": -1295.893310546875, "loss": 0.139, "rewards/accuracies": 0.96875, "rewards/chosen": -6.931687355041504, "rewards/margins": 4.101052761077881, "rewards/rejected": -11.032739639282227, "step": 4570 }, { "epoch": 2.64, "grad_norm": 22.94275037286604, "learning_rate": 2.182313164894489e-08, "logits/chosen": 3.5911636352539062, "logits/rejected": 3.5587100982666016, "logps/chosen": -898.5363159179688, "logps/rejected": -1352.5966796875, "loss": 0.1242, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.980717658996582, "rewards/margins": 4.443552017211914, "rewards/rejected": -11.424270629882812, "step": 4580 }, { "epoch": 2.64, "grad_norm": 34.33123595936751, "learning_rate": 2.1143550754939425e-08, "logits/chosen": 3.3467681407928467, "logits/rejected": 3.774111270904541, "logps/chosen": -890.4309692382812, "logps/rejected": -1297.5684814453125, "loss": 0.1581, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.697007656097412, "rewards/margins": 4.356503486633301, "rewards/rejected": -11.053510665893555, "step": 4590 }, { "epoch": 2.65, "grad_norm": 51.671795879801024, "learning_rate": 2.0474251713399142e-08, "logits/chosen": 3.565133571624756, "logits/rejected": 3.6993865966796875, "logps/chosen": -957.3294677734375, "logps/rejected": -1375.5245361328125, "loss": 0.1626, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.553174018859863, "rewards/margins": 4.247647285461426, "rewards/rejected": -11.800821304321289, "step": 4600 }, { "epoch": 2.65, "eval_logits/chosen": 4.007676124572754, "eval_logits/rejected": 3.5311696529388428, "eval_logps/chosen": -968.5971069335938, "eval_logps/rejected": -1129.09814453125, "eval_loss": 0.6461330056190491, "eval_rewards/accuracies": 0.7151826620101929, "eval_rewards/chosen": -8.058403015136719, "eval_rewards/margins": 1.5293123722076416, "eval_rewards/rejected": -9.587715148925781, "eval_runtime": 524.1499, "eval_samples_per_second": 13.355, "eval_steps_per_second": 0.418, "step": 4600 }, { "epoch": 2.65, "grad_norm": 35.77288253958212, "learning_rate": 1.9815264593977034e-08, "logits/chosen": 3.814164400100708, "logits/rejected": 3.532639265060425, "logps/chosen": -856.3995361328125, "logps/rejected": -1297.891845703125, "loss": 0.168, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -6.720461845397949, "rewards/margins": 4.241576194763184, "rewards/rejected": -10.962038040161133, "step": 4610 }, { "epoch": 2.66, "grad_norm": 30.668232671849207, "learning_rate": 1.9166619003041424e-08, "logits/chosen": 3.4838790893554688, "logits/rejected": 3.3041927814483643, "logps/chosen": -879.7357177734375, "logps/rejected": -1339.953857421875, "loss": 0.1375, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.853123664855957, "rewards/margins": 4.509236812591553, "rewards/rejected": -11.362360000610352, "step": 4620 }, { "epoch": 2.67, "grad_norm": 26.870324853794724, "learning_rate": 1.8528344082345965e-08, "logits/chosen": 4.082023620605469, "logits/rejected": 4.03671932220459, "logps/chosen": -864.4454345703125, "logps/rejected": -1316.72021484375, "loss": 0.1457, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.773983955383301, "rewards/margins": 4.559479713439941, "rewards/rejected": -11.333462715148926, "step": 4630 }, { "epoch": 2.67, "grad_norm": 30.188068349540764, "learning_rate": 1.7900468507720395e-08, "logits/chosen": 3.4232888221740723, "logits/rejected": 3.5408999919891357, "logps/chosen": -916.8629760742188, "logps/rejected": -1353.6793212890625, "loss": 0.1513, "rewards/accuracies": 0.9375, "rewards/chosen": -6.97579288482666, "rewards/margins": 4.428987979888916, "rewards/rejected": -11.40478229522705, "step": 4640 }, { "epoch": 2.68, "grad_norm": 30.28147195987134, "learning_rate": 1.72830204877821e-08, "logits/chosen": 3.3942248821258545, "logits/rejected": 4.188652992248535, "logps/chosen": -860.9830322265625, "logps/rejected": -1322.212646484375, "loss": 0.134, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.711422920227051, "rewards/margins": 4.580803871154785, "rewards/rejected": -11.292226791381836, "step": 4650 }, { "epoch": 2.68, "grad_norm": 28.145822188860837, "learning_rate": 1.6676027762668964e-08, "logits/chosen": 3.5769622325897217, "logits/rejected": 3.5155227184295654, "logps/chosen": -892.1435546875, "logps/rejected": -1289.792236328125, "loss": 0.1292, "rewards/accuracies": 0.9375, "rewards/chosen": -6.827092170715332, "rewards/margins": 4.144232749938965, "rewards/rejected": -10.971325874328613, "step": 4660 }, { "epoch": 2.69, "grad_norm": 33.06414686314608, "learning_rate": 1.607951760279297e-08, "logits/chosen": 3.45444917678833, "logits/rejected": 3.792262315750122, "logps/chosen": -868.2800903320312, "logps/rejected": -1299.379638671875, "loss": 0.16, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.855556488037109, "rewards/margins": 4.262399673461914, "rewards/rejected": -11.117956161499023, "step": 4670 }, { "epoch": 2.7, "grad_norm": 60.80878690428588, "learning_rate": 1.5493516807614903e-08, "logits/chosen": 3.516972780227661, "logits/rejected": 3.576953411102295, "logps/chosen": -927.10400390625, "logps/rejected": -1362.1495361328125, "loss": 0.1367, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.0487799644470215, "rewards/margins": 4.471040725708008, "rewards/rejected": -11.519821166992188, "step": 4680 }, { "epoch": 2.7, "grad_norm": 16.841740865068495, "learning_rate": 1.4918051704440633e-08, "logits/chosen": 3.5870327949523926, "logits/rejected": 3.9361374378204346, "logps/chosen": -892.5515747070312, "logps/rejected": -1326.2392578125, "loss": 0.1349, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.9856438636779785, "rewards/margins": 4.329216480255127, "rewards/rejected": -11.314860343933105, "step": 4690 }, { "epoch": 2.71, "grad_norm": 38.78231445380969, "learning_rate": 1.435314814723798e-08, "logits/chosen": 3.042799472808838, "logits/rejected": 4.420306205749512, "logps/chosen": -863.3905029296875, "logps/rejected": -1269.309814453125, "loss": 0.1496, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.8190178871154785, "rewards/margins": 4.038343906402588, "rewards/rejected": -10.85736083984375, "step": 4700 }, { "epoch": 2.71, "eval_logits/chosen": 4.0036139488220215, "eval_logits/rejected": 3.533724784851074, "eval_logps/chosen": -962.5296020507812, "eval_logps/rejected": -1123.53759765625, "eval_loss": 0.6474282741546631, "eval_rewards/accuracies": 0.7163242101669312, "eval_rewards/chosen": -7.997727394104004, "eval_rewards/margins": 1.5343804359436035, "eval_rewards/rejected": -9.532108306884766, "eval_runtime": 535.4123, "eval_samples_per_second": 13.074, "eval_steps_per_second": 0.409, "step": 4700 }, { "epoch": 2.71, "grad_norm": 24.551223246045325, "learning_rate": 1.3798831515475395e-08, "logits/chosen": 3.072558879852295, "logits/rejected": 3.710848331451416, "logps/chosen": -891.6023559570312, "logps/rejected": -1356.918212890625, "loss": 0.1544, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.867218971252441, "rewards/margins": 4.621755599975586, "rewards/rejected": -11.488973617553711, "step": 4710 }, { "epoch": 2.72, "grad_norm": 40.47189397993195, "learning_rate": 1.3255126712981734e-08, "logits/chosen": 3.7027173042297363, "logits/rejected": 3.381751537322998, "logps/chosen": -869.5496826171875, "logps/rejected": -1306.2579345703125, "loss": 0.1212, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.6971116065979, "rewards/margins": 4.516592979431152, "rewards/rejected": -11.213704109191895, "step": 4720 }, { "epoch": 2.72, "grad_norm": 34.56575568304188, "learning_rate": 1.2722058166827143e-08, "logits/chosen": 3.8306148052215576, "logits/rejected": 3.7671120166778564, "logps/chosen": -845.8519287109375, "logps/rejected": -1276.844970703125, "loss": 0.1452, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -6.482962131500244, "rewards/margins": 4.464810371398926, "rewards/rejected": -10.947772026062012, "step": 4730 }, { "epoch": 2.73, "grad_norm": 29.479305631975514, "learning_rate": 1.219964982622601e-08, "logits/chosen": 3.4553637504577637, "logits/rejected": 3.317314624786377, "logps/chosen": -883.1945190429688, "logps/rejected": -1327.991455078125, "loss": 0.1759, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.809109687805176, "rewards/margins": 4.410789966583252, "rewards/rejected": -11.21989917755127, "step": 4740 }, { "epoch": 2.74, "grad_norm": 26.28208646673632, "learning_rate": 1.1687925161460665e-08, "logits/chosen": 3.2795956134796143, "logits/rejected": 3.284482955932617, "logps/chosen": -892.5914916992188, "logps/rejected": -1286.396728515625, "loss": 0.1773, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.642425537109375, "rewards/margins": 4.2639641761779785, "rewards/rejected": -10.906389236450195, "step": 4750 }, { "epoch": 2.74, "grad_norm": 32.00390074683538, "learning_rate": 1.1186907162827026e-08, "logits/chosen": 3.102405071258545, "logits/rejected": 3.890953779220581, "logps/chosen": -843.8536376953125, "logps/rejected": -1307.8677978515625, "loss": 0.1402, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -6.512621879577637, "rewards/margins": 4.557175636291504, "rewards/rejected": -11.06979751586914, "step": 4760 }, { "epoch": 2.75, "grad_norm": 53.47542641508516, "learning_rate": 1.0696618339601921e-08, "logits/chosen": 3.2005233764648438, "logits/rejected": 3.7379202842712402, "logps/chosen": -899.4488525390625, "logps/rejected": -1338.7275390625, "loss": 0.143, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.747419834136963, "rewards/margins": 4.382193088531494, "rewards/rejected": -11.129612922668457, "step": 4770 }, { "epoch": 2.75, "grad_norm": 46.897075781966414, "learning_rate": 1.0217080719031435e-08, "logits/chosen": 3.1699719429016113, "logits/rejected": 3.5534234046936035, "logps/chosen": -844.8187255859375, "logps/rejected": -1327.2874755859375, "loss": 0.1666, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.6261491775512695, "rewards/margins": 4.503304958343506, "rewards/rejected": -11.12945556640625, "step": 4780 }, { "epoch": 2.76, "grad_norm": 40.37252120725173, "learning_rate": 9.74831584534161e-09, "logits/chosen": 3.5424411296844482, "logits/rejected": 3.3728795051574707, "logps/chosen": -877.6661987304688, "logps/rejected": -1362.89794921875, "loss": 0.1555, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.622837066650391, "rewards/margins": 4.987973690032959, "rewards/rejected": -11.610811233520508, "step": 4790 }, { "epoch": 2.76, "grad_norm": 30.716680539889897, "learning_rate": 9.290344778770414e-09, "logits/chosen": 3.1436657905578613, "logits/rejected": 3.227104902267456, "logps/chosen": -890.2732543945312, "logps/rejected": -1304.5169677734375, "loss": 0.1418, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.701428413391113, "rewards/margins": 4.270579814910889, "rewards/rejected": -10.972007751464844, "step": 4800 }, { "epoch": 2.76, "eval_logits/chosen": 4.0293426513671875, "eval_logits/rejected": 3.5537781715393066, "eval_logps/chosen": -960.7057495117188, "eval_logps/rejected": -1119.3050537109375, "eval_loss": 0.6430767774581909, "eval_rewards/accuracies": 0.7146118879318237, "eval_rewards/chosen": -7.979489803314209, "eval_rewards/margins": 1.51029372215271, "eval_rewards/rejected": -9.489784240722656, "eval_runtime": 523.5473, "eval_samples_per_second": 13.37, "eval_steps_per_second": 0.418, "step": 4800 }, { "epoch": 2.77, "grad_norm": 45.32998206948141, "learning_rate": 8.843188094621422e-09, "logits/chosen": 3.806798219680786, "logits/rejected": 3.341357469558716, "logps/chosen": -863.56103515625, "logps/rejected": -1294.8153076171875, "loss": 0.1605, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.850883483886719, "rewards/margins": 4.211820125579834, "rewards/rejected": -11.062702178955078, "step": 4810 }, { "epoch": 2.78, "grad_norm": 36.908565504020146, "learning_rate": 8.406865882339769e-09, "logits/chosen": 3.5770492553710938, "logits/rejected": 3.332200527191162, "logps/chosen": -959.1471557617188, "logps/rejected": -1318.4459228515625, "loss": 0.1521, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -7.319136619567871, "rewards/margins": 3.8620917797088623, "rewards/rejected": -11.181228637695312, "step": 4820 }, { "epoch": 2.78, "grad_norm": 29.014744076250533, "learning_rate": 7.98139774460918e-09, "logits/chosen": 3.699235439300537, "logits/rejected": 3.9964439868927, "logps/chosen": -917.5882568359375, "logps/rejected": -1362.103759765625, "loss": 0.1178, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.213653564453125, "rewards/margins": 4.421055793762207, "rewards/rejected": -11.634710311889648, "step": 4830 }, { "epoch": 2.79, "grad_norm": 41.9229722666574, "learning_rate": 7.566802796471594e-09, "logits/chosen": 3.7365927696228027, "logits/rejected": 3.699310779571533, "logps/chosen": -879.0908203125, "logps/rejected": -1244.96875, "loss": 0.1806, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -6.7581915855407715, "rewards/margins": 3.804527997970581, "rewards/rejected": -10.562719345092773, "step": 4840 }, { "epoch": 2.79, "grad_norm": 40.29408530997682, "learning_rate": 7.163099664468292e-09, "logits/chosen": 3.2732269763946533, "logits/rejected": 3.862213611602783, "logps/chosen": -868.6654052734375, "logps/rejected": -1314.559326171875, "loss": 0.1593, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.6967620849609375, "rewards/margins": 4.43014669418335, "rewards/rejected": -11.126908302307129, "step": 4850 }, { "epoch": 2.8, "grad_norm": 56.71587257040045, "learning_rate": 6.7703064858029565e-09, "logits/chosen": 3.834251880645752, "logits/rejected": 3.468278408050537, "logps/chosen": -831.2278442382812, "logps/rejected": -1259.8314208984375, "loss": 0.1584, "rewards/accuracies": 0.90625, "rewards/chosen": -6.560222625732422, "rewards/margins": 4.249187469482422, "rewards/rejected": -10.809409141540527, "step": 4860 }, { "epoch": 2.8, "grad_norm": 32.73197077619573, "learning_rate": 6.388440907526965e-09, "logits/chosen": 3.4287161827087402, "logits/rejected": 3.4881961345672607, "logps/chosen": -904.5638427734375, "logps/rejected": -1347.948486328125, "loss": 0.1462, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.7811384201049805, "rewards/margins": 4.708926200866699, "rewards/rejected": -11.49006462097168, "step": 4870 }, { "epoch": 2.81, "grad_norm": 30.23152895662283, "learning_rate": 6.017520085746436e-09, "logits/chosen": 3.6235904693603516, "logits/rejected": 3.8630385398864746, "logps/chosen": -866.9783935546875, "logps/rejected": -1323.78515625, "loss": 0.1357, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.679640293121338, "rewards/margins": 4.525690078735352, "rewards/rejected": -11.205328941345215, "step": 4880 }, { "epoch": 2.82, "grad_norm": 29.312021550372048, "learning_rate": 5.657560684851598e-09, "logits/chosen": 3.491269588470459, "logits/rejected": 3.815241575241089, "logps/chosen": -891.5387573242188, "logps/rejected": -1334.976318359375, "loss": 0.1623, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.029611110687256, "rewards/margins": 4.442296028137207, "rewards/rejected": -11.471906661987305, "step": 4890 }, { "epoch": 2.82, "grad_norm": 50.594900786915225, "learning_rate": 5.308578876767944e-09, "logits/chosen": 3.602823257446289, "logits/rejected": 3.1024231910705566, "logps/chosen": -898.5730590820312, "logps/rejected": -1401.591796875, "loss": 0.1505, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.869839668273926, "rewards/margins": 4.945147514343262, "rewards/rejected": -11.814986228942871, "step": 4900 }, { "epoch": 2.82, "eval_logits/chosen": 4.051290512084961, "eval_logits/rejected": 3.5728461742401123, "eval_logps/chosen": -964.4603881835938, "eval_logps/rejected": -1122.0504150390625, "eval_loss": 0.6431969404220581, "eval_rewards/accuracies": 0.715753436088562, "eval_rewards/chosen": -8.017036437988281, "eval_rewards/margins": 1.5002012252807617, "eval_rewards/rejected": -9.517237663269043, "eval_runtime": 534.1179, "eval_samples_per_second": 13.106, "eval_steps_per_second": 0.41, "step": 4900 }, { "epoch": 2.83, "grad_norm": 31.755847820825434, "learning_rate": 4.9705903402297856e-09, "logits/chosen": 3.2635657787323, "logits/rejected": 3.7057242393493652, "logps/chosen": -895.36669921875, "logps/rejected": -1369.053955078125, "loss": 0.1365, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.844570159912109, "rewards/margins": 4.6734843254089355, "rewards/rejected": -11.518054008483887, "step": 4910 }, { "epoch": 2.83, "grad_norm": 34.296333734376184, "learning_rate": 4.643610260075842e-09, "logits/chosen": 3.2745940685272217, "logits/rejected": 4.042975425720215, "logps/chosen": -878.0281982421875, "logps/rejected": -1299.2745361328125, "loss": 0.1455, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.805761814117432, "rewards/margins": 4.322918891906738, "rewards/rejected": -11.128680229187012, "step": 4920 }, { "epoch": 2.84, "grad_norm": 26.01322769518254, "learning_rate": 4.32765332656701e-09, "logits/chosen": 3.712836503982544, "logits/rejected": 3.8804562091827393, "logps/chosen": -898.783203125, "logps/rejected": -1293.727294921875, "loss": 0.1409, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.055450439453125, "rewards/margins": 3.921583890914917, "rewards/rejected": -10.977034568786621, "step": 4930 }, { "epoch": 2.84, "grad_norm": 28.67393633239166, "learning_rate": 4.022733734726308e-09, "logits/chosen": 3.169675588607788, "logits/rejected": 4.061818599700928, "logps/chosen": -892.6276245117188, "logps/rejected": -1322.0609130859375, "loss": 0.1462, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.837277889251709, "rewards/margins": 4.396964073181152, "rewards/rejected": -11.234241485595703, "step": 4940 }, { "epoch": 2.85, "grad_norm": 37.662916053006875, "learning_rate": 3.7288651837012745e-09, "logits/chosen": 3.986090898513794, "logits/rejected": 3.6710445880889893, "logps/chosen": -874.7596435546875, "logps/rejected": -1287.571533203125, "loss": 0.1647, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.867257118225098, "rewards/margins": 4.1071248054504395, "rewards/rejected": -10.974382400512695, "step": 4950 }, { "epoch": 2.86, "grad_norm": 63.16362694549441, "learning_rate": 3.4460608761483768e-09, "logits/chosen": 3.63019061088562, "logits/rejected": 3.9077401161193848, "logps/chosen": -925.8382568359375, "logps/rejected": -1332.7681884765625, "loss": 0.134, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.061820983886719, "rewards/margins": 4.329748153686523, "rewards/rejected": -11.391569137573242, "step": 4960 }, { "epoch": 2.86, "grad_norm": 38.30236134938325, "learning_rate": 3.1743335176399876e-09, "logits/chosen": 3.7773661613464355, "logits/rejected": 3.586193799972534, "logps/chosen": -953.4528198242188, "logps/rejected": -1409.2462158203125, "loss": 0.1276, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.398825645446777, "rewards/margins": 4.613528251647949, "rewards/rejected": -12.012353897094727, "step": 4970 }, { "epoch": 2.87, "grad_norm": 39.46985346189525, "learning_rate": 2.9136953160933953e-09, "logits/chosen": 3.5676701068878174, "logits/rejected": 3.565430164337158, "logps/chosen": -922.1800537109375, "logps/rejected": -1357.783935546875, "loss": 0.1216, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.1007399559021, "rewards/margins": 4.476837635040283, "rewards/rejected": -11.5775785446167, "step": 4980 }, { "epoch": 2.87, "grad_norm": 50.12908844515136, "learning_rate": 2.664157981222437e-09, "logits/chosen": 3.913939952850342, "logits/rejected": 3.622389316558838, "logps/chosen": -857.6802978515625, "logps/rejected": -1237.967041015625, "loss": 0.1712, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.7367095947265625, "rewards/margins": 3.8834033012390137, "rewards/rejected": -10.62011432647705, "step": 4990 }, { "epoch": 2.88, "grad_norm": 21.67878253523554, "learning_rate": 2.4257327240114754e-09, "logits/chosen": 3.4103477001190186, "logits/rejected": 3.3824386596679688, "logps/chosen": -917.9265747070312, "logps/rejected": -1345.055908203125, "loss": 0.1321, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.031444549560547, "rewards/margins": 4.156983375549316, "rewards/rejected": -11.188427925109863, "step": 5000 }, { "epoch": 2.88, "eval_logits/chosen": 4.037291526794434, "eval_logits/rejected": 3.561065912246704, "eval_logps/chosen": -965.1029663085938, "eval_logps/rejected": -1123.42626953125, "eval_loss": 0.6443235278129578, "eval_rewards/accuracies": 0.7123287916183472, "eval_rewards/chosen": -8.023462295532227, "eval_rewards/margins": 1.5075329542160034, "eval_rewards/rejected": -9.53099536895752, "eval_runtime": 523.545, "eval_samples_per_second": 13.37, "eval_steps_per_second": 0.418, "step": 5000 }, { "epoch": 2.89, "grad_norm": 39.920973718942655, "learning_rate": 2.198430256211553e-09, "logits/chosen": 3.6530303955078125, "logits/rejected": 3.886491060256958, "logps/chosen": -897.0211791992188, "logps/rejected": -1283.4132080078125, "loss": 0.1645, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -7.023435115814209, "rewards/margins": 3.8803188800811768, "rewards/rejected": -10.903754234313965, "step": 5010 }, { "epoch": 2.89, "grad_norm": 21.943495835783825, "learning_rate": 1.9822607898593025e-09, "logits/chosen": 3.393162250518799, "logits/rejected": 4.065022945404053, "logps/chosen": -844.3812255859375, "logps/rejected": -1254.89111328125, "loss": 0.1304, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -6.477970123291016, "rewards/margins": 4.242525577545166, "rewards/rejected": -10.720495223999023, "step": 5020 }, { "epoch": 2.9, "grad_norm": 34.39663264473806, "learning_rate": 1.777234036818065e-09, "logits/chosen": 3.7023253440856934, "logits/rejected": 3.416015625, "logps/chosen": -894.38232421875, "logps/rejected": -1303.0245361328125, "loss": 0.1519, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.887213230133057, "rewards/margins": 4.113541603088379, "rewards/rejected": -11.000754356384277, "step": 5030 }, { "epoch": 2.9, "grad_norm": 24.353203018618846, "learning_rate": 1.5833592083416003e-09, "logits/chosen": 3.705573320388794, "logits/rejected": 3.664074659347534, "logps/chosen": -878.6209716796875, "logps/rejected": -1326.17626953125, "loss": 0.133, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.973219394683838, "rewards/margins": 4.37343692779541, "rewards/rejected": -11.346656799316406, "step": 5040 }, { "epoch": 2.91, "grad_norm": 33.92545566201757, "learning_rate": 1.4006450146601956e-09, "logits/chosen": 3.398350238800049, "logits/rejected": 3.6035189628601074, "logps/chosen": -894.0634765625, "logps/rejected": -1308.4305419921875, "loss": 0.1637, "rewards/accuracies": 0.9375, "rewards/chosen": -6.870625972747803, "rewards/margins": 4.277186393737793, "rewards/rejected": -11.147812843322754, "step": 5050 }, { "epoch": 2.91, "grad_norm": 41.81074524662964, "learning_rate": 1.2290996645894226e-09, "logits/chosen": 3.6675384044647217, "logits/rejected": 3.647371292114258, "logps/chosen": -937.9327392578125, "logps/rejected": -1326.6580810546875, "loss": 0.1461, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.089934349060059, "rewards/margins": 4.029538631439209, "rewards/rejected": -11.119474411010742, "step": 5060 }, { "epoch": 2.92, "grad_norm": 35.00731785169669, "learning_rate": 1.0687308651613214e-09, "logits/chosen": 3.675919771194458, "logits/rejected": 4.088980674743652, "logps/chosen": -859.2726440429688, "logps/rejected": -1264.521484375, "loss": 0.1328, "rewards/accuracies": 0.96875, "rewards/chosen": -6.6713128089904785, "rewards/margins": 4.143948554992676, "rewards/rejected": -10.815261840820312, "step": 5070 }, { "epoch": 2.93, "grad_norm": 30.620071236837248, "learning_rate": 9.195458212780948e-10, "logits/chosen": 3.6601357460021973, "logits/rejected": 3.642512083053589, "logps/chosen": -891.5439453125, "logps/rejected": -1345.772705078125, "loss": 0.1612, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.0605058670043945, "rewards/margins": 4.543371677398682, "rewards/rejected": -11.603878021240234, "step": 5080 }, { "epoch": 2.93, "grad_norm": 20.484451762821365, "learning_rate": 7.815512353884224e-10, "logits/chosen": 3.803593397140503, "logits/rejected": 2.817673444747925, "logps/chosen": -890.3509521484375, "logps/rejected": -1317.270263671875, "loss": 0.1502, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.808762550354004, "rewards/margins": 4.3904643058776855, "rewards/rejected": -11.199227333068848, "step": 5090 }, { "epoch": 2.94, "grad_norm": 27.693377224820647, "learning_rate": 6.547533071863676e-10, "logits/chosen": 3.6689095497131348, "logits/rejected": 4.051274299621582, "logps/chosen": -886.99072265625, "logps/rejected": -1289.313232421875, "loss": 0.1269, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.942623138427734, "rewards/margins": 4.121016979217529, "rewards/rejected": -11.063640594482422, "step": 5100 }, { "epoch": 2.94, "eval_logits/chosen": 4.047187328338623, "eval_logits/rejected": 3.5691068172454834, "eval_logps/chosen": -966.4895629882812, "eval_logps/rejected": -1124.8212890625, "eval_loss": 0.6446624994277954, "eval_rewards/accuracies": 0.7140411138534546, "eval_rewards/chosen": -8.037328720092773, "eval_rewards/margins": 1.5076179504394531, "eval_rewards/rejected": -9.544946670532227, "eval_runtime": 534.4275, "eval_samples_per_second": 13.098, "eval_steps_per_second": 0.41, "step": 5100 }, { "epoch": 2.94, "grad_norm": 31.003392859789937, "learning_rate": 5.391577333329067e-10, "logits/chosen": 3.4945876598358154, "logits/rejected": 3.6774909496307373, "logps/chosen": -841.7384643554688, "logps/rejected": -1333.0516357421875, "loss": 0.1404, "rewards/accuracies": 0.9375, "rewards/chosen": -6.523072242736816, "rewards/margins": 4.783470630645752, "rewards/rejected": -11.30654239654541, "step": 5110 }, { "epoch": 2.95, "grad_norm": 34.12405563447399, "learning_rate": 4.3476970719982797e-10, "logits/chosen": 3.725111484527588, "logits/rejected": 3.6920580863952637, "logps/chosen": -894.1261596679688, "logps/rejected": -1314.723388671875, "loss": 0.1567, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.998967170715332, "rewards/margins": 4.328560829162598, "rewards/rejected": -11.327528953552246, "step": 5120 }, { "epoch": 2.95, "grad_norm": 38.2825259548916, "learning_rate": 3.4159391863655685e-10, "logits/chosen": 4.108199119567871, "logits/rejected": 4.227601051330566, "logps/chosen": -848.1701049804688, "logps/rejected": -1304.1600341796875, "loss": 0.1672, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.720125675201416, "rewards/margins": 4.469071865081787, "rewards/rejected": -11.189196586608887, "step": 5130 }, { "epoch": 2.96, "grad_norm": 29.70933228747571, "learning_rate": 2.5963455375938023e-10, "logits/chosen": 3.7547576427459717, "logits/rejected": 3.758855104446411, "logps/chosen": -873.04052734375, "logps/rejected": -1299.674072265625, "loss": 0.1208, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.89468240737915, "rewards/margins": 4.2812042236328125, "rewards/rejected": -11.175884246826172, "step": 5140 }, { "epoch": 2.97, "grad_norm": 38.94650501718253, "learning_rate": 1.8889529476337485e-10, "logits/chosen": 3.6312007904052734, "logits/rejected": 3.7288143634796143, "logps/chosen": -862.5938720703125, "logps/rejected": -1277.8011474609375, "loss": 0.1513, "rewards/accuracies": 0.9375, "rewards/chosen": -6.6468963623046875, "rewards/margins": 4.190321445465088, "rewards/rejected": -10.83721923828125, "step": 5150 }, { "epoch": 2.97, "grad_norm": 29.023276381718574, "learning_rate": 1.293793197570947e-10, "logits/chosen": 3.3328635692596436, "logits/rejected": 3.5534427165985107, "logps/chosen": -943.3983154296875, "logps/rejected": -1369.872802734375, "loss": 0.1359, "rewards/accuracies": 0.9375, "rewards/chosen": -7.060005187988281, "rewards/margins": 4.446120262145996, "rewards/rejected": -11.506126403808594, "step": 5160 }, { "epoch": 2.98, "grad_norm": 36.64168368172609, "learning_rate": 8.108930261960245e-11, "logits/chosen": 3.7104103565216064, "logits/rejected": 3.6520133018493652, "logps/chosen": -863.7526245117188, "logps/rejected": -1277.0634765625, "loss": 0.1394, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.7254204750061035, "rewards/margins": 4.092425346374512, "rewards/rejected": -10.817845344543457, "step": 5170 }, { "epoch": 2.98, "grad_norm": 29.578096558304168, "learning_rate": 4.402741288045408e-11, "logits/chosen": 3.2252113819122314, "logits/rejected": 3.7862179279327393, "logps/chosen": -870.3963623046875, "logps/rejected": -1319.7581787109375, "loss": 0.1469, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.497006893157959, "rewards/margins": 4.707976341247559, "rewards/rejected": -11.204983711242676, "step": 5180 }, { "epoch": 2.99, "grad_norm": 32.81547741449026, "learning_rate": 1.8195315622193675e-11, "logits/chosen": 3.36704683303833, "logits/rejected": 4.082869052886963, "logps/chosen": -902.6834716796875, "logps/rejected": -1286.15771484375, "loss": 0.1886, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.9593400955200195, "rewards/margins": 3.9016661643981934, "rewards/rejected": -10.861005783081055, "step": 5190 }, { "epoch": 2.99, "grad_norm": 34.251003038296346, "learning_rate": 3.5941714056075824e-12, "logits/chosen": 3.4102511405944824, "logits/rejected": 4.034352779388428, "logps/chosen": -893.3396606445312, "logps/rejected": -1350.4420166015625, "loss": 0.1417, "rewards/accuracies": 0.9375, "rewards/chosen": -6.748190879821777, "rewards/margins": 4.648064613342285, "rewards/rejected": -11.396255493164062, "step": 5200 }, { "epoch": 2.99, "eval_logits/chosen": 4.039480686187744, "eval_logits/rejected": 3.5627248287200928, "eval_logps/chosen": -965.5220947265625, "eval_logps/rejected": -1123.870361328125, "eval_loss": 0.6445793509483337, "eval_rewards/accuracies": 0.7163242101669312, "eval_rewards/chosen": -8.027654647827148, "eval_rewards/margins": 1.5077828168869019, "eval_rewards/rejected": -9.53543758392334, "eval_runtime": 523.0507, "eval_samples_per_second": 13.383, "eval_steps_per_second": 0.419, "step": 5200 }, { "epoch": 3.0, "step": 5208, "total_flos": 0.0, "train_loss": 0.3444365044389086, "train_runtime": 113135.1128, "train_samples_per_second": 2.947, "train_steps_per_second": 0.046 } ], "logging_steps": 10, "max_steps": 5208, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }