{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990049751243781, "eval_steps": 100, "global_step": 753, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013266998341625207, "grad_norm": 40.25, "learning_rate": 6.578947368421052e-09, "logits/chosen": -1.2802138328552246, "logits/rejected": -1.3739961385726929, "logps/chosen": -584.777587890625, "logps/rejected": -533.882080078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.013266998341625208, "grad_norm": 54.5, "learning_rate": 6.578947368421052e-08, "logits/chosen": -1.1525533199310303, "logits/rejected": -1.1556764841079712, "logps/chosen": -577.8804321289062, "logps/rejected": -498.16986083984375, "loss": 0.6903, "rewards/accuracies": 0.4930555522441864, "rewards/chosen": 0.00347831379622221, "rewards/margins": 0.00828113965690136, "rewards/rejected": -0.004802825395017862, "step": 10 }, { "epoch": 0.026533996683250415, "grad_norm": 44.75, "learning_rate": 1.3157894736842104e-07, "logits/chosen": -1.1788235902786255, "logits/rejected": -1.2242963314056396, "logps/chosen": -611.204833984375, "logps/rejected": -543.866455078125, "loss": 0.6928, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.0023336124140769243, "rewards/margins": 0.003125081304460764, "rewards/rejected": -0.0007914667949080467, "step": 20 }, { "epoch": 0.03980099502487562, "grad_norm": 36.25, "learning_rate": 1.9736842105263157e-07, "logits/chosen": -1.158151388168335, "logits/rejected": -1.1601974964141846, "logps/chosen": -633.5345458984375, "logps/rejected": -536.1189575195312, "loss": 0.6905, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.013027493841946125, "rewards/margins": 0.015905674546957016, "rewards/rejected": -0.0028781811706721783, "step": 30 }, { "epoch": 0.05306799336650083, "grad_norm": 42.5, "learning_rate": 2.631578947368421e-07, "logits/chosen": -1.19637930393219, "logits/rejected": -1.22651207447052, "logps/chosen": -540.5403442382812, "logps/rejected": -552.4425048828125, "loss": 0.687, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0009165151277557015, "rewards/margins": 0.011455372907221317, "rewards/rejected": -0.01053885743021965, "step": 40 }, { "epoch": 0.06633499170812604, "grad_norm": 48.25, "learning_rate": 3.2894736842105264e-07, "logits/chosen": -1.198271632194519, "logits/rejected": -1.2518165111541748, "logps/chosen": -579.4686279296875, "logps/rejected": -571.4375, "loss": 0.6818, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.005964647978544235, "rewards/margins": 0.019765758886933327, "rewards/rejected": -0.013801109977066517, "step": 50 }, { "epoch": 0.07960199004975124, "grad_norm": 39.25, "learning_rate": 3.9473684210526315e-07, "logits/chosen": -1.1785616874694824, "logits/rejected": -1.226552963256836, "logps/chosen": -595.6961059570312, "logps/rejected": -572.9510498046875, "loss": 0.6691, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.022173848003149033, "rewards/margins": 0.06000928208231926, "rewards/rejected": -0.03783543407917023, "step": 60 }, { "epoch": 0.09286898839137644, "grad_norm": 44.5, "learning_rate": 4.6052631578947365e-07, "logits/chosen": -1.1982749700546265, "logits/rejected": -1.236537218093872, "logps/chosen": -634.6461181640625, "logps/rejected": -673.8761596679688, "loss": 0.6543, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02825814113020897, "rewards/margins": 0.08534505218267441, "rewards/rejected": -0.05708691477775574, "step": 70 }, { "epoch": 0.10613598673300166, "grad_norm": 36.75, "learning_rate": 4.999569334646955e-07, "logits/chosen": -1.0726072788238525, "logits/rejected": -1.1171576976776123, "logps/chosen": -614.9038696289062, "logps/rejected": -572.2459716796875, "loss": 0.638, "rewards/accuracies": 0.875, "rewards/chosen": 0.04197516664862633, "rewards/margins": 0.1271333396434784, "rewards/rejected": -0.08515818417072296, "step": 80 }, { "epoch": 0.11940298507462686, "grad_norm": 38.25, "learning_rate": 4.994726053293702e-07, "logits/chosen": -1.1955012083053589, "logits/rejected": -1.2350232601165771, "logps/chosen": -590.61376953125, "logps/rejected": -564.5113525390625, "loss": 0.6237, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.031416941434144974, "rewards/margins": 0.15016348659992218, "rewards/rejected": -0.1187465563416481, "step": 90 }, { "epoch": 0.13266998341625208, "grad_norm": 39.0, "learning_rate": 4.984511621268102e-07, "logits/chosen": -1.162690281867981, "logits/rejected": -1.1824209690093994, "logps/chosen": -573.8049926757812, "logps/rejected": -528.6422119140625, "loss": 0.5984, "rewards/accuracies": 0.90625, "rewards/chosen": 0.038635846227407455, "rewards/margins": 0.2009139508008957, "rewards/rejected": -0.16227811574935913, "step": 100 }, { "epoch": 0.13266998341625208, "eval_logits/chosen": -1.139477252960205, "eval_logits/rejected": -1.160577416419983, "eval_logps/chosen": -601.4292602539062, "eval_logps/rejected": -539.8974609375, "eval_loss": 0.5903807878494263, "eval_rewards/accuracies": 0.9029850959777832, "eval_rewards/chosen": 0.05485348403453827, "eval_rewards/margins": 0.22831708192825317, "eval_rewards/rejected": -0.1734635829925537, "eval_runtime": 685.2004, "eval_samples_per_second": 7.821, "eval_steps_per_second": 0.489, "step": 100 }, { "epoch": 0.14593698175787728, "grad_norm": 33.5, "learning_rate": 4.968948030264742e-07, "logits/chosen": -1.1363273859024048, "logits/rejected": -1.1610157489776611, "logps/chosen": -592.44482421875, "logps/rejected": -573.2866821289062, "loss": 0.582, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.07824783772230148, "rewards/margins": 0.2655600905418396, "rewards/rejected": -0.1873122602701187, "step": 110 }, { "epoch": 0.15920398009950248, "grad_norm": 33.25, "learning_rate": 4.948068788729238e-07, "logits/chosen": -1.1630527973175049, "logits/rejected": -1.202096939086914, "logps/chosen": -579.3135375976562, "logps/rejected": -578.4791259765625, "loss": 0.5691, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.07454140484333038, "rewards/margins": 0.28431838750839233, "rewards/rejected": -0.20977696776390076, "step": 120 }, { "epoch": 0.1724709784411277, "grad_norm": 35.25, "learning_rate": 4.921918849714475e-07, "logits/chosen": -1.185011625289917, "logits/rejected": -1.1927728652954102, "logps/chosen": -621.1232299804688, "logps/rejected": -598.525634765625, "loss": 0.5436, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.07992725074291229, "rewards/margins": 0.3480888307094574, "rewards/rejected": -0.2681615948677063, "step": 130 }, { "epoch": 0.1857379767827529, "grad_norm": 31.375, "learning_rate": 4.890554514096591e-07, "logits/chosen": -1.1601734161376953, "logits/rejected": -1.1828594207763672, "logps/chosen": -589.795654296875, "logps/rejected": -544.245849609375, "loss": 0.5303, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.06376471370458603, "rewards/margins": 0.3611634075641632, "rewards/rejected": -0.29739871621131897, "step": 140 }, { "epoch": 0.19900497512437812, "grad_norm": 28.25, "learning_rate": 4.854043309359063e-07, "logits/chosen": -1.2025436162948608, "logits/rejected": -1.2480312585830688, "logps/chosen": -550.0576782226562, "logps/rejected": -483.212646484375, "loss": 0.5173, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.08968071639537811, "rewards/margins": 0.39301761984825134, "rewards/rejected": -0.30333688855171204, "step": 150 }, { "epoch": 0.21227197346600332, "grad_norm": 28.125, "learning_rate": 4.812463844205884e-07, "logits/chosen": -1.1907384395599365, "logits/rejected": -1.218056321144104, "logps/chosen": -612.6536865234375, "logps/rejected": -552.7808837890625, "loss": 0.5032, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12153647094964981, "rewards/margins": 0.4728039801120758, "rewards/rejected": -0.3512675166130066, "step": 160 }, { "epoch": 0.22553897180762852, "grad_norm": 27.125, "learning_rate": 4.7659056393168604e-07, "logits/chosen": -1.2418904304504395, "logits/rejected": -1.3286497592926025, "logps/chosen": -532.3972778320312, "logps/rejected": -481.52459716796875, "loss": 0.5031, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.10282758623361588, "rewards/margins": 0.44313424825668335, "rewards/rejected": -0.3403066396713257, "step": 170 }, { "epoch": 0.23880597014925373, "grad_norm": 27.0, "learning_rate": 4.714468934609381e-07, "logits/chosen": -1.2157742977142334, "logits/rejected": -1.2327635288238525, "logps/chosen": -541.45361328125, "logps/rejected": -482.06103515625, "loss": 0.4824, "rewards/accuracies": 0.96875, "rewards/chosen": 0.1126769408583641, "rewards/margins": 0.5232059359550476, "rewards/rejected": -0.4105289876461029, "step": 180 }, { "epoch": 0.25207296849087896, "grad_norm": 23.875, "learning_rate": 4.658264473421659e-07, "logits/chosen": -1.2209118604660034, "logits/rejected": -1.2742892503738403, "logps/chosen": -600.0318603515625, "logps/rejected": -533.5223388671875, "loss": 0.4874, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.09032230079174042, "rewards/margins": 0.5253477096557617, "rewards/rejected": -0.4350253939628601, "step": 190 }, { "epoch": 0.26533996683250416, "grad_norm": 30.125, "learning_rate": 4.597413264082086e-07, "logits/chosen": -1.2209361791610718, "logits/rejected": -1.295668125152588, "logps/chosen": -573.05859375, "logps/rejected": -572.3590087890625, "loss": 0.4622, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.08147934824228287, "rewards/margins": 0.5580233931541443, "rewards/rejected": -0.4765440821647644, "step": 200 }, { "epoch": 0.26533996683250416, "eval_logits/chosen": -1.2179902791976929, "eval_logits/rejected": -1.271428108215332, "eval_logps/chosen": -600.8441162109375, "eval_logps/rejected": -543.1426391601562, "eval_loss": 0.45805710554122925, "eval_rewards/accuracies": 0.9350746273994446, "eval_rewards/chosen": 0.11337064951658249, "eval_rewards/margins": 0.6113449931144714, "eval_rewards/rejected": -0.49797430634498596, "eval_runtime": 676.1331, "eval_samples_per_second": 7.926, "eval_steps_per_second": 0.495, "step": 200 }, { "epoch": 0.27860696517412936, "grad_norm": 25.25, "learning_rate": 4.5320463193780256e-07, "logits/chosen": -1.1879713535308838, "logits/rejected": -1.234440565109253, "logps/chosen": -570.4061279296875, "logps/rejected": -540.0577392578125, "loss": 0.4485, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.1254458725452423, "rewards/margins": 0.5999458432197571, "rewards/rejected": -0.4744999408721924, "step": 210 }, { "epoch": 0.29187396351575456, "grad_norm": 22.625, "learning_rate": 4.4623043744850044e-07, "logits/chosen": -1.1740987300872803, "logits/rejected": -1.2075783014297485, "logps/chosen": -588.6426391601562, "logps/rejected": -534.1394653320312, "loss": 0.4551, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.11971279233694077, "rewards/margins": 0.642697811126709, "rewards/rejected": -0.5229849815368652, "step": 220 }, { "epoch": 0.30514096185737977, "grad_norm": 24.875, "learning_rate": 4.388337583963563e-07, "logits/chosen": -1.1734439134597778, "logits/rejected": -1.1642463207244873, "logps/chosen": -630.2869873046875, "logps/rejected": -650.8348388671875, "loss": 0.4326, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.11086304485797882, "rewards/margins": 0.7283642888069153, "rewards/rejected": -0.6175012588500977, "step": 230 }, { "epoch": 0.31840796019900497, "grad_norm": 28.375, "learning_rate": 4.31030519847616e-07, "logits/chosen": -1.170459508895874, "logits/rejected": -1.1728956699371338, "logps/chosen": -568.0635986328125, "logps/rejected": -495.91094970703125, "loss": 0.4293, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.09963791817426682, "rewards/margins": 0.654728889465332, "rewards/rejected": -0.5550910234451294, "step": 240 }, { "epoch": 0.33167495854063017, "grad_norm": 22.875, "learning_rate": 4.2283752219201464e-07, "logits/chosen": -1.0885179042816162, "logits/rejected": -1.133748173713684, "logps/chosen": -556.9310913085938, "logps/rejected": -509.1935119628906, "loss": 0.4244, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10411699116230011, "rewards/margins": 0.6922882795333862, "rewards/rejected": -0.5881712436676025, "step": 250 }, { "epoch": 0.3449419568822554, "grad_norm": 24.75, "learning_rate": 4.1427240497150047e-07, "logits/chosen": -1.132869839668274, "logits/rejected": -1.1097866296768188, "logps/chosen": -604.4608154296875, "logps/rejected": -561.1189575195312, "loss": 0.4112, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.10400505363941193, "rewards/margins": 0.708010196685791, "rewards/rejected": -0.6040050983428955, "step": 260 }, { "epoch": 0.3582089552238806, "grad_norm": 27.125, "learning_rate": 4.053536089022623e-07, "logits/chosen": -1.1613821983337402, "logits/rejected": -1.195441722869873, "logps/chosen": -502.41607666015625, "logps/rejected": -479.5301208496094, "loss": 0.4202, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.11941847950220108, "rewards/margins": 0.7337124347686768, "rewards/rejected": -0.6142939329147339, "step": 270 }, { "epoch": 0.3714759535655058, "grad_norm": 27.625, "learning_rate": 3.9610033617182715e-07, "logits/chosen": -1.1407119035720825, "logits/rejected": -1.1306835412979126, "logps/chosen": -632.7389526367188, "logps/rejected": -621.2080078125, "loss": 0.4085, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.15029093623161316, "rewards/margins": 0.8855623006820679, "rewards/rejected": -0.7352713346481323, "step": 280 }, { "epoch": 0.38474295190713104, "grad_norm": 20.75, "learning_rate": 3.865325090967081e-07, "logits/chosen": -1.1717865467071533, "logits/rejected": -1.1872893571853638, "logps/chosen": -561.0062255859375, "logps/rejected": -548.43603515625, "loss": 0.4078, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.11881165206432343, "rewards/margins": 0.7872866988182068, "rewards/rejected": -0.6684750318527222, "step": 290 }, { "epoch": 0.39800995024875624, "grad_norm": 23.25, "learning_rate": 3.7667072722961357e-07, "logits/chosen": -1.1743600368499756, "logits/rejected": -1.1946338415145874, "logps/chosen": -590.4188232421875, "logps/rejected": -515.8514404296875, "loss": 0.3934, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.14561176300048828, "rewards/margins": 0.7691020369529724, "rewards/rejected": -0.6234902739524841, "step": 300 }, { "epoch": 0.39800995024875624, "eval_logits/chosen": -1.0754693746566772, "eval_logits/rejected": -1.0528287887573242, "eval_logps/chosen": -600.71435546875, "eval_logps/rejected": -545.3746948242188, "eval_loss": 0.39592820405960083, "eval_rewards/accuracies": 0.9365671873092651, "eval_rewards/chosen": 0.12633956968784332, "eval_rewards/margins": 0.8475195169448853, "eval_rewards/rejected": -0.7211799621582031, "eval_runtime": 694.2423, "eval_samples_per_second": 7.719, "eval_steps_per_second": 0.483, "step": 300 }, { "epoch": 0.41127694859038144, "grad_norm": 22.375, "learning_rate": 3.6653622300856457e-07, "logits/chosen": -1.154953956604004, "logits/rejected": -1.1766210794448853, "logps/chosen": -573.0366821289062, "logps/rejected": -538.422119140625, "loss": 0.3901, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.10485813766717911, "rewards/margins": 0.8158906698226929, "rewards/rejected": -0.711032509803772, "step": 310 }, { "epoch": 0.42454394693200664, "grad_norm": 22.0, "learning_rate": 3.5615081604340903e-07, "logits/chosen": -1.196800708770752, "logits/rejected": -1.248241662979126, "logps/chosen": -629.1947631835938, "logps/rejected": -599.6900634765625, "loss": 0.3898, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.13255654275417328, "rewards/margins": 0.8788650631904602, "rewards/rejected": -0.7463085055351257, "step": 320 }, { "epoch": 0.43781094527363185, "grad_norm": 22.125, "learning_rate": 3.455368661381543e-07, "logits/chosen": -1.1678781509399414, "logits/rejected": -1.178554892539978, "logps/chosen": -493.3885803222656, "logps/rejected": -474.68402099609375, "loss": 0.3899, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.07870842516422272, "rewards/margins": 0.7927115559577942, "rewards/rejected": -0.7140030860900879, "step": 330 }, { "epoch": 0.45107794361525705, "grad_norm": 22.625, "learning_rate": 3.347172251502598e-07, "logits/chosen": -1.1612517833709717, "logits/rejected": -1.1822433471679688, "logps/chosen": -608.53271484375, "logps/rejected": -532.9712524414062, "loss": 0.3763, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.12583430111408234, "rewards/margins": 0.9212196469306946, "rewards/rejected": -0.7953853607177734, "step": 340 }, { "epoch": 0.46434494195688225, "grad_norm": 21.25, "learning_rate": 3.2371518779053744e-07, "logits/chosen": -1.101665735244751, "logits/rejected": -1.0791598558425903, "logps/chosen": -651.3994750976562, "logps/rejected": -633.3742065429688, "loss": 0.3678, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.12759700417518616, "rewards/margins": 0.9944127798080444, "rewards/rejected": -0.8668158650398254, "step": 350 }, { "epoch": 0.47761194029850745, "grad_norm": 24.75, "learning_rate": 3.1255444146958844e-07, "logits/chosen": -1.1323697566986084, "logits/rejected": -1.118276596069336, "logps/chosen": -565.9261474609375, "logps/rejected": -552.2655639648438, "loss": 0.3684, "rewards/accuracies": 0.90625, "rewards/chosen": 0.06775705516338348, "rewards/margins": 0.8904596567153931, "rewards/rejected": -0.8227025270462036, "step": 360 }, { "epoch": 0.49087893864013266, "grad_norm": 20.875, "learning_rate": 3.012590152987561e-07, "logits/chosen": -1.1285905838012695, "logits/rejected": -1.1064956188201904, "logps/chosen": -601.677734375, "logps/rejected": -547.2380981445312, "loss": 0.3663, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.12407927215099335, "rewards/margins": 0.9496960639953613, "rewards/rejected": -0.825616717338562, "step": 370 }, { "epoch": 0.5041459369817579, "grad_norm": 21.125, "learning_rate": 2.8985322835539626e-07, "logits/chosen": -1.0900777578353882, "logits/rejected": -1.0679134130477905, "logps/chosen": -622.616455078125, "logps/rejected": -570.1490478515625, "loss": 0.3705, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.15115661919116974, "rewards/margins": 0.9384227991104126, "rewards/rejected": -0.7872661352157593, "step": 380 }, { "epoch": 0.5174129353233831, "grad_norm": 21.375, "learning_rate": 2.7836163732385063e-07, "logits/chosen": -1.146226167678833, "logits/rejected": -1.131203293800354, "logps/chosen": -637.7056274414062, "logps/rejected": -580.0550537109375, "loss": 0.3568, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.09901280701160431, "rewards/margins": 0.9571696519851685, "rewards/rejected": -0.8581568598747253, "step": 390 }, { "epoch": 0.5306799336650083, "grad_norm": 21.125, "learning_rate": 2.6680898362485124e-07, "logits/chosen": -1.0712168216705322, "logits/rejected": -1.0648881196975708, "logps/chosen": -538.2913208007812, "logps/rejected": -526.31689453125, "loss": 0.3629, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.10081170499324799, "rewards/margins": 0.889560341835022, "rewards/rejected": -0.788748562335968, "step": 400 }, { "epoch": 0.5306799336650083, "eval_logits/chosen": -1.1154277324676514, "eval_logits/rejected": -1.110862135887146, "eval_logps/chosen": -600.8080444335938, "eval_logps/rejected": -546.7705078125, "eval_loss": 0.3673515021800995, "eval_rewards/accuracies": 0.9380596876144409, "eval_rewards/chosen": 0.11697468906641006, "eval_rewards/margins": 0.9777337312698364, "eval_rewards/rejected": -0.8607590198516846, "eval_runtime": 712.8834, "eval_samples_per_second": 7.517, "eval_steps_per_second": 0.47, "step": 400 }, { "epoch": 0.5439469320066335, "grad_norm": 19.625, "learning_rate": 2.5522014014718697e-07, "logits/chosen": -1.0688056945800781, "logits/rejected": -1.0452687740325928, "logps/chosen": -548.5653076171875, "logps/rejected": -500.2899475097656, "loss": 0.3686, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09097462147474289, "rewards/margins": 0.8869258761405945, "rewards/rejected": -0.7959513664245605, "step": 410 }, { "epoch": 0.5572139303482587, "grad_norm": 20.75, "learning_rate": 2.436200576963198e-07, "logits/chosen": -1.1284773349761963, "logits/rejected": -1.0750479698181152, "logps/chosen": -570.462890625, "logps/rejected": -484.501708984375, "loss": 0.3638, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.1417674571275711, "rewards/margins": 0.9661204218864441, "rewards/rejected": -0.8243529200553894, "step": 420 }, { "epoch": 0.5704809286898839, "grad_norm": 26.625, "learning_rate": 2.3203371127524588e-07, "logits/chosen": -1.142064094543457, "logits/rejected": -1.0941470861434937, "logps/chosen": -540.7872314453125, "logps/rejected": -460.7454528808594, "loss": 0.3751, "rewards/accuracies": 0.90625, "rewards/chosen": 0.09556931257247925, "rewards/margins": 0.9223111271858215, "rewards/rejected": -0.8267418742179871, "step": 430 }, { "epoch": 0.5837479270315091, "grad_norm": 21.75, "learning_rate": 2.2048604631325892e-07, "logits/chosen": -1.0380522012710571, "logits/rejected": -1.036592721939087, "logps/chosen": -568.5135498046875, "logps/rejected": -558.4591064453125, "loss": 0.3722, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05709138512611389, "rewards/margins": 0.89503014087677, "rewards/rejected": -0.837938666343689, "step": 440 }, { "epoch": 0.5970149253731343, "grad_norm": 20.5, "learning_rate": 2.0900192495838615e-07, "logits/chosen": -1.0975573062896729, "logits/rejected": -1.0408273935317993, "logps/chosen": -531.0595703125, "logps/rejected": -478.84222412109375, "loss": 0.3591, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.07921108603477478, "rewards/margins": 0.9134254455566406, "rewards/rejected": -0.8342143893241882, "step": 450 }, { "epoch": 0.6102819237147595, "grad_norm": 18.5, "learning_rate": 1.9760607254912926e-07, "logits/chosen": -1.0876163244247437, "logits/rejected": -1.0495961904525757, "logps/chosen": -616.9581298828125, "logps/rejected": -538.6895751953125, "loss": 0.3562, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.09119832515716553, "rewards/margins": 0.9315482974052429, "rewards/rejected": -0.8403499722480774, "step": 460 }, { "epoch": 0.6235489220563848, "grad_norm": 20.75, "learning_rate": 1.8632302438075613e-07, "logits/chosen": -1.1088566780090332, "logits/rejected": -1.1191766262054443, "logps/chosen": -589.3919067382812, "logps/rejected": -567.9808349609375, "loss": 0.3629, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.1344141662120819, "rewards/margins": 1.0434377193450928, "rewards/rejected": -0.9090234637260437, "step": 470 }, { "epoch": 0.6368159203980099, "grad_norm": 18.625, "learning_rate": 1.7517707288075614e-07, "logits/chosen": -1.106209635734558, "logits/rejected": -1.1109434366226196, "logps/chosen": -565.6036987304688, "logps/rejected": -519.3636474609375, "loss": 0.3464, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09805373102426529, "rewards/margins": 0.9947841763496399, "rewards/rejected": -0.8967304229736328, "step": 480 }, { "epoch": 0.6500829187396352, "grad_norm": 20.5, "learning_rate": 1.641922153071906e-07, "logits/chosen": -1.0548484325408936, "logits/rejected": -1.0250844955444336, "logps/chosen": -573.2247924804688, "logps/rejected": -566.44970703125, "loss": 0.365, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.09521574527025223, "rewards/margins": 1.005110740661621, "rewards/rejected": -0.9098949432373047, "step": 490 }, { "epoch": 0.6633499170812603, "grad_norm": 19.125, "learning_rate": 1.5339210208254344e-07, "logits/chosen": -1.060248613357544, "logits/rejected": -1.0458314418792725, "logps/chosen": -541.8770751953125, "logps/rejected": -513.5958251953125, "loss": 0.3556, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.11755643784999847, "rewards/margins": 0.9871570467948914, "rewards/rejected": -0.8696004748344421, "step": 500 }, { "epoch": 0.6633499170812603, "eval_logits/chosen": -0.9289145469665527, "eval_logits/rejected": -0.8265557885169983, "eval_logps/chosen": -600.8418579101562, "eval_logps/rejected": -547.3089599609375, "eval_loss": 0.3561394512653351, "eval_rewards/accuracies": 0.9388059973716736, "eval_rewards/chosen": 0.11359576135873795, "eval_rewards/margins": 1.0282028913497925, "eval_rewards/rejected": -0.9146071672439575, "eval_runtime": 731.2224, "eval_samples_per_second": 7.329, "eval_steps_per_second": 0.458, "step": 500 }, { "epoch": 0.6766169154228856, "grad_norm": 19.75, "learning_rate": 1.4279998587430943e-07, "logits/chosen": -1.0720138549804688, "logits/rejected": -1.0440585613250732, "logps/chosen": -576.8155517578125, "logps/rejected": -474.7138671875, "loss": 0.3397, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.12137231975793839, "rewards/margins": 1.0447041988372803, "rewards/rejected": -0.9233318567276001, "step": 510 }, { "epoch": 0.6898839137645107, "grad_norm": 19.375, "learning_rate": 1.324386715319503e-07, "logits/chosen": -1.0745595693588257, "logits/rejected": -1.0517549514770508, "logps/chosen": -547.5264282226562, "logps/rejected": -513.98974609375, "loss": 0.3567, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.12890011072158813, "rewards/margins": 0.9725991487503052, "rewards/rejected": -0.8436989784240723, "step": 520 }, { "epoch": 0.703150912106136, "grad_norm": 20.125, "learning_rate": 1.2233046698800343e-07, "logits/chosen": -1.0820659399032593, "logits/rejected": -1.0528825521469116, "logps/chosen": -623.875, "logps/rejected": -623.9129638671875, "loss": 0.3541, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.141106516122818, "rewards/margins": 1.074299931526184, "rewards/rejected": -0.9331933856010437, "step": 530 }, { "epoch": 0.7164179104477612, "grad_norm": 20.625, "learning_rate": 1.124971352290545e-07, "logits/chosen": -1.108722448348999, "logits/rejected": -1.0873199701309204, "logps/chosen": -593.7824096679688, "logps/rejected": -559.232421875, "loss": 0.337, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.1328240931034088, "rewards/margins": 1.019162654876709, "rewards/rejected": -0.8863385915756226, "step": 540 }, { "epoch": 0.7296849087893864, "grad_norm": 21.75, "learning_rate": 1.0295984743997909e-07, "logits/chosen": -1.085311770439148, "logits/rejected": -1.0750799179077148, "logps/chosen": -602.3040161132812, "logps/rejected": -558.0685424804688, "loss": 0.3544, "rewards/accuracies": 0.9375, "rewards/chosen": 0.08248591423034668, "rewards/margins": 1.015809178352356, "rewards/rejected": -0.9333232641220093, "step": 550 }, { "epoch": 0.7429519071310116, "grad_norm": 20.375, "learning_rate": 9.37391374223355e-08, "logits/chosen": -1.1596343517303467, "logits/rejected": -1.1693814992904663, "logps/chosen": -583.8175659179688, "logps/rejected": -574.1356811523438, "loss": 0.3416, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1034403070807457, "rewards/margins": 1.0356991291046143, "rewards/rejected": -0.9322587251663208, "step": 560 }, { "epoch": 0.7562189054726368, "grad_norm": 17.875, "learning_rate": 8.485485738504488e-08, "logits/chosen": -1.1387842893600464, "logits/rejected": -1.108246922492981, "logps/chosen": -627.6776123046875, "logps/rejected": -469.2587890625, "loss": 0.3468, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.1544940024614334, "rewards/margins": 1.0551023483276367, "rewards/rejected": -0.900608241558075, "step": 570 }, { "epoch": 0.7694859038142621, "grad_norm": 19.25, "learning_rate": 7.632613520254158e-08, "logits/chosen": -1.0649652481079102, "logits/rejected": -1.064888834953308, "logps/chosen": -601.84033203125, "logps/rejected": -529.4444580078125, "loss": 0.359, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.10679004341363907, "rewards/margins": 1.0746941566467285, "rewards/rejected": -0.9679039716720581, "step": 580 }, { "epoch": 0.7827529021558872, "grad_norm": 20.75, "learning_rate": 6.817133323241755e-08, "logits/chosen": -1.1302725076675415, "logits/rejected": -1.1106232404708862, "logps/chosen": -675.71533203125, "logps/rejected": -525.1083984375, "loss": 0.3478, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.09434429556131363, "rewards/margins": 1.0678186416625977, "rewards/rejected": -0.9734743237495422, "step": 590 }, { "epoch": 0.7960199004975125, "grad_norm": 22.25, "learning_rate": 6.040800878122654e-08, "logits/chosen": -1.1192970275878906, "logits/rejected": -1.1343142986297607, "logps/chosen": -567.078125, "logps/rejected": -525.439697265625, "loss": 0.3488, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09638460725545883, "rewards/margins": 0.9892334938049316, "rewards/rejected": -0.8928488492965698, "step": 600 }, { "epoch": 0.7960199004975125, "eval_logits/chosen": -1.0876879692077637, "eval_logits/rejected": -1.067589521408081, "eval_logps/chosen": -600.8737182617188, "eval_logps/rejected": -547.4734497070312, "eval_loss": 0.35399559140205383, "eval_rewards/accuracies": 0.9410447478294373, "eval_rewards/chosen": 0.11041063815355301, "eval_rewards/margins": 1.0414601564407349, "eval_rewards/rejected": -0.9310495853424072, "eval_runtime": 680.93, "eval_samples_per_second": 7.87, "eval_steps_per_second": 0.492, "step": 600 }, { "epoch": 0.8092868988391376, "grad_norm": 20.625, "learning_rate": 5.305287630356362e-08, "logits/chosen": -1.1514161825180054, "logits/rejected": -1.1489306688308716, "logps/chosen": -603.4745483398438, "logps/rejected": -562.2152709960938, "loss": 0.3556, "rewards/accuracies": 0.9375, "rewards/chosen": 0.08816297352313995, "rewards/margins": 1.0151373147964478, "rewards/rejected": -0.9269744157791138, "step": 610 }, { "epoch": 0.8225538971807629, "grad_norm": 23.375, "learning_rate": 4.612177141580875e-08, "logits/chosen": -1.0614503622055054, "logits/rejected": -1.0462639331817627, "logps/chosen": -600.1735229492188, "logps/rejected": -563.9249877929688, "loss": 0.3555, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.1291816681623459, "rewards/margins": 1.0133174657821655, "rewards/rejected": -0.8841358423233032, "step": 620 }, { "epoch": 0.835820895522388, "grad_norm": 19.25, "learning_rate": 3.962961680200927e-08, "logits/chosen": -1.154007911682129, "logits/rejected": -1.1668691635131836, "logps/chosen": -587.5554809570312, "logps/rejected": -579.425537109375, "loss": 0.3591, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.1071285754442215, "rewards/margins": 1.01383376121521, "rewards/rejected": -0.9067050814628601, "step": 630 }, { "epoch": 0.8490878938640133, "grad_norm": 19.625, "learning_rate": 3.359039008530845e-08, "logits/chosen": -1.1280542612075806, "logits/rejected": -1.1074917316436768, "logps/chosen": -638.248779296875, "logps/rejected": -570.1997680664062, "loss": 0.3504, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.10942580550909042, "rewards/margins": 1.0872418880462646, "rewards/rejected": -0.9778162240982056, "step": 640 }, { "epoch": 0.8623548922056384, "grad_norm": 21.125, "learning_rate": 2.8017093734092474e-08, "logits/chosen": -1.0559157133102417, "logits/rejected": -0.9880287051200867, "logps/chosen": -617.8060302734375, "logps/rejected": -542.3763427734375, "loss": 0.3563, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.06925593316555023, "rewards/margins": 1.0108643770217896, "rewards/rejected": -0.9416083097457886, "step": 650 }, { "epoch": 0.8756218905472637, "grad_norm": 20.25, "learning_rate": 2.292172706764703e-08, "logits/chosen": -1.0475237369537354, "logits/rejected": -1.0124943256378174, "logps/chosen": -625.7453002929688, "logps/rejected": -627.2447509765625, "loss": 0.3567, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11004464328289032, "rewards/margins": 1.0592918395996094, "rewards/rejected": -0.9492471814155579, "step": 660 }, { "epoch": 0.8888888888888888, "grad_norm": 19.875, "learning_rate": 1.8315260421596924e-08, "logits/chosen": -1.16936457157135, "logits/rejected": -1.1426036357879639, "logps/chosen": -555.6038818359375, "logps/rejected": -494.2486267089844, "loss": 0.3509, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.12268821895122528, "rewards/margins": 0.9934048652648926, "rewards/rejected": -0.8707167506217957, "step": 670 }, { "epoch": 0.9021558872305141, "grad_norm": 20.125, "learning_rate": 1.4207611528748997e-08, "logits/chosen": -1.122236967086792, "logits/rejected": -1.0928575992584229, "logps/chosen": -568.34765625, "logps/rejected": -548.4368896484375, "loss": 0.3548, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.10156118869781494, "rewards/margins": 1.01285719871521, "rewards/rejected": -0.9112960696220398, "step": 680 }, { "epoch": 0.9154228855721394, "grad_norm": 20.875, "learning_rate": 1.0607624166191958e-08, "logits/chosen": -1.102480411529541, "logits/rejected": -1.097570776939392, "logps/chosen": -670.6092529296875, "logps/rejected": -724.0338134765625, "loss": 0.3508, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.122245192527771, "rewards/margins": 1.0727375745773315, "rewards/rejected": -0.9504923820495605, "step": 690 }, { "epoch": 0.9286898839137645, "grad_norm": 20.75, "learning_rate": 7.523049114624647e-09, "logits/chosen": -1.067058801651001, "logits/rejected": -1.0042006969451904, "logps/chosen": -610.7342529296875, "logps/rejected": -569.3170776367188, "loss": 0.3563, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.13328425586223602, "rewards/margins": 1.0865224599838257, "rewards/rejected": -0.9532381296157837, "step": 700 }, { "epoch": 0.9286898839137645, "eval_logits/chosen": -0.9600119590759277, "eval_logits/rejected": -0.8735809922218323, "eval_logps/chosen": -600.8121337890625, "eval_logps/rejected": -547.42236328125, "eval_loss": 0.3540438711643219, "eval_rewards/accuracies": 0.9395522475242615, "eval_rewards/chosen": 0.11656844615936279, "eval_rewards/margins": 1.0425076484680176, "eval_rewards/rejected": -0.92593914270401, "eval_runtime": 733.9957, "eval_samples_per_second": 7.301, "eval_steps_per_second": 0.456, "step": 700 }, { "epoch": 0.9419568822553898, "grad_norm": 20.25, "learning_rate": 4.960527470908277e-09, "logits/chosen": -0.9644180536270142, "logits/rejected": -0.860200047492981, "logps/chosen": -622.1219482421875, "logps/rejected": -567.1380615234375, "loss": 0.3555, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.11110372841358185, "rewards/margins": 1.0618221759796143, "rewards/rejected": -0.9507185220718384, "step": 710 }, { "epoch": 0.9552238805970149, "grad_norm": 19.75, "learning_rate": 2.925576349770337e-09, "logits/chosen": -0.9986553192138672, "logits/rejected": -0.8984715342521667, "logps/chosen": -605.7318725585938, "logps/rejected": -542.7632446289062, "loss": 0.359, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.10133838653564453, "rewards/margins": 0.9883183240890503, "rewards/rejected": -0.886979877948761, "step": 720 }, { "epoch": 0.9684908789386402, "grad_norm": 21.25, "learning_rate": 1.4225770054443197e-09, "logits/chosen": -0.9282068014144897, "logits/rejected": -0.8550642132759094, "logps/chosen": -571.7738037109375, "logps/rejected": -500.0634765625, "loss": 0.3571, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.10189428180456161, "rewards/margins": 1.0126664638519287, "rewards/rejected": -0.9107722043991089, "step": 730 }, { "epoch": 0.9817578772802653, "grad_norm": 19.625, "learning_rate": 4.547653988198619e-10, "logits/chosen": -0.9236332774162292, "logits/rejected": -0.8542205095291138, "logps/chosen": -632.85546875, "logps/rejected": -597.6421508789062, "loss": 0.3493, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.147782564163208, "rewards/margins": 1.067484736442566, "rewards/rejected": -0.9197022318840027, "step": 740 }, { "epoch": 0.9950248756218906, "grad_norm": 20.5, "learning_rate": 2.4225230411789588e-11, "logits/chosen": -0.9963258504867554, "logits/rejected": -0.8823927044868469, "logps/chosen": -617.5396728515625, "logps/rejected": -596.9856567382812, "loss": 0.36, "rewards/accuracies": 0.90625, "rewards/chosen": 0.14549708366394043, "rewards/margins": 1.0541255474090576, "rewards/rejected": -0.9086285829544067, "step": 750 }, { "epoch": 0.9990049751243781, "step": 753, "total_flos": 0.0, "train_loss": 0.4291752041731856, "train_runtime": 22849.6211, "train_samples_per_second": 2.111, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 753, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }