{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999214865218529, "eval_steps": 100, "global_step": 5730, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005234231876472127, "grad_norm": 0.5165453173260315, "kl": 0.0361328125, "learning_rate": 8.726003490401396e-07, "logits/chosen": -1284925056.0, "logits/rejected": -1155530752.0, "logps/chosen": -305.5562310030395, "logps/rejected": -284.19292604501607, "loss": 0.4999, "rewards/chosen": 0.0001331085854388298, "rewards/margins": 4.391090532427996e-05, "rewards/rejected": 8.919768011454984e-05, "step": 10 }, { "epoch": 0.010468463752944255, "grad_norm": 0.507696597667233, "kl": 0.13066406548023224, "learning_rate": 1.7452006980802793e-06, "logits/chosen": -1350985344.0, "logits/rejected": -1120508288.0, "logps/chosen": -327.7463976945245, "logps/rejected": -297.61092150170646, "loss": 0.4999, "rewards/chosen": 0.001419726984645173, "rewards/margins": 0.0006502778092238846, "rewards/rejected": 0.0007694491754212883, "step": 20 }, { "epoch": 0.015702695629416383, "grad_norm": 0.5221877360005158, "kl": 0.18417969346046448, "learning_rate": 2.617801047120419e-06, "logits/chosen": -1257032960.0, "logits/rejected": -1239207168.0, "logps/chosen": -317.86996904024767, "logps/rejected": -271.64668769716087, "loss": 0.4997, "rewards/chosen": 0.00518288627128483, "rewards/margins": 0.0010991144603621173, "rewards/rejected": 0.004083771810922713, "step": 30 }, { "epoch": 0.02093692750588851, "grad_norm": 0.514175907898874, "kl": 0.12470702826976776, "learning_rate": 3.4904013961605585e-06, "logits/chosen": -1334417792.0, "logits/rejected": -1143367296.0, "logps/chosen": -323.3053892215569, "logps/rejected": -279.16339869281046, "loss": 0.4993, "rewards/chosen": 0.19018326262514035, "rewards/margins": 0.1845642384855366, "rewards/rejected": 0.0056190241396037585, "step": 40 }, { "epoch": 0.02617115938236064, "grad_norm": 0.5085275851932604, "kl": 0.13925781846046448, "learning_rate": 4.363001745200698e-06, "logits/chosen": -1259549440.0, "logits/rejected": -1169791360.0, "logps/chosen": -361.9696048632219, "logps/rejected": -280.18006430868166, "loss": 0.4981, "rewards/chosen": 0.017455683653115502, "rewards/margins": 0.013712324906129971, "rewards/rejected": 0.0037433587469855307, "step": 50 }, { "epoch": 0.031405391258832765, "grad_norm": 0.46829128041895135, "kl": 0.0, "learning_rate": 5.235602094240838e-06, "logits/chosen": -1275907328.0, "logits/rejected": -1146722688.0, "logps/chosen": -318.89156626506025, "logps/rejected": -301.97402597402595, "loss": 0.4982, "rewards/chosen": 0.1808394121836467, "rewards/margins": 0.18323246590726683, "rewards/rejected": -0.00239305372362013, "step": 60 }, { "epoch": 0.036639623135304895, "grad_norm": 0.5227551593809364, "kl": 0.0, "learning_rate": 6.108202443280978e-06, "logits/chosen": -1260598016.0, "logits/rejected": -1303589632.0, "logps/chosen": -374.28753993610223, "logps/rejected": -286.38532110091745, "loss": 0.4954, "rewards/chosen": 0.015276729108426517, "rewards/margins": 0.03535449573686689, "rewards/rejected": -0.020077766628440366, "step": 70 }, { "epoch": 0.04187385501177702, "grad_norm": 0.5592960536432072, "kl": 0.0, "learning_rate": 6.980802792321117e-06, "logits/chosen": -1319947520.0, "logits/rejected": -1250951168.0, "logps/chosen": -333.88957055214723, "logps/rejected": -304.71337579617835, "loss": 0.4945, "rewards/chosen": -0.0124519207726227, "rewards/margins": 0.0456597681206894, "rewards/rejected": -0.0581116888933121, "step": 80 }, { "epoch": 0.04710808688824915, "grad_norm": 0.5380069808961143, "kl": 0.0, "learning_rate": 7.853403141361257e-06, "logits/chosen": -1415158144.0, "logits/rejected": -1307154816.0, "logps/chosen": -353.02127659574467, "logps/rejected": -296.38585209003213, "loss": 0.4907, "rewards/chosen": -0.07930934175531915, "rewards/margins": 0.08092226998101526, "rewards/rejected": -0.1602316117363344, "step": 90 }, { "epoch": 0.05234231876472128, "grad_norm": 0.5652054096019196, "kl": 0.0, "learning_rate": 8.726003490401396e-06, "logits/chosen": -1422078720.0, "logits/rejected": -1432354816.0, "logps/chosen": -374.0444444444444, "logps/rejected": -317.04615384615386, "loss": 0.4859, "rewards/chosen": -0.3256448412698413, "rewards/margins": 0.1057493894993895, "rewards/rejected": -0.4313942307692308, "step": 100 }, { "epoch": 0.05234231876472128, "eval_kl": 0.0, "eval_logits/chosen": -3175687424.0, "eval_logits/rejected": -3128284928.0, "eval_logps/chosen": -393.0410687778328, "eval_logps/rejected": -370.77672799602186, "eval_loss": 0.485539048910141, "eval_rewards/chosen": -0.6135576447303315, "eval_rewards/margins": 0.14153932195291075, "eval_rewards/rejected": -0.7550969666832422, "eval_runtime": 93.7067, "eval_samples_per_second": 42.686, "eval_steps_per_second": 0.672, "step": 100 }, { "epoch": 0.05757655064119341, "grad_norm": 0.731851227511473, "kl": 0.0, "learning_rate": 9.598603839441536e-06, "logits/chosen": -1513304832.0, "logits/rejected": -1316382336.0, "logps/chosen": -370.15204678362574, "logps/rejected": -375.89261744966444, "loss": 0.4897, "rewards/chosen": -0.49762426900584794, "rewards/margins": 0.17886482495388362, "rewards/rejected": -0.6764890939597316, "step": 110 }, { "epoch": 0.06281078251766553, "grad_norm": 0.8844414419531235, "kl": 0.0, "learning_rate": 1.0471204188481676e-05, "logits/chosen": -1446615424.0, "logits/rejected": -1554409088.0, "logps/chosen": -438.3061889250814, "logps/rejected": -390.4384384384384, "loss": 0.4743, "rewards/chosen": -0.9323086319218241, "rewards/margins": 0.2139000767868846, "rewards/rejected": -1.1462087087087087, "step": 120 }, { "epoch": 0.06804501439413765, "grad_norm": 0.5762003679726659, "kl": 0.0, "learning_rate": 1.1343804537521815e-05, "logits/chosen": -1495479040.0, "logits/rejected": -1531759872.0, "logps/chosen": -415.79421221864953, "logps/rejected": -433.3130699088146, "loss": 0.4632, "rewards/chosen": -0.9915594855305466, "rewards/margins": 0.35456513453024374, "rewards/rejected": -1.3461246200607904, "step": 130 }, { "epoch": 0.07327924627060979, "grad_norm": 0.7230629257517631, "kl": 0.0, "learning_rate": 1.2216404886561955e-05, "logits/chosen": -1567201664.0, "logits/rejected": -1556715904.0, "logps/chosen": -462.81481481481484, "logps/rejected": -487.49367088607596, "loss": 0.4764, "rewards/chosen": -1.3439429012345678, "rewards/margins": 0.37441152914517906, "rewards/rejected": -1.7183544303797469, "step": 140 }, { "epoch": 0.07851347814708191, "grad_norm": 0.6568103530741317, "kl": 0.0, "learning_rate": 1.3089005235602096e-05, "logits/chosen": -1576638848.0, "logits/rejected": -1556506240.0, "logps/chosen": -454.82866043613706, "logps/rejected": -408.9780564263323, "loss": 0.4709, "rewards/chosen": -0.938376168224299, "rewards/margins": 0.3631520449418453, "rewards/rejected": -1.3015282131661443, "step": 150 }, { "epoch": 0.08374771002355404, "grad_norm": 0.7681435781467648, "kl": 0.0, "learning_rate": 1.3961605584642234e-05, "logits/chosen": -1524839168.0, "logits/rejected": -1607886464.0, "logps/chosen": -391.9225806451613, "logps/rejected": -412.3151515151515, "loss": 0.462, "rewards/chosen": -0.7109879032258064, "rewards/margins": 0.56287573313783, "rewards/rejected": -1.2738636363636364, "step": 160 }, { "epoch": 0.08898194190002617, "grad_norm": 0.6470606006807348, "kl": 0.0, "learning_rate": 1.4834205933682374e-05, "logits/chosen": -1812358784.0, "logits/rejected": -1592996608.0, "logps/chosen": -403.7957957957958, "logps/rejected": -453.628664495114, "loss": 0.4832, "rewards/chosen": -0.9804804804804805, "rewards/margins": 0.4303501384120276, "rewards/rejected": -1.4108306188925082, "step": 170 }, { "epoch": 0.0942161737764983, "grad_norm": 0.6062523081778814, "kl": 0.0, "learning_rate": 1.5706806282722515e-05, "logits/chosen": -1425224448.0, "logits/rejected": -1423337088.0, "logps/chosen": -353.8, "logps/rejected": -389.05, "loss": 0.4639, "rewards/chosen": -0.6763671875, "rewards/margins": 0.5416015625, "rewards/rejected": -1.21796875, "step": 180 }, { "epoch": 0.09945040565297043, "grad_norm": 0.5853123473869152, "kl": 0.0, "learning_rate": 1.6579406631762653e-05, "logits/chosen": -1505126016.0, "logits/rejected": -1503028864.0, "logps/chosen": -414.8, "logps/rejected": -472.75, "loss": 0.4585, "rewards/chosen": -0.96484375, "rewards/margins": 0.764453125, "rewards/rejected": -1.729296875, "step": 190 }, { "epoch": 0.10468463752944256, "grad_norm": 0.5544132434798503, "kl": 0.0, "learning_rate": 1.7452006980802792e-05, "logits/chosen": -1624034560.0, "logits/rejected": -1637456256.0, "logps/chosen": -481.99376947040497, "logps/rejected": -547.7115987460814, "loss": 0.4556, "rewards/chosen": -1.3319704049844237, "rewards/margins": 1.131783513510874, "rewards/rejected": -2.4637539184952977, "step": 200 }, { "epoch": 0.10468463752944256, "eval_kl": 0.0, "eval_logits/chosen": -3518155520.0, "eval_logits/rejected": -3502177280.0, "eval_logps/chosen": -452.2592775853538, "eval_logps/rejected": -491.090999502735, "eval_loss": 0.46578124165534973, "eval_rewards/chosen": -1.2023132112815438, "eval_rewards/margins": 0.757905585337054, "eval_rewards/rejected": -1.9602187966185978, "eval_runtime": 93.4704, "eval_samples_per_second": 42.794, "eval_steps_per_second": 0.674, "step": 200 }, { "epoch": 0.10991886940591468, "grad_norm": 0.5633400221663752, "kl": 0.0, "learning_rate": 1.8324607329842934e-05, "logits/chosen": -1558393600.0, "logits/rejected": -1515401984.0, "logps/chosen": -446.51851851851853, "logps/rejected": -465.82278481012656, "loss": 0.4702, "rewards/chosen": -1.308641975308642, "rewards/margins": 0.7303216322862949, "rewards/rejected": -2.038963607594937, "step": 210 }, { "epoch": 0.11515310128238682, "grad_norm": 0.8284904491112901, "kl": 0.0, "learning_rate": 1.9197207678883072e-05, "logits/chosen": -1535954176.0, "logits/rejected": -1548537088.0, "logps/chosen": -430.65, "logps/rejected": -465.8, "loss": 0.4699, "rewards/chosen": -1.17978515625, "rewards/margins": 0.42138671875, "rewards/rejected": -1.601171875, "step": 220 }, { "epoch": 0.12038733315885894, "grad_norm": 0.5556844000663201, "kl": 0.0, "learning_rate": 2.006980802792321e-05, "logits/chosen": -1586914944.0, "logits/rejected": -1648780928.0, "logps/chosen": -457.20127795527156, "logps/rejected": -569.6391437308869, "loss": 0.4462, "rewards/chosen": -1.5564097444089458, "rewards/margins": 1.1716101332668953, "rewards/rejected": -2.728019877675841, "step": 230 }, { "epoch": 0.12562156503533106, "grad_norm": 0.6855447682569064, "kl": 0.0, "learning_rate": 2.0942408376963353e-05, "logits/chosen": -1610193280.0, "logits/rejected": -1664719232.0, "logps/chosen": -527.1847133757962, "logps/rejected": -572.2699386503067, "loss": 0.4581, "rewards/chosen": -1.6970541401273886, "rewards/margins": 1.0817035285842678, "rewards/rejected": -2.7787576687116564, "step": 240 }, { "epoch": 0.13085579691180318, "grad_norm": 0.5884560345985042, "kl": 0.0, "learning_rate": 2.181500872600349e-05, "logits/chosen": -1686949120.0, "logits/rejected": -1573283456.0, "logps/chosen": -567.063063063063, "logps/rejected": -599.4527687296417, "loss": 0.4696, "rewards/chosen": -2.0405405405405403, "rewards/margins": 1.0877167884496877, "rewards/rejected": -3.128257328990228, "step": 250 }, { "epoch": 0.1360900287882753, "grad_norm": 0.6800356341208197, "kl": 0.0, "learning_rate": 2.268760907504363e-05, "logits/chosen": -1592157824.0, "logits/rejected": -1575800064.0, "logps/chosen": -456.7725856697819, "logps/rejected": -522.3322884012539, "loss": 0.4699, "rewards/chosen": -1.407904984423676, "rewards/margins": 0.6860409403412142, "rewards/rejected": -2.09394592476489, "step": 260 }, { "epoch": 0.14132426066474746, "grad_norm": 0.605890168743028, "kl": 0.0, "learning_rate": 2.3560209424083772e-05, "logits/chosen": -1526307200.0, "logits/rejected": -1540358144.0, "logps/chosen": -449.85, "logps/rejected": -448.7, "loss": 0.4709, "rewards/chosen": -1.132080078125, "rewards/margins": 0.3761230468750001, "rewards/rejected": -1.508203125, "step": 270 }, { "epoch": 0.14655849254121958, "grad_norm": 0.5866481803635993, "kl": 0.0, "learning_rate": 2.443280977312391e-05, "logits/chosen": -1631374592.0, "logits/rejected": -1556925696.0, "logps/chosen": -431.41104294478527, "logps/rejected": -466.0891719745223, "loss": 0.4686, "rewards/chosen": -1.1589340490797546, "rewards/margins": 0.5214003458247041, "rewards/rejected": -1.6803343949044587, "step": 280 }, { "epoch": 0.1517927244176917, "grad_norm": 0.6838595939783, "kl": 0.0, "learning_rate": 2.5305410122164053e-05, "logits/chosen": -1706242816.0, "logits/rejected": -1674785536.0, "logps/chosen": -463.2049689440994, "logps/rejected": -457.40880503144655, "loss": 0.4689, "rewards/chosen": -1.1663431677018634, "rewards/margins": 0.46455305871323094, "rewards/rejected": -1.6308962264150944, "step": 290 }, { "epoch": 0.15702695629416383, "grad_norm": 0.657362779444818, "kl": 0.0, "learning_rate": 2.617801047120419e-05, "logits/chosen": -1563007360.0, "logits/rejected": -1538680448.0, "logps/chosen": -410.1183800623053, "logps/rejected": -425.8808777429467, "loss": 0.4658, "rewards/chosen": -1.044489875389408, "rewards/margins": 0.5211449208488363, "rewards/rejected": -1.5656347962382444, "step": 300 }, { "epoch": 0.15702695629416383, "eval_kl": 0.0, "eval_logits/chosen": -3454242304.0, "eval_logits/rejected": -3428943360.0, "eval_logps/chosen": -416.22167243938645, "eval_logps/rejected": -439.7414221780209, "eval_loss": 0.46299219131469727, "eval_rewards/chosen": -0.8429614052449282, "eval_rewards/margins": 0.6032096539296118, "eval_rewards/rejected": -1.44617105917454, "eval_runtime": 93.4505, "eval_samples_per_second": 42.803, "eval_steps_per_second": 0.674, "step": 300 }, { "epoch": 0.16226118817063595, "grad_norm": 0.5595274955541248, "kl": 0.0, "learning_rate": 2.7050610820244333e-05, "logits/chosen": -1672269056.0, "logits/rejected": -1616065280.0, "logps/chosen": -503.0617283950617, "logps/rejected": -515.746835443038, "loss": 0.4648, "rewards/chosen": -1.6059992283950617, "rewards/margins": 0.6509232399593685, "rewards/rejected": -2.2569224683544302, "step": 310 }, { "epoch": 0.16749542004710807, "grad_norm": 0.5451493353326954, "kl": 0.0, "learning_rate": 2.792321116928447e-05, "logits/chosen": -1698273664.0, "logits/rejected": -1834588544.0, "logps/chosen": -461.6848874598071, "logps/rejected": -551.0030395136778, "loss": 0.4571, "rewards/chosen": -1.5639067524115755, "rewards/margins": 0.9851054056431359, "rewards/rejected": -2.5490121580547114, "step": 320 }, { "epoch": 0.17272965192358022, "grad_norm": 0.5540765788797768, "kl": 0.0, "learning_rate": 2.879581151832461e-05, "logits/chosen": -1663880448.0, "logits/rejected": -1705194240.0, "logps/chosen": -472.02492211838006, "logps/rejected": -502.77115987460814, "loss": 0.4663, "rewards/chosen": -1.4273753894080996, "rewards/margins": 0.7626716325354739, "rewards/rejected": -2.1900470219435735, "step": 330 }, { "epoch": 0.17796388380005235, "grad_norm": 0.6702465977915886, "kl": 0.0, "learning_rate": 2.966841186736475e-05, "logits/chosen": -1732247552.0, "logits/rejected": -1823264000.0, "logps/chosen": -472.2547770700637, "logps/rejected": -495.0184049079755, "loss": 0.4653, "rewards/chosen": -1.3769904458598725, "rewards/margins": 0.5137304130358331, "rewards/rejected": -1.8907208588957056, "step": 340 }, { "epoch": 0.18319811567652447, "grad_norm": 0.5946325216041444, "kl": 0.0, "learning_rate": 3.054101221640489e-05, "logits/chosen": -1580413696.0, "logits/rejected": -1975517184.0, "logps/chosen": -502.18815331010455, "logps/rejected": -546.356940509915, "loss": 0.4393, "rewards/chosen": -1.6064895470383276, "rewards/margins": 0.7727597447463748, "rewards/rejected": -2.3792492917847023, "step": 350 }, { "epoch": 0.1884323475529966, "grad_norm": 0.7184388546396678, "kl": 0.0, "learning_rate": 3.141361256544503e-05, "logits/chosen": -1819279360.0, "logits/rejected": -1610612736.0, "logps/chosen": -578.5060240963855, "logps/rejected": -636.0519480519481, "loss": 0.4787, "rewards/chosen": -2.4390060240963853, "rewards/margins": 1.0800686512282898, "rewards/rejected": -3.519074675324675, "step": 360 }, { "epoch": 0.19366657942946872, "grad_norm": 0.5963756654926242, "kl": 0.0, "learning_rate": 3.228621291448517e-05, "logits/chosen": -1832910848.0, "logits/rejected": -1772932352.0, "logps/chosen": -433.1636363636364, "logps/rejected": -440.46451612903223, "loss": 0.4755, "rewards/chosen": -1.0137310606060606, "rewards/margins": 0.49554313294232655, "rewards/rejected": -1.509274193548387, "step": 370 }, { "epoch": 0.19890081130594087, "grad_norm": 0.7653020375478802, "kl": 0.0, "learning_rate": 3.3158813263525307e-05, "logits/chosen": -1737280768.0, "logits/rejected": -1738958464.0, "logps/chosen": -419.92452830188677, "logps/rejected": -482.18633540372673, "loss": 0.4546, "rewards/chosen": -1.0692315251572326, "rewards/margins": 0.561785555588109, "rewards/rejected": -1.6310170807453417, "step": 380 }, { "epoch": 0.204135043182413, "grad_norm": 0.5470598105886728, "kl": 0.0, "learning_rate": 3.403141361256545e-05, "logits/chosen": -1615645952.0, "logits/rejected": -1695757056.0, "logps/chosen": -523.5527156549521, "logps/rejected": -659.8654434250765, "loss": 0.4491, "rewards/chosen": -1.9094448881789137, "rewards/margins": 1.6620382922492207, "rewards/rejected": -3.5714831804281344, "step": 390 }, { "epoch": 0.2093692750588851, "grad_norm": 0.4397255692980879, "kl": 0.0, "learning_rate": 3.4904013961605584e-05, "logits/chosen": -1483944704.0, "logits/rejected": -1774610048.0, "logps/chosen": -507.66101694915255, "logps/rejected": -540.9391304347826, "loss": 0.4543, "rewards/chosen": -1.9313559322033897, "rewards/margins": 0.41791943011545096, "rewards/rejected": -2.3492753623188407, "step": 400 }, { "epoch": 0.2093692750588851, "eval_kl": 0.0, "eval_logits/chosen": -3808428032.0, "eval_logits/rejected": -3728536576.0, "eval_logps/chosen": -562.5413161801089, "eval_logps/rejected": -589.5574341123819, "eval_loss": 0.47276562452316284, "eval_rewards/chosen": -2.3096239485403265, "eval_rewards/margins": 0.6356768968102453, "eval_rewards/rejected": -2.945300845350572, "eval_runtime": 93.4384, "eval_samples_per_second": 42.809, "eval_steps_per_second": 0.674, "step": 400 }, { "epoch": 0.21460350693535724, "grad_norm": 0.6281588994781631, "kl": 0.0, "learning_rate": 3.5776614310645726e-05, "logits/chosen": -1783837440.0, "logits/rejected": -1711066368.0, "logps/chosen": -513.4723926380368, "logps/rejected": -534.624203821656, "loss": 0.4732, "rewards/chosen": -1.7965874233128833, "rewards/margins": 0.6082692645852059, "rewards/rejected": -2.4048566878980893, "step": 410 }, { "epoch": 0.21983773881182936, "grad_norm": 0.6464496703356261, "kl": 0.0, "learning_rate": 3.664921465968587e-05, "logits/chosen": -1744830464.0, "logits/rejected": -1841718912.0, "logps/chosen": -475.9, "logps/rejected": -487.8, "loss": 0.4621, "rewards/chosen": -1.2525390625, "rewards/margins": 0.6818359375, "rewards/rejected": -1.934375, "step": 420 }, { "epoch": 0.22507197068830148, "grad_norm": 0.5760537357814977, "kl": 0.0, "learning_rate": 3.752181500872601e-05, "logits/chosen": -1720084096.0, "logits/rejected": -1644586624.0, "logps/chosen": -572.1212121212121, "logps/rejected": -579.0967741935484, "loss": 0.4851, "rewards/chosen": -2.3323863636363638, "rewards/margins": 0.5716458944281522, "rewards/rejected": -2.904032258064516, "step": 430 }, { "epoch": 0.23030620256477363, "grad_norm": 0.46548421992086714, "kl": 0.0, "learning_rate": 3.8394415357766145e-05, "logits/chosen": -1524839168.0, "logits/rejected": -1457101184.0, "logps/chosen": -469.3333333333333, "logps/rejected": -510.7848101265823, "loss": 0.4737, "rewards/chosen": -1.548707561728395, "rewards/margins": 0.6397813623222379, "rewards/rejected": -2.188488924050633, "step": 440 }, { "epoch": 0.23554043444124576, "grad_norm": 0.632322529320257, "kl": 0.0, "learning_rate": 3.926701570680629e-05, "logits/chosen": -1409705600.0, "logits/rejected": -1382442624.0, "logps/chosen": -430.2278481012658, "logps/rejected": -464.98765432098764, "loss": 0.4558, "rewards/chosen": -0.9287974683544303, "rewards/margins": 0.8775296921393968, "rewards/rejected": -1.8063271604938271, "step": 450 }, { "epoch": 0.24077466631771788, "grad_norm": 0.863998535094776, "kl": 0.0, "learning_rate": 4.013961605584642e-05, "logits/chosen": -1358744832.0, "logits/rejected": -1437807360.0, "logps/chosen": -477.8877887788779, "logps/rejected": -527.8575667655787, "loss": 0.4473, "rewards/chosen": -1.2968234323432344, "rewards/margins": 1.076692888131543, "rewards/rejected": -2.3735163204747773, "step": 460 }, { "epoch": 0.24600889819419, "grad_norm": 0.5718990529028527, "kl": 0.0, "learning_rate": 4.1012216404886564e-05, "logits/chosen": -1557554816.0, "logits/rejected": -1502399744.0, "logps/chosen": -697.2, "logps/rejected": -779.5, "loss": 0.476, "rewards/chosen": -2.8913015365600585, "rewards/margins": 2.1313547134399413, "rewards/rejected": -5.02265625, "step": 470 }, { "epoch": 0.2512431300706621, "grad_norm": 0.4910531304249267, "kl": 0.0, "learning_rate": 4.1884816753926706e-05, "logits/chosen": -1737700096.0, "logits/rejected": -1578945792.0, "logps/chosen": -573.3809523809524, "logps/rejected": -857.7894736842105, "loss": 0.4705, "rewards/chosen": -2.5364583333333335, "rewards/margins": 3.074561403508772, "rewards/rejected": -5.611019736842105, "step": 480 }, { "epoch": 0.2564773619471343, "grad_norm": 0.623179288487328, "kl": 0.0, "learning_rate": 4.275741710296685e-05, "logits/chosen": -1720922880.0, "logits/rejected": -1587963520.0, "logps/chosen": -648.072072072072, "logps/rejected": -719.4267100977198, "loss": 0.4775, "rewards/chosen": -3.0417605105105103, "rewards/margins": 1.1585652223885123, "rewards/rejected": -4.200325732899023, "step": 490 }, { "epoch": 0.26171159382360637, "grad_norm": 0.5699561570689229, "kl": 0.0, "learning_rate": 4.363001745200698e-05, "logits/chosen": -1380974592.0, "logits/rejected": -1450390272.0, "logps/chosen": -548.4967320261438, "logps/rejected": -635.4011976047905, "loss": 0.4445, "rewards/chosen": -2.0408496732026142, "rewards/margins": 1.4277132010488827, "rewards/rejected": -3.468562874251497, "step": 500 }, { "epoch": 0.26171159382360637, "eval_kl": 0.0, "eval_logits/chosen": -3128418048.0, "eval_logits/rejected": -2977955840.0, "eval_logps/chosen": -534.3255813953489, "eval_logps/rejected": -585.3565390353058, "eval_loss": 0.4657031297683716, "eval_rewards/chosen": -2.027090549233053, "eval_rewards/margins": 0.8744509723979763, "eval_rewards/rejected": -2.9015415216310294, "eval_runtime": 93.4439, "eval_samples_per_second": 42.806, "eval_steps_per_second": 0.674, "step": 500 }, { "epoch": 0.2669458257000785, "grad_norm": 0.5227183067644776, "kl": 0.0, "learning_rate": 4.4502617801047125e-05, "logits/chosen": -1514143744.0, "logits/rejected": -1418094208.0, "logps/chosen": -464.4938271604938, "logps/rejected": -544.1012658227849, "loss": 0.4525, "rewards/chosen": -1.46875, "rewards/margins": 1.2005537974683542, "rewards/rejected": -2.6693037974683542, "step": 510 }, { "epoch": 0.2721800575765506, "grad_norm": 0.7979790881328824, "kl": 0.0, "learning_rate": 4.537521815008726e-05, "logits/chosen": -1408447232.0, "logits/rejected": -1350356224.0, "logps/chosen": -539.2445820433436, "logps/rejected": -590.9400630914827, "loss": 0.4654, "rewards/chosen": -1.9049922600619196, "rewards/margins": 1.1391717777929702, "rewards/rejected": -3.0441640378548898, "step": 520 }, { "epoch": 0.27741428945302277, "grad_norm": 0.5782821069877753, "kl": 0.0, "learning_rate": 4.62478184991274e-05, "logits/chosen": -1633471744.0, "logits/rejected": -1529452928.0, "logps/chosen": -447.219512195122, "logps/rejected": -482.56410256410254, "loss": 0.4671, "rewards/chosen": -1.1834984756097562, "rewards/margins": 0.7592098577235771, "rewards/rejected": -1.9427083333333333, "step": 530 }, { "epoch": 0.2826485213294949, "grad_norm": 0.5557000802623413, "kl": 0.0, "learning_rate": 4.7120418848167544e-05, "logits/chosen": -1497576192.0, "logits/rejected": -1404252928.0, "logps/chosen": -536.9107692307692, "logps/rejected": -561.168253968254, "loss": 0.4802, "rewards/chosen": -2.223846153846154, "rewards/margins": 0.345995115995116, "rewards/rejected": -2.56984126984127, "step": 540 }, { "epoch": 0.287882753205967, "grad_norm": 0.5326154463719267, "kl": 0.0, "learning_rate": 4.7993019197207686e-05, "logits/chosen": -1470732672.0, "logits/rejected": -1550214784.0, "logps/chosen": -600.2077922077922, "logps/rejected": -632.5783132530121, "loss": 0.4592, "rewards/chosen": -2.6318993506493507, "rewards/margins": 0.8703596854952278, "rewards/rejected": -3.5022590361445785, "step": 550 }, { "epoch": 0.29311698508243916, "grad_norm": 1.3655207310078739, "kl": 0.0, "learning_rate": 4.886561954624782e-05, "logits/chosen": -1554828544.0, "logits/rejected": -1595513216.0, "logps/chosen": -491.62700964630227, "logps/rejected": -535.2462006079028, "loss": 0.4513, "rewards/chosen": -1.6141479099678457, "rewards/margins": 0.9189068012783548, "rewards/rejected": -2.5330547112462005, "step": 560 }, { "epoch": 0.29835121695891126, "grad_norm": 0.647903484038066, "kl": 0.0, "learning_rate": 4.973821989528796e-05, "logits/chosen": -1670801024.0, "logits/rejected": -1741894400.0, "logps/chosen": -527.0709677419355, "logps/rejected": -641.8424242424243, "loss": 0.4469, "rewards/chosen": -2.0377016129032257, "rewards/margins": 1.4562377810361684, "rewards/rejected": -3.493939393939394, "step": 570 }, { "epoch": 0.3035854488353834, "grad_norm": 0.46006692434550134, "kl": 0.0, "learning_rate": 4.999977269399062e-05, "logits/chosen": -1725956096.0, "logits/rejected": -1775029504.0, "logps/chosen": -619.1746031746031, "logps/rejected": -694.5476923076923, "loss": 0.4667, "rewards/chosen": -2.857738095238095, "rewards/margins": 1.0472619047619047, "rewards/rejected": -3.905, "step": 580 }, { "epoch": 0.30881968071185556, "grad_norm": 0.505652615532779, "kl": 0.0, "learning_rate": 4.9998659368385024e-05, "logits/chosen": -1675204992.0, "logits/rejected": -1821586176.0, "logps/chosen": -670.7368421052631, "logps/rejected": -852.5714285714286, "loss": 0.4549, "rewards/chosen": -3.2960526315789473, "rewards/margins": 2.3029057017543857, "rewards/rejected": -5.598958333333333, "step": 590 }, { "epoch": 0.31405391258832765, "grad_norm": 0.47080891304692235, "kl": 0.0, "learning_rate": 4.999661831436499e-05, "logits/chosen": -1803970176.0, "logits/rejected": -1722600704.0, "logps/chosen": -609.3700305810397, "logps/rejected": -651.7571884984026, "loss": 0.4654, "rewards/chosen": -2.3581804281345566, "rewards/margins": 1.0863083897568173, "rewards/rejected": -3.444488817891374, "step": 600 }, { "epoch": 0.31405391258832765, "eval_kl": 0.0, "eval_logits/chosen": -3596449280.0, "eval_logits/rejected": -3444389120.0, "eval_logps/chosen": -529.132112815438, "eval_logps/rejected": -582.269517652909, "eval_loss": 0.4657500088214874, "eval_rewards/chosen": -1.9722909450766948, "eval_rewards/margins": 0.896306767504111, "eval_rewards/rejected": -2.8685977125808058, "eval_runtime": 93.4365, "eval_samples_per_second": 42.81, "eval_steps_per_second": 0.674, "step": 600 }, { "epoch": 0.3192881444647998, "grad_norm": 0.5339767024170512, "kl": 0.0, "learning_rate": 4.9993649607676306e-05, "logits/chosen": -1825361152.0, "logits/rejected": -1622776192.0, "logps/chosen": -454.8433734939759, "logps/rejected": -541.922077922078, "loss": 0.4677, "rewards/chosen": -1.3794239457831325, "rewards/margins": 0.8608357944766076, "rewards/rejected": -2.24025974025974, "step": 610 }, { "epoch": 0.3245223763412719, "grad_norm": 0.5213452170759816, "kl": 0.0, "learning_rate": 4.998975335849104e-05, "logits/chosen": -1747347072.0, "logits/rejected": -2048078592.0, "logps/chosen": -562.7034482758621, "logps/rejected": -542.5371428571428, "loss": 0.451, "rewards/chosen": -2.232112068965517, "rewards/margins": 0.32038793103448304, "rewards/rejected": -2.5525, "step": 620 }, { "epoch": 0.32975660821774405, "grad_norm": 0.4969950941570501, "kl": 0.0, "learning_rate": 4.998492971140339e-05, "logits/chosen": -1885759104.0, "logits/rejected": -1890792192.0, "logps/chosen": -744.7923322683706, "logps/rejected": -789.9204892966361, "loss": 0.4539, "rewards/chosen": -4.179313099041534, "rewards/margins": 0.8566196226710048, "rewards/rejected": -5.0359327217125385, "step": 630 }, { "epoch": 0.33499084009421615, "grad_norm": 0.5451050219577263, "kl": 0.0, "learning_rate": 4.997917884542433e-05, "logits/chosen": -1594674432.0, "logits/rejected": -1598449280.0, "logps/chosen": -773.7539432176657, "logps/rejected": -728.1733746130031, "loss": 0.4474, "rewards/chosen": -4.292981072555205, "rewards/margins": 0.29331923704231855, "rewards/rejected": -4.586300309597523, "step": 640 }, { "epoch": 0.3402250719706883, "grad_norm": 0.5566572286394867, "kl": 0.0, "learning_rate": 4.997250097397497e-05, "logits/chosen": -1689885056.0, "logits/rejected": -1711695488.0, "logps/chosen": -810.5, "logps/rejected": -962.2, "loss": 0.458, "rewards/chosen": -4.571484375, "rewards/margins": 1.9644531250000004, "rewards/rejected": -6.5359375, "step": 650 }, { "epoch": 0.34545930384716045, "grad_norm": 0.7822806261860601, "kl": 0.0, "learning_rate": 4.9964896344878655e-05, "logits/chosen": -1595932672.0, "logits/rejected": -1489816832.0, "logps/chosen": -600.5696594427245, "logps/rejected": -857.4384858044164, "loss": 0.4588, "rewards/chosen": -2.655185758513932, "rewards/margins": 3.174072916564932, "rewards/rejected": -5.829258675078864, "step": 660 }, { "epoch": 0.35069353572363254, "grad_norm": 0.5814521512972838, "kl": 0.0, "learning_rate": 4.995636524035173e-05, "logits/chosen": -1610193280.0, "logits/rejected": -1613758464.0, "logps/chosen": -492.0261437908497, "logps/rejected": -678.9940119760479, "loss": 0.4322, "rewards/chosen": -1.9438316993464053, "rewards/margins": 1.8391024323901217, "rewards/rejected": -3.782934131736527, "step": 670 }, { "epoch": 0.3559277676001047, "grad_norm": 0.9280019507247134, "kl": 0.0, "learning_rate": 4.9946907976993104e-05, "logits/chosen": -1555667328.0, "logits/rejected": -1712114944.0, "logps/chosen": -446.7854785478548, "logps/rejected": -587.0148367952522, "loss": 0.4426, "rewards/chosen": -1.2946575907590758, "rewards/margins": 1.6893928543447818, "rewards/rejected": -2.9840504451038576, "step": 680 }, { "epoch": 0.3611619994765768, "grad_norm": 0.5627181242723076, "kl": 0.0, "learning_rate": 4.9936524905772464e-05, "logits/chosen": -1844654848.0, "logits/rejected": -1409076480.0, "logps/chosen": -602.8156424581006, "logps/rejected": -633.0780141843971, "loss": 0.4862, "rewards/chosen": -2.725034916201117, "rewards/margins": 0.7687594100400177, "rewards/rejected": -3.493794326241135, "step": 690 }, { "epoch": 0.36639623135304894, "grad_norm": 0.7905331398546509, "kl": 0.0, "learning_rate": 4.992521641201728e-05, "logits/chosen": -1645215744.0, "logits/rejected": -1646054656.0, "logps/chosen": -442.0253164556962, "logps/rejected": -546.2716049382716, "loss": 0.4517, "rewards/chosen": -1.162381329113924, "rewards/margins": 1.1979581770589154, "rewards/rejected": -2.3603395061728394, "step": 700 }, { "epoch": 0.36639623135304894, "eval_kl": 0.0, "eval_logits/chosen": -3631068928.0, "eval_logits/rejected": -3583666688.0, "eval_logps/chosen": -452.750123701138, "eval_logps/rejected": -492.39582297364495, "eval_loss": 0.4689921736717224, "eval_rewards/chosen": -1.2098589807026225, "eval_rewards/margins": 0.7624184931909628, "eval_rewards/rejected": -1.9722774738935853, "eval_runtime": 93.452, "eval_samples_per_second": 42.803, "eval_steps_per_second": 0.674, "step": 700 }, { "epoch": 0.3716304632295211, "grad_norm": 0.9400152465029169, "kl": 0.0, "learning_rate": 4.991298291539852e-05, "logits/chosen": -1697434880.0, "logits/rejected": -1574122240.0, "logps/chosen": -561.9318885448916, "logps/rejected": -566.813880126183, "loss": 0.4746, "rewards/chosen": -2.135642414860681, "rewards/margins": 0.567038973151937, "rewards/rejected": -2.702681388012618, "step": 710 }, { "epoch": 0.3768646951059932, "grad_norm": 0.7974875506467937, "kl": 0.0, "learning_rate": 4.9899824869915e-05, "logits/chosen": -1780482048.0, "logits/rejected": -1550004992.0, "logps/chosen": -558.2222222222222, "logps/rejected": -649.9865771812081, "loss": 0.4847, "rewards/chosen": -2.4365862573099415, "rewards/margins": 1.2320379037638842, "rewards/rejected": -3.6686241610738257, "step": 720 }, { "epoch": 0.38209892698246534, "grad_norm": 0.6046106097461434, "kl": 0.0, "learning_rate": 4.988574276387662e-05, "logits/chosen": -1531759872.0, "logits/rejected": -1609354496.0, "logps/chosen": -733.272131147541, "logps/rejected": -945.7671641791045, "loss": 0.4492, "rewards/chosen": -3.739344262295082, "rewards/margins": 2.6472229018840223, "rewards/rejected": -6.386567164179104, "step": 730 }, { "epoch": 0.38733315885893743, "grad_norm": 0.7231031658869789, "kl": 0.0, "learning_rate": 4.9870737119886216e-05, "logits/chosen": -1537212416.0, "logits/rejected": -1564475392.0, "logps/chosen": -622.6542056074767, "logps/rejected": -822.4702194357367, "loss": 0.4693, "rewards/chosen": -2.7961448598130842, "rewards/margins": 2.4091842937919314, "rewards/rejected": -5.205329153605016, "step": 740 }, { "epoch": 0.3925673907354096, "grad_norm": 0.6790396014146793, "kl": 0.0, "learning_rate": 4.985480849482012e-05, "logits/chosen": -1664299776.0, "logits/rejected": -1463392640.0, "logps/chosen": -504.02373887240356, "logps/rejected": -571.7755775577558, "loss": 0.469, "rewards/chosen": -1.559532640949555, "rewards/margins": 1.1335366659811383, "rewards/rejected": -2.6930693069306932, "step": 750 }, { "epoch": 0.39780162261188173, "grad_norm": 0.7572345641049093, "kl": 0.0, "learning_rate": 4.983795747980757e-05, "logits/chosen": -1453326336.0, "logits/rejected": -1286812416.0, "logps/chosen": -544.6706586826348, "logps/rejected": -572.7581699346405, "loss": 0.4779, "rewards/chosen": -2.0988023952095807, "rewards/margins": 0.9269328989080665, "rewards/rejected": -3.025735294117647, "step": 760 }, { "epoch": 0.40303585448835383, "grad_norm": 0.4563766273978695, "kl": 0.0, "learning_rate": 4.982018470020871e-05, "logits/chosen": -1355808768.0, "logits/rejected": -1307364608.0, "logps/chosen": -589.6, "logps/rejected": -710.9, "loss": 0.4678, "rewards/chosen": -2.70078125, "rewards/margins": 1.475, "rewards/rejected": -4.17578125, "step": 770 }, { "epoch": 0.408270086364826, "grad_norm": 0.6737898724374982, "kl": 0.0, "learning_rate": 4.980149081559142e-05, "logits/chosen": -1413060992.0, "logits/rejected": -1393767168.0, "logps/chosen": -566.2, "logps/rejected": -598.1, "loss": 0.4712, "rewards/chosen": -2.268359375, "rewards/margins": 0.7742187499999997, "rewards/rejected": -3.042578125, "step": 780 }, { "epoch": 0.4135043182412981, "grad_norm": 0.9789884036557728, "kl": 0.0, "learning_rate": 4.978187651970683e-05, "logits/chosen": -1426692480.0, "logits/rejected": -1546439936.0, "logps/chosen": -710.5667752442997, "logps/rejected": -951.5435435435436, "loss": 0.4526, "rewards/chosen": -3.73371335504886, "rewards/margins": 2.6003707290352245, "rewards/rejected": -6.334084084084084, "step": 790 }, { "epoch": 0.4187385501177702, "grad_norm": 0.5400402364635294, "kl": 0.0, "learning_rate": 4.976134254046353e-05, "logits/chosen": -1541616384.0, "logits/rejected": -1428579968.0, "logps/chosen": -742.3806646525679, "logps/rejected": -925.9288025889967, "loss": 0.4701, "rewards/chosen": -4.000377643504532, "rewards/margins": 2.705933036107119, "rewards/rejected": -6.706310679611651, "step": 800 }, { "epoch": 0.4187385501177702, "eval_kl": 0.0, "eval_logits/chosen": -3269426688.0, "eval_logits/rejected": -3207910144.0, "eval_logps/chosen": -550.3809995051954, "eval_logps/rejected": -656.9627051218299, "eval_loss": 0.45948827266693115, "eval_rewards/chosen": -2.187654626422563, "eval_rewards/margins": 1.4299485560737075, "eval_rewards/rejected": -3.6176031824962704, "eval_runtime": 93.4665, "eval_samples_per_second": 42.796, "eval_steps_per_second": 0.674, "step": 800 }, { "epoch": 0.4239727819942423, "grad_norm": 0.43312826690611683, "kl": 0.0, "learning_rate": 4.973988963990065e-05, "logits/chosen": -1373215104.0, "logits/rejected": -1463392640.0, "logps/chosen": -502.2540716612378, "logps/rejected": -595.0750750750751, "loss": 0.4514, "rewards/chosen": -1.7492874592833876, "rewards/margins": 1.3389257539298258, "rewards/rejected": -3.0882132132132134, "step": 810 }, { "epoch": 0.42920701387071447, "grad_norm": 0.5639702493055443, "kl": 0.0, "learning_rate": 4.9717518614159496e-05, "logits/chosen": -1284086144.0, "logits/rejected": -1526307200.0, "logps/chosen": -545.2179930795847, "logps/rejected": -586.3931623931624, "loss": 0.4329, "rewards/chosen": -2.0311418685121105, "rewards/margins": 0.7427185303482884, "rewards/rejected": -2.773860398860399, "step": 820 }, { "epoch": 0.4344412457471866, "grad_norm": 0.5479773194208483, "kl": 0.0, "learning_rate": 4.9694230293454034e-05, "logits/chosen": -1395025536.0, "logits/rejected": -1466957824.0, "logps/chosen": -584.906148867314, "logps/rejected": -660.2054380664653, "loss": 0.4574, "rewards/chosen": -2.1725323624595467, "rewards/margins": 1.3969540423742903, "rewards/rejected": -3.569486404833837, "step": 830 }, { "epoch": 0.4396754776236587, "grad_norm": 1.2259598727664864, "kl": 0.0, "learning_rate": 4.9670025542040085e-05, "logits/chosen": -1483105920.0, "logits/rejected": -1462973184.0, "logps/chosen": -602.2, "logps/rejected": -715.8, "loss": 0.4609, "rewards/chosen": -2.4546875, "rewards/margins": 1.9167968749999997, "rewards/rejected": -4.371484375, "step": 840 }, { "epoch": 0.44490970950013087, "grad_norm": 0.467639624489036, "kl": 0.0, "learning_rate": 4.964490525818325e-05, "logits/chosen": -1678979840.0, "logits/rejected": -1438226816.0, "logps/chosen": -578.0645161290323, "logps/rejected": -736.9632107023411, "loss": 0.4765, "rewards/chosen": -2.4149560117302054, "rewards/margins": 1.5164821153600956, "rewards/rejected": -3.931438127090301, "step": 850 }, { "epoch": 0.45014394137660296, "grad_norm": 0.5160824174070019, "kl": 0.0, "learning_rate": 4.9618870374125554e-05, "logits/chosen": -1446195968.0, "logits/rejected": -1525258624.0, "logps/chosen": -492.3154574132492, "logps/rejected": -531.3188854489164, "loss": 0.4725, "rewards/chosen": -1.4590863248903292, "rewards/margins": 0.8056195574626119, "rewards/rejected": -2.264705882352941, "step": 860 }, { "epoch": 0.4553781732530751, "grad_norm": 0.5034576976762464, "kl": 0.0, "learning_rate": 4.959192185605088e-05, "logits/chosen": -1166016512.0, "logits/rejected": -1171259392.0, "logps/chosen": -500.0883280757098, "logps/rejected": -518.4396284829721, "loss": 0.4681, "rewards/chosen": -1.6275630914826498, "rewards/margins": 0.7458889209012511, "rewards/rejected": -2.373452012383901, "step": 870 }, { "epoch": 0.46061240512954726, "grad_norm": 0.529996647847534, "kl": 0.0, "learning_rate": 4.956406070404911e-05, "logits/chosen": -1160144512.0, "logits/rejected": -1177131392.0, "logps/chosen": -497.6551724137931, "logps/rejected": -610.8909657320872, "loss": 0.4637, "rewards/chosen": -1.6755485893416928, "rewards/margins": 1.2753859900975595, "rewards/rejected": -2.9509345794392523, "step": 880 }, { "epoch": 0.46584663700601936, "grad_norm": 0.5312067368635373, "kl": 0.0, "learning_rate": 4.953528795207896e-05, "logits/chosen": -1360212736.0, "logits/rejected": -1401526656.0, "logps/chosen": -615.0543130990416, "logps/rejected": -729.7370030581039, "loss": 0.4479, "rewards/chosen": -2.7581869009584663, "rewards/margins": 1.5912014782464263, "rewards/rejected": -4.349388379204893, "step": 890 }, { "epoch": 0.4710808688824915, "grad_norm": 0.5427173257752647, "kl": 0.0, "learning_rate": 4.9505604667929694e-05, "logits/chosen": -1431096576.0, "logits/rejected": -1388104960.0, "logps/chosen": -644.1823708206687, "logps/rejected": -853.8649517684887, "loss": 0.4711, "rewards/chosen": -2.942629179331307, "rewards/margins": 2.707290434816603, "rewards/rejected": -5.64991961414791, "step": 900 }, { "epoch": 0.4710808688824915, "eval_kl": 0.0, "eval_logits/chosen": -3022695168.0, "eval_logits/rejected": -3008713984.0, "eval_logps/chosen": -592.4354280059376, "eval_logps/rejected": -758.0387866732968, "eval_loss": 0.46263280510902405, "eval_rewards/chosen": -2.6066303809995053, "eval_rewards/margins": 2.022409897468918, "eval_rewards/rejected": -4.629040278468423, "eval_runtime": 93.4443, "eval_samples_per_second": 42.806, "eval_steps_per_second": 0.674, "step": 900 }, { "epoch": 0.4763151007589636, "grad_norm": 0.5670490372210191, "kl": 0.0, "learning_rate": 4.947501195318143e-05, "logits/chosen": -1286393088.0, "logits/rejected": -1343016192.0, "logps/chosen": -541.8770226537217, "logps/rejected": -748.5679758308157, "loss": 0.4443, "rewards/chosen": -2.2771035598705502, "rewards/margins": 2.1466124522140415, "rewards/rejected": -4.423716012084592, "step": 910 }, { "epoch": 0.48154933263543576, "grad_norm": 0.9235768559898764, "kl": 0.0, "learning_rate": 4.9443510943164264e-05, "logits/chosen": -1360632192.0, "logits/rejected": -1285134720.0, "logps/chosen": -606.2545454545455, "logps/rejected": -831.3806451612903, "loss": 0.4702, "rewards/chosen": -2.9428030303030304, "rewards/margins": 2.578164711632453, "rewards/rejected": -5.5209677419354835, "step": 920 }, { "epoch": 0.48678356451190785, "grad_norm": 0.9874634734285241, "kl": 0.0, "learning_rate": 4.941110280691619e-05, "logits/chosen": -1304218880.0, "logits/rejected": -1258920320.0, "logps/chosen": -656.0, "logps/rejected": -765.0445859872611, "loss": 0.4713, "rewards/chosen": -3.4745015337423313, "rewards/margins": 1.234893370716267, "rewards/rejected": -4.709394904458598, "step": 930 }, { "epoch": 0.49201779638838, "grad_norm": 1.2200993550123265, "kl": 0.0, "learning_rate": 4.937778874713963e-05, "logits/chosen": -1235851648.0, "logits/rejected": -1326658304.0, "logps/chosen": -526.688524590164, "logps/rejected": -661.3014925373134, "loss": 0.4579, "rewards/chosen": -1.9245901639344263, "rewards/margins": 1.7929471494984095, "rewards/rejected": -3.7175373134328358, "step": 940 }, { "epoch": 0.49725202826485215, "grad_norm": 0.5242476580045962, "kl": 0.0, "learning_rate": 4.93435700001569e-05, "logits/chosen": -1391460352.0, "logits/rejected": -1220961920.0, "logps/chosen": -505.7142857142857, "logps/rejected": -512.5263157894736, "loss": 0.4774, "rewards/chosen": -1.6203497023809523, "rewards/margins": 0.6912251331453636, "rewards/rejected": -2.311574835526316, "step": 950 }, { "epoch": 0.5024862601413242, "grad_norm": 0.4569538224283365, "kl": 0.0, "learning_rate": 4.930844783586425e-05, "logits/chosen": -1389992320.0, "logits/rejected": -1364616832.0, "logps/chosen": -444.84923076923076, "logps/rejected": -559.5428571428571, "loss": 0.4583, "rewards/chosen": -1.3008653846153846, "rewards/margins": 1.5765155677655678, "rewards/rejected": -2.8773809523809524, "step": 960 }, { "epoch": 0.5077204920177963, "grad_norm": 0.43309063699281336, "kl": 0.0, "learning_rate": 4.927242355768477e-05, "logits/chosen": -1397961472.0, "logits/rejected": -1420610816.0, "logps/chosen": -605.3587301587302, "logps/rejected": -647.1876923076923, "loss": 0.4748, "rewards/chosen": -2.7121031746031745, "rewards/margins": 0.9955891330891333, "rewards/rejected": -3.707692307692308, "step": 970 }, { "epoch": 0.5129547238942685, "grad_norm": 0.5990213278748463, "kl": 0.0, "learning_rate": 4.923549850251999e-05, "logits/chosen": -1486461312.0, "logits/rejected": -1447244544.0, "logps/chosen": -569.9378881987577, "logps/rejected": -693.1320754716982, "loss": 0.4635, "rewards/chosen": -2.345302795031056, "rewards/margins": 1.5124016074846676, "rewards/rejected": -3.8577044025157234, "step": 980 }, { "epoch": 0.5181889557707406, "grad_norm": 0.7014728476500259, "kl": 0.0, "learning_rate": 4.9197674040700333e-05, "logits/chosen": -1509320320.0, "logits/rejected": -1295620480.0, "logps/chosen": -584.8674698795181, "logps/rejected": -670.3376623376623, "loss": 0.4781, "rewards/chosen": -2.560617469879518, "rewards/margins": 1.3107299327178845, "rewards/rejected": -3.8713474025974026, "step": 990 }, { "epoch": 0.5234231876472127, "grad_norm": 0.5015363941820027, "kl": 0.0, "learning_rate": 4.915895157593418e-05, "logits/chosen": -993420928.0, "logits/rejected": -932393792.0, "logps/chosen": -586.2360248447205, "logps/rejected": -731.4716981132076, "loss": 0.4534, "rewards/chosen": -2.779114906832298, "rewards/margins": 1.5565769170670727, "rewards/rejected": -4.335691823899371, "step": 1000 }, { "epoch": 0.5234231876472127, "eval_kl": 0.0, "eval_logits/chosen": -2621240320.0, "eval_logits/rejected": -2359728640.0, "eval_logps/chosen": -548.955962394854, "eval_logps/rejected": -656.3580308304327, "eval_loss": 0.4593867063522339, "eval_rewards/chosen": -2.1703983176645223, "eval_rewards/margins": 1.4413620005851047, "eval_rewards/rejected": -3.611760318249627, "eval_runtime": 93.4283, "eval_samples_per_second": 42.814, "eval_steps_per_second": 0.674, "step": 1000 }, { "epoch": 0.528657419523685, "grad_norm": 0.4651845185250115, "kl": 0.0, "learning_rate": 4.911933254525583e-05, "logits/chosen": -1264582656.0, "logits/rejected": -1249692928.0, "logps/chosen": -512.2025316455696, "logps/rejected": -624.9876543209876, "loss": 0.4461, "rewards/chosen": -1.9050632911392404, "rewards/margins": 1.5521126347866856, "rewards/rejected": -3.457175925925926, "step": 1010 }, { "epoch": 0.533891651400157, "grad_norm": 0.8209032470774238, "kl": 0.0, "learning_rate": 4.907881841897216e-05, "logits/chosen": -1299395328.0, "logits/rejected": -1353921280.0, "logps/chosen": -617.7215189873418, "logps/rejected": -818.1728395061729, "loss": 0.4533, "rewards/chosen": -3.068631329113924, "rewards/margins": 2.218405707923113, "rewards/rejected": -5.287037037037037, "step": 1020 }, { "epoch": 0.5391258832766291, "grad_norm": 0.8025767619285588, "kl": 0.0, "learning_rate": 4.903741070060802e-05, "logits/chosen": -1437178240.0, "logits/rejected": -1362939136.0, "logps/chosen": -801.6534954407294, "logps/rejected": -1041.491961414791, "loss": 0.4819, "rewards/chosen": -4.88031914893617, "rewards/margins": 2.763169597044537, "rewards/rejected": -7.643488745980707, "step": 1030 }, { "epoch": 0.5443601151531012, "grad_norm": 0.5755210037603522, "kl": 0.0, "learning_rate": 4.899511092685051e-05, "logits/chosen": -1300024576.0, "logits/rejected": -1426273024.0, "logps/chosen": -617.6300940438872, "logps/rejected": -793.0218068535826, "loss": 0.448, "rewards/chosen": -2.7321708463949843, "rewards/margins": 2.284184293791932, "rewards/rejected": -5.016355140186916, "step": 1040 }, { "epoch": 0.5495943470295734, "grad_norm": 0.6536447981970849, "kl": 0.0, "learning_rate": 4.895192066749189e-05, "logits/chosen": -1290587392.0, "logits/rejected": -1367972224.0, "logps/chosen": -537.9872611464968, "logps/rejected": -568.0981595092024, "loss": 0.4598, "rewards/chosen": -1.950437898089172, "rewards/margins": 0.961659494548865, "rewards/rejected": -2.912097392638037, "step": 1050 }, { "epoch": 0.5548285789060455, "grad_norm": 0.7464676262058021, "kl": 0.0, "learning_rate": 4.890784152537134e-05, "logits/chosen": -1363358464.0, "logits/rejected": -1421030144.0, "logps/chosen": -612.5859872611464, "logps/rejected": -741.8895705521472, "loss": 0.4466, "rewards/chosen": -2.6575437898089174, "rewards/margins": 1.7922261488413893, "rewards/rejected": -4.449769938650307, "step": 1060 }, { "epoch": 0.5600628107825176, "grad_norm": 0.5391370350173297, "kl": 0.0, "learning_rate": 4.886287513631548e-05, "logits/chosen": -1447034880.0, "logits/rejected": -1292684544.0, "logps/chosen": -652.1904761904761, "logps/rejected": -886.7368421052631, "loss": 0.4765, "rewards/chosen": -3.2641369047619047, "rewards/margins": 2.480106516290727, "rewards/rejected": -5.744243421052632, "step": 1070 }, { "epoch": 0.5652970426589898, "grad_norm": 0.6642351807621497, "kl": 0.0, "learning_rate": 4.881702316907768e-05, "logits/chosen": -1291216512.0, "logits/rejected": -1240465408.0, "logps/chosen": -657.3211009174312, "logps/rejected": -793.0479233226837, "loss": 0.4577, "rewards/chosen": -3.2305045871559632, "rewards/margins": 1.8625465310548992, "rewards/rejected": -5.093051118210862, "step": 1080 }, { "epoch": 0.5705312745354619, "grad_norm": 0.7711978299001714, "kl": 0.0, "learning_rate": 4.8770287325276116e-05, "logits/chosen": -1279262720.0, "logits/rejected": -1235641984.0, "logps/chosen": -615.0219435736677, "logps/rejected": -897.196261682243, "loss": 0.4411, "rewards/chosen": -2.970219435736677, "rewards/margins": 2.7291575113038213, "rewards/rejected": -5.6993769470404985, "step": 1090 }, { "epoch": 0.575765506411934, "grad_norm": 0.4862124971831588, "kl": 0.0, "learning_rate": 4.872266933933058e-05, "logits/chosen": -1231867136.0, "logits/rejected": -1382442624.0, "logps/chosen": -613.1178451178451, "logps/rejected": -751.6734693877551, "loss": 0.4428, "rewards/chosen": -2.7518939393939394, "rewards/margins": 1.8669107253290926, "rewards/rejected": -4.618804664723032, "step": 1100 }, { "epoch": 0.575765506411934, "eval_kl": 0.0, "eval_logits/chosen": -2962243840.0, "eval_logits/rejected": -2838944512.0, "eval_logps/chosen": -587.938644235527, "eval_logps/rejected": -759.0253605171556, "eval_loss": 0.4583164155483246, "eval_rewards/chosen": -2.5622216724393865, "eval_rewards/margins": 2.079374548346292, "eval_rewards/rejected": -4.6415962207856785, "eval_runtime": 93.4298, "eval_samples_per_second": 42.813, "eval_steps_per_second": 0.674, "step": 1100 }, { "epoch": 0.5809997382884062, "grad_norm": 0.5321270952633724, "kl": 0.0, "learning_rate": 4.86741709783982e-05, "logits/chosen": -1324980608.0, "logits/rejected": -1339660672.0, "logps/chosen": -612.2322580645161, "logps/rejected": -860.4121212121212, "loss": 0.442, "rewards/chosen": -2.820161290322581, "rewards/margins": 2.7540811339198434, "rewards/rejected": -5.574242424242424, "step": 1110 }, { "epoch": 0.5862339701648783, "grad_norm": 0.5390449896110832, "kl": 0.0, "learning_rate": 4.86247940423078e-05, "logits/chosen": -1407608448.0, "logits/rejected": -1356228224.0, "logps/chosen": -845.8404907975461, "logps/rejected": -1190.216560509554, "loss": 0.4735, "rewards/chosen": -5.088957055214724, "rewards/margins": 3.746234027587823, "rewards/rejected": -8.835191082802547, "step": 1120 }, { "epoch": 0.5914682020413504, "grad_norm": 1.013994676787218, "kl": 0.0, "learning_rate": 4.857454036349308e-05, "logits/chosen": -1230399104.0, "logits/rejected": -1360003072.0, "logps/chosen": -815.6862745098039, "logps/rejected": -1295.5209580838323, "loss": 0.4508, "rewards/chosen": -5.069444444444445, "rewards/margins": 4.926813040585495, "rewards/rejected": -9.99625748502994, "step": 1130 }, { "epoch": 0.5967024339178225, "grad_norm": 0.4959809334818276, "kl": 0.0, "learning_rate": 4.8523411806924704e-05, "logits/chosen": -1222429952.0, "logits/rejected": -1233964288.0, "logps/chosen": -732.4668769716088, "logps/rejected": -1033.1145510835913, "loss": 0.4554, "rewards/chosen": -4.1857255520504735, "rewards/margins": 3.266286831850455, "rewards/rejected": -7.452012383900929, "step": 1140 }, { "epoch": 0.6019366657942947, "grad_norm": 0.5525159790700757, "kl": 0.0, "learning_rate": 4.8471410270041e-05, "logits/chosen": -1273600384.0, "logits/rejected": -1167694208.0, "logps/chosen": -672.0963855421687, "logps/rejected": -895.2727272727273, "loss": 0.4676, "rewards/chosen": -3.2428463855421685, "rewards/margins": 2.5915691988734157, "rewards/rejected": -5.834415584415584, "step": 1150 }, { "epoch": 0.6071708976707668, "grad_norm": 1.020879817955065, "kl": 0.0, "learning_rate": 4.84185376826776e-05, "logits/chosen": -1160144512.0, "logits/rejected": -1336934400.0, "logps/chosen": -576.9664429530201, "logps/rejected": -796.5380116959064, "loss": 0.4354, "rewards/chosen": -2.484060402684564, "rewards/margins": 2.3613343341575415, "rewards/rejected": -4.845394736842105, "step": 1160 }, { "epoch": 0.6124051295472389, "grad_norm": 1.5753296833031845, "kl": 0.0, "learning_rate": 4.8364796006995785e-05, "logits/chosen": -1258291200.0, "logits/rejected": -1169791360.0, "logps/chosen": -574.6424242424242, "logps/rejected": -794.2193548387097, "loss": 0.4648, "rewards/chosen": -2.3223484848484848, "rewards/margins": 2.761925708699902, "rewards/rejected": -5.084274193548387, "step": 1170 }, { "epoch": 0.6176393614237111, "grad_norm": 0.7771283437694375, "kl": 0.0, "learning_rate": 4.831018723740969e-05, "logits/chosen": -1246337408.0, "logits/rejected": -1175453696.0, "logps/chosen": -609.6738461538462, "logps/rejected": -783.847619047619, "loss": 0.456, "rewards/chosen": -2.5955769230769232, "rewards/margins": 2.1702960927960926, "rewards/rejected": -4.765873015873016, "step": 1180 }, { "epoch": 0.6228735933001832, "grad_norm": 0.6937936950072409, "kl": 0.0, "learning_rate": 4.825471340051228e-05, "logits/chosen": -1167065088.0, "logits/rejected": -1241304320.0, "logps/chosen": -640.4155844155844, "logps/rejected": -821.5903614457832, "loss": 0.4504, "rewards/chosen": -2.888392857142857, "rewards/margins": 2.294589070567986, "rewards/rejected": -5.182981927710843, "step": 1190 }, { "epoch": 0.6281078251766553, "grad_norm": 0.550959849863717, "kl": 0.0, "learning_rate": 4.8198376555000134e-05, "logits/chosen": -1269615872.0, "logits/rejected": -1262485504.0, "logps/chosen": -579.5555555555555, "logps/rejected": -637.2658227848101, "loss": 0.4619, "rewards/chosen": -2.6766975308641974, "rewards/margins": 0.9190303172370684, "rewards/rejected": -3.5957278481012658, "step": 1200 }, { "epoch": 0.6281078251766553, "eval_kl": 0.0, "eval_logits/chosen": -2887944704.0, "eval_logits/rejected": -2875961088.0, "eval_logps/chosen": -570.1098466105888, "eval_logps/rejected": -742.3172550969667, "eval_loss": 0.45485547184944153, "eval_rewards/chosen": -2.3843394359228105, "eval_rewards/margins": 2.0890568843158768, "eval_rewards/rejected": -4.473396320238687, "eval_runtime": 93.4311, "eval_samples_per_second": 42.812, "eval_steps_per_second": 0.674, "step": 1200 }, { "epoch": 0.6333420570531274, "grad_norm": 0.4957479086059728, "kl": 0.0, "learning_rate": 4.8141178791597086e-05, "logits/chosen": -1334417792.0, "logits/rejected": -1312607488.0, "logps/chosen": -544.2006269592476, "logps/rejected": -696.1246105919004, "loss": 0.4547, "rewards/chosen": -2.127742946708464, "rewards/margins": 1.9672726296155236, "rewards/rejected": -4.095015576323988, "step": 1210 }, { "epoch": 0.6385762889295996, "grad_norm": 0.5599659559777491, "kl": 0.0, "learning_rate": 4.8083122232976555e-05, "logits/chosen": -1310720000.0, "logits/rejected": -1296878848.0, "logps/chosen": -533.7358490566038, "logps/rejected": -600.1490683229814, "loss": 0.4536, "rewards/chosen": -2.0184748427672954, "rewards/margins": 1.1729071448103445, "rewards/rejected": -3.19138198757764, "step": 1220 }, { "epoch": 0.6438105208060717, "grad_norm": 0.4205980725915211, "kl": 0.0, "learning_rate": 4.802420903368285e-05, "logits/chosen": -1353921280.0, "logits/rejected": -1259339776.0, "logps/chosen": -521.015479876161, "logps/rejected": -639.7981072555204, "loss": 0.4561, "rewards/chosen": -1.7972136222910218, "rewards/margins": 1.373527702630114, "rewards/rejected": -3.170741324921136, "step": 1230 }, { "epoch": 0.6490447526825438, "grad_norm": 0.938997171512235, "kl": 0.0, "learning_rate": 4.7964441380051184e-05, "logits/chosen": -1280311296.0, "logits/rejected": -1343854976.0, "logps/chosen": -537.3935483870968, "logps/rejected": -607.4181818181818, "loss": 0.4526, "rewards/chosen": -1.9568548387096774, "rewards/margins": 1.3946603128054742, "rewards/rejected": -3.3515151515151516, "step": 1240 }, { "epoch": 0.654278984559016, "grad_norm": 0.471438924793617, "kl": 0.0, "learning_rate": 4.790382149012651e-05, "logits/chosen": -1317640576.0, "logits/rejected": -1276850944.0, "logps/chosen": -549.7, "logps/rejected": -682.3, "loss": 0.4476, "rewards/chosen": -2.1296875, "rewards/margins": 1.491796875, "rewards/rejected": -3.621484375, "step": 1250 }, { "epoch": 0.6595132164354881, "grad_norm": 0.5189481411209818, "kl": 0.0, "learning_rate": 4.7842351613581235e-05, "logits/chosen": -1350356224.0, "logits/rejected": -1310510336.0, "logps/chosen": -529.65, "logps/rejected": -629.9, "loss": 0.4479, "rewards/chosen": -1.728515625, "rewards/margins": 1.648046875, "rewards/rejected": -3.3765625, "step": 1260 }, { "epoch": 0.6647474483119602, "grad_norm": 0.3747743160718501, "kl": 0.0, "learning_rate": 4.778003403163175e-05, "logits/chosen": -1436339456.0, "logits/rejected": -1278004480.0, "logps/chosen": -494.65060240963857, "logps/rejected": -561.3506493506494, "loss": 0.4579, "rewards/chosen": -1.4790097891566265, "rewards/margins": 1.2811363147394774, "rewards/rejected": -2.760146103896104, "step": 1270 }, { "epoch": 0.6699816801884323, "grad_norm": 0.6112560725436108, "kl": 0.0, "learning_rate": 4.771687105695373e-05, "logits/chosen": -1431935360.0, "logits/rejected": -1293103872.0, "logps/chosen": -485.3172205438066, "logps/rejected": -602.0970873786408, "loss": 0.4595, "rewards/chosen": -1.2869146525679758, "rewards/margins": 1.8643798458139014, "rewards/rejected": -3.151294498381877, "step": 1280 }, { "epoch": 0.6752159120649045, "grad_norm": 0.8780083757352779, "kl": 0.0, "learning_rate": 4.765286503359632e-05, "logits/chosen": -1350985344.0, "logits/rejected": -1378038528.0, "logps/chosen": -540.5239616613419, "logps/rejected": -729.4434250764526, "loss": 0.4465, "rewards/chosen": -2.110323482428115, "rewards/margins": 2.0257621444832004, "rewards/rejected": -4.136085626911315, "step": 1290 }, { "epoch": 0.6804501439413766, "grad_norm": 0.6875786082085655, "kl": 0.0, "learning_rate": 4.758801833689516e-05, "logits/chosen": -1400478080.0, "logits/rejected": -1204813824.0, "logps/chosen": -765.8795180722891, "logps/rejected": -994.7012987012987, "loss": 0.4627, "rewards/chosen": -4.251129518072289, "rewards/margins": 2.9923769754342047, "rewards/rejected": -7.2435064935064934, "step": 1300 }, { "epoch": 0.6804501439413766, "eval_kl": 0.0, "eval_logits/chosen": -2942404096.0, "eval_logits/rejected": -2700332800.0, "eval_logps/chosen": -705.3933696190005, "eval_logps/rejected": -1037.3664843361512, "eval_loss": 0.4564609229564667, "eval_rewards/chosen": -3.737877288471054, "eval_rewards/margins": 3.6828089372872754, "eval_rewards/rejected": -7.420686225758329, "eval_runtime": 93.4369, "eval_samples_per_second": 42.81, "eval_steps_per_second": 0.674, "step": 1300 }, { "epoch": 0.6856843758178487, "grad_norm": 0.7487850272715161, "kl": 0.0, "learning_rate": 4.752233337338423e-05, "logits/chosen": -1514772864.0, "logits/rejected": -1181116032.0, "logps/chosen": -660.6840579710145, "logps/rejected": -1124.3389830508474, "loss": 0.4639, "rewards/chosen": -3.3021739130434784, "rewards/margins": 4.929182019159912, "rewards/rejected": -8.23135593220339, "step": 1310 }, { "epoch": 0.6909186076943209, "grad_norm": 1.3004359512746912, "kl": 0.0, "learning_rate": 4.745581258070654e-05, "logits/chosen": -1363777920.0, "logits/rejected": -1418094208.0, "logps/chosen": -704.4142394822006, "logps/rejected": -936.02416918429, "loss": 0.4353, "rewards/chosen": -3.8309061488673137, "rewards/margins": 2.5610878088366134, "rewards/rejected": -6.391993957703927, "step": 1320 }, { "epoch": 0.696152839570793, "grad_norm": 0.9380483957803694, "kl": 0.0, "learning_rate": 4.738845842752364e-05, "logits/chosen": -1527146112.0, "logits/rejected": -1395235200.0, "logps/chosen": -655.9024390243902, "logps/rejected": -928.4102564102565, "loss": 0.4589, "rewards/chosen": -3.3266006097560976, "rewards/margins": 2.894553236397748, "rewards/rejected": -6.221153846153846, "step": 1330 }, { "epoch": 0.7013870714472651, "grad_norm": 0.6394650193087535, "kl": 0.0, "learning_rate": 4.732027341342405e-05, "logits/chosen": -1620259584.0, "logits/rejected": -1330852608.0, "logps/chosen": -535.6521739130435, "logps/rejected": -647.0508474576271, "loss": 0.4685, "rewards/chosen": -2.1190217391304347, "rewards/margins": 1.1597918201915993, "rewards/rejected": -3.278813559322034, "step": 1340 }, { "epoch": 0.7066213033237373, "grad_norm": 0.5927846775262273, "kl": 0.0, "learning_rate": 4.725126006883046e-05, "logits/chosen": -1479121280.0, "logits/rejected": -1497366528.0, "logps/chosen": -567.9746031746032, "logps/rejected": -689.8215384615385, "loss": 0.4429, "rewards/chosen": -2.175, "rewards/margins": 1.5423076923076926, "rewards/rejected": -3.7173076923076924, "step": 1350 }, { "epoch": 0.7118555352002094, "grad_norm": 0.4712554320074594, "kl": 0.0, "learning_rate": 4.718142095490584e-05, "logits/chosen": -1507013376.0, "logits/rejected": -1583978880.0, "logps/chosen": -856.7272727272727, "logps/rejected": -1207.0361445783133, "loss": 0.4461, "rewards/chosen": -5.242288961038961, "rewards/margins": 3.8601206775152566, "rewards/rejected": -9.102409638554217, "step": 1360 }, { "epoch": 0.7170897670766815, "grad_norm": 0.8329581163405648, "kl": 0.0, "learning_rate": 4.711075866345841e-05, "logits/chosen": -1503448320.0, "logits/rejected": -1478911616.0, "logps/chosen": -964.5419354838709, "logps/rejected": -1271.2727272727273, "loss": 0.4472, "rewards/chosen": -6.430645161290323, "rewards/margins": 3.249657869012707, "rewards/rejected": -9.68030303030303, "step": 1370 }, { "epoch": 0.7223239989531536, "grad_norm": 1.012785333416532, "kl": 0.0, "learning_rate": 4.70392758168454e-05, "logits/chosen": -1536373504.0, "logits/rejected": -1538051328.0, "logps/chosen": -830.8645161290323, "logps/rejected": -1044.9454545454546, "loss": 0.4487, "rewards/chosen": -4.7915322580645165, "rewards/margins": 2.7804374389051807, "rewards/rejected": -7.571969696969697, "step": 1380 }, { "epoch": 0.7275582308296258, "grad_norm": 0.4696033985185279, "kl": 0.0, "learning_rate": 4.696697506787579e-05, "logits/chosen": -1367343104.0, "logits/rejected": -1455423488.0, "logps/chosen": -776.0, "logps/rejected": -999.6190476190476, "loss": 0.4437, "rewards/chosen": -4.519736842105263, "rewards/margins": 2.3976738721804516, "rewards/rejected": -6.917410714285714, "step": 1390 }, { "epoch": 0.7327924627060979, "grad_norm": 0.94786322002552, "kl": 0.0, "learning_rate": 4.689385909971184e-05, "logits/chosen": -1476185344.0, "logits/rejected": -1244869376.0, "logps/chosen": -808.3809523809524, "logps/rejected": -1248.7368421052631, "loss": 0.4622, "rewards/chosen": -4.801153273809524, "rewards/margins": 4.798764489348372, "rewards/rejected": -9.599917763157896, "step": 1400 }, { "epoch": 0.7327924627060979, "eval_kl": 0.0, "eval_logits/chosen": -3010711296.0, "eval_logits/rejected": -2812979968.0, "eval_logps/chosen": -783.2637308263236, "eval_logps/rejected": -1089.1775236200895, "eval_loss": 0.4619843661785126, "eval_rewards/chosen": -4.518060366155368, "eval_rewards/margins": 3.4185383409555214, "eval_rewards/rejected": -7.93659870711089, "eval_runtime": 93.4326, "eval_samples_per_second": 42.812, "eval_steps_per_second": 0.674, "step": 1400 }, { "epoch": 0.73802669458257, "grad_norm": 0.6652634253453028, "kl": 0.0, "learning_rate": 4.68199306257695e-05, "logits/chosen": -1382023168.0, "logits/rejected": -1249483136.0, "logps/chosen": -1017.8723404255319, "logps/rejected": -1256.7459807073956, "loss": 0.471, "rewards/chosen": -6.800151975683891, "rewards/margins": 2.8622274455379735, "rewards/rejected": -9.662379421221864, "step": 1410 }, { "epoch": 0.7432609264590422, "grad_norm": 0.6086427236404571, "kl": 0.0, "learning_rate": 4.674519238961773e-05, "logits/chosen": -1209217792.0, "logits/rejected": -1091777280.0, "logps/chosen": -781.2515337423313, "logps/rejected": -999.9490445859873, "loss": 0.461, "rewards/chosen": -4.742523006134969, "rewards/margins": 2.4684642550115274, "rewards/rejected": -7.210987261146497, "step": 1420 }, { "epoch": 0.7484951583355143, "grad_norm": 0.587128760826668, "kl": 0.0, "learning_rate": 4.66696471648767e-05, "logits/chosen": -1229140736.0, "logits/rejected": -1149658752.0, "logps/chosen": -623.1898734177215, "logps/rejected": -915.358024691358, "loss": 0.4482, "rewards/chosen": -3.1242088607594938, "rewards/margins": 2.8715473120800126, "rewards/rejected": -5.995756172839506, "step": 1430 }, { "epoch": 0.7537293902119864, "grad_norm": 0.5455085313383593, "kl": 0.0, "learning_rate": 4.659329775511478e-05, "logits/chosen": -1181954816.0, "logits/rejected": -1167484544.0, "logps/chosen": -647.3375796178344, "logps/rejected": -891.680981595092, "loss": 0.4553, "rewards/chosen": -2.915207006369427, "rewards/margins": 2.9260506623422295, "rewards/rejected": -5.841257668711656, "step": 1440 }, { "epoch": 0.7589636220884585, "grad_norm": 0.6399322898967481, "kl": 0.0, "learning_rate": 4.651614699374461e-05, "logits/chosen": -1192860032.0, "logits/rejected": -1133930112.0, "logps/chosen": -622.7924528301887, "logps/rejected": -1006.3105590062112, "loss": 0.4463, "rewards/chosen": -2.8330385220125787, "rewards/margins": 4.218203714012265, "rewards/rejected": -7.051242236024844, "step": 1450 }, { "epoch": 0.7641978539649307, "grad_norm": 0.5781761467701665, "kl": 0.0, "learning_rate": 4.643819774391786e-05, "logits/chosen": -1172727424.0, "logits/rejected": -1157627904.0, "logps/chosen": -750.91961414791, "logps/rejected": -904.7537993920972, "loss": 0.4565, "rewards/chosen": -4.142282958199357, "rewards/margins": 1.9367443974237437, "rewards/rejected": -6.079027355623101, "step": 1460 }, { "epoch": 0.7694320858414028, "grad_norm": 0.8073221865761616, "kl": 0.0, "learning_rate": 4.635945289841902e-05, "logits/chosen": -1248224896.0, "logits/rejected": -1091777280.0, "logps/chosen": -671.1246200607902, "logps/rejected": -1179.1639871382636, "loss": 0.458, "rewards/chosen": -3.5155775075987843, "rewards/margins": 5.603393553494463, "rewards/rejected": -9.118971061093248, "step": 1470 }, { "epoch": 0.7746663177178749, "grad_norm": 0.731784234486593, "kl": 0.0, "learning_rate": 4.6279915379558017e-05, "logits/chosen": -1293523328.0, "logits/rejected": -1241094528.0, "logps/chosen": -677.12, "logps/rejected": -918.7555555555556, "loss": 0.4593, "rewards/chosen": -3.539615384615385, "rewards/margins": 2.8984798534798535, "rewards/rejected": -6.438095238095238, "step": 1480 }, { "epoch": 0.7799005495943471, "grad_norm": 0.8269303401098701, "kl": 0.0, "learning_rate": 4.61995881390618e-05, "logits/chosen": -1258920320.0, "logits/rejected": -1162241664.0, "logps/chosen": -703.2074303405573, "logps/rejected": -881.8675078864353, "loss": 0.4587, "rewards/chosen": -3.490905572755418, "rewards/margins": 2.262249001377074, "rewards/rejected": -5.753154574132492, "step": 1490 }, { "epoch": 0.7851347814708192, "grad_norm": 0.5998559400896546, "kl": 0.0, "learning_rate": 4.6118474157964765e-05, "logits/chosen": -1203136128.0, "logits/rejected": -1172937088.0, "logps/chosen": -643.9245283018868, "logps/rejected": -819.4782608695652, "loss": 0.4571, "rewards/chosen": -3.2344732704402515, "rewards/margins": 2.1319863568889414, "rewards/rejected": -5.366459627329193, "step": 1500 }, { "epoch": 0.7851347814708192, "eval_kl": 0.0, "eval_logits/chosen": -2759985152.0, "eval_logits/rejected": -2638283776.0, "eval_logps/chosen": -567.8931222167244, "eval_logps/rejected": -773.6330183988066, "eval_loss": 0.45720311999320984, "eval_rewards/chosen": -2.362196932211776, "eval_rewards/margins": 2.422735937007518, "eval_rewards/rejected": -4.784932869219294, "eval_runtime": 93.4354, "eval_samples_per_second": 42.81, "eval_steps_per_second": 0.674, "step": 1500 }, { "epoch": 0.7903690133472913, "grad_norm": 0.5520818527747415, "kl": 0.0, "learning_rate": 4.6036576446498124e-05, "logits/chosen": -1313026816.0, "logits/rejected": -1107086592.0, "logps/chosen": -480.0, "logps/rejected": -601.027027027027, "loss": 0.4786, "rewards/chosen": -1.6594295058139534, "rewards/margins": 1.3794218455373979, "rewards/rejected": -3.0388513513513513, "step": 1510 }, { "epoch": 0.7956032452237635, "grad_norm": 0.6769152223374779, "kl": 0.0, "learning_rate": 4.5953898043978244e-05, "logits/chosen": -1228511616.0, "logits/rejected": -1322254336.0, "logps/chosen": -459.7922077922078, "logps/rejected": -568.9638554216867, "loss": 0.4482, "rewards/chosen": -1.322646103896104, "rewards/margins": 1.6125948599593176, "rewards/rejected": -2.9352409638554215, "step": 1520 }, { "epoch": 0.8008374771002356, "grad_norm": 0.7170707161941443, "kl": 0.0, "learning_rate": 4.5870442018693775e-05, "logits/chosen": -1229350528.0, "logits/rejected": -1242143104.0, "logps/chosen": -616.128617363344, "logps/rejected": -772.6686930091186, "loss": 0.4464, "rewards/chosen": -2.6310289389067525, "rewards/margins": 2.3317370185400557, "rewards/rejected": -4.962765957446808, "step": 1530 }, { "epoch": 0.8060717089767077, "grad_norm": 0.4908931386250085, "kl": 0.0, "learning_rate": 4.578621146779186e-05, "logits/chosen": -1167904000.0, "logits/rejected": -1179018880.0, "logps/chosen": -712.6514657980456, "logps/rejected": -903.1111111111111, "loss": 0.442, "rewards/chosen": -3.5916123778501627, "rewards/margins": 2.4309101446723598, "rewards/rejected": -6.0225225225225225, "step": 1540 }, { "epoch": 0.8113059408531798, "grad_norm": 0.7969496873913599, "kl": 0.0, "learning_rate": 4.570120951716312e-05, "logits/chosen": -1113587712.0, "logits/rejected": -1260178688.0, "logps/chosen": -554.1168384879725, "logps/rejected": -655.8624641833811, "loss": 0.427, "rewards/chosen": -2.2758805841924397, "rewards/margins": 1.3129446307072739, "rewards/rejected": -3.5888252148997135, "step": 1550 }, { "epoch": 0.816540172729652, "grad_norm": 0.7026514692155559, "kl": 0.0, "learning_rate": 4.561543932132574e-05, "logits/chosen": -1231028224.0, "logits/rejected": -1186988032.0, "logps/chosen": -514.0125786163522, "logps/rejected": -615.4534161490683, "loss": 0.4641, "rewards/chosen": -1.8867924528301887, "rewards/margins": 1.039449783194656, "rewards/rejected": -2.926242236024845, "step": 1560 }, { "epoch": 0.821774404606124, "grad_norm": 0.6689197008305775, "kl": 0.0, "learning_rate": 4.5528904063308294e-05, "logits/chosen": -1195586304.0, "logits/rejected": -1132042624.0, "logps/chosen": -562.7414330218069, "logps/rejected": -632.0752351097178, "loss": 0.4642, "rewards/chosen": -2.2988707165109035, "rewards/margins": 1.3555179982226386, "rewards/rejected": -3.654388714733542, "step": 1570 }, { "epoch": 0.8270086364825961, "grad_norm": 0.48187712881639605, "kl": 0.0, "learning_rate": 4.544160695453173e-05, "logits/chosen": -1171259392.0, "logits/rejected": -1162031872.0, "logps/chosen": -612.1423948220065, "logps/rejected": -730.2960725075528, "loss": 0.4444, "rewards/chosen": -2.8717637540453076, "rewards/margins": 1.6622241613625475, "rewards/rejected": -4.533987915407855, "step": 1580 }, { "epoch": 0.8322428683590684, "grad_norm": 0.49689165831090876, "kl": 0.0, "learning_rate": 4.535355123469009e-05, "logits/chosen": -1364616832.0, "logits/rejected": -1105094272.0, "logps/chosen": -522.4811594202898, "logps/rejected": -693.477966101695, "loss": 0.4636, "rewards/chosen": -2.0105072463768114, "rewards/margins": 2.0017808892164086, "rewards/rejected": -4.01228813559322, "step": 1590 }, { "epoch": 0.8374771002355405, "grad_norm": 0.46546847894467513, "kl": 0.0, "learning_rate": 4.5264740171630346e-05, "logits/chosen": -1284505600.0, "logits/rejected": -1162661120.0, "logps/chosen": -537.3293051359517, "logps/rejected": -721.1909385113269, "loss": 0.4714, "rewards/chosen": -2.1334969788519635, "rewards/margins": 2.135515966131855, "rewards/rejected": -4.269012944983818, "step": 1600 }, { "epoch": 0.8374771002355405, "eval_kl": 0.0, "eval_logits/chosen": -2749732352.0, "eval_logits/rejected": -2697137152.0, "eval_logps/chosen": -526.0603661553687, "eval_logps/rejected": -662.9457981103928, "eval_loss": 0.4618281126022339, "eval_rewards/chosen": -1.9415821375556654, "eval_rewards/margins": 1.7369360126183775, "eval_rewards/rejected": -3.678518150174043, "eval_runtime": 93.4309, "eval_samples_per_second": 42.812, "eval_steps_per_second": 0.674, "step": 1600 }, { "epoch": 0.8427113321120125, "grad_norm": 0.6877692391512474, "kl": 0.0, "learning_rate": 4.5175177061231135e-05, "logits/chosen": -1227463040.0, "logits/rejected": -1216348160.0, "logps/chosen": -482.126582278481, "logps/rejected": -651.0617283950618, "loss": 0.4595, "rewards/chosen": -1.8253560126582278, "rewards/margins": 1.6723291725269573, "rewards/rejected": -3.497685185185185, "step": 1610 }, { "epoch": 0.8479455639884846, "grad_norm": 1.0779043545356655, "kl": 0.0, "learning_rate": 4.508486522728037e-05, "logits/chosen": -1146513024.0, "logits/rejected": -1204813824.0, "logps/chosen": -617.6410256410256, "logps/rejected": -724.0, "loss": 0.4601, "rewards/chosen": -3.0234375, "rewards/margins": 1.2730564024390247, "rewards/rejected": -4.296493902439025, "step": 1620 }, { "epoch": 0.8531797958649568, "grad_norm": 1.8732198687823562, "kl": 0.0, "learning_rate": 4.499380802135197e-05, "logits/chosen": -1244240256.0, "logits/rejected": -1163919360.0, "logps/chosen": -569.6, "logps/rejected": -862.3, "loss": 0.4386, "rewards/chosen": -2.205078125, "rewards/margins": 3.373046875, "rewards/rejected": -5.578125, "step": 1630 }, { "epoch": 0.8584140277414289, "grad_norm": 0.6010892356137254, "kl": 0.0, "learning_rate": 4.490200882268142e-05, "logits/chosen": -1278423808.0, "logits/rejected": -1088421888.0, "logps/chosen": -517.7831325301205, "logps/rejected": -721.8701298701299, "loss": 0.4642, "rewards/chosen": -2.1364834337349397, "rewards/margins": 2.0254483844468787, "rewards/rejected": -4.161931818181818, "step": 1640 }, { "epoch": 0.863648259617901, "grad_norm": 0.6195805031595781, "kl": 0.0, "learning_rate": 4.480947103804044e-05, "logits/chosen": -1189504640.0, "logits/rejected": -1145464448.0, "logps/chosen": -610.0934579439253, "logps/rejected": -731.8871473354232, "loss": 0.4686, "rewards/chosen": -2.809968847352025, "rewards/margins": 1.5720844441840254, "rewards/rejected": -4.38205329153605, "step": 1650 }, { "epoch": 0.8688824914943732, "grad_norm": 0.6162524547197834, "kl": 0.0, "learning_rate": 4.471619810161046e-05, "logits/chosen": -1099956224.0, "logits/rejected": -1118620928.0, "logps/chosen": -567.7152103559871, "logps/rejected": -803.5770392749245, "loss": 0.4349, "rewards/chosen": -2.560881877022654, "rewards/margins": 2.4428945580226635, "rewards/rejected": -5.003776435045317, "step": 1660 }, { "epoch": 0.8741167233708453, "grad_norm": 0.8094087004559215, "kl": 0.0, "learning_rate": 4.462219347485523e-05, "logits/chosen": -1235851648.0, "logits/rejected": -1014182720.0, "logps/chosen": -623.0588235294117, "logps/rejected": -716.3733333333333, "loss": 0.4825, "rewards/chosen": -2.572794117647059, "rewards/margins": 1.730539215686275, "rewards/rejected": -4.303333333333334, "step": 1670 }, { "epoch": 0.8793509552473174, "grad_norm": 2.202531952092164, "kl": 0.0, "learning_rate": 4.452746064639239e-05, "logits/chosen": -1071434944.0, "logits/rejected": -1215719040.0, "logps/chosen": -562.1917808219179, "logps/rejected": -664.183908045977, "loss": 0.4356, "rewards/chosen": -2.2741866438356166, "rewards/margins": 1.2714311722563374, "rewards/rejected": -3.545617816091954, "step": 1680 }, { "epoch": 0.8845851871237895, "grad_norm": 0.7546139807682359, "kl": 0.0, "learning_rate": 4.4432003131863906e-05, "logits/chosen": -1045220544.0, "logits/rejected": -876609536.0, "logps/chosen": -607.4146341463414, "logps/rejected": -712.1025641025641, "loss": 0.4601, "rewards/chosen": -2.603849085365854, "rewards/margins": 1.4995162992495312, "rewards/rejected": -4.103365384615385, "step": 1690 }, { "epoch": 0.8898194190002617, "grad_norm": 0.5459929181429486, "kl": 0.0, "learning_rate": 4.4335824473805716e-05, "logits/chosen": -781503680.0, "logits/rejected": -809920128.0, "logps/chosen": -588.6344827586207, "logps/rejected": -712.5942857142857, "loss": 0.4266, "rewards/chosen": -2.5760775862068965, "rewards/margins": 1.5317795566502466, "rewards/rejected": -4.107857142857143, "step": 1700 }, { "epoch": 0.8898194190002617, "eval_kl": 0.0, "eval_logits/chosen": -1904080896.0, "eval_logits/rejected": -1675191680.0, "eval_logps/chosen": -554.2761009401287, "eval_logps/rejected": -683.2819492789657, "eval_loss": 0.45478126406669617, "eval_rewards/chosen": -2.223899059871351, "eval_rewards/margins": 1.6572545950267097, "eval_rewards/rejected": -3.8811536548980605, "eval_runtime": 93.4412, "eval_samples_per_second": 42.808, "eval_steps_per_second": 0.674, "step": 1700 }, { "epoch": 0.8950536508767338, "grad_norm": 0.4395457724253016, "kl": 0.0, "learning_rate": 4.423892824151616e-05, "logits/chosen": -925682880.0, "logits/rejected": -894225600.0, "logps/chosen": -554.5, "logps/rejected": -700.1, "loss": 0.4603, "rewards/chosen": -2.29921875, "rewards/margins": 1.616796875, "rewards/rejected": -3.916015625, "step": 1710 }, { "epoch": 0.9002878827532059, "grad_norm": 0.8329962927416344, "kl": 0.0, "learning_rate": 4.414131803092362e-05, "logits/chosen": -1054448000.0, "logits/rejected": -922327424.0, "logps/chosen": -504.95412844036696, "logps/rejected": -647.4632587859425, "loss": 0.4552, "rewards/chosen": -2.029625382262997, "rewards/margins": 1.48335385096384, "rewards/rejected": -3.512979233226837, "step": 1720 }, { "epoch": 0.9055221146296781, "grad_norm": 0.6160573144972731, "kl": 0.0, "learning_rate": 4.404299746445295e-05, "logits/chosen": -1025717056.0, "logits/rejected": -967521088.0, "logps/chosen": -504.22429906542055, "logps/rejected": -682.4326018808778, "loss": 0.4591, "rewards/chosen": -1.7044392523364487, "rewards/margins": 2.07220651568863, "rewards/rejected": -3.7766457680250785, "step": 1730 }, { "epoch": 0.9107563465061502, "grad_norm": 0.6060710943985353, "kl": 0.0, "learning_rate": 4.394397019089116e-05, "logits/chosen": -1029701632.0, "logits/rejected": -1022781056.0, "logps/chosen": -575.1715210355987, "logps/rejected": -696.1691842900302, "loss": 0.4463, "rewards/chosen": -2.292677993527508, "rewards/margins": 1.4610984415178092, "rewards/rejected": -3.7537764350453173, "step": 1740 }, { "epoch": 0.9159905783826223, "grad_norm": 0.8160183432487458, "kl": 0.0, "learning_rate": 4.384423988525196e-05, "logits/chosen": -1208798464.0, "logits/rejected": -997405504.0, "logps/chosen": -692.7283582089552, "logps/rejected": -909.0098360655737, "loss": 0.463, "rewards/chosen": -3.575373134328358, "rewards/margins": 2.482823586983117, "rewards/rejected": -6.058196721311475, "step": 1750 }, { "epoch": 0.9212248102590945, "grad_norm": 0.5247996958339426, "kl": 0.0, "learning_rate": 4.3743810248639325e-05, "logits/chosen": -1090938496.0, "logits/rejected": -1167904000.0, "logps/chosen": -586.989898989899, "logps/rejected": -848.4198250728863, "loss": 0.434, "rewards/chosen": -2.617003367003367, "rewards/margins": 3.081247361859606, "rewards/rejected": -5.698250728862973, "step": 1760 }, { "epoch": 0.9264590421355666, "grad_norm": 0.4876127564448978, "kl": 0.0, "learning_rate": 4.364268500811025e-05, "logits/chosen": -1071225216.0, "logits/rejected": -1151126784.0, "logps/chosen": -561.6623376623377, "logps/rejected": -721.4457831325301, "loss": 0.4567, "rewards/chosen": -2.36911525974026, "rewards/margins": 1.8496347402597402, "rewards/rejected": -4.21875, "step": 1770 }, { "epoch": 0.9316932740120387, "grad_norm": 0.4259995214022993, "kl": 0.0, "learning_rate": 4.354086791653633e-05, "logits/chosen": -1195166976.0, "logits/rejected": -1017118720.0, "logps/chosen": -564.1305637982196, "logps/rejected": -664.7128712871287, "loss": 0.4818, "rewards/chosen": -2.4063427299703264, "rewards/margins": 1.1992678310857792, "rewards/rejected": -3.6056105610561056, "step": 1780 }, { "epoch": 0.9369275058885108, "grad_norm": 0.8417012837062164, "kl": 0.0, "learning_rate": 4.343836275246455e-05, "logits/chosen": -1106876800.0, "logits/rejected": -1180486912.0, "logps/chosen": -571.7979797979798, "logps/rejected": -697.8425655976677, "loss": 0.4433, "rewards/chosen": -2.2925084175084174, "rewards/margins": 1.5478705912379382, "rewards/rejected": -3.8403790087463556, "step": 1790 }, { "epoch": 0.942161737764983, "grad_norm": 0.48599694772283725, "kl": 0.0, "learning_rate": 4.333517331997704e-05, "logits/chosen": -1041865088.0, "logits/rejected": -1005584384.0, "logps/chosen": -572.2838709677419, "logps/rejected": -771.2, "loss": 0.449, "rewards/chosen": -2.52258064516129, "rewards/margins": 1.9986314760508312, "rewards/rejected": -4.5212121212121215, "step": 1800 }, { "epoch": 0.942161737764983, "eval_kl": 0.0, "eval_logits/chosen": -2063198080.0, "eval_logits/rejected": -1781980032.0, "eval_logps/chosen": -630.7530925284512, "eval_logps/rejected": -780.4753853804077, "eval_loss": 0.4611523449420929, "eval_rewards/chosen": -2.9898565066798612, "eval_rewards/margins": 1.863947570893485, "eval_rewards/rejected": -4.853804077573346, "eval_runtime": 93.4415, "eval_samples_per_second": 42.808, "eval_steps_per_second": 0.674, "step": 1800 }, { "epoch": 0.9473959696414551, "grad_norm": 2.6266758242101504, "kl": 0.0, "learning_rate": 4.3231303448549904e-05, "logits/chosen": -917084544.0, "logits/rejected": -829423616.0, "logps/chosen": -644.2264150943396, "logps/rejected": -748.223602484472, "loss": 0.463, "rewards/chosen": -3.0727201257861636, "rewards/margins": 1.3698264580647677, "rewards/rejected": -4.442546583850931, "step": 1810 }, { "epoch": 0.9526302015179272, "grad_norm": 0.4042034424904704, "kl": 0.0, "learning_rate": 4.312675699291109e-05, "logits/chosen": -1117782016.0, "logits/rejected": -859937152.0, "logps/chosen": -590.8502994011976, "logps/rejected": -735.5816993464052, "loss": 0.4671, "rewards/chosen": -2.502994011976048, "rewards/margins": 1.8932478180893115, "rewards/rejected": -4.396241830065359, "step": 1820 }, { "epoch": 0.9578644333943994, "grad_norm": 0.6023946268417015, "kl": 0.0, "learning_rate": 4.3021537832897366e-05, "logits/chosen": -1201458432.0, "logits/rejected": -1019006144.0, "logps/chosen": -544.780487804878, "logps/rejected": -787.1794871794872, "loss": 0.4584, "rewards/chosen": -2.018292682926829, "rewards/margins": 2.7765791119449656, "rewards/rejected": -4.794871794871795, "step": 1830 }, { "epoch": 0.9630986652708715, "grad_norm": 1.0304378455020775, "kl": 0.0, "learning_rate": 4.2915649873310295e-05, "logits/chosen": -1215928704.0, "logits/rejected": -1159305600.0, "logps/chosen": -534.9873417721519, "logps/rejected": -651.4567901234568, "loss": 0.4617, "rewards/chosen": -1.8275316455696202, "rewards/margins": 1.957190576652602, "rewards/rejected": -3.7847222222222223, "step": 1840 }, { "epoch": 0.9683328971473436, "grad_norm": 1.0604209553629633, "kl": 0.0, "learning_rate": 4.2809097043771364e-05, "logits/chosen": -1071644672.0, "logits/rejected": -1154482176.0, "logps/chosen": -656.6864686468647, "logps/rejected": -724.2255192878338, "loss": 0.4595, "rewards/chosen": -3.446575907590759, "rewards/margins": 0.8627712734181427, "rewards/rejected": -4.309347181008902, "step": 1850 }, { "epoch": 0.9735671290238157, "grad_norm": 0.4643605329633105, "kl": 0.0, "learning_rate": 4.270188329857613e-05, "logits/chosen": -1131203840.0, "logits/rejected": -1029491904.0, "logps/chosen": -617.6507936507936, "logps/rejected": -847.1630769230769, "loss": 0.446, "rewards/chosen": -2.8615079365079366, "rewards/margins": 2.5123382173382174, "rewards/rejected": -5.373846153846154, "step": 1860 }, { "epoch": 0.9788013609002879, "grad_norm": 0.5357454435977848, "kl": 0.0, "learning_rate": 4.259401261654746e-05, "logits/chosen": -1183632640.0, "logits/rejected": -924424576.0, "logps/chosen": -689.6457142857142, "logps/rejected": -862.4551724137931, "loss": 0.4981, "rewards/chosen": -3.9153571428571428, "rewards/margins": 1.6824876847290642, "rewards/rejected": -5.597844827586207, "step": 1870 }, { "epoch": 0.98403559277676, "grad_norm": 0.6560194750573574, "kl": 0.0, "learning_rate": 4.248548900088793e-05, "logits/chosen": -1229979648.0, "logits/rejected": -977901952.0, "logps/chosen": -517.6637168141593, "logps/rejected": -625.5415282392026, "loss": 0.4687, "rewards/chosen": -1.6395648967551621, "rewards/margins": 1.7109334421152695, "rewards/rejected": -3.3504983388704317, "step": 1880 }, { "epoch": 0.9892698246532321, "grad_norm": 0.6967632982234868, "kl": 0.0, "learning_rate": 4.2376316479031155e-05, "logits/chosen": -1214041344.0, "logits/rejected": -1190133760.0, "logps/chosen": -488.8488745980707, "logps/rejected": -551.8784194528876, "loss": 0.4502, "rewards/chosen": -1.5696342443729903, "rewards/margins": 1.1302137799431191, "rewards/rejected": -2.6998480243161094, "step": 1890 }, { "epoch": 0.9945040565297043, "grad_norm": 0.7039052687108839, "kl": 0.0, "learning_rate": 4.2266499102492426e-05, "logits/chosen": -1291216512.0, "logits/rejected": -957769344.0, "logps/chosen": -645.5813953488372, "logps/rejected": -762.0540540540541, "loss": 0.4773, "rewards/chosen": -2.9923691860465116, "rewards/margins": 1.6414990571967314, "rewards/rejected": -4.633868243243243, "step": 1900 }, { "epoch": 0.9945040565297043, "eval_kl": 0.0, "eval_logits/chosen": -2405932544.0, "eval_logits/rejected": -2133635840.0, "eval_logps/chosen": -658.1771400296883, "eval_logps/rejected": -792.9825957235206, "eval_loss": 0.46122264862060547, "eval_rewards/chosen": -3.266081147946561, "eval_rewards/margins": 1.7145255154050054, "eval_rewards/rejected": -4.980606663351566, "eval_runtime": 93.4298, "eval_samples_per_second": 42.813, "eval_steps_per_second": 0.674, "step": 1900 }, { "epoch": 0.9997382884061764, "grad_norm": 1.64298159620159, "kl": 0.0, "learning_rate": 4.215604094671835e-05, "logits/chosen": -1055286912.0, "logits/rejected": -1002858112.0, "logps/chosen": -664.6230529595016, "logps/rejected": -746.1316614420062, "loss": 0.4767, "rewards/chosen": -3.2990654205607477, "rewards/margins": 1.0191163976210706, "rewards/rejected": -4.318181818181818, "step": 1910 }, { "epoch": 1.0049725202826485, "grad_norm": 1.15544453899332, "kl": 0.0, "learning_rate": 4.2044946110935485e-05, "logits/chosen": -1248015104.0, "logits/rejected": -1101004800.0, "logps/chosen": -596.4171779141104, "logps/rejected": -646.624203821656, "loss": 0.466, "rewards/chosen": -2.8730828220858897, "rewards/margins": 0.8275541205892694, "rewards/rejected": -3.700636942675159, "step": 1920 }, { "epoch": 1.0102067521591207, "grad_norm": 0.7147045746313497, "kl": 0.0, "learning_rate": 4.193321871799839e-05, "logits/chosen": -1404882176.0, "logits/rejected": -1079613824.0, "logps/chosen": -597.5141242937854, "logps/rejected": -692.3636363636364, "loss": 0.4909, "rewards/chosen": -2.713276836158192, "rewards/margins": 1.0104993876180317, "rewards/rejected": -3.7237762237762237, "step": 1930 }, { "epoch": 1.0154409840355927, "grad_norm": 0.3811105029373331, "kl": 0.0, "learning_rate": 4.1820862914236495e-05, "logits/chosen": -1241723648.0, "logits/rejected": -1168113664.0, "logps/chosen": -548.6355140186915, "logps/rejected": -740.4137931034483, "loss": 0.4364, "rewards/chosen": -2.3601051401869158, "rewards/margins": 2.304863511850702, "rewards/rejected": -4.664968652037618, "step": 1940 }, { "epoch": 1.020675215912065, "grad_norm": 0.6064780465316373, "kl": 0.0, "learning_rate": 4.170788286930024e-05, "logits/chosen": -1268567296.0, "logits/rejected": -1069128064.0, "logps/chosen": -630.4864864864865, "logps/rejected": -733.185667752443, "loss": 0.468, "rewards/chosen": -2.695846053573104, "rewards/margins": 1.8546425457754299, "rewards/rejected": -4.550488599348534, "step": 1950 }, { "epoch": 1.025909447788537, "grad_norm": 0.7574936764747783, "kl": 0.0, "learning_rate": 4.159428277600641e-05, "logits/chosen": -1210895616.0, "logits/rejected": -1113378048.0, "logps/chosen": -643.8637770897833, "logps/rejected": -788.3911671924291, "loss": 0.4486, "rewards/chosen": -2.7565789473684212, "rewards/margins": 2.301386352316121, "rewards/rejected": -5.057965299684542, "step": 1960 }, { "epoch": 1.031143679665009, "grad_norm": 0.4407112262867067, "kl": 0.0, "learning_rate": 4.1480066850182456e-05, "logits/chosen": -1225785344.0, "logits/rejected": -1066401792.0, "logps/chosen": -553.033033033033, "logps/rejected": -648.9641693811075, "loss": 0.4585, "rewards/chosen": -2.1083881733295797, "rewards/margins": 1.4005694813935472, "rewards/rejected": -3.508957654723127, "step": 1970 }, { "epoch": 1.0363779115414813, "grad_norm": 0.6105630732861353, "kl": 0.0, "learning_rate": 4.1365239330510055e-05, "logits/chosen": -1200409856.0, "logits/rejected": -1117152896.0, "logps/chosen": -577.211356466877, "logps/rejected": -647.0340557275542, "loss": 0.4376, "rewards/chosen": -2.117705047318612, "rewards/margins": 1.4565983582541433, "rewards/rejected": -3.5743034055727554, "step": 1980 }, { "epoch": 1.0416121434179535, "grad_norm": 0.36902170461352074, "kl": 0.0, "learning_rate": 4.1249804478367844e-05, "logits/chosen": -1086953856.0, "logits/rejected": -988807168.0, "logps/chosen": -543.3128834355829, "logps/rejected": -669.7579617834394, "loss": 0.4531, "rewards/chosen": -2.051955521472393, "rewards/margins": 1.7020253702473522, "rewards/rejected": -3.753980891719745, "step": 1990 }, { "epoch": 1.0468463752944255, "grad_norm": 0.585975080111189, "kl": 0.0, "learning_rate": 4.113376657767324e-05, "logits/chosen": -1178599424.0, "logits/rejected": -1079823616.0, "logps/chosen": -552.2674772036474, "logps/rejected": -588.4501607717042, "loss": 0.4654, "rewards/chosen": -2.0672492401215807, "rewards/margins": 0.9745514029652358, "rewards/rejected": -3.0418006430868165, "step": 2000 }, { "epoch": 1.0468463752944255, "eval_kl": 0.0, "eval_logits/chosen": -2678495744.0, "eval_logits/rejected": -2597272832.0, "eval_logps/chosen": -528.9421078673923, "eval_logps/rejected": -616.7041272998508, "eval_loss": 0.4567851424217224, "eval_rewards/chosen": -1.968951014349332, "eval_rewards/margins": 1.2454945351285396, "eval_rewards/rejected": -3.2144455494778716, "eval_runtime": 93.4154, "eval_samples_per_second": 42.82, "eval_steps_per_second": 0.674, "step": 2000 }, { "epoch": 1.0520806071708977, "grad_norm": 0.5056456626042156, "kl": 0.0, "learning_rate": 4.101712993472348e-05, "logits/chosen": -1287231872.0, "logits/rejected": -1250531712.0, "logps/chosen": -505.0126582278481, "logps/rejected": -589.0370370370371, "loss": 0.4375, "rewards/chosen": -1.6400316455696202, "rewards/margins": 1.44908872480075, "rewards/rejected": -3.0891203703703702, "step": 2010 }, { "epoch": 1.05731483904737, "grad_norm": 0.5989903113327755, "kl": 0.0, "learning_rate": 4.089989887803579e-05, "logits/chosen": -1364407040.0, "logits/rejected": -1140011776.0, "logps/chosen": -436.22028985507245, "logps/rejected": -529.7898305084746, "loss": 0.4414, "rewards/chosen": -1.1168478260869565, "rewards/margins": 1.1954403095062638, "rewards/rejected": -2.3122881355932203, "step": 2020 }, { "epoch": 1.0625490709238419, "grad_norm": 0.6371162663574992, "kl": 0.0, "learning_rate": 4.078207775818677e-05, "logits/chosen": -1195796096.0, "logits/rejected": -1192860032.0, "logps/chosen": -466.33112582781456, "logps/rejected": -573.6331360946746, "loss": 0.4137, "rewards/chosen": -1.283319536423841, "rewards/margins": 1.705585789019946, "rewards/rejected": -2.988905325443787, "step": 2030 }, { "epoch": 1.067783302800314, "grad_norm": 0.4672351902083702, "kl": 0.0, "learning_rate": 4.066367094765091e-05, "logits/chosen": -1106247680.0, "logits/rejected": -1035993088.0, "logps/chosen": -513.1794871794872, "logps/rejected": -633.560975609756, "loss": 0.423, "rewards/chosen": -1.8940304487179487, "rewards/margins": 1.5206036976235147, "rewards/rejected": -3.4146341463414633, "step": 2040 }, { "epoch": 1.073017534676786, "grad_norm": 0.44456160753496904, "kl": 0.0, "learning_rate": 4.054468284063837e-05, "logits/chosen": -1123758848.0, "logits/rejected": -1046793408.0, "logps/chosen": -508.62111801242236, "logps/rejected": -643.3207547169811, "loss": 0.4316, "rewards/chosen": -1.8557841614906831, "rewards/margins": 1.3828164674401344, "rewards/rejected": -3.2386006289308176, "step": 2050 }, { "epoch": 1.0782517665532583, "grad_norm": 0.5992749413853709, "kl": 0.0, "learning_rate": 4.0425117852931854e-05, "logits/chosen": -1236480768.0, "logits/rejected": -1104150528.0, "logps/chosen": -514.0804953560372, "logps/rejected": -616.378548895899, "loss": 0.4238, "rewards/chosen": -1.4851006191950464, "rewards/margins": 1.8303567940541647, "rewards/rejected": -3.315457413249211, "step": 2060 }, { "epoch": 1.0834859984297305, "grad_norm": 0.7960409622933732, "kl": 0.0, "learning_rate": 4.030498042172277e-05, "logits/chosen": -1191182336.0, "logits/rejected": -1186358912.0, "logps/chosen": -446.5407166123779, "logps/rejected": -573.6936936936937, "loss": 0.4079, "rewards/chosen": -1.2974857491856677, "rewards/margins": 1.6180547913548726, "rewards/rejected": -2.9155405405405403, "step": 2070 }, { "epoch": 1.0887202303062025, "grad_norm": 0.557857832422228, "kl": 0.0, "learning_rate": 4.0184275005446536e-05, "logits/chosen": -1252838656.0, "logits/rejected": -1044381696.0, "logps/chosen": -505.81268882175226, "logps/rejected": -670.6537216828478, "loss": 0.4405, "rewards/chosen": -1.959733761329305, "rewards/margins": 1.5884053972467467, "rewards/rejected": -3.5481391585760518, "step": 2080 }, { "epoch": 1.0939544621826747, "grad_norm": 0.5146387697235528, "kl": 0.0, "learning_rate": 4.0063006083617164e-05, "logits/chosen": -1001390080.0, "logits/rejected": -814428992.0, "logps/chosen": -497.4276923076923, "logps/rejected": -751.2380952380952, "loss": 0.424, "rewards/chosen": -2.1334615384615385, "rewards/margins": 2.7026495726495727, "rewards/rejected": -4.836111111111111, "step": 2090 }, { "epoch": 1.0991886940591469, "grad_norm": 0.7762982381798743, "kl": 0.0, "learning_rate": 3.9941178156660956e-05, "logits/chosen": -943298944.0, "logits/rejected": -795869184.0, "logps/chosen": -603.4858934169279, "logps/rejected": -831.1028037383178, "loss": 0.4228, "rewards/chosen": -2.774980407523511, "rewards/margins": 2.5451130504204142, "rewards/rejected": -5.320093457943925, "step": 2100 }, { "epoch": 1.0991886940591469, "eval_kl": 0.0, "eval_logits/chosen": -1897423232.0, "eval_logits/rejected": -1554289280.0, "eval_logps/chosen": -605.4824344383968, "eval_logps/rejected": -827.4172053704625, "eval_loss": 0.4539531171321869, "eval_rewards/chosen": -2.737877288471054, "eval_rewards/margins": 2.5883285792564448, "eval_rewards/rejected": -5.326205867727499, "eval_runtime": 93.4436, "eval_samples_per_second": 42.807, "eval_steps_per_second": 0.674, "step": 2100 }, { "epoch": 1.1044229259356189, "grad_norm": 0.5835570550501369, "kl": 0.0, "learning_rate": 3.9818795745749544e-05, "logits/chosen": -910163968.0, "logits/rejected": -678952960.0, "logps/chosen": -582.7730061349694, "logps/rejected": -1006.6751592356688, "loss": 0.4134, "rewards/chosen": -2.3824769938650308, "rewards/margins": 4.701917910593568, "rewards/rejected": -7.084394904458598, "step": 2110 }, { "epoch": 1.109657157812091, "grad_norm": 0.7263499612561425, "kl": 0.0, "learning_rate": 3.969586339263209e-05, "logits/chosen": -940153216.0, "logits/rejected": -795344896.0, "logps/chosen": -573.8, "logps/rejected": -756.2, "loss": 0.4227, "rewards/chosen": -2.5900390625, "rewards/margins": 2.3224609375, "rewards/rejected": -4.9125, "step": 2120 }, { "epoch": 1.1148913896885633, "grad_norm": 0.5624781964454179, "kl": 0.0, "learning_rate": 3.9572385659466717e-05, "logits/chosen": -1081920768.0, "logits/rejected": -1029491904.0, "logps/chosen": -538.1132075471698, "logps/rejected": -698.8322981366459, "loss": 0.4119, "rewards/chosen": -2.2749361242138364, "rewards/margins": 1.6513061118110084, "rewards/rejected": -3.926242236024845, "step": 2130 }, { "epoch": 1.1201256215650353, "grad_norm": 0.42810241625177137, "kl": 0.0, "learning_rate": 3.944836712865122e-05, "logits/chosen": -1106247680.0, "logits/rejected": -993001472.0, "logps/chosen": -507.2704402515723, "logps/rejected": -762.8322981366459, "loss": 0.3997, "rewards/chosen": -2.029726808176101, "rewards/margins": 2.6010961731903586, "rewards/rejected": -4.630822981366459, "step": 2140 }, { "epoch": 1.1253598534415075, "grad_norm": 1.792400030040611, "kl": 0.0, "learning_rate": 3.932381240265301e-05, "logits/chosen": -1087792768.0, "logits/rejected": -1007681536.0, "logps/chosen": -620.0774193548388, "logps/rejected": -853.3333333333334, "loss": 0.4082, "rewards/chosen": -2.63125, "rewards/margins": 2.992234848484848, "rewards/rejected": -5.623484848484848, "step": 2150 }, { "epoch": 1.1305940853179797, "grad_norm": 0.5225809025312501, "kl": 0.0, "learning_rate": 3.919872610383831e-05, "logits/chosen": -1179648000.0, "logits/rejected": -880384384.0, "logps/chosen": -584.7447447447447, "logps/rejected": -784.1563517915309, "loss": 0.4165, "rewards/chosen": -2.2351726726726726, "rewards/margins": 2.7599413338419856, "rewards/rejected": -4.995114006514658, "step": 2160 }, { "epoch": 1.1358283171944517, "grad_norm": 0.5287442378757015, "kl": 0.0, "learning_rate": 3.9073112874300574e-05, "logits/chosen": -1114845952.0, "logits/rejected": -979369984.0, "logps/chosen": -434.03095975232196, "logps/rejected": -636.8706624605678, "loss": 0.4086, "rewards/chosen": -1.198843846749226, "rewards/margins": 1.92773344031702, "rewards/rejected": -3.126577287066246, "step": 2170 }, { "epoch": 1.1410625490709239, "grad_norm": 0.4391631988412161, "kl": 0.0, "learning_rate": 3.8946977375688306e-05, "logits/chosen": -1016384704.0, "logits/rejected": -800168320.0, "logps/chosen": -554.1682242990654, "logps/rejected": -720.3510971786834, "loss": 0.4097, "rewards/chosen": -2.169575058411215, "rewards/margins": 2.07023685381449, "rewards/rejected": -4.239811912225705, "step": 2180 }, { "epoch": 1.146296780947396, "grad_norm": 0.771737814326899, "kl": 0.0, "learning_rate": 3.882032428903195e-05, "logits/chosen": -940992128.0, "logits/rejected": -778357952.0, "logps/chosen": -493.84326018808775, "logps/rejected": -693.4330218068536, "loss": 0.388, "rewards/chosen": -1.720807210031348, "rewards/margins": 2.2090993320247265, "rewards/rejected": -3.9299065420560746, "step": 2190 }, { "epoch": 1.151531012823868, "grad_norm": 0.4987120515896395, "kl": 0.0, "learning_rate": 3.869315831457025e-05, "logits/chosen": -939314368.0, "logits/rejected": -714289984.0, "logps/chosen": -538.6504559270517, "logps/rejected": -637.6334405144695, "loss": 0.4094, "rewards/chosen": -1.9659954407294833, "rewards/margins": 1.4363357489811277, "rewards/rejected": -3.402331189710611, "step": 2200 }, { "epoch": 1.151531012823868, "eval_kl": 0.0, "eval_logits/chosen": -1938966784.0, "eval_logits/rejected": -1658281344.0, "eval_logps/chosen": -495.0578921326076, "eval_logps/rejected": -585.3565390353058, "eval_loss": 0.4561484456062317, "eval_rewards/chosen": -1.6318035625927758, "eval_rewards/margins": 1.2711054379044893, "eval_rewards/rejected": -2.902909000497265, "eval_runtime": 93.3795, "eval_samples_per_second": 42.836, "eval_steps_per_second": 0.675, "step": 2200 }, { "epoch": 1.1567652447003403, "grad_norm": 0.4808914069280438, "kl": 0.0, "learning_rate": 3.856548417157581e-05, "logits/chosen": -964060800.0, "logits/rejected": -820196160.0, "logps/chosen": -439.7037037037037, "logps/rejected": -554.3291139240506, "loss": 0.4057, "rewards/chosen": -1.3296199845679013, "rewards/margins": 1.5208546989764027, "rewards/rejected": -2.850474683544304, "step": 2210 }, { "epoch": 1.1619994765768125, "grad_norm": 0.5384626355350068, "kl": 0.0, "learning_rate": 3.843730659817991e-05, "logits/chosen": -1157418240.0, "logits/rejected": -929877184.0, "logps/chosen": -463.1055900621118, "logps/rejected": -571.5723270440252, "loss": 0.3925, "rewards/chosen": -1.1734763198757765, "rewards/margins": 1.6975928625141607, "rewards/rejected": -2.8710691823899372, "step": 2220 }, { "epoch": 1.1672337084532844, "grad_norm": 0.9556698752680222, "kl": 0.0, "learning_rate": 3.830863035119671e-05, "logits/chosen": -999502656.0, "logits/rejected": -930611200.0, "logps/chosen": -460.6233766233766, "logps/rejected": -653.2048192771084, "loss": 0.382, "rewards/chosen": -1.5374391233766234, "rewards/margins": 2.0212958163824126, "rewards/rejected": -3.558734939759036, "step": 2230 }, { "epoch": 1.1724679403297567, "grad_norm": 0.6610739790203071, "kl": 0.0, "learning_rate": 3.8179460205946717e-05, "logits/chosen": -844418240.0, "logits/rejected": -707683968.0, "logps/chosen": -561.4, "logps/rejected": -765.4, "loss": 0.4029, "rewards/chosen": -2.3671875, "rewards/margins": 2.4789062499999996, "rewards/rejected": -4.84609375, "step": 2240 }, { "epoch": 1.1777021722062289, "grad_norm": 0.5001407476527855, "kl": 0.0, "learning_rate": 3.804980095607955e-05, "logits/chosen": -905445376.0, "logits/rejected": -802789760.0, "logps/chosen": -514.6081504702195, "logps/rejected": -698.018691588785, "loss": 0.3878, "rewards/chosen": -1.7934952978056427, "rewards/margins": 2.1161620230666314, "rewards/rejected": -3.9096573208722742, "step": 2250 }, { "epoch": 1.1829364040827008, "grad_norm": 0.47433364516008364, "kl": 0.0, "learning_rate": 3.791965741339607e-05, "logits/chosen": -893072192.0, "logits/rejected": -958503296.0, "logps/chosen": -432.2787456445993, "logps/rejected": -619.4220963172804, "loss": 0.3568, "rewards/chosen": -0.9600664198606271, "rewards/margins": 2.1079222486946136, "rewards/rejected": -3.0679886685552407, "step": 2260 }, { "epoch": 1.188170635959173, "grad_norm": 0.5473442952438966, "kl": 0.0, "learning_rate": 3.7789034407669754e-05, "logits/chosen": -1082549888.0, "logits/rejected": -838441344.0, "logps/chosen": -452.55727554179566, "logps/rejected": -616.378548895899, "loss": 0.3847, "rewards/chosen": -1.1921439628482973, "rewards/margins": 2.1154270150696837, "rewards/rejected": -3.307570977917981, "step": 2270 }, { "epoch": 1.193404867835645, "grad_norm": 0.5456062661969855, "kl": 0.0, "learning_rate": 3.7657936786467526e-05, "logits/chosen": -1233335040.0, "logits/rejected": -1010198144.0, "logps/chosen": -454.24624624624624, "logps/rejected": -569.4332247557003, "loss": 0.3936, "rewards/chosen": -1.1121199324324325, "rewards/margins": 1.6916259959063296, "rewards/rejected": -2.803745928338762, "step": 2280 }, { "epoch": 1.1986390997121172, "grad_norm": 0.48831991548633896, "kl": 0.0, "learning_rate": 3.752636941496981e-05, "logits/chosen": -1165177600.0, "logits/rejected": -1014602112.0, "logps/chosen": -387.0691823899371, "logps/rejected": -613.664596273292, "loss": 0.3634, "rewards/chosen": -0.813298447327044, "rewards/margins": 2.093533850809602, "rewards/rejected": -2.906832298136646, "step": 2290 }, { "epoch": 1.2038733315885894, "grad_norm": 1.0066995567188157, "kl": 0.0, "learning_rate": 3.739433717578999e-05, "logits/chosen": -1147351808.0, "logits/rejected": -1030540480.0, "logps/chosen": -441.68152866242036, "logps/rejected": -643.0429447852761, "loss": 0.3779, "rewards/chosen": -1.0644904458598725, "rewards/margins": 2.300540228986753, "rewards/rejected": -3.3650306748466257, "step": 2300 }, { "epoch": 1.2038733315885894, "eval_kl": 0.0, "eval_logits/chosen": -2399674624.0, "eval_logits/rejected": -2125380352.0, "eval_logps/chosen": -514.6284017812964, "eval_logps/rejected": -627.5882645450025, "eval_loss": 0.45635154843330383, "eval_rewards/chosen": -1.8295398317664522, "eval_rewards/margins": 1.4960444546582123, "eval_rewards/rejected": -3.3255842864246645, "eval_runtime": 93.3818, "eval_samples_per_second": 42.835, "eval_steps_per_second": 0.675, "step": 2300 }, { "epoch": 1.2091075634650614, "grad_norm": 0.5113334468273057, "kl": 0.0, "learning_rate": 3.726184496879323e-05, "logits/chosen": -1007891264.0, "logits/rejected": -1058013184.0, "logps/chosen": -467.43624161073825, "logps/rejected": -666.8538011695906, "loss": 0.3791, "rewards/chosen": -1.4866820469798658, "rewards/margins": 2.1664612278739352, "rewards/rejected": -3.653143274853801, "step": 2310 }, { "epoch": 1.2143417953415336, "grad_norm": 0.6051314218968836, "kl": 0.0, "learning_rate": 3.71288977109146e-05, "logits/chosen": -1114845952.0, "logits/rejected": -938056064.0, "logps/chosen": -475.3865030674847, "logps/rejected": -625.6305732484077, "loss": 0.4018, "rewards/chosen": -1.461320935582822, "rewards/margins": 1.8372459433980697, "rewards/rejected": -3.2985668789808917, "step": 2320 }, { "epoch": 1.2195760272180058, "grad_norm": 0.5399080960512563, "kl": 0.0, "learning_rate": 3.699550033597663e-05, "logits/chosen": -1155950208.0, "logits/rejected": -1095132800.0, "logps/chosen": -430.9808917197452, "logps/rejected": -603.2883435582822, "loss": 0.358, "rewards/chosen": -0.7718016023089171, "rewards/margins": 2.3060358210039666, "rewards/rejected": -3.0778374233128836, "step": 2330 }, { "epoch": 1.2248102590944778, "grad_norm": 0.48019340691437284, "kl": 0.0, "learning_rate": 3.686165779450619e-05, "logits/chosen": -1158047360.0, "logits/rejected": -988387712.0, "logps/chosen": -422.5853658536585, "logps/rejected": -529.7435897435897, "loss": 0.4038, "rewards/chosen": -0.8891482469512195, "rewards/margins": 1.569986368433396, "rewards/rejected": -2.4591346153846154, "step": 2340 }, { "epoch": 1.23004449097095, "grad_norm": 0.5240411276344981, "kl": 0.0, "learning_rate": 3.672737505355081e-05, "logits/chosen": -998873472.0, "logits/rejected": -825334144.0, "logps/chosen": -420.0487804878049, "logps/rejected": -590.0512820512821, "loss": 0.3863, "rewards/chosen": -1.0591773056402438, "rewards/margins": 1.87591884820591, "rewards/rejected": -2.9350961538461537, "step": 2350 }, { "epoch": 1.235278722847422, "grad_norm": 0.5381425730614176, "kl": 0.0, "learning_rate": 3.659265709649428e-05, "logits/chosen": -919391424.0, "logits/rejected": -660707712.0, "logps/chosen": -423.07788161993767, "logps/rejected": -651.6363636363636, "loss": 0.3566, "rewards/chosen": -0.8701263142523364, "rewards/margins": 2.8444113033025222, "rewards/rejected": -3.714537617554859, "step": 2360 }, { "epoch": 1.2405129547238942, "grad_norm": 0.5783093754455919, "kl": 0.0, "learning_rate": 3.645750892287178e-05, "logits/chosen": -942669824.0, "logits/rejected": -837812224.0, "logps/chosen": -405.0331125827815, "logps/rejected": -571.7396449704142, "loss": 0.3374, "rewards/chosen": -0.5006467301324503, "rewards/margins": 2.3114834473823427, "rewards/rejected": -2.812130177514793, "step": 2370 }, { "epoch": 1.2457471866003664, "grad_norm": 0.5404535850119857, "kl": 0.0, "learning_rate": 3.632193554818429e-05, "logits/chosen": -935329792.0, "logits/rejected": -691640704.0, "logps/chosen": -462.44025157232704, "logps/rejected": -624.695652173913, "loss": 0.369, "rewards/chosen": -0.691011056960004, "rewards/margins": 2.7880262101207416, "rewards/rejected": -3.4790372670807455, "step": 2380 }, { "epoch": 1.2509814184768384, "grad_norm": 0.4591882116536945, "kl": 0.0, "learning_rate": 3.6185942003712515e-05, "logits/chosen": -927150912.0, "logits/rejected": -722468864.0, "logps/chosen": -416.6808510638298, "logps/rejected": -663.7684887459807, "loss": 0.3684, "rewards/chosen": -0.9160809270516718, "rewards/margins": 2.778452835006206, "rewards/rejected": -3.694533762057878, "step": 2390 }, { "epoch": 1.2562156503533106, "grad_norm": 0.6102727110509801, "kl": 0.0, "learning_rate": 3.604953333633009e-05, "logits/chosen": -955252736.0, "logits/rejected": -769235328.0, "logps/chosen": -464.283185840708, "logps/rejected": -666.6843853820598, "loss": 0.3731, "rewards/chosen": -1.2830360435103245, "rewards/margins": 2.393458973100971, "rewards/rejected": -3.6764950166112955, "step": 2400 }, { "epoch": 1.2562156503533106, "eval_kl": 0.0, "eval_logits/chosen": -1886105216.0, "eval_logits/rejected": -1661210752.0, "eval_logps/chosen": -496.7679366650173, "eval_logps/rejected": -605.2153157633019, "eval_loss": 0.4578281342983246, "eval_rewards/chosen": -1.6493072736269174, "eval_rewards/margins": 1.4530050088196265, "eval_rewards/rejected": -3.102312282446544, "eval_runtime": 93.4183, "eval_samples_per_second": 42.818, "eval_steps_per_second": 0.674, "step": 2400 }, { "epoch": 1.2614498822297828, "grad_norm": 0.5419994441020899, "kl": 0.0, "learning_rate": 3.5912714608316346e-05, "logits/chosen": -832464512.0, "logits/rejected": -718484288.0, "logps/chosen": -455.2207792207792, "logps/rejected": -698.5060240963855, "loss": 0.3477, "rewards/chosen": -1.0582830255681819, "rewards/margins": 3.069729022624589, "rewards/rejected": -4.128012048192771, "step": 2410 }, { "epoch": 1.2666841141062548, "grad_norm": 1.1041391753234233, "kl": 0.0, "learning_rate": 3.577549089716845e-05, "logits/chosen": -842425984.0, "logits/rejected": -619184128.0, "logps/chosen": -402.8553846153846, "logps/rejected": -700.4444444444445, "loss": 0.3471, "rewards/chosen": -0.9087079326923077, "rewards/margins": 3.2742285752442, "rewards/rejected": -4.182936507936508, "step": 2420 }, { "epoch": 1.271918345982727, "grad_norm": 0.4627953236624642, "kl": 0.0, "learning_rate": 3.56378672954129e-05, "logits/chosen": -896532480.0, "logits/rejected": -730123456.0, "logps/chosen": -487.7770897832817, "logps/rejected": -696.429022082019, "loss": 0.3557, "rewards/chosen": -1.3471120356037152, "rewards/margins": 2.7372728224404486, "rewards/rejected": -4.084384858044164, "step": 2430 }, { "epoch": 1.2771525778591992, "grad_norm": 0.5819819288924131, "kl": 0.0, "learning_rate": 3.5499848910416646e-05, "logits/chosen": -1001390080.0, "logits/rejected": -815267840.0, "logps/chosen": -421.08868501529054, "logps/rejected": -678.5431309904153, "loss": 0.3453, "rewards/chosen": -0.9145940844801224, "rewards/margins": 3.003137544912849, "rewards/rejected": -3.9177316293929714, "step": 2440 }, { "epoch": 1.2823868097356712, "grad_norm": 1.2888270938532067, "kl": 0.0, "learning_rate": 3.536144086419744e-05, "logits/chosen": -1004326080.0, "logits/rejected": -833775232.0, "logps/chosen": -406.111801242236, "logps/rejected": -582.5408805031446, "loss": 0.3863, "rewards/chosen": -1.0681774068322982, "rewards/margins": 1.7191653604632993, "rewards/rejected": -2.7873427672955975, "step": 2450 }, { "epoch": 1.2876210416121434, "grad_norm": 0.8224405943194626, "kl": 0.0, "learning_rate": 3.522264829323381e-05, "logits/chosen": -865494656.0, "logits/rejected": -794296320.0, "logps/chosen": -459.9225806451613, "logps/rejected": -565.7212121212121, "loss": 0.4008, "rewards/chosen": -1.0576612903225806, "rewards/margins": 1.7438538611925711, "rewards/rejected": -2.8015151515151517, "step": 2460 }, { "epoch": 1.2928552734886156, "grad_norm": 0.6298545868105269, "kl": 0.0, "learning_rate": 3.5083476348274454e-05, "logits/chosen": -741657792.0, "logits/rejected": -663958336.0, "logps/chosen": -455.0769230769231, "logps/rejected": -694.6341463414634, "loss": 0.3556, "rewards/chosen": -1.2873347355769231, "rewards/margins": 2.8254701424718576, "rewards/rejected": -4.112804878048781, "step": 2470 }, { "epoch": 1.2980895053650876, "grad_norm": 1.120319476468608, "kl": 0.0, "learning_rate": 3.494393019414704e-05, "logits/chosen": -760217600.0, "logits/rejected": -618187968.0, "logps/chosen": -449.5686274509804, "logps/rejected": -713.4850299401197, "loss": 0.3475, "rewards/chosen": -1.2301368464052287, "rewards/margins": 3.027348183534891, "rewards/rejected": -4.25748502994012, "step": 2480 }, { "epoch": 1.3033237372415598, "grad_norm": 0.8833961844092323, "kl": 0.0, "learning_rate": 3.480401500956657e-05, "logits/chosen": -747529856.0, "logits/rejected": -612892672.0, "logps/chosen": -461.81366459627327, "logps/rejected": -668.5786163522013, "loss": 0.368, "rewards/chosen": -1.3041537267080745, "rewards/margins": 2.2583462732919255, "rewards/rejected": -3.5625, "step": 2490 }, { "epoch": 1.308557969118032, "grad_norm": 0.7959973438288277, "kl": 0.0, "learning_rate": 3.4663735986943194e-05, "logits/chosen": -723622272.0, "logits/rejected": -588146304.0, "logps/chosen": -441.44, "logps/rejected": -676.8941176470588, "loss": 0.3538, "rewards/chosen": -1.0112760416666666, "rewards/margins": 2.8769592524509804, "rewards/rejected": -3.888235294117647, "step": 2500 }, { "epoch": 1.308557969118032, "eval_kl": 0.0, "eval_logits/chosen": -1523131520.0, "eval_logits/rejected": -1173539584.0, "eval_logps/chosen": -520.2968827313211, "eval_logps/rejected": -641.4639482844356, "eval_loss": 0.45775389671325684, "eval_rewards/chosen": -1.8840920336467095, "eval_rewards/margins": 1.5788617207043598, "eval_rewards/rejected": -3.4629537543510693, "eval_runtime": 93.4179, "eval_samples_per_second": 42.818, "eval_steps_per_second": 0.674, "step": 2500 }, { "epoch": 1.313792200994504, "grad_norm": 0.6111285044739223, "kl": 0.0, "learning_rate": 3.452309833218948e-05, "logits/chosen": -867696640.0, "logits/rejected": -645083968.0, "logps/chosen": -516.1823708206687, "logps/rejected": -749.9935691318328, "loss": 0.3563, "rewards/chosen": -1.453647416413374, "rewards/margins": 2.949889561078587, "rewards/rejected": -4.403536977491961, "step": 2510 }, { "epoch": 1.3190264328709762, "grad_norm": 1.1367797934531445, "kl": 0.0, "learning_rate": 3.438210726452724e-05, "logits/chosen": -850499968.0, "logits/rejected": -654416256.0, "logps/chosen": -392.9051987767584, "logps/rejected": -761.0479233226837, "loss": 0.3216, "rewards/chosen": -0.7394997133027523, "rewards/margins": 3.7509156221605062, "rewards/rejected": -4.4904153354632586, "step": 2520 }, { "epoch": 1.3242606647474484, "grad_norm": 0.6046826859018436, "kl": 0.0, "learning_rate": 3.424076801629387e-05, "logits/chosen": -920649728.0, "logits/rejected": -852072832.0, "logps/chosen": -417.80204778156997, "logps/rejected": -634.3746397694524, "loss": 0.3214, "rewards/chosen": -0.8054074232081911, "rewards/margins": 2.649923988895555, "rewards/rejected": -3.4553314121037464, "step": 2530 }, { "epoch": 1.3294948966239204, "grad_norm": 0.6290047351078182, "kl": 0.0, "learning_rate": 3.4099085832748095e-05, "logits/chosen": -1099327104.0, "logits/rejected": -1017118720.0, "logps/chosen": -407.2025723472669, "logps/rejected": -598.0790273556231, "loss": 0.3594, "rewards/chosen": -0.7732993368167203, "rewards/margins": 2.3102872893230977, "rewards/rejected": -3.0835866261398177, "step": 2540 }, { "epoch": 1.3347291285003926, "grad_norm": 0.7278685039353052, "kl": 0.0, "learning_rate": 3.395706597187538e-05, "logits/chosen": -1117572352.0, "logits/rejected": -899887936.0, "logps/chosen": -432.34890965732086, "logps/rejected": -595.8620689655172, "loss": 0.3615, "rewards/chosen": -0.8768983644859814, "rewards/margins": 2.3605624505610407, "rewards/rejected": -3.237460815047022, "step": 2550 }, { "epoch": 1.3399633603768648, "grad_norm": 0.5868735607220685, "kl": 0.0, "learning_rate": 3.381471370419278e-05, "logits/chosen": -1081501312.0, "logits/rejected": -898944192.0, "logps/chosen": -441.9240506329114, "logps/rejected": -697.4814814814815, "loss": 0.3362, "rewards/chosen": -0.9524698378164557, "rewards/margins": 2.9780857177391, "rewards/rejected": -3.9305555555555554, "step": 2560 }, { "epoch": 1.3451975922533368, "grad_norm": 1.3113360831377077, "kl": 0.0, "learning_rate": 3.3672034312553326e-05, "logits/chosen": -1024039296.0, "logits/rejected": -698561344.0, "logps/chosen": -476.35692307692307, "logps/rejected": -833.3206349206349, "loss": 0.3546, "rewards/chosen": -1.4344711538461539, "rewards/margins": 4.108782814407815, "rewards/rejected": -5.5432539682539685, "step": 2570 }, { "epoch": 1.350431824129809, "grad_norm": 0.5501904923995546, "kl": 0.0, "learning_rate": 3.352903309194999e-05, "logits/chosen": -974022272.0, "logits/rejected": -782657152.0, "logps/chosen": -445.72168284789643, "logps/rejected": -795.6495468277946, "loss": 0.3294, "rewards/chosen": -1.3428398058252426, "rewards/margins": 3.5846526413046664, "rewards/rejected": -4.927492447129909, "step": 2580 }, { "epoch": 1.3556660560062812, "grad_norm": 0.5726718370323468, "kl": 0.0, "learning_rate": 3.338571534931919e-05, "logits/chosen": -1016804160.0, "logits/rejected": -867172352.0, "logps/chosen": -430.8724832214765, "logps/rejected": -744.6081871345029, "loss": 0.3272, "rewards/chosen": -1.143089345637584, "rewards/margins": 3.473869718689901, "rewards/rejected": -4.616959064327485, "step": 2590 }, { "epoch": 1.3609002878827532, "grad_norm": 0.5016181108370353, "kl": 0.0, "learning_rate": 3.324208640334383e-05, "logits/chosen": -1298556544.0, "logits/rejected": -899258752.0, "logps/chosen": -419.6507042253521, "logps/rejected": -713.5438596491229, "loss": 0.3699, "rewards/chosen": -0.999768926056338, "rewards/margins": 3.238827565171732, "rewards/rejected": -4.23859649122807, "step": 2600 }, { "epoch": 1.3609002878827532, "eval_kl": 0.0, "eval_logits/chosen": -2620041984.0, "eval_logits/rejected": -2228706560.0, "eval_logps/chosen": -464.24542305789214, "eval_logps/rejected": -571.5126802585778, "eval_loss": 0.4552265703678131, "eval_rewards/chosen": -1.3249628896585848, "eval_rewards/margins": 1.4383389502220716, "eval_rewards/rejected": -2.7633018398806564, "eval_runtime": 93.4503, "eval_samples_per_second": 42.804, "eval_steps_per_second": 0.674, "step": 2600 }, { "epoch": 1.3661345197592254, "grad_norm": 0.5441867816476207, "kl": 0.0, "learning_rate": 3.309815158425591e-05, "logits/chosen": -1281359872.0, "logits/rejected": -1104989440.0, "logps/chosen": -386.2085889570552, "logps/rejected": -610.0382165605096, "loss": 0.3322, "rewards/chosen": -0.46640145705521474, "rewards/margins": 2.5964966321167595, "rewards/rejected": -3.0628980891719744, "step": 2610 }, { "epoch": 1.3713687516356974, "grad_norm": 0.6345683057432078, "kl": 0.0, "learning_rate": 3.295391623363874e-05, "logits/chosen": -1280730752.0, "logits/rejected": -1039243648.0, "logps/chosen": -412.21183800623055, "logps/rejected": -631.9749216300941, "loss": 0.3526, "rewards/chosen": -0.6753796728971962, "rewards/margins": 2.6251689164444967, "rewards/rejected": -3.300548589341693, "step": 2620 }, { "epoch": 1.3766029835121696, "grad_norm": 0.5674215327512067, "kl": 0.0, "learning_rate": 3.280938570422869e-05, "logits/chosen": -1282198784.0, "logits/rejected": -923376000.0, "logps/chosen": -377.22352941176473, "logps/rejected": -641.6, "loss": 0.3667, "rewards/chosen": -0.6543198529411764, "rewards/margins": 2.8973468137254903, "rewards/rejected": -3.5516666666666667, "step": 2630 }, { "epoch": 1.3818372153886418, "grad_norm": 0.5205232170690738, "kl": 0.0, "learning_rate": 3.266456535971654e-05, "logits/chosen": -1084856704.0, "logits/rejected": -985241984.0, "logps/chosen": -458.17704918032786, "logps/rejected": -665.9820895522388, "loss": 0.3498, "rewards/chosen": -0.9981045081967214, "rewards/margins": 2.5884626559823833, "rewards/rejected": -3.5865671641791046, "step": 2640 }, { "epoch": 1.3870714472651138, "grad_norm": 0.6619442824934767, "kl": 0.0, "learning_rate": 3.2519460574548435e-05, "logits/chosen": -1018586752.0, "logits/rejected": -875770688.0, "logps/chosen": -436.0883280757098, "logps/rejected": -744.5201238390093, "loss": 0.3679, "rewards/chosen": -0.9814175867507886, "rewards/margins": 3.5220653853854342, "rewards/rejected": -4.503482972136223, "step": 2650 }, { "epoch": 1.392305679141586, "grad_norm": 0.48346469367063, "kl": 0.0, "learning_rate": 3.237407673372644e-05, "logits/chosen": -1107086592.0, "logits/rejected": -822922432.0, "logps/chosen": -461.2544378698225, "logps/rejected": -731.2317880794702, "loss": 0.3577, "rewards/chosen": -1.0839959319526626, "rewards/margins": 3.2388517501665426, "rewards/rejected": -4.322847682119205, "step": 2660 }, { "epoch": 1.397539911018058, "grad_norm": 0.8678937627006628, "kl": 0.0, "learning_rate": 3.2228419232608695e-05, "logits/chosen": -1117991680.0, "logits/rejected": -744593792.0, "logps/chosen": -416.094674556213, "logps/rejected": -673.1655629139073, "loss": 0.3578, "rewards/chosen": -0.8156954974112426, "rewards/margins": 3.1851323171582937, "rewards/rejected": -4.000827814569536, "step": 2670 }, { "epoch": 1.4027741428945302, "grad_norm": 0.658458342613859, "kl": 0.0, "learning_rate": 3.208249347670917e-05, "logits/chosen": -944347520.0, "logits/rejected": -722993152.0, "logps/chosen": -423.54430379746833, "logps/rejected": -716.4444444444445, "loss": 0.3536, "rewards/chosen": -1.0082946004746836, "rewards/margins": 3.232446140266057, "rewards/rejected": -4.2407407407407405, "step": 2680 }, { "epoch": 1.4080083747710024, "grad_norm": 0.4856064293626971, "kl": 0.0, "learning_rate": 3.1936304881497084e-05, "logits/chosen": -867801472.0, "logits/rejected": -710410240.0, "logps/chosen": -476.8888888888889, "logps/rejected": -697.7215189873418, "loss": 0.3519, "rewards/chosen": -1.3734809027777777, "rewards/margins": 2.6538134010196908, "rewards/rejected": -4.0272943037974684, "step": 2690 }, { "epoch": 1.4132426066474744, "grad_norm": 0.5589508309469322, "kl": 0.0, "learning_rate": 3.178985887219589e-05, "logits/chosen": -790836032.0, "logits/rejected": -627992192.0, "logps/chosen": -495.3268608414239, "logps/rejected": -932.9305135951662, "loss": 0.3293, "rewards/chosen": -1.562626415857605, "rewards/margins": 4.543869052420341, "rewards/rejected": -6.106495468277946, "step": 2700 }, { "epoch": 1.4132426066474744, "eval_kl": 0.0, "eval_logits/chosen": -1547698176.0, "eval_logits/rejected": -1145178112.0, "eval_logps/chosen": -559.4695695200396, "eval_logps/rejected": -805.1397314768772, "eval_loss": 0.45078906416893005, "eval_rewards/chosen": -2.276039089559624, "eval_rewards/margins": 2.8229166538516144, "eval_rewards/rejected": -5.0989557434112385, "eval_runtime": 93.4389, "eval_samples_per_second": 42.809, "eval_steps_per_second": 0.674, "step": 2700 }, { "epoch": 1.4184768385239466, "grad_norm": 0.47458066615925315, "kl": 0.0, "learning_rate": 3.164316088358201e-05, "logits/chosen": -779511424.0, "logits/rejected": -455029568.0, "logps/chosen": -558.780487804878, "logps/rejected": -872.3076923076923, "loss": 0.3588, "rewards/chosen": -2.2006538205030486, "rewards/margins": 4.011685923086695, "rewards/rejected": -6.212339743589744, "step": 2710 }, { "epoch": 1.4237110704004188, "grad_norm": 0.4805620768604118, "kl": 0.0, "learning_rate": 3.149621635978309e-05, "logits/chosen": -705901376.0, "logits/rejected": -523239424.0, "logps/chosen": -486.81290322580645, "logps/rejected": -1012.6545454545454, "loss": 0.3358, "rewards/chosen": -1.6076108870967742, "rewards/margins": 5.625722446236559, "rewards/rejected": -7.233333333333333, "step": 2720 }, { "epoch": 1.4289453022768908, "grad_norm": 0.8604064532109258, "kl": 0.0, "learning_rate": 3.134903075407594e-05, "logits/chosen": -661232000.0, "logits/rejected": -641099392.0, "logps/chosen": -477.7304964539007, "logps/rejected": -869.8994413407821, "loss": 0.3028, "rewards/chosen": -1.3523936170212767, "rewards/margins": 4.299142695827885, "rewards/rejected": -5.651536312849162, "step": 2730 }, { "epoch": 1.434179534153363, "grad_norm": 0.5740359557647091, "kl": 0.0, "learning_rate": 3.120160952868424e-05, "logits/chosen": -802999488.0, "logits/rejected": -629722304.0, "logps/chosen": -500.75949367088606, "logps/rejected": -868.3456790123457, "loss": 0.3432, "rewards/chosen": -1.3324515427215189, "rewards/margins": 4.25859783999453, "rewards/rejected": -5.591049382716049, "step": 2740 }, { "epoch": 1.4394137660298352, "grad_norm": 0.8150577225591913, "kl": 0.0, "learning_rate": 3.1053958154575743e-05, "logits/chosen": -833093632.0, "logits/rejected": -635437056.0, "logps/chosen": -506.3619047619048, "logps/rejected": -867.0523076923076, "loss": 0.3312, "rewards/chosen": -1.4729910714285714, "rewards/margins": 4.43931662087912, "rewards/rejected": -5.912307692307692, "step": 2750 }, { "epoch": 1.4446479979063072, "grad_norm": 0.4811412127606036, "kl": 0.0, "learning_rate": 3.090608211125931e-05, "logits/chosen": -878496960.0, "logits/rejected": -652424000.0, "logps/chosen": -534.9380530973451, "logps/rejected": -947.3488372093024, "loss": 0.3638, "rewards/chosen": -1.9941694321533923, "rewards/margins": 4.019119604391458, "rewards/rejected": -6.01328903654485, "step": 2760 }, { "epoch": 1.4498822297827794, "grad_norm": 0.5558741046389223, "kl": 0.0, "learning_rate": 3.0757986886581506e-05, "logits/chosen": -793247744.0, "logits/rejected": -655150272.0, "logps/chosen": -435.3605015673981, "logps/rejected": -711.3769470404984, "loss": 0.3391, "rewards/chosen": -0.9830899133951313, "rewards/margins": 3.080773014953779, "rewards/rejected": -4.06386292834891, "step": 2770 }, { "epoch": 1.4551164616592516, "grad_norm": 0.6805099501290808, "kl": 0.0, "learning_rate": 3.060967797652299e-05, "logits/chosen": -764516736.0, "logits/rejected": -620652160.0, "logps/chosen": -382.8930817610063, "logps/rejected": -623.0062111801242, "loss": 0.3268, "rewards/chosen": -0.38918779481132076, "rewards/margins": 3.0409364287911633, "rewards/rejected": -3.4301242236024843, "step": 2780 }, { "epoch": 1.4603506935357236, "grad_norm": 0.4985713683174277, "kl": 0.0, "learning_rate": 3.046116088499449e-05, "logits/chosen": -821035008.0, "logits/rejected": -718589120.0, "logps/chosen": -373.3003095975232, "logps/rejected": -633.2365930599369, "loss": 0.3369, "rewards/chosen": -0.4190208978328173, "rewards/margins": 2.7556636447539335, "rewards/rejected": -3.1746845425867507, "step": 2790 }, { "epoch": 1.4655849254121958, "grad_norm": 0.5357398173690253, "kl": 0.0, "learning_rate": 3.0312441123632607e-05, "logits/chosen": -841691968.0, "logits/rejected": -659973760.0, "logps/chosen": -414.20512820512823, "logps/rejected": -630.5365853658536, "loss": 0.3376, "rewards/chosen": -0.7527919671474359, "rewards/margins": 2.662604374315979, "rewards/rejected": -3.4153963414634148, "step": 2800 }, { "epoch": 1.4655849254121958, "eval_kl": 0.0, "eval_logits/chosen": -1543770240.0, "eval_logits/rejected": -1201035648.0, "eval_logps/chosen": -495.72290945076696, "eval_logps/rejected": -643.6280457483839, "eval_loss": 0.45756250619888306, "eval_rewards/chosen": -1.6387926768926273, "eval_rewards/margins": 1.8452948417548116, "eval_rewards/rejected": -3.484087518647439, "eval_runtime": 93.4359, "eval_samples_per_second": 42.81, "eval_steps_per_second": 0.674, "step": 2800 }, { "epoch": 1.470819157288668, "grad_norm": 1.0632107189982902, "kl": 0.0, "learning_rate": 3.0163524211595257e-05, "logits/chosen": -742706368.0, "logits/rejected": -504207776.0, "logps/chosen": -471.7546012269939, "logps/rejected": -858.3949044585987, "loss": 0.3515, "rewards/chosen": -1.2275450536809815, "rewards/margins": 4.451595073707553, "rewards/rejected": -5.679140127388535, "step": 2810 }, { "epoch": 1.47605338916514, "grad_norm": 0.6066090073258897, "kl": 0.0, "learning_rate": 3.001441567535681e-05, "logits/chosen": -518992704.0, "logits/rejected": -481768256.0, "logps/chosen": -463.1225806451613, "logps/rejected": -838.7878787878788, "loss": 0.3447, "rewards/chosen": -1.4911290322580646, "rewards/margins": 3.7975073313782994, "rewards/rejected": -5.288636363636364, "step": 2820 }, { "epoch": 1.4812876210416122, "grad_norm": 0.5724306368429418, "kl": 0.0, "learning_rate": 2.9865121048503052e-05, "logits/chosen": -565706752.0, "logits/rejected": -314074720.0, "logps/chosen": -527.854103343465, "logps/rejected": -860.1929260450161, "loss": 0.3691, "rewards/chosen": -2.0718085106382977, "rewards/margins": 3.8060049941848533, "rewards/rejected": -5.877813504823151, "step": 2830 }, { "epoch": 1.4865218529180844, "grad_norm": 0.8133276454135758, "kl": 0.0, "learning_rate": 2.971564587152579e-05, "logits/chosen": -629669888.0, "logits/rejected": -469054272.0, "logps/chosen": -481.6385542168675, "logps/rejected": -840.9350649350649, "loss": 0.3583, "rewards/chosen": -1.8020519578313252, "rewards/margins": 3.5843116785323117, "rewards/rejected": -5.386363636363637, "step": 2840 }, { "epoch": 1.4917560847945563, "grad_norm": 0.6631153686584916, "kl": 0.0, "learning_rate": 2.9565995691617242e-05, "logits/chosen": -809290944.0, "logits/rejected": -725247616.0, "logps/chosen": -451.44370860927154, "logps/rejected": -709.207100591716, "loss": 0.3432, "rewards/chosen": -1.1947304428807948, "rewards/margins": 3.027163048243465, "rewards/rejected": -4.22189349112426, "step": 2850 }, { "epoch": 1.4969903166710286, "grad_norm": 0.5188461507645532, "kl": 0.0, "learning_rate": 2.9416176062464207e-05, "logits/chosen": -850185408.0, "logits/rejected": -705377088.0, "logps/chosen": -502.1575757575758, "logps/rejected": -739.0451612903225, "loss": 0.3496, "rewards/chosen": -1.5047585227272726, "rewards/margins": 3.0883866385630494, "rewards/rejected": -4.593145161290322, "step": 2860 }, { "epoch": 1.5022245485475008, "grad_norm": 0.5151500356381172, "kl": 0.0, "learning_rate": 2.9266192544041916e-05, "logits/chosen": -836973376.0, "logits/rejected": -678848128.0, "logps/chosen": -388.55757575757576, "logps/rejected": -746.0129032258064, "loss": 0.3352, "rewards/chosen": -0.7599668560606061, "rewards/margins": 4.0009202407135875, "rewards/rejected": -4.760887096774193, "step": 2870 }, { "epoch": 1.5074587804239727, "grad_norm": 0.8504634704689563, "kl": 0.0, "learning_rate": 2.9116050702407703e-05, "logits/chosen": -775736512.0, "logits/rejected": -702231360.0, "logps/chosen": -488.4025157232704, "logps/rejected": -753.6894409937888, "loss": 0.352, "rewards/chosen": -1.600051591981132, "rewards/margins": 3.076190644043712, "rewards/rejected": -4.676242236024844, "step": 2880 }, { "epoch": 1.512693012300445, "grad_norm": 0.7155406298950329, "kl": 0.0, "learning_rate": 2.8965756109494485e-05, "logits/chosen": -772695680.0, "logits/rejected": -694891328.0, "logps/chosen": -536.4528301886793, "logps/rejected": -841.4409937888199, "loss": 0.3432, "rewards/chosen": -1.9663423742138364, "rewards/margins": 3.4738750170905113, "rewards/rejected": -5.440217391304348, "step": 2890 }, { "epoch": 1.5179272441769172, "grad_norm": 0.6035091088536375, "kl": 0.0, "learning_rate": 2.8815314342903948e-05, "logits/chosen": -787270848.0, "logits/rejected": -597321344.0, "logps/chosen": -602.1204819277109, "logps/rejected": -968.5194805194806, "loss": 0.3545, "rewards/chosen": -2.746046686746988, "rewards/margins": 4.1265182483179474, "rewards/rejected": -6.872564935064935, "step": 2900 }, { "epoch": 1.5179272441769172, "eval_kl": 0.0, "eval_logits/chosen": -1516407296.0, "eval_logits/rejected": -1264482816.0, "eval_logps/chosen": -736.205838693716, "eval_logps/rejected": -1167.2123321730483, "eval_loss": 0.45225000381469727, "eval_rewards/chosen": -4.043295398317665, "eval_rewards/margins": 4.673263527589843, "eval_rewards/rejected": -8.716558925907508, "eval_runtime": 93.433, "eval_samples_per_second": 42.811, "eval_steps_per_second": 0.674, "step": 2900 }, { "epoch": 1.5231614760533891, "grad_norm": 0.6807872514271341, "kl": 0.0, "learning_rate": 2.8664730985699534e-05, "logits/chosen": -689438720.0, "logits/rejected": -629669888.0, "logps/chosen": -672.8971962616822, "logps/rejected": -1098.5329153605016, "loss": 0.352, "rewards/chosen": -3.6897877725856696, "rewards/margins": 4.076669907665114, "rewards/rejected": -7.766457680250784, "step": 2910 }, { "epoch": 1.5283957079298613, "grad_norm": 1.004264687962567, "kl": 0.0, "learning_rate": 2.851401162619929e-05, "logits/chosen": -740504384.0, "logits/rejected": -671612928.0, "logps/chosen": -581.166144200627, "logps/rejected": -1205.632398753894, "loss": 0.34, "rewards/chosen": -2.5907866379310347, "rewards/margins": 6.76785822188205, "rewards/rejected": -9.358644859813085, "step": 2920 }, { "epoch": 1.5336299398063336, "grad_norm": 0.5686639443419601, "kl": 0.0, "learning_rate": 2.836316185776846e-05, "logits/chosen": -855742848.0, "logits/rejected": -828794496.0, "logps/chosen": -669.1282051282051, "logps/rejected": -1336.3902439024391, "loss": 0.341, "rewards/chosen": -3.5234375, "rewards/margins": 6.859184451219512, "rewards/rejected": -10.382621951219512, "step": 2930 }, { "epoch": 1.5388641716828055, "grad_norm": 1.0145324073512791, "kl": 0.0, "learning_rate": 2.8212187278611906e-05, "logits/chosen": -950429312.0, "logits/rejected": -835610240.0, "logps/chosen": -832.0984615384615, "logps/rejected": -1331.911111111111, "loss": 0.392, "rewards/chosen": -5.184615384615385, "rewards/margins": 5.361813186813186, "rewards/rejected": -10.54642857142857, "step": 2940 }, { "epoch": 1.5440984035592775, "grad_norm": 0.6405221763005252, "kl": 0.0, "learning_rate": 2.8061093491566364e-05, "logits/chosen": -961124736.0, "logits/rejected": -918972032.0, "logps/chosen": -732.5749235474007, "logps/rejected": -1326.3130990415336, "loss": 0.3448, "rewards/chosen": -3.912461773700306, "rewards/margins": 6.49728263524538, "rewards/rejected": -10.409744408945686, "step": 2950 }, { "epoch": 1.54933263543575, "grad_norm": 0.54622495653271, "kl": 0.0, "learning_rate": 2.7909886103892508e-05, "logits/chosen": -992372352.0, "logits/rejected": -994469504.0, "logps/chosen": -473.14285714285717, "logps/rejected": -818.3138461538462, "loss": 0.3316, "rewards/chosen": -1.2914186507936507, "rewards/margins": 4.099735195360195, "rewards/rejected": -5.391153846153846, "step": 2960 }, { "epoch": 1.554566867312222, "grad_norm": 0.5518051451207503, "kl": 0.0, "learning_rate": 2.775857072706684e-05, "logits/chosen": -1027185024.0, "logits/rejected": -1086324736.0, "logps/chosen": -461.2987012987013, "logps/rejected": -758.3614457831326, "loss": 0.3422, "rewards/chosen": -1.1163250811688312, "rewards/margins": 3.468012268228759, "rewards/rejected": -4.5843373493975905, "step": 2970 }, { "epoch": 1.559801099188694, "grad_norm": 0.9684158437169723, "kl": 0.0, "learning_rate": 2.7607152976573485e-05, "logits/chosen": -1112119680.0, "logits/rejected": -1002858112.0, "logps/chosen": -538.3809523809524, "logps/rejected": -908.421052631579, "loss": 0.3674, "rewards/chosen": -2.1278831845238093, "rewards/margins": 3.815373394423559, "rewards/rejected": -5.943256578947368, "step": 2980 }, { "epoch": 1.5650353310651663, "grad_norm": 0.4735573674201958, "kl": 0.0, "learning_rate": 2.745563847169577e-05, "logits/chosen": -1047946880.0, "logits/rejected": -963431616.0, "logps/chosen": -543.1272727272727, "logps/rejected": -919.9483870967742, "loss": 0.351, "rewards/chosen": -2.0688920454545454, "rewards/margins": 4.311753115835778, "rewards/rejected": -6.380645161290323, "step": 2990 }, { "epoch": 1.5702695629416383, "grad_norm": 0.5342128661104129, "kl": 0.0, "learning_rate": 2.730403283530767e-05, "logits/chosen": -1017957568.0, "logits/rejected": -1011246720.0, "logps/chosen": -481.62025316455697, "logps/rejected": -906.4691358024692, "loss": 0.3399, "rewards/chosen": -1.7228540348101267, "rewards/margins": 4.141343496054072, "rewards/rejected": -5.864197530864198, "step": 3000 }, { "epoch": 1.5702695629416383, "eval_kl": 0.0, "eval_logits/chosen": -2308331776.0, "eval_logits/rejected": -2173847808.0, "eval_logps/chosen": -634.2681840672934, "eval_logps/rejected": -983.0730979612133, "eval_loss": 0.44966405630111694, "eval_rewards/chosen": -3.022389905987135, "eval_rewards/margins": 3.854785628572785, "eval_rewards/rejected": -6.87717553455992, "eval_runtime": 93.4395, "eval_samples_per_second": 42.808, "eval_steps_per_second": 0.674, "step": 3000 }, { "epoch": 1.5755037948181103, "grad_norm": 0.7511793567904532, "kl": 0.0, "learning_rate": 2.7152341693665157e-05, "logits/chosen": -1030540480.0, "logits/rejected": -1036202816.0, "logps/chosen": -629.9733333333334, "logps/rejected": -1026.070588235294, "loss": 0.3273, "rewards/chosen": -2.8218229166666666, "rewards/margins": 4.504647671568627, "rewards/rejected": -7.326470588235294, "step": 3010 }, { "epoch": 1.5807380266945825, "grad_norm": 0.7390161397159264, "kl": 0.0, "learning_rate": 2.700057067619741e-05, "logits/chosen": -1011351552.0, "logits/rejected": -1045220544.0, "logps/chosen": -522.3741935483871, "logps/rejected": -1024.7757575757576, "loss": 0.3266, "rewards/chosen": -2.0369329637096776, "rewards/margins": 5.278218551441838, "rewards/rejected": -7.315151515151515, "step": 3020 }, { "epoch": 1.5859722585710547, "grad_norm": 0.4565756908370157, "kl": 0.0, "learning_rate": 2.6848725415297887e-05, "logits/chosen": -1065982336.0, "logits/rejected": -926941184.0, "logps/chosen": -619.4922600619195, "logps/rejected": -1133.1230283911673, "loss": 0.3562, "rewards/chosen": -2.773171439628483, "rewards/margins": 5.447648749645966, "rewards/rejected": -8.220820189274448, "step": 3030 }, { "epoch": 1.5912064904475267, "grad_norm": 0.4524101511224128, "kl": 0.0, "learning_rate": 2.6696811546115296e-05, "logits/chosen": -977063104.0, "logits/rejected": -975699968.0, "logps/chosen": -612.5566343042071, "logps/rejected": -1335.4924471299094, "loss": 0.343, "rewards/chosen": -3.0539037216828477, "rewards/margins": 7.356217124238602, "rewards/rejected": -10.41012084592145, "step": 3040 }, { "epoch": 1.596440722323999, "grad_norm": 0.46571336835876526, "kl": 0.0, "learning_rate": 2.6544834706344478e-05, "logits/chosen": -1069128064.0, "logits/rejected": -1019425600.0, "logps/chosen": -542.6876971608833, "logps/rejected": -1155.9628482972137, "loss": 0.334, "rewards/chosen": -2.3101340694006307, "rewards/margins": 6.342342710785127, "rewards/rejected": -8.652476780185758, "step": 3050 }, { "epoch": 1.6016749542004711, "grad_norm": 0.49672912228082994, "kl": 0.0, "learning_rate": 2.6392800536017187e-05, "logits/chosen": -1069757248.0, "logits/rejected": -947493248.0, "logps/chosen": -577.8650306748466, "logps/rejected": -1023.1847133757962, "loss": 0.3375, "rewards/chosen": -2.266823236196319, "rewards/margins": 4.895597145969286, "rewards/rejected": -7.162420382165605, "step": 3060 }, { "epoch": 1.606909186076943, "grad_norm": 0.45133476462395533, "kl": 0.0, "learning_rate": 2.6240714677292765e-05, "logits/chosen": -1005164928.0, "logits/rejected": -1023200448.0, "logps/chosen": -487.73856209150324, "logps/rejected": -825.3892215568862, "loss": 0.3276, "rewards/chosen": -1.6173917483660132, "rewards/margins": 3.4671890899573405, "rewards/rejected": -5.084580838323354, "step": 3070 }, { "epoch": 1.6121434179534153, "grad_norm": 0.5101297454841202, "kl": 0.0, "learning_rate": 2.60885827742488e-05, "logits/chosen": -1089470464.0, "logits/rejected": -903033664.0, "logps/chosen": -445.44785276073617, "logps/rejected": -756.28025477707, "loss": 0.3426, "rewards/chosen": -0.963957055214724, "rewards/margins": 3.73269899574069, "rewards/rejected": -4.696656050955414, "step": 3080 }, { "epoch": 1.6173776498298875, "grad_norm": 0.6709928988481924, "kl": 0.0, "learning_rate": 2.5936410472671603e-05, "logits/chosen": -1081081856.0, "logits/rejected": -948122432.0, "logps/chosen": -527.4, "logps/rejected": -908.6, "loss": 0.3432, "rewards/chosen": -1.796337890625, "rewards/margins": 4.245458984375, "rewards/rejected": -6.041796875, "step": 3090 }, { "epoch": 1.6226118817063595, "grad_norm": 0.525355527958997, "kl": 0.0, "learning_rate": 2.5784203419846742e-05, "logits/chosen": -939314368.0, "logits/rejected": -838651072.0, "logps/chosen": -531.1168831168832, "logps/rejected": -942.0722891566265, "loss": 0.3429, "rewards/chosen": -1.7687702922077921, "rewards/margins": 4.598323081286184, "rewards/rejected": -6.367093373493976, "step": 3100 }, { "epoch": 1.6226118817063595, "eval_kl": 0.0, "eval_logits/chosen": -2065728000.0, "eval_logits/rejected": -1775322368.0, "eval_logps/chosen": -641.7100445324097, "eval_logps/rejected": -945.901541521631, "eval_loss": 0.4497109353542328, "eval_rewards/chosen": -3.100816427511133, "eval_rewards/margins": 3.405150752996077, "eval_rewards/rejected": -6.50596718050721, "eval_runtime": 94.217, "eval_samples_per_second": 42.455, "eval_steps_per_second": 0.669, "step": 3100 }, { "epoch": 1.6278461135828317, "grad_norm": 0.35519814669807354, "kl": 0.0, "learning_rate": 2.5631967264349423e-05, "logits/chosen": -952107008.0, "logits/rejected": -813904704.0, "logps/chosen": -604.4012158054711, "logps/rejected": -835.8070739549839, "loss": 0.3653, "rewards/chosen": -2.898936170212766, "rewards/margins": 2.683057398919067, "rewards/rejected": -5.581993569131833, "step": 3110 }, { "epoch": 1.633080345459304, "grad_norm": 0.6880022249793109, "kl": 0.0, "learning_rate": 2.5479707655834912e-05, "logits/chosen": -1054448000.0, "logits/rejected": -871786112.0, "logps/chosen": -546.5919003115265, "logps/rejected": -1034.833855799373, "loss": 0.3434, "rewards/chosen": -2.2242017133956384, "rewards/margins": 5.286770073438217, "rewards/rejected": -7.5109717868338555, "step": 3120 }, { "epoch": 1.638314577335776, "grad_norm": 0.5447475519276055, "kl": 0.0, "learning_rate": 2.5327430244828815e-05, "logits/chosen": -1043333120.0, "logits/rejected": -910583424.0, "logps/chosen": -572.4668769716088, "logps/rejected": -853.8947368421053, "loss": 0.3362, "rewards/chosen": -2.3772180599369084, "rewards/margins": 3.363494014366497, "rewards/rejected": -5.7407120743034055, "step": 3130 }, { "epoch": 1.643548809212248, "grad_norm": 0.6308208204607763, "kl": 0.0, "learning_rate": 2.517514068251743e-05, "logits/chosen": -1020054720.0, "logits/rejected": -832988800.0, "logps/chosen": -511.7037037037037, "logps/rejected": -914.8354430379746, "loss": 0.3374, "rewards/chosen": -1.7274305555555556, "rewards/margins": 4.221936533052039, "rewards/rejected": -5.949367088607595, "step": 3140 }, { "epoch": 1.6487830410887203, "grad_norm": 0.6056564587784243, "kl": 0.0, "learning_rate": 2.5022844620537988e-05, "logits/chosen": -913729152.0, "logits/rejected": -808347264.0, "logps/chosen": -477.1003236245955, "logps/rejected": -800.3867069486405, "loss": 0.3335, "rewards/chosen": -1.2809971682847896, "rewards/margins": 3.9659816836789563, "rewards/rejected": -5.246978851963746, "step": 3150 }, { "epoch": 1.6540172729651923, "grad_norm": 0.4584347654412988, "kl": 0.0, "learning_rate": 2.487054771076893e-05, "logits/chosen": -985661440.0, "logits/rejected": -875980416.0, "logps/chosen": -404.6792452830189, "logps/rejected": -746.0372670807453, "loss": 0.3279, "rewards/chosen": -0.7281102594339622, "rewards/margins": 3.5040325977088944, "rewards/rejected": -4.232142857142857, "step": 3160 }, { "epoch": 1.6592515048416645, "grad_norm": 0.950559110059267, "kl": 0.0, "learning_rate": 2.4718255605120185e-05, "logits/chosen": -984612864.0, "logits/rejected": -867801472.0, "logps/chosen": -429.6149068322981, "logps/rejected": -722.2138364779875, "loss": 0.3212, "rewards/chosen": -0.7463485054347826, "rewards/margins": 3.5897364002255947, "rewards/rejected": -4.336084905660377, "step": 3170 }, { "epoch": 1.6644857367181367, "grad_norm": 0.5945415890746549, "kl": 0.0, "learning_rate": 2.456597395532338e-05, "logits/chosen": -1107505920.0, "logits/rejected": -908276544.0, "logps/chosen": -428.4179104477612, "logps/rejected": -751.1081967213115, "loss": 0.3311, "rewards/chosen": -0.7750932835820895, "rewards/margins": 3.832693601663812, "rewards/rejected": -4.607786885245901, "step": 3180 }, { "epoch": 1.6697199685946087, "grad_norm": 0.6507764293651568, "kl": 0.0, "learning_rate": 2.4413708412722084e-05, "logits/chosen": -1060739456.0, "logits/rejected": -942879552.0, "logps/chosen": -403.55555555555554, "logps/rejected": -760.3037974683544, "loss": 0.3207, "rewards/chosen": -0.513695987654321, "rewards/margins": 4.264785025003906, "rewards/rejected": -4.7784810126582276, "step": 3190 }, { "epoch": 1.674954200471081, "grad_norm": 0.6257739087742488, "kl": 0.0, "learning_rate": 2.4261464628062143e-05, "logits/chosen": -1028862784.0, "logits/rejected": -1017538176.0, "logps/chosen": -414.36421725239614, "logps/rejected": -792.269113149847, "loss": 0.3005, "rewards/chosen": -0.7960637979233227, "rewards/margins": 3.907300116449766, "rewards/rejected": -4.703363914373089, "step": 3200 }, { "epoch": 1.674954200471081, "eval_kl": 0.0, "eval_logits/chosen": -2172250112.0, "eval_logits/rejected": -2024051328.0, "eval_logps/chosen": -515.8317664522514, "eval_logps/rejected": -719.3396320238687, "eval_loss": 0.44920703768730164, "eval_rewards/chosen": -1.8422191984166254, "eval_rewards/margins": 2.399824560907094, "eval_rewards/rejected": -4.242043759323719, "eval_runtime": 93.4548, "eval_samples_per_second": 42.801, "eval_steps_per_second": 0.674, "step": 3200 }, { "epoch": 1.680188432347553, "grad_norm": 1.0328000427510753, "kl": 0.0, "learning_rate": 2.410924825128195e-05, "logits/chosen": -1078565248.0, "logits/rejected": -850290304.0, "logps/chosen": -457.85714285714283, "logps/rejected": -658.5263157894736, "loss": 0.3694, "rewards/chosen": -1.2154947916666667, "rewards/margins": 2.6738966557017543, "rewards/rejected": -3.8893914473684212, "step": 3210 }, { "epoch": 1.685422664224025, "grad_norm": 0.5208279156175792, "kl": 0.0, "learning_rate": 2.395706493130274e-05, "logits/chosen": -1064094912.0, "logits/rejected": -793457472.0, "logps/chosen": -413.5420289855073, "logps/rejected": -859.8779661016949, "loss": 0.3487, "rewards/chosen": -0.7851222826086957, "rewards/margins": 4.839877717391304, "rewards/rejected": -5.625, "step": 3220 }, { "epoch": 1.6906568961004973, "grad_norm": 0.6530601088926155, "kl": 0.0, "learning_rate": 2.380492031581897e-05, "logits/chosen": -927360640.0, "logits/rejected": -874722112.0, "logps/chosen": -456.7741935483871, "logps/rejected": -829.2848484848485, "loss": 0.3305, "rewards/chosen": -1.384375, "rewards/margins": 3.946685606060606, "rewards/rejected": -5.331060606060606, "step": 3230 }, { "epoch": 1.6958911279769695, "grad_norm": 0.7857055522080945, "kl": 0.0, "learning_rate": 2.365282005108875e-05, "logits/chosen": -938056064.0, "logits/rejected": -797232320.0, "logps/chosen": -475.49847094801225, "logps/rejected": -878.3130990415335, "loss": 0.3524, "rewards/chosen": -1.4733252102446484, "rewards/margins": 4.301435173141933, "rewards/rejected": -5.774760383386582, "step": 3240 }, { "epoch": 1.7011253598534415, "grad_norm": 1.3501713758787808, "kl": 0.0, "learning_rate": 2.3500769781724256e-05, "logits/chosen": -1067764928.0, "logits/rejected": -812226944.0, "logps/chosen": -481.3872832369942, "logps/rejected": -860.1904761904761, "loss": 0.3503, "rewards/chosen": -1.6231484826589595, "rewards/margins": 3.704232469721993, "rewards/rejected": -5.3273809523809526, "step": 3250 }, { "epoch": 1.7063595917299135, "grad_norm": 0.7060478361762077, "kl": 0.0, "learning_rate": 2.334877515048231e-05, "logits/chosen": -1088002432.0, "logits/rejected": -968255104.0, "logps/chosen": -501.98083067092654, "logps/rejected": -789.4311926605504, "loss": 0.3216, "rewards/chosen": -1.5131165135782747, "rewards/margins": 3.3890241592046, "rewards/rejected": -4.902140672782875, "step": 3260 }, { "epoch": 1.711593823606386, "grad_norm": 0.7469032114353387, "kl": 0.0, "learning_rate": 2.319684179805491e-05, "logits/chosen": -1021732480.0, "logits/rejected": -1027709312.0, "logps/chosen": -580.3540983606557, "logps/rejected": -993.6238805970149, "loss": 0.3386, "rewards/chosen": -2.4837090163934428, "rewards/margins": 4.50733575972596, "rewards/rejected": -6.991044776119403, "step": 3270 }, { "epoch": 1.7168280554828579, "grad_norm": 0.8433420902642595, "kl": 0.0, "learning_rate": 2.304497536285996e-05, "logits/chosen": -988387712.0, "logits/rejected": -944137856.0, "logps/chosen": -700.6984126984127, "logps/rejected": -1110.843076923077, "loss": 0.3698, "rewards/chosen": -3.7896329365079366, "rewards/margins": 4.231905525030525, "rewards/rejected": -8.021538461538462, "step": 3280 }, { "epoch": 1.7220622873593299, "grad_norm": 0.684092268488129, "kl": 0.0, "learning_rate": 2.289318148083196e-05, "logits/chosen": -1000551232.0, "logits/rejected": -938685248.0, "logps/chosen": -844.1165048543689, "logps/rejected": -1529.0392749244713, "loss": 0.3529, "rewards/chosen": -4.929510517799352, "rewards/margins": 7.559160177064696, "rewards/rejected": -12.488670694864048, "step": 3290 }, { "epoch": 1.7272965192358023, "grad_norm": 0.6694204511833904, "kl": 0.0, "learning_rate": 2.2741465785212905e-05, "logits/chosen": -991323776.0, "logits/rejected": -970352256.0, "logps/chosen": -771.3795379537954, "logps/rejected": -1320.0712166172107, "loss": 0.3468, "rewards/chosen": -4.449824669966997, "rewards/margins": 5.689641205403923, "rewards/rejected": -10.13946587537092, "step": 3300 }, { "epoch": 1.7272965192358023, "eval_kl": 0.0, "eval_logits/chosen": -2299277568.0, "eval_logits/rejected": -2052279552.0, "eval_logps/chosen": -650.1019297377536, "eval_logps/rejected": -1102.9895574341124, "eval_loss": 0.4503124952316284, "eval_rewards/chosen": -3.182088075210292, "eval_rewards/margins": 4.895236638862309, "eval_rewards/rejected": -8.077324714072601, "eval_runtime": 93.468, "eval_samples_per_second": 42.795, "eval_steps_per_second": 0.674, "step": 3300 }, { "epoch": 1.7325307511122743, "grad_norm": 0.5936884040448765, "kl": 0.0, "learning_rate": 2.2589833906343182e-05, "logits/chosen": -1130155264.0, "logits/rejected": -893386752.0, "logps/chosen": -524.2017804154302, "logps/rejected": -1324.039603960396, "loss": 0.3446, "rewards/chosen": -1.9497751298219586, "rewards/margins": 8.34436678436946, "rewards/rejected": -10.29414191419142, "step": 3310 }, { "epoch": 1.7377649829887463, "grad_norm": 0.8087649219172015, "kl": 0.0, "learning_rate": 2.2438291471452667e-05, "logits/chosen": -1134559232.0, "logits/rejected": -969093952.0, "logps/chosen": -652.5679758308157, "logps/rejected": -1052.2718446601941, "loss": 0.3799, "rewards/chosen": -3.147092145015106, "rewards/margins": 4.507438599321464, "rewards/rejected": -7.6545307443365695, "step": 3320 }, { "epoch": 1.7429992148652187, "grad_norm": 0.6066142385094927, "kl": 0.0, "learning_rate": 2.2286844104451846e-05, "logits/chosen": -1056125760.0, "logits/rejected": -890136192.0, "logps/chosen": -528.9268292682926, "logps/rejected": -1025.948717948718, "loss": 0.353, "rewards/chosen": -2.246189024390244, "rewards/margins": 5.228971232020013, "rewards/rejected": -7.475160256410256, "step": 3330 }, { "epoch": 1.7482334467416907, "grad_norm": 0.6316286497085969, "kl": 0.0, "learning_rate": 2.213549742572314e-05, "logits/chosen": -1060110336.0, "logits/rejected": -960915072.0, "logps/chosen": -551.7435897435897, "logps/rejected": -958.829268292683, "loss": 0.3362, "rewards/chosen": -2.3900741185897436, "rewards/margins": 4.085916735068793, "rewards/rejected": -6.475990853658536, "step": 3340 }, { "epoch": 1.7534676786181627, "grad_norm": 0.6928407887366415, "kl": 0.0, "learning_rate": 2.1984257051912326e-05, "logits/chosen": -974756224.0, "logits/rejected": -900936512.0, "logps/chosen": -547.4760383386581, "logps/rejected": -911.65749235474, "loss": 0.3315, "rewards/chosen": -2.032947284345048, "rewards/margins": 3.9081842141258996, "rewards/rejected": -5.941131498470948, "step": 3350 }, { "epoch": 1.7587019104946349, "grad_norm": 0.6287104207098294, "kl": 0.0, "learning_rate": 2.183312859572008e-05, "logits/chosen": -924057600.0, "logits/rejected": -831940224.0, "logps/chosen": -547.1898734177215, "logps/rejected": -1009.3827160493827, "loss": 0.3402, "rewards/chosen": -1.9961926424050633, "rewards/margins": 5.153498715619628, "rewards/rejected": -7.1496913580246915, "step": 3360 }, { "epoch": 1.763936142371107, "grad_norm": 1.309187284373212, "kl": 0.0, "learning_rate": 2.1682117665693663e-05, "logits/chosen": -941306688.0, "logits/rejected": -866333504.0, "logps/chosen": -594.6750788643533, "logps/rejected": -968.8173374613003, "loss": 0.3435, "rewards/chosen": -2.580515575709779, "rewards/margins": 4.067317241627682, "rewards/rejected": -6.647832817337461, "step": 3370 }, { "epoch": 1.769170374247579, "grad_norm": 0.8941842404807043, "kl": 0.0, "learning_rate": 2.1531229866018832e-05, "logits/chosen": -918762304.0, "logits/rejected": -699819648.0, "logps/chosen": -514.4539877300614, "logps/rejected": -1172.7898089171974, "loss": 0.3531, "rewards/chosen": -1.924079754601227, "rewards/margins": 7.1101559141885815, "rewards/rejected": -9.034235668789808, "step": 3380 }, { "epoch": 1.7744046061240513, "grad_norm": 0.7075290388583574, "kl": 0.0, "learning_rate": 2.1380470796311843e-05, "logits/chosen": -899258752.0, "logits/rejected": -794086592.0, "logps/chosen": -649.0864197530864, "logps/rejected": -1344.607594936709, "loss": 0.3448, "rewards/chosen": -3.271291473765432, "rewards/margins": 7.443898399652289, "rewards/rejected": -10.715189873417721, "step": 3390 }, { "epoch": 1.7796388380005235, "grad_norm": 0.784185262384793, "kl": 0.0, "learning_rate": 2.1229846051411624e-05, "logits/chosen": -947702976.0, "logits/rejected": -786851456.0, "logps/chosen": -541.416149068323, "logps/rejected": -1070.691823899371, "loss": 0.3361, "rewards/chosen": -1.9438082298136645, "rewards/margins": 5.612009380249228, "rewards/rejected": -7.555817610062893, "step": 3400 }, { "epoch": 1.7796388380005235, "eval_kl": 0.0, "eval_logits/chosen": -1953080960.0, "eval_logits/rejected": -1726854912.0, "eval_logps/chosen": -586.8936170212766, "eval_logps/rejected": -953.3485827946296, "eval_loss": 0.44880467653274536, "eval_rewards/chosen": -2.550377288471054, "eval_rewards/margins": 4.0314228109819545, "eval_rewards/rejected": -6.581800099453009, "eval_runtime": 93.4545, "eval_samples_per_second": 42.802, "eval_steps_per_second": 0.674, "step": 3400 }, { "epoch": 1.7848730698769955, "grad_norm": 0.6417707088331657, "kl": 0.0, "learning_rate": 2.1079361221172168e-05, "logits/chosen": -876924096.0, "logits/rejected": -800273216.0, "logps/chosen": -555.4069400630915, "logps/rejected": -1011.8142414860681, "loss": 0.3365, "rewards/chosen": -2.269469637223975, "rewards/margins": 5.012264108906056, "rewards/rejected": -7.281733746130031, "step": 3410 }, { "epoch": 1.7901073017534677, "grad_norm": 0.6562900236984517, "kl": 0.0, "learning_rate": 2.092902189025507e-05, "logits/chosen": -968674496.0, "logits/rejected": -715653120.0, "logps/chosen": -531.7701149425287, "logps/rejected": -1166.6301369863013, "loss": 0.3455, "rewards/chosen": -2.1996901939655173, "rewards/margins": 6.43986460055503, "rewards/rejected": -8.639554794520548, "step": 3420 }, { "epoch": 1.7953415336299399, "grad_norm": 0.8218911355557875, "kl": 0.0, "learning_rate": 2.0778833637922277e-05, "logits/chosen": -863607168.0, "logits/rejected": -816840704.0, "logps/chosen": -521.4545454545455, "logps/rejected": -1002.2168674698795, "loss": 0.3291, "rewards/chosen": -1.9117352374188312, "rewards/margins": 5.367632232460687, "rewards/rejected": -7.279367469879518, "step": 3430 }, { "epoch": 1.8005757655064119, "grad_norm": 0.584855714512457, "kl": 0.0, "learning_rate": 2.0628802037829047e-05, "logits/chosen": -910583424.0, "logits/rejected": -787166016.0, "logps/chosen": -515.0868167202573, "logps/rejected": -1014.468085106383, "loss": 0.3294, "rewards/chosen": -1.6524567926045015, "rewards/margins": 5.78219366332255, "rewards/rejected": -7.434650455927052, "step": 3440 }, { "epoch": 1.805809997382884, "grad_norm": 0.5836628556756525, "kl": 0.0, "learning_rate": 2.0478932657817105e-05, "logits/chosen": -852702016.0, "logits/rejected": -765250752.0, "logps/chosen": -664.28664495114, "logps/rejected": -1122.4024024024025, "loss": 0.3339, "rewards/chosen": -3.09441164495114, "rewards/margins": 5.129312078772584, "rewards/rejected": -8.223723723723724, "step": 3450 }, { "epoch": 1.8110442292593563, "grad_norm": 0.5760216909652937, "kl": 0.0, "learning_rate": 2.0329231059707986e-05, "logits/chosen": -732325504.0, "logits/rejected": -773219968.0, "logps/chosen": -588.6896551724138, "logps/rejected": -990.72, "loss": 0.3215, "rewards/chosen": -2.666971982758621, "rewards/margins": 4.308028017241378, "rewards/rejected": -6.975, "step": 3460 }, { "epoch": 1.8162784611358282, "grad_norm": 0.7342083500831641, "kl": 0.0, "learning_rate": 2.017970279909667e-05, "logits/chosen": -782552256.0, "logits/rejected": -720476544.0, "logps/chosen": -540.6792452830189, "logps/rejected": -1125.5652173913043, "loss": 0.3382, "rewards/chosen": -2.103355935534591, "rewards/margins": 5.864035368813235, "rewards/rejected": -7.967391304347826, "step": 3470 }, { "epoch": 1.8215126930123005, "grad_norm": 1.3753140883742832, "kl": 0.0, "learning_rate": 2.0030353425145378e-05, "logits/chosen": -791255424.0, "logits/rejected": -629565056.0, "logps/chosen": -527.0, "logps/rejected": -930.8, "loss": 0.351, "rewards/chosen": -1.91419677734375, "rewards/margins": 4.7514282226562505, "rewards/rejected": -6.665625, "step": 3480 }, { "epoch": 1.8267469248887727, "grad_norm": 0.6736862587298565, "kl": 0.0, "learning_rate": 1.9881188480377632e-05, "logits/chosen": -787795136.0, "logits/rejected": -726243712.0, "logps/chosen": -518.7313915857605, "logps/rejected": -847.2749244712991, "loss": 0.3406, "rewards/chosen": -1.9428094660194175, "rewards/margins": 3.7622509569413074, "rewards/rejected": -5.705060422960725, "step": 3490 }, { "epoch": 1.8319811567652446, "grad_norm": 1.106559380657363, "kl": 0.0, "learning_rate": 1.9732213500472605e-05, "logits/chosen": -802999488.0, "logits/rejected": -650012288.0, "logps/chosen": -464.7463556851312, "logps/rejected": -869.0639730639731, "loss": 0.3405, "rewards/chosen": -1.4682944606413995, "rewards/margins": 4.312850320503381, "rewards/rejected": -5.781144781144781, "step": 3500 }, { "epoch": 1.8319811567652446, "eval_kl": 0.0, "eval_logits/chosen": -1600359936.0, "eval_logits/rejected": -1368940928.0, "eval_logps/chosen": -595.8555170707571, "eval_logps/rejected": -917.2272501243162, "eval_loss": 0.447328120470047, "eval_rewards/chosen": -2.6416996536368136, "eval_rewards/margins": 3.5805778202567713, "eval_rewards/rejected": -6.222277473893585, "eval_runtime": 93.4479, "eval_samples_per_second": 42.805, "eval_steps_per_second": 0.674, "step": 3500 }, { "epoch": 1.8372153886417169, "grad_norm": 0.4772534610253572, "kl": 0.0, "learning_rate": 1.9583434014059638e-05, "logits/chosen": -800797504.0, "logits/rejected": -647705408.0, "logps/chosen": -516.1904761904761, "logps/rejected": -1060.6315789473683, "loss": 0.3433, "rewards/chosen": -1.9356631324404763, "rewards/margins": 5.672067130717418, "rewards/rejected": -7.607730263157895, "step": 3510 }, { "epoch": 1.842449620518189, "grad_norm": 0.4797616930935301, "kl": 0.0, "learning_rate": 1.9434855542513106e-05, "logits/chosen": -823236992.0, "logits/rejected": -705796480.0, "logps/chosen": -444.1150159744409, "logps/rejected": -825.5412844036697, "loss": 0.34, "rewards/chosen": -1.3486796126198084, "rewards/margins": 3.9609534149031274, "rewards/rejected": -5.309633027522936, "step": 3520 }, { "epoch": 1.847683852394661, "grad_norm": 0.5695842896989155, "kl": 0.0, "learning_rate": 1.9286483599747475e-05, "logits/chosen": -722259136.0, "logits/rejected": -758015616.0, "logps/chosen": -514.8354430379746, "logps/rejected": -884.5432098765432, "loss": 0.351, "rewards/chosen": -2.095010878164557, "rewards/margins": 3.8100817144280357, "rewards/rejected": -5.905092592592593, "step": 3530 }, { "epoch": 1.8529180842711332, "grad_norm": 0.6900510116894868, "kl": 0.0, "learning_rate": 1.9138323692012737e-05, "logits/chosen": -784439680.0, "logits/rejected": -683252096.0, "logps/chosen": -511.39622641509436, "logps/rejected": -1062.1614906832299, "loss": 0.3223, "rewards/chosen": -1.662441037735849, "rewards/margins": 5.90471734735732, "rewards/rejected": -7.567158385093168, "step": 3540 }, { "epoch": 1.8581523161476055, "grad_norm": 0.9385097332615309, "kl": 0.0, "learning_rate": 1.8990381317689958e-05, "logits/chosen": -824809856.0, "logits/rejected": -664063168.0, "logps/chosen": -437.96941896024464, "logps/rejected": -783.8466453674122, "loss": 0.3498, "rewards/chosen": -1.254730504587156, "rewards/margins": 3.7029372270422374, "rewards/rejected": -4.957667731629393, "step": 3550 }, { "epoch": 1.8633865480240774, "grad_norm": 0.8998105100017849, "kl": 0.0, "learning_rate": 1.8842661967087353e-05, "logits/chosen": -801007232.0, "logits/rejected": -725719424.0, "logps/chosen": -453.1358024691358, "logps/rejected": -819.746835443038, "loss": 0.3526, "rewards/chosen": -1.3312596450617284, "rewards/margins": 3.7272846587357398, "rewards/rejected": -5.0585443037974684, "step": 3560 }, { "epoch": 1.8686207799005496, "grad_norm": 0.6166221805440664, "kl": 0.0, "learning_rate": 1.8695171122236444e-05, "logits/chosen": -735156608.0, "logits/rejected": -726872896.0, "logps/chosen": -470.7752442996743, "logps/rejected": -846.3183183183183, "loss": 0.3509, "rewards/chosen": -1.5342528501628665, "rewards/margins": 3.9041855882755714, "rewards/rejected": -5.438438438438438, "step": 3570 }, { "epoch": 1.8738550117770219, "grad_norm": 0.9064425131607848, "kl": 0.0, "learning_rate": 1.8547914256688663e-05, "logits/chosen": -914987392.0, "logits/rejected": -619288960.0, "logps/chosen": -581.008695652174, "logps/rejected": -964.2305084745763, "loss": 0.3741, "rewards/chosen": -2.208786231884058, "rewards/margins": 4.6098578359125515, "rewards/rejected": -6.81864406779661, "step": 3580 }, { "epoch": 1.8790892436534938, "grad_norm": 0.4694968385256991, "kl": 0.0, "learning_rate": 1.8400896835312208e-05, "logits/chosen": -803314048.0, "logits/rejected": -782342528.0, "logps/chosen": -627.4711864406779, "logps/rejected": -1034.3884057971015, "loss": 0.3389, "rewards/chosen": -2.8996822033898306, "rewards/margins": 4.351767071972488, "rewards/rejected": -7.251449275362319, "step": 3590 }, { "epoch": 1.8843234755299658, "grad_norm": 0.5385083556993564, "kl": 0.0, "learning_rate": 1.8254124314089223e-05, "logits/chosen": -786012544.0, "logits/rejected": -727921472.0, "logps/chosen": -592.3975155279503, "logps/rejected": -968.3522012578617, "loss": 0.362, "rewards/chosen": -2.4558423913043477, "rewards/margins": 4.180950061525841, "rewards/rejected": -6.636792452830188, "step": 3600 }, { "epoch": 1.8843234755299658, "eval_kl": 0.0, "eval_logits/chosen": -1622263552.0, "eval_logits/rejected": -1444837888.0, "eval_logps/chosen": -594.8738248391885, "eval_logps/rejected": -933.2352063649926, "eval_loss": 0.4472343623638153, "eval_rewards/chosen": -2.6311541316180107, "eval_rewards/margins": 3.747761830589846, "eval_rewards/rejected": -6.378915962207857, "eval_runtime": 93.4509, "eval_samples_per_second": 42.803, "eval_steps_per_second": 0.674, "step": 3600 }, { "epoch": 1.8895577074064382, "grad_norm": 0.7316473583926785, "kl": 0.0, "learning_rate": 1.810760213991332e-05, "logits/chosen": -709885952.0, "logits/rejected": -708417920.0, "logps/chosen": -562.8745762711865, "logps/rejected": -1033.4608695652173, "loss": 0.3495, "rewards/chosen": -2.3506091101694917, "rewards/margins": 4.948303933308769, "rewards/rejected": -7.298913043478261, "step": 3610 }, { "epoch": 1.8947919392829102, "grad_norm": 0.7645475500978628, "kl": 0.0, "learning_rate": 1.796133575038748e-05, "logits/chosen": -663434048.0, "logits/rejected": -674339200.0, "logps/chosen": -638.6709265175718, "logps/rejected": -1222.7522935779816, "loss": 0.3553, "rewards/chosen": -3.1315894568690097, "rewards/margins": 6.036606261785424, "rewards/rejected": -9.168195718654435, "step": 3620 }, { "epoch": 1.9000261711593822, "grad_norm": 0.49257289540325766, "kl": 0.0, "learning_rate": 1.781533057362221e-05, "logits/chosen": -701497344.0, "logits/rejected": -594228032.0, "logps/chosen": -531.6923076923077, "logps/rejected": -1016.3809523809524, "loss": 0.3361, "rewards/chosen": -2.2409375, "rewards/margins": 4.999935515873016, "rewards/rejected": -7.2408730158730155, "step": 3630 }, { "epoch": 1.9052604030358546, "grad_norm": 0.8198249770364769, "kl": 0.0, "learning_rate": 1.7669592028034116e-05, "logits/chosen": -755289280.0, "logits/rejected": -648386944.0, "logps/chosen": -473.2682926829268, "logps/rejected": -1005.5384615384615, "loss": 0.3421, "rewards/chosen": -1.4224942835365855, "rewards/margins": 5.542649947232645, "rewards/rejected": -6.965144230769231, "step": 3640 }, { "epoch": 1.9104946349123266, "grad_norm": 0.9264357764615075, "kl": 0.0, "learning_rate": 1.7524125522144826e-05, "logits/chosen": -701602176.0, "logits/rejected": -645922816.0, "logps/chosen": -545.9804560260586, "logps/rejected": -965.957957957958, "loss": 0.3238, "rewards/chosen": -2.004911441368078, "rewards/margins": 4.538632102175465, "rewards/rejected": -6.543543543543543, "step": 3650 }, { "epoch": 1.9157288667887986, "grad_norm": 0.47892161569097425, "kl": 0.0, "learning_rate": 1.7378936454380276e-05, "logits/chosen": -742601536.0, "logits/rejected": -597478592.0, "logps/chosen": -566.4191616766467, "logps/rejected": -995.7647058823529, "loss": 0.351, "rewards/chosen": -2.3877245508982035, "rewards/margins": 4.476654534069116, "rewards/rejected": -6.86437908496732, "step": 3660 }, { "epoch": 1.9209630986652708, "grad_norm": 0.5930608926989867, "kl": 0.0, "learning_rate": 1.7234030212870334e-05, "logits/chosen": -670564352.0, "logits/rejected": -624427008.0, "logps/chosen": -476.1471571906354, "logps/rejected": -1045.208211143695, "loss": 0.3098, "rewards/chosen": -1.4977529264214047, "rewards/margins": 6.181865841907041, "rewards/rejected": -7.679618768328446, "step": 3670 }, { "epoch": 1.926197330541743, "grad_norm": 0.4877603718353012, "kl": 0.0, "learning_rate": 1.7089412175248896e-05, "logits/chosen": -703070208.0, "logits/rejected": -640575104.0, "logps/chosen": -491.5032679738562, "logps/rejected": -966.8023952095808, "loss": 0.3282, "rewards/chosen": -1.5397263071895424, "rewards/margins": 5.183327585026027, "rewards/rejected": -6.723053892215569, "step": 3680 }, { "epoch": 1.931431562418215, "grad_norm": 0.7344807590979141, "kl": 0.0, "learning_rate": 1.694508770845427e-05, "logits/chosen": -740084928.0, "logits/rejected": -586835584.0, "logps/chosen": -526.8571428571429, "logps/rejected": -891.2631578947369, "loss": 0.3558, "rewards/chosen": -2.110932849702381, "rewards/margins": 3.780925702929198, "rewards/rejected": -5.891858552631579, "step": 3690 }, { "epoch": 1.9366657942946872, "grad_norm": 0.5681820063641687, "kl": 0.0, "learning_rate": 1.680106216853003e-05, "logits/chosen": -702545920.0, "logits/rejected": -728865152.0, "logps/chosen": -476.7676767676768, "logps/rejected": -914.0991253644315, "loss": 0.3153, "rewards/chosen": -1.3383838383838385, "rewards/margins": 4.622257561033071, "rewards/rejected": -5.960641399416909, "step": 3700 }, { "epoch": 1.9366657942946872, "eval_kl": 0.0, "eval_logits/chosen": -1449431680.0, "eval_logits/rejected": -1236653952.0, "eval_logps/chosen": -548.1009401286492, "eval_logps/rejected": -828.2764793635007, "eval_loss": 0.4481757879257202, "eval_rewards/chosen": -2.1611207323107373, "eval_rewards/margins": 3.175030436262112, "eval_rewards/rejected": -5.336151168572849, "eval_runtime": 93.4468, "eval_samples_per_second": 42.805, "eval_steps_per_second": 0.674, "step": 3700 }, { "epoch": 1.9419000261711594, "grad_norm": 0.7862073138396892, "kl": 0.0, "learning_rate": 1.665734090042622e-05, "logits/chosen": -694681600.0, "logits/rejected": -642829504.0, "logps/chosen": -481.1210191082803, "logps/rejected": -1028.3190184049079, "loss": 0.3552, "rewards/chosen": -1.5101574940286624, "rewards/margins": 5.586468272842504, "rewards/rejected": -7.096625766871166, "step": 3710 }, { "epoch": 1.9471342580476314, "grad_norm": 0.8285344647326756, "kl": 0.0, "learning_rate": 1.651392923780105e-05, "logits/chosen": -724251456.0, "logits/rejected": -629145600.0, "logps/chosen": -501.58255451713393, "logps/rejected": -835.3103448275862, "loss": 0.3722, "rewards/chosen": -1.7075879818925233, "rewards/margins": 3.5847317673237775, "rewards/rejected": -5.2923197492163006, "step": 3720 }, { "epoch": 1.9523684899241036, "grad_norm": 0.6838021947504493, "kl": 0.0, "learning_rate": 1.637083250282288e-05, "logits/chosen": -716387136.0, "logits/rejected": -572889472.0, "logps/chosen": -504.85626911314984, "logps/rejected": -945.1757188498402, "loss": 0.3516, "rewards/chosen": -1.6721139143730888, "rewards/margins": 4.77756659680902, "rewards/rejected": -6.449680511182109, "step": 3730 }, { "epoch": 1.9576027218005758, "grad_norm": 0.49973063687202535, "kl": 0.0, "learning_rate": 1.6228056005972762e-05, "logits/chosen": -754450432.0, "logits/rejected": -652738560.0, "logps/chosen": -499.238670694864, "logps/rejected": -1019.4433656957929, "loss": 0.3441, "rewards/chosen": -1.5205343655589123, "rewards/margins": 5.6631225923698905, "rewards/rejected": -7.183656957928803, "step": 3740 }, { "epoch": 1.9628369536770478, "grad_norm": 0.7913259869671393, "kl": 0.0, "learning_rate": 1.6085605045847367e-05, "logits/chosen": -836553920.0, "logits/rejected": -634178752.0, "logps/chosen": -443.83492063492065, "logps/rejected": -1056.0984615384616, "loss": 0.3281, "rewards/chosen": -0.9071180555555556, "rewards/margins": 6.879035790598291, "rewards/rejected": -7.786153846153846, "step": 3750 }, { "epoch": 1.96807118555352, "grad_norm": 0.6164908641832416, "kl": 0.0, "learning_rate": 1.5943484908962325e-05, "logits/chosen": -712192832.0, "logits/rejected": -696044736.0, "logps/chosen": -521.3465346534654, "logps/rejected": -953.9228486646884, "loss": 0.35, "rewards/chosen": -2.1199431466584158, "rewards/margins": 4.52916664562645, "rewards/rejected": -6.649109792284866, "step": 3760 }, { "epoch": 1.9733054174299922, "grad_norm": 0.9149146447875485, "kl": 0.0, "learning_rate": 1.580170086955603e-05, "logits/chosen": -780769664.0, "logits/rejected": -724041728.0, "logps/chosen": -506.03821656050957, "logps/rejected": -1002.7975460122699, "loss": 0.3521, "rewards/chosen": -1.7051652070063694, "rewards/margins": 5.1656170015825875, "rewards/rejected": -6.870782208588957, "step": 3770 }, { "epoch": 1.9785396493064642, "grad_norm": 0.5360239915386263, "kl": 0.0, "learning_rate": 1.5660258189393946e-05, "logits/chosen": -726872896.0, "logits/rejected": -551079104.0, "logps/chosen": -592.5014245014245, "logps/rejected": -1048.3598615916956, "loss": 0.4024, "rewards/chosen": -2.969128383190883, "rewards/margins": 4.530871616809117, "rewards/rejected": -7.5, "step": 3780 }, { "epoch": 1.9837738811829364, "grad_norm": 0.7520642077552421, "kl": 0.0, "learning_rate": 1.551916211757326e-05, "logits/chosen": -776470528.0, "logits/rejected": -559992000.0, "logps/chosen": -461.6804733727811, "logps/rejected": -912.635761589404, "loss": 0.3518, "rewards/chosen": -1.1292991863905326, "rewards/margins": 5.128978959304831, "rewards/rejected": -6.258278145695364, "step": 3790 }, { "epoch": 1.9890081130594086, "grad_norm": 1.159247733239679, "kl": 0.0, "learning_rate": 1.537841789032819e-05, "logits/chosen": -741028672.0, "logits/rejected": -635856512.0, "logps/chosen": -490.70031545741324, "logps/rejected": -917.5975232198142, "loss": 0.332, "rewards/chosen": -1.5408246253943219, "rewards/margins": 4.827596427237257, "rewards/rejected": -6.368421052631579, "step": 3800 }, { "epoch": 1.9890081130594086, "eval_kl": 0.0, "eval_logits/chosen": -1527192704.0, "eval_logits/rejected": -1300766848.0, "eval_logps/chosen": -517.2568035625927, "eval_logps/rejected": -750.3689706613625, "eval_loss": 0.4465000033378601, "eval_rewards/chosen": -1.8533523008411676, "eval_rewards/margins": 2.699855058681458, "eval_rewards/rejected": -4.5532073595226255, "eval_runtime": 93.4594, "eval_samples_per_second": 42.799, "eval_steps_per_second": 0.674, "step": 3800 }, { "epoch": 1.9942423449358806, "grad_norm": 0.9057885208631024, "kl": 0.0, "learning_rate": 1.523803073083558e-05, "logits/chosen": -846620288.0, "logits/rejected": -680001536.0, "logps/chosen": -472.8358208955224, "logps/rejected": -827.5934426229509, "loss": 0.3806, "rewards/chosen": -1.304764750466418, "rewards/margins": 3.919005741336861, "rewards/rejected": -5.223770491803279, "step": 3810 }, { "epoch": 1.9994765768123528, "grad_norm": 0.5119873172602645, "kl": 0.0, "learning_rate": 1.509800584902108e-05, "logits/chosen": -803838336.0, "logits/rejected": -722573696.0, "logps/chosen": -474.0, "logps/rejected": -756.7179487179487, "loss": 0.3808, "rewards/chosen": -1.3775724085365855, "rewards/margins": 3.1476679760787993, "rewards/rejected": -4.525240384615385, "step": 3820 }, { "epoch": 2.004710808688825, "grad_norm": 1.0538275818801006, "kl": 0.0, "learning_rate": 1.4958348441365826e-05, "logits/chosen": -836973376.0, "logits/rejected": -691431040.0, "logps/chosen": -462.88145896656533, "logps/rejected": -709.1961414790997, "loss": 0.3741, "rewards/chosen": -1.5843643142097263, "rewards/margins": 2.6957803803240354, "rewards/rejected": -4.280144694533762, "step": 3830 }, { "epoch": 2.009945040565297, "grad_norm": 0.8750158141328271, "kl": 0.0, "learning_rate": 1.4819063690713565e-05, "logits/chosen": -916035968.0, "logits/rejected": -644769408.0, "logps/chosen": -470.2808022922636, "logps/rejected": -773.3883161512027, "loss": 0.3592, "rewards/chosen": -1.3557093929083095, "rewards/margins": 3.066971019462825, "rewards/rejected": -4.422680412371134, "step": 3840 }, { "epoch": 2.015179272441769, "grad_norm": 0.867120838419022, "kl": 0.0, "learning_rate": 1.4680156766078312e-05, "logits/chosen": -785593152.0, "logits/rejected": -728393344.0, "logps/chosen": -384.1006289308176, "logps/rejected": -811.9254658385094, "loss": 0.3292, "rewards/chosen": -0.747248427672956, "rewards/margins": 4.652596292823938, "rewards/rejected": -5.399844720496894, "step": 3850 }, { "epoch": 2.0204135043182414, "grad_norm": 0.6042760418475094, "kl": 0.0, "learning_rate": 1.4541632822452546e-05, "logits/chosen": -842216256.0, "logits/rejected": -607964352.0, "logps/chosen": -556.9139465875371, "logps/rejected": -836.013201320132, "loss": 0.3501, "rewards/chosen": -1.8699041983494065, "rewards/margins": 3.69610240231066, "rewards/rejected": -5.566006600660066, "step": 3860 }, { "epoch": 2.0256477361947134, "grad_norm": 0.6030811889034846, "kl": 0.0, "learning_rate": 1.4403497000615885e-05, "logits/chosen": -897476224.0, "logits/rejected": -684510400.0, "logps/chosen": -514.7329192546584, "logps/rejected": -893.2830188679245, "loss": 0.342, "rewards/chosen": -1.5113062888198758, "rewards/margins": 4.606618239482011, "rewards/rejected": -6.117924528301887, "step": 3870 }, { "epoch": 2.0308819680711854, "grad_norm": 0.7459007455056504, "kl": 0.0, "learning_rate": 1.4265754426944322e-05, "logits/chosen": -860461440.0, "logits/rejected": -671403200.0, "logps/chosen": -484.3987915407855, "logps/rejected": -980.9190938511326, "loss": 0.3352, "rewards/chosen": -1.3766509917567504, "rewards/margins": 5.371730885265904, "rewards/rejected": -6.748381877022654, "step": 3880 }, { "epoch": 2.036116199947658, "grad_norm": 0.7604713023578963, "kl": 0.0, "learning_rate": 1.4128410213219942e-05, "logits/chosen": -869688960.0, "logits/rejected": -683986112.0, "logps/chosen": -546.6, "logps/rejected": -1040.2, "loss": 0.3211, "rewards/chosen": -1.85484619140625, "rewards/margins": 5.72406005859375, "rewards/rejected": -7.57890625, "step": 3890 }, { "epoch": 2.04135043182413, "grad_norm": 1.0804174673014937, "kl": 0.0, "learning_rate": 1.3991469456441273e-05, "logits/chosen": -826173056.0, "logits/rejected": -657824128.0, "logps/chosen": -501.8452012383901, "logps/rejected": -1085.8801261829653, "loss": 0.3281, "rewards/chosen": -1.6408184984520124, "rewards/margins": 6.257840807541679, "rewards/rejected": -7.898659305993691, "step": 3900 }, { "epoch": 2.04135043182413, "eval_kl": 0.0, "eval_logits/chosen": -1679585792.0, "eval_logits/rejected": -1427328256.0, "eval_logps/chosen": -596.9638792676892, "eval_logps/rejected": -954.6852312282447, "eval_loss": 0.4462812542915344, "eval_rewards/chosen": -2.651657595249876, "eval_rewards/margins": 3.942698446520387, "eval_rewards/rejected": -6.594356041770263, "eval_runtime": 93.447, "eval_samples_per_second": 42.805, "eval_steps_per_second": 0.674, "step": 3900 }, { "epoch": 2.0465846637006018, "grad_norm": 0.5340218287321821, "kl": 0.0, "learning_rate": 1.3854937238634077e-05, "logits/chosen": -826907008.0, "logits/rejected": -684720128.0, "logps/chosen": -579.2492307692307, "logps/rejected": -1005.815873015873, "loss": 0.3338, "rewards/chosen": -2.297403846153846, "rewards/margins": 4.872437423687424, "rewards/rejected": -7.16984126984127, "step": 3910 }, { "epoch": 2.051818895577074, "grad_norm": 1.1950327463899646, "kl": 0.0, "learning_rate": 1.3718818626662776e-05, "logits/chosen": -840538496.0, "logits/rejected": -627782464.0, "logps/chosen": -479.9, "logps/rejected": -971.6, "loss": 0.3238, "rewards/chosen": -1.4367919921875, "rewards/margins": 5.5171142578125, "rewards/rejected": -6.95390625, "step": 3920 }, { "epoch": 2.057053127453546, "grad_norm": 0.42341628215112187, "kl": 0.0, "learning_rate": 1.3583118672042442e-05, "logits/chosen": -825019584.0, "logits/rejected": -621595840.0, "logps/chosen": -464.4651162790698, "logps/rejected": -945.6216216216217, "loss": 0.3096, "rewards/chosen": -1.3582394622093024, "rewards/margins": 5.080105132385293, "rewards/rejected": -6.438344594594595, "step": 3930 }, { "epoch": 2.062287359330018, "grad_norm": 1.9842269461022177, "kl": 0.0, "learning_rate": 1.3447842410751255e-05, "logits/chosen": -761161344.0, "logits/rejected": -688599872.0, "logps/chosen": -393.63157894736844, "logps/rejected": -860.7619047619048, "loss": 0.2949, "rewards/chosen": -0.6329538445723685, "rewards/margins": 5.192939012570489, "rewards/rejected": -5.825892857142857, "step": 3940 }, { "epoch": 2.0675215912064906, "grad_norm": 0.6398178755738954, "kl": 0.0, "learning_rate": 1.331299486304371e-05, "logits/chosen": -761685632.0, "logits/rejected": -684929856.0, "logps/chosen": -501.4185303514377, "logps/rejected": -804.1100917431193, "loss": 0.3164, "rewards/chosen": -1.7560652955271565, "rewards/margins": 3.414424001108929, "rewards/rejected": -5.170489296636085, "step": 3950 }, { "epoch": 2.0727558230829626, "grad_norm": 0.5001389075478716, "kl": 0.0, "learning_rate": 1.3178581033264218e-05, "logits/chosen": -784754304.0, "logits/rejected": -674653824.0, "logps/chosen": -535.4733542319749, "logps/rejected": -945.2461059190031, "loss": 0.2987, "rewards/chosen": -2.061030564263323, "rewards/margins": 4.187411803337923, "rewards/rejected": -6.248442367601246, "step": 3960 }, { "epoch": 2.0779900549594346, "grad_norm": 1.3573404725613187, "kl": 0.0, "learning_rate": 1.3044605909661434e-05, "logits/chosen": -831101312.0, "logits/rejected": -637534208.0, "logps/chosen": -511.0617283950617, "logps/rejected": -930.9367088607595, "loss": 0.2924, "rewards/chosen": -1.5159866898148149, "rewards/margins": 4.905690525375059, "rewards/rejected": -6.421677215189874, "step": 3970 }, { "epoch": 2.083224286835907, "grad_norm": 0.7052536575350146, "kl": 0.0, "learning_rate": 1.2911074464203157e-05, "logits/chosen": -691378560.0, "logits/rejected": -608383808.0, "logps/chosen": -434.2931596091205, "logps/rejected": -906.8588588588589, "loss": 0.2845, "rewards/chosen": -1.1614413680781759, "rewards/margins": 5.0990691424323344, "rewards/rejected": -6.26051051051051, "step": 3980 }, { "epoch": 2.088458518712379, "grad_norm": 0.5194332110158825, "kl": 0.0, "learning_rate": 1.2777991652391758e-05, "logits/chosen": -745747264.0, "logits/rejected": -601043776.0, "logps/chosen": -453.30120481927713, "logps/rejected": -860.8831168831168, "loss": 0.3133, "rewards/chosen": -1.4601609563253013, "rewards/margins": 3.994384498220153, "rewards/rejected": -5.454545454545454, "step": 3990 }, { "epoch": 2.093692750588851, "grad_norm": 1.005682372476424, "kl": 0.0, "learning_rate": 1.2645362413080342e-05, "logits/chosen": -702650752.0, "logits/rejected": -508926368.0, "logps/chosen": -389.91515151515154, "logps/rejected": -866.2709677419355, "loss": 0.3181, "rewards/chosen": -1.0228811553030304, "rewards/margins": 4.944860780180841, "rewards/rejected": -5.967741935483871, "step": 4000 }, { "epoch": 2.093692750588851, "eval_kl": 0.0, "eval_logits/chosen": -1560081408.0, "eval_logits/rejected": -1263351040.0, "eval_logps/chosen": -525.110341415141, "eval_logps/rejected": -727.0094480358031, "eval_loss": 0.4486015737056732, "eval_rewards/chosen": -1.932149925779317, "eval_rewards/margins": 2.384732222405665, "eval_rewards/rejected": -4.316882148184982, "eval_runtime": 93.4499, "eval_samples_per_second": 42.804, "eval_steps_per_second": 0.674, "step": 4000 }, { "epoch": 2.0989269824653234, "grad_norm": 0.8653491127894726, "kl": 0.0, "learning_rate": 1.2513191668289393e-05, "logits/chosen": -744803520.0, "logits/rejected": -595119296.0, "logps/chosen": -400.85804416403784, "logps/rejected": -879.3560371517028, "loss": 0.291, "rewards/chosen": -0.7524645110410094, "rewards/margins": 5.041653136017814, "rewards/rejected": -5.794117647058823, "step": 4010 }, { "epoch": 2.1041612143417954, "grad_norm": 0.5023478537850743, "kl": 0.0, "learning_rate": 1.2381484323024178e-05, "logits/chosen": -772276224.0, "logits/rejected": -564238720.0, "logps/chosen": -368.2507836990596, "logps/rejected": -883.8380062305296, "loss": 0.273, "rewards/chosen": -0.25505118534482757, "rewards/margins": 5.663173113720593, "rewards/rejected": -5.918224299065421, "step": 4020 }, { "epoch": 2.1093954462182674, "grad_norm": 0.47927941821650943, "kl": 0.0, "learning_rate": 1.2250245265092666e-05, "logits/chosen": -683147264.0, "logits/rejected": -500118336.0, "logps/chosen": -428.57142857142856, "logps/rejected": -802.7169811320755, "loss": 0.3035, "rewards/chosen": -1.0651446040372672, "rewards/margins": 4.261899421119965, "rewards/rejected": -5.327044025157233, "step": 4030 }, { "epoch": 2.11462967809474, "grad_norm": 0.43364669877586093, "kl": 0.0, "learning_rate": 1.2119479364924148e-05, "logits/chosen": -692479616.0, "logits/rejected": -560359040.0, "logps/chosen": -441.1640866873065, "logps/rejected": -814.1324921135647, "loss": 0.3, "rewards/chosen": -1.3454914860681115, "rewards/margins": 3.826432804152709, "rewards/rejected": -5.1719242902208205, "step": 4040 }, { "epoch": 2.1198639099712118, "grad_norm": 0.4900123761137033, "kl": 0.0, "learning_rate": 1.1989191475388516e-05, "logits/chosen": -708732544.0, "logits/rejected": -572103040.0, "logps/chosen": -385.9746835443038, "logps/rejected": -871.8024691358024, "loss": 0.2711, "rewards/chosen": -0.8139092167721519, "rewards/margins": 4.855843869647601, "rewards/rejected": -5.669753086419753, "step": 4050 }, { "epoch": 2.1250981418476838, "grad_norm": 0.5886859527298862, "kl": 0.0, "learning_rate": 1.1859386431616157e-05, "logits/chosen": -776785088.0, "logits/rejected": -623797888.0, "logps/chosen": -431.7928802588997, "logps/rejected": -838.9607250755287, "loss": 0.2749, "rewards/chosen": -0.7912874190938511, "rewards/margins": 4.660374212326089, "rewards/rejected": -5.45166163141994, "step": 4060 }, { "epoch": 2.130332373724156, "grad_norm": 1.0109121847189841, "kl": 0.0, "learning_rate": 1.173006905081847e-05, "logits/chosen": -857944896.0, "logits/rejected": -517000384.0, "logps/chosen": -453.10843373493975, "logps/rejected": -853.2987012987013, "loss": 0.2734, "rewards/chosen": -0.9301816641566265, "rewards/margins": 4.767058595583633, "rewards/rejected": -5.697240259740259, "step": 4070 }, { "epoch": 2.135566605600628, "grad_norm": 1.0500135128137795, "kl": 0.0, "learning_rate": 1.160124413210918e-05, "logits/chosen": -787900032.0, "logits/rejected": -592130880.0, "logps/chosen": -349.6369230769231, "logps/rejected": -861.4603174603175, "loss": 0.2678, "rewards/chosen": -0.2839783653846154, "rewards/margins": 5.0350692536630035, "rewards/rejected": -5.319047619047619, "step": 4080 }, { "epoch": 2.1408008374771, "grad_norm": 0.6560418400369694, "kl": 0.0, "learning_rate": 1.1472916456326146e-05, "logits/chosen": -784020288.0, "logits/rejected": -531942592.0, "logps/chosen": -444.8, "logps/rejected": -822.6, "loss": 0.2778, "rewards/chosen": -1.0637939453125, "rewards/margins": 4.2182373046875, "rewards/rejected": -5.28203125, "step": 4090 }, { "epoch": 2.146035069353572, "grad_norm": 0.7551742469602785, "kl": 0.0, "learning_rate": 1.1345090785853999e-05, "logits/chosen": -776260800.0, "logits/rejected": -585734528.0, "logps/chosen": -369.46835443037975, "logps/rejected": -818.8641975308642, "loss": 0.2603, "rewards/chosen": -0.4557456487341772, "rewards/margins": 4.676970400648539, "rewards/rejected": -5.132716049382716, "step": 4100 }, { "epoch": 2.146035069353572, "eval_kl": 0.0, "eval_logits/chosen": -1459418112.0, "eval_logits/rejected": -1138054528.0, "eval_logps/chosen": -524.6036615536863, "eval_logps/rejected": -721.5355544505221, "eval_loss": 0.451171875, "eval_rewards/chosen": -1.927325581395349, "eval_rewards/margins": 2.337592369872677, "eval_rewards/rejected": -4.264917951268026, "eval_runtime": 93.4435, "eval_samples_per_second": 42.807, "eval_steps_per_second": 0.674, "step": 4100 }, { "epoch": 2.1512693012300446, "grad_norm": 0.39235017704762193, "kl": 0.0, "learning_rate": 1.1217771864447396e-05, "logits/chosen": -765565312.0, "logits/rejected": -523894784.0, "logps/chosen": -407.58409785932724, "logps/rejected": -776.7923322683706, "loss": 0.2826, "rewards/chosen": -0.7646215596330275, "rewards/margins": 4.039691539408507, "rewards/rejected": -4.804313099041534, "step": 4110 }, { "epoch": 2.1565035331065165, "grad_norm": 0.9980238027631436, "kl": 0.0, "learning_rate": 1.1090964417054946e-05, "logits/chosen": -705691648.0, "logits/rejected": -469028032.0, "logps/chosen": -406.66261398176295, "logps/rejected": -768.5144694533763, "loss": 0.2883, "rewards/chosen": -0.9441845554711246, "rewards/margins": 4.141828306265209, "rewards/rejected": -5.086012861736334, "step": 4120 }, { "epoch": 2.1617377649829885, "grad_norm": 0.44846759594413, "kl": 0.0, "learning_rate": 1.0964673149643911e-05, "logits/chosen": -777099648.0, "logits/rejected": -531523168.0, "logps/chosen": -414.9433962264151, "logps/rejected": -828.5217391304348, "loss": 0.2407, "rewards/chosen": -0.6657576650943396, "rewards/margins": 4.735639850433611, "rewards/rejected": -5.40139751552795, "step": 4130 }, { "epoch": 2.166971996859461, "grad_norm": 1.0494481926845889, "kl": 0.0, "learning_rate": 1.08389027490255e-05, "logits/chosen": -704118784.0, "logits/rejected": -602092352.0, "logps/chosen": -371.94805194805195, "logps/rejected": -816.9638554216867, "loss": 0.2649, "rewards/chosen": -0.5836038961038961, "rewards/margins": 4.6212153810045375, "rewards/rejected": -5.204819277108434, "step": 4140 }, { "epoch": 2.172206228735933, "grad_norm": 0.48196554804324737, "kl": 0.0, "learning_rate": 1.0713657882680975e-05, "logits/chosen": -682518144.0, "logits/rejected": -520827712.0, "logps/chosen": -418.4691358024691, "logps/rejected": -798.5822784810126, "loss": 0.2906, "rewards/chosen": -1.0041473765432098, "rewards/margins": 4.1097766740897015, "rewards/rejected": -5.113924050632911, "step": 4150 }, { "epoch": 2.177440460612405, "grad_norm": 1.416871559890211, "kl": 0.0, "learning_rate": 1.0588943198588456e-05, "logits/chosen": -716072576.0, "logits/rejected": -583637376.0, "logps/chosen": -383.3, "logps/rejected": -802.5, "loss": 0.2669, "rewards/chosen": -0.545361328125, "rewards/margins": 4.497607421875, "rewards/rejected": -5.04296875, "step": 4160 }, { "epoch": 2.1826746924888774, "grad_norm": 0.6456767473107083, "kl": 0.0, "learning_rate": 1.0464763325050358e-05, "logits/chosen": -603979776.0, "logits/rejected": -628568896.0, "logps/chosen": -347.06849315068496, "logps/rejected": -838.6206896551724, "loss": 0.2265, "rewards/chosen": -0.028842037671232876, "rewards/margins": 5.19529589336325, "rewards/rejected": -5.224137931034483, "step": 4170 }, { "epoch": 2.1879089243653493, "grad_norm": 0.3442641001225459, "kl": 0.0, "learning_rate": 1.0341122870521725e-05, "logits/chosen": -679372416.0, "logits/rejected": -476525376.0, "logps/chosen": -351.39240506329116, "logps/rejected": -795.1604938271605, "loss": 0.2384, "rewards/chosen": -0.20490506329113925, "rewards/margins": 4.890002344116268, "rewards/rejected": -5.094907407407407, "step": 4180 }, { "epoch": 2.1931431562418213, "grad_norm": 0.4512805022045456, "kl": 0.0, "learning_rate": 1.0218026423439101e-05, "logits/chosen": -709047104.0, "logits/rejected": -486696544.0, "logps/chosen": -384.0955223880597, "logps/rejected": -794.6491803278689, "loss": 0.2544, "rewards/chosen": -0.3564365671641791, "rewards/margins": 4.705038842671886, "rewards/rejected": -5.061475409836065, "step": 4190 }, { "epoch": 2.1983773881182938, "grad_norm": 0.407756567161035, "kl": 0.0, "learning_rate": 1.0095478552050347e-05, "logits/chosen": -640994496.0, "logits/rejected": -545416832.0, "logps/chosen": -307.7115987460815, "logps/rejected": -814.8535825545172, "loss": 0.2388, "rewards/chosen": -0.040727860501567396, "rewards/margins": 4.7482129494672805, "rewards/rejected": -4.788940809968848, "step": 4200 }, { "epoch": 2.1983773881182938, "eval_kl": 0.0, "eval_logits/chosen": -1377462656.0, "eval_logits/rejected": -1113288064.0, "eval_logps/chosen": -484.2592775853538, "eval_logps/rejected": -613.2352063649926, "eval_loss": 0.4555937647819519, "eval_rewards/chosen": -1.5242454230578921, "eval_rewards/margins": 1.6601155913627943, "eval_rewards/rejected": -3.1843610144206864, "eval_runtime": 93.4488, "eval_samples_per_second": 42.804, "eval_steps_per_second": 0.674, "step": 4200 }, { "epoch": 2.2036116199947657, "grad_norm": 0.5169381215562495, "kl": 0.0, "learning_rate": 9.973483804245033e-06, "logits/chosen": -717960000.0, "logits/rejected": -570005888.0, "logps/chosen": -330.7524115755627, "logps/rejected": -749.4224924012158, "loss": 0.2471, "rewards/chosen": 0.02110128617363344, "rewards/margins": 4.477028337845366, "rewards/rejected": -4.455927051671733, "step": 4210 }, { "epoch": 2.2088458518712377, "grad_norm": 0.6406300733522271, "kl": 0.0, "learning_rate": 9.85204670738569e-06, "logits/chosen": -620022976.0, "logits/rejected": -565287296.0, "logps/chosen": -323.8933333333333, "logps/rejected": -758.0235294117647, "loss": 0.2546, "rewards/chosen": -0.04555338541666667, "rewards/margins": 4.550034849877451, "rewards/rejected": -4.595588235294118, "step": 4220 }, { "epoch": 2.21408008374771, "grad_norm": 1.0622337513292255, "kl": 0.0, "learning_rate": 9.731171768139807e-06, "logits/chosen": -662700032.0, "logits/rejected": -499227040.0, "logps/chosen": -358.0246913580247, "logps/rejected": -760.6075949367089, "loss": 0.2706, "rewards/chosen": -0.2875855999228395, "rewards/margins": 4.2859903494442495, "rewards/rejected": -4.573575949367089, "step": 4230 }, { "epoch": 2.219314315624182, "grad_norm": 0.6021455816373862, "kl": 0.0, "learning_rate": 9.610863472312582e-06, "logits/chosen": -704223616.0, "logits/rejected": -538129216.0, "logps/chosen": -331.6923076923077, "logps/rejected": -764.0, "loss": 0.2176, "rewards/chosen": 0.22345753205128205, "rewards/margins": 4.911719727173233, "rewards/rejected": -4.688262195121951, "step": 4240 }, { "epoch": 2.224548547500654, "grad_norm": 0.4635125876528428, "kl": 0.0, "learning_rate": 9.491126284680398e-06, "logits/chosen": -614884992.0, "logits/rejected": -477521504.0, "logps/chosen": -333.1480362537764, "logps/rejected": -717.3592233009708, "loss": 0.2614, "rewards/chosen": -0.05881797583081571, "rewards/margins": 4.218690114784071, "rewards/rejected": -4.277508090614886, "step": 4250 }, { "epoch": 2.2297827793771265, "grad_norm": 0.47303646752910194, "kl": 0.0, "learning_rate": 9.371964648825221e-06, "logits/chosen": -663853440.0, "logits/rejected": -537237888.0, "logps/chosen": -320.62111801242236, "logps/rejected": -752.3018867924528, "loss": 0.2374, "rewards/chosen": 0.059819002329192544, "rewards/margins": 4.639221518052463, "rewards/rejected": -4.57940251572327, "step": 4260 }, { "epoch": 2.2350170112535985, "grad_norm": 0.39987970021496416, "kl": 0.0, "learning_rate": 9.253382986969578e-06, "logits/chosen": -710619968.0, "logits/rejected": -506226272.0, "logps/chosen": -294.41975308641975, "logps/rejected": -773.4683544303797, "loss": 0.227, "rewards/chosen": 0.24508101851851852, "rewards/margins": 5.129179119784341, "rewards/rejected": -4.884098101265823, "step": 4270 }, { "epoch": 2.2402512431300705, "grad_norm": 0.7503840516133481, "kl": 0.0, "learning_rate": 9.135385699812558e-06, "logits/chosen": -779930816.0, "logits/rejected": -603665216.0, "logps/chosen": -301.1948051948052, "logps/rejected": -752.5783132530121, "loss": 0.205, "rewards/chosen": 0.578023538961039, "rewards/margins": 5.206788599202003, "rewards/rejected": -4.628765060240964, "step": 4280 }, { "epoch": 2.245485475006543, "grad_norm": 0.4724610462486881, "kl": 0.0, "learning_rate": 9.017977166366445e-06, "logits/chosen": -787061120.0, "logits/rejected": -559992000.0, "logps/chosen": -327.8490566037736, "logps/rejected": -717.7142857142857, "loss": 0.2269, "rewards/chosen": 0.601754866306137, "rewards/margins": 5.016351139598062, "rewards/rejected": -4.4145962732919255, "step": 4290 }, { "epoch": 2.250719706883015, "grad_norm": 0.49216136291730395, "kl": 0.0, "learning_rate": 8.901161743794175e-06, "logits/chosen": -734108032.0, "logits/rejected": -551865536.0, "logps/chosen": -282.2716049382716, "logps/rejected": -743.8987341772151, "loss": 0.224, "rewards/chosen": 0.4676408179012346, "rewards/margins": 4.9668496786607275, "rewards/rejected": -4.499208860759493, "step": 4300 }, { "epoch": 2.250719706883015, "eval_kl": 0.0, "eval_logits/chosen": -1391976320.0, "eval_logits/rejected": -1133061248.0, "eval_logps/chosen": -479.9841662543295, "eval_logps/rejected": -590.8304326205867, "eval_loss": 0.45759373903274536, "eval_rewards/chosen": -1.481321128154379, "eval_rewards/margins": 1.4770329245557154, "eval_rewards/rejected": -2.9583540527100944, "eval_runtime": 93.45, "eval_samples_per_second": 42.804, "eval_steps_per_second": 0.674, "step": 4300 }, { "epoch": 2.255953938759487, "grad_norm": 0.6632663517963511, "kl": 0.0, "learning_rate": 8.784943767247714e-06, "logits/chosen": -755813568.0, "logits/rejected": -542638080.0, "logps/chosen": -350.7294117647059, "logps/rejected": -749.8666666666667, "loss": 0.2505, "rewards/chosen": -0.1849264705882353, "rewards/margins": 4.332573529411765, "rewards/rejected": -4.5175, "step": 4310 }, { "epoch": 2.2611881706359593, "grad_norm": 0.7391169731532689, "kl": 0.0, "learning_rate": 8.669327549707096e-06, "logits/chosen": -655569728.0, "logits/rejected": -531103744.0, "logps/chosen": -336.8126984126984, "logps/rejected": -768.0, "loss": 0.2342, "rewards/chosen": 0.10716765873015872, "rewards/margins": 4.954090735653236, "rewards/rejected": -4.846923076923077, "step": 4320 }, { "epoch": 2.2664224025124313, "grad_norm": 0.6386360163671007, "kl": 0.0, "learning_rate": 8.554317381820411e-06, "logits/chosen": -667104064.0, "logits/rejected": -506986496.0, "logps/chosen": -301.05806451612904, "logps/rejected": -754.4242424242424, "loss": 0.232, "rewards/chosen": 0.19148185483870966, "rewards/margins": 4.9043606427174975, "rewards/rejected": -4.712878787878788, "step": 4330 }, { "epoch": 2.2716566343889033, "grad_norm": 0.600238772093869, "kl": 0.0, "learning_rate": 8.439917531744587e-06, "logits/chosen": -666894336.0, "logits/rejected": -518520832.0, "logps/chosen": -323.95180722891564, "logps/rejected": -758.6493506493506, "loss": 0.2381, "rewards/chosen": 0.22035015060240964, "rewards/margins": 4.907444306446565, "rewards/rejected": -4.6870941558441555, "step": 4340 }, { "epoch": 2.2768908662653757, "grad_norm": 0.8954410797785683, "kl": 0.0, "learning_rate": 8.326132244986932e-06, "logits/chosen": -701707072.0, "logits/rejected": -517367392.0, "logps/chosen": -290.51851851851853, "logps/rejected": -748.7594936708861, "loss": 0.2228, "rewards/chosen": 0.3577594521604938, "rewards/margins": 4.953487300261759, "rewards/rejected": -4.595727848101266, "step": 4350 }, { "epoch": 2.2821250981418477, "grad_norm": 0.6812826415405142, "kl": 0.0, "learning_rate": 8.212965744247652e-06, "logits/chosen": -646709248.0, "logits/rejected": -484101312.0, "logps/chosen": -325.4787878787879, "logps/rejected": -740.2322580645161, "loss": 0.2575, "rewards/chosen": -0.15175189393939395, "rewards/margins": 4.116796493157381, "rewards/rejected": -4.268548387096774, "step": 4360 }, { "epoch": 2.2873593300183197, "grad_norm": 0.5151716397306011, "kl": 0.0, "learning_rate": 8.100422229263077e-06, "logits/chosen": -604399232.0, "logits/rejected": -566650496.0, "logps/chosen": -358.19867549668874, "logps/rejected": -766.1065088757397, "loss": 0.24, "rewards/chosen": -0.015883692052980132, "rewards/margins": 4.736335242858263, "rewards/rejected": -4.752218934911243, "step": 4370 }, { "epoch": 2.292593561894792, "grad_norm": 1.135779708785077, "kl": 0.0, "learning_rate": 7.988505876649863e-06, "logits/chosen": -608174080.0, "logits/rejected": -524956480.0, "logps/chosen": -331.6687898089172, "logps/rejected": -709.398773006135, "loss": 0.2588, "rewards/chosen": -0.05260997213375796, "rewards/margins": 4.220396162835567, "rewards/rejected": -4.273006134969325, "step": 4380 }, { "epoch": 2.297827793771264, "grad_norm": 0.5072498511179133, "kl": 0.0, "learning_rate": 7.877220839749939e-06, "logits/chosen": -597111616.0, "logits/rejected": -512281792.0, "logps/chosen": -310.92459016393445, "logps/rejected": -764.5611940298508, "loss": 0.2456, "rewards/chosen": 0.0782530737704918, "rewards/margins": 4.849894864815268, "rewards/rejected": -4.771641791044776, "step": 4390 }, { "epoch": 2.303062025647736, "grad_norm": 0.3652767871003493, "kl": 0.0, "learning_rate": 7.766571248476399e-06, "logits/chosen": -577136256.0, "logits/rejected": -415983200.0, "logps/chosen": -382.37538461538463, "logps/rejected": -836.4698412698413, "loss": 0.26, "rewards/chosen": -0.4841105769230769, "rewards/margins": 4.81271481990232, "rewards/rejected": -5.296825396825397, "step": 4400 }, { "epoch": 2.303062025647736, "eval_kl": 0.0, "eval_logits/chosen": -1215682432.0, "eval_logits/rejected": -961427712.0, "eval_logps/chosen": -496.70460168233546, "eval_logps/rejected": -633.5713575335654, "eval_loss": 0.4561718702316284, "eval_rewards/chosen": -1.6511009401286492, "eval_rewards/margins": 1.7327876725018827, "eval_rewards/rejected": -3.383888612630532, "eval_runtime": 93.4509, "eval_samples_per_second": 42.803, "eval_steps_per_second": 0.674, "step": 4400 }, { "epoch": 2.3082962575242085, "grad_norm": 0.7225883147344231, "kl": 0.0, "learning_rate": 7.656561209160248e-06, "logits/chosen": -582431552.0, "logits/rejected": -499017312.0, "logps/chosen": -314.61333333333334, "logps/rejected": -786.4470588235295, "loss": 0.2242, "rewards/chosen": 0.25373046875, "rewards/margins": 5.158142233455882, "rewards/rejected": -4.904411764705882, "step": 4410 }, { "epoch": 2.3135304894006805, "grad_norm": 0.5150878842633106, "kl": 0.0, "learning_rate": 7.547194804398e-06, "logits/chosen": -703279936.0, "logits/rejected": -511082496.0, "logps/chosen": -382.17846153846153, "logps/rejected": -845.3079365079365, "loss": 0.2401, "rewards/chosen": -0.09444411057692308, "rewards/margins": 5.292063825931014, "rewards/rejected": -5.386507936507937, "step": 4420 }, { "epoch": 2.3187647212771525, "grad_norm": 0.44831570510738245, "kl": 0.0, "learning_rate": 7.43847609290014e-06, "logits/chosen": -621491008.0, "logits/rejected": -489894720.0, "logps/chosen": -278.4773413897281, "logps/rejected": -815.8446601941747, "loss": 0.2211, "rewards/chosen": 0.3919290502265861, "rewards/margins": 5.424291509773512, "rewards/rejected": -5.032362459546926, "step": 4430 }, { "epoch": 2.323998953153625, "grad_norm": 0.31782004144491344, "kl": 0.0, "learning_rate": 7.330409109340563e-06, "logits/chosen": -515217824.0, "logits/rejected": -545993536.0, "logps/chosen": -315.5342465753425, "logps/rejected": -770.5747126436781, "loss": 0.2086, "rewards/chosen": 0.1988073897688356, "rewards/margins": 5.010588998964238, "rewards/rejected": -4.811781609195402, "step": 4440 }, { "epoch": 2.329233185030097, "grad_norm": 1.2202322471583766, "kl": 0.0, "learning_rate": 7.222997864206757e-06, "logits/chosen": -604346752.0, "logits/rejected": -504784480.0, "logps/chosen": -316.65594855305466, "logps/rejected": -817.2158054711247, "loss": 0.215, "rewards/chosen": 0.21764469453376206, "rewards/margins": 5.486641655020084, "rewards/rejected": -5.268996960486322, "step": 4450 }, { "epoch": 2.334467416906569, "grad_norm": 0.6017860477665279, "kl": 0.0, "learning_rate": 7.1162463436510615e-06, "logits/chosen": -627625152.0, "logits/rejected": -399297728.0, "logps/chosen": -349.1446153846154, "logps/rejected": -789.8412698412699, "loss": 0.2337, "rewards/chosen": -0.11662860576923077, "rewards/margins": 5.069085679945055, "rewards/rejected": -5.185714285714286, "step": 4460 }, { "epoch": 2.3397016487830413, "grad_norm": 0.4804667120654089, "kl": 0.0, "learning_rate": 7.010158509342682e-06, "logits/chosen": -588775424.0, "logits/rejected": -481977952.0, "logps/chosen": -353.53846153846155, "logps/rejected": -810.6341463414634, "loss": 0.2194, "rewards/chosen": -0.06151091746794872, "rewards/margins": 5.045196399605222, "rewards/rejected": -5.1067073170731705, "step": 4470 }, { "epoch": 2.3449358806595133, "grad_norm": 0.9272819424930441, "kl": 0.0, "learning_rate": 6.904738298320665e-06, "logits/chosen": -649173376.0, "logits/rejected": -448423520.0, "logps/chosen": -336.9846153846154, "logps/rejected": -809.7523809523809, "loss": 0.2483, "rewards/chosen": -0.031105769230769232, "rewards/margins": 5.2395291514041515, "rewards/rejected": -5.270634920634921, "step": 4480 }, { "epoch": 2.3501701125359853, "grad_norm": 0.45114391479169963, "kl": 0.0, "learning_rate": 6.799989622847827e-06, "logits/chosen": -559860928.0, "logits/rejected": -491362720.0, "logps/chosen": -313.53846153846155, "logps/rejected": -871.8048780487804, "loss": 0.2363, "rewards/chosen": -0.07071314102564102, "rewards/margins": 5.6548966150719195, "rewards/rejected": -5.725609756097561, "step": 4490 }, { "epoch": 2.3554043444124577, "grad_norm": 0.5010732836904357, "kl": 0.0, "learning_rate": 6.695916370265528e-06, "logits/chosen": -522138432.0, "logits/rejected": -414030240.0, "logps/chosen": -343.3762711864407, "logps/rejected": -907.6869565217391, "loss": 0.2234, "rewards/chosen": -0.18509004237288135, "rewards/margins": 5.982301261974945, "rewards/rejected": -6.167391304347826, "step": 4500 }, { "epoch": 2.3554043444124577, "eval_kl": 0.0, "eval_logits/chosen": -1145577600.0, "eval_logits/rejected": -884432256.0, "eval_logps/chosen": -540.8807521029194, "eval_logps/rejected": -735.6340129288911, "eval_loss": 0.4556874930858612, "eval_rewards/chosen": -2.091848095002474, "eval_rewards/margins": 2.313795863227263, "eval_rewards/rejected": -4.405643958229737, "eval_runtime": 93.4327, "eval_samples_per_second": 42.812, "eval_steps_per_second": 0.674, "step": 4500 }, { "epoch": 2.3606385762889297, "grad_norm": 1.0333751484523532, "kl": 0.0, "learning_rate": 6.592522402849421e-06, "logits/chosen": -636590464.0, "logits/rejected": -421606208.0, "logps/chosen": -373.8426966292135, "logps/rejected": -919.3239436619718, "loss": 0.2598, "rewards/chosen": -0.5181377282303371, "rewards/margins": 5.887672130924593, "rewards/rejected": -6.40580985915493, "step": 4510 }, { "epoch": 2.3658728081654017, "grad_norm": 0.6399952632365049, "kl": 0.0, "learning_rate": 6.489811557666137e-06, "logits/chosen": -648177280.0, "logits/rejected": -545574080.0, "logps/chosen": -350.32615384615383, "logps/rejected": -833.5238095238095, "loss": 0.2274, "rewards/chosen": -0.19805288461538462, "rewards/margins": 5.143216956654456, "rewards/rejected": -5.341269841269841, "step": 4520 }, { "epoch": 2.3711070400418737, "grad_norm": 0.6465757772054535, "kl": 0.0, "learning_rate": 6.387787646430854e-06, "logits/chosen": -707946112.0, "logits/rejected": -566702912.0, "logps/chosen": -314.63291139240505, "logps/rejected": -818.8641975308642, "loss": 0.22, "rewards/chosen": 0.3127472310126582, "rewards/margins": 5.450864514963275, "rewards/rejected": -5.138117283950617, "step": 4530 }, { "epoch": 2.376341271918346, "grad_norm": 0.4214262313722552, "kl": 0.0, "learning_rate": 6.286454455365875e-06, "logits/chosen": -754555264.0, "logits/rejected": -540016640.0, "logps/chosen": -294.6900584795322, "logps/rejected": -800.5369127516778, "loss": 0.2453, "rewards/chosen": 0.23325566520467836, "rewards/margins": 5.291980497419444, "rewards/rejected": -5.058724832214765, "step": 4540 }, { "epoch": 2.381575503794818, "grad_norm": 0.4238912364274616, "kl": 0.0, "learning_rate": 6.1858157450600775e-06, "logits/chosen": -721210560.0, "logits/rejected": -625999872.0, "logps/chosen": -364.0774193548387, "logps/rejected": -827.1515151515151, "loss": 0.2304, "rewards/chosen": -0.1208921370967742, "rewards/margins": 5.11395634775171, "rewards/rejected": -5.234848484848484, "step": 4550 }, { "epoch": 2.38680973567129, "grad_norm": 1.2001050025077662, "kl": 0.0, "learning_rate": 6.085875250329401e-06, "logits/chosen": -656828032.0, "logits/rejected": -566702912.0, "logps/chosen": -330.1635220125786, "logps/rejected": -768.5962732919255, "loss": 0.2435, "rewards/chosen": 0.12917649371069181, "rewards/margins": 4.847344195574046, "rewards/rejected": -4.718167701863354, "step": 4560 }, { "epoch": 2.3920439675477625, "grad_norm": 0.9922339601859416, "kl": 0.0, "learning_rate": 5.9866366800782e-06, "logits/chosen": -736310080.0, "logits/rejected": -497549312.0, "logps/chosen": -365.87951807228916, "logps/rejected": -852.0519480519481, "loss": 0.2309, "rewards/chosen": -0.16735692771084337, "rewards/margins": 5.382967747613832, "rewards/rejected": -5.550324675324675, "step": 4570 }, { "epoch": 2.3972781994242345, "grad_norm": 0.8372891321444087, "kl": 0.0, "learning_rate": 5.888103717161619e-06, "logits/chosen": -734632320.0, "logits/rejected": -425354848.0, "logps/chosen": -326.99708454810497, "logps/rejected": -892.7676767676768, "loss": 0.2377, "rewards/chosen": 0.01731049562682216, "rewards/margins": 6.149465377781705, "rewards/rejected": -6.132154882154882, "step": 4580 }, { "epoch": 2.4025124313007065, "grad_norm": 1.1722616912692914, "kl": 0.0, "learning_rate": 5.790280018248939e-06, "logits/chosen": -658610560.0, "logits/rejected": -490733568.0, "logps/chosen": -332.5678233438486, "logps/rejected": -859.1455108359133, "loss": 0.2372, "rewards/chosen": -0.004115733438485805, "rewards/margins": 5.72034247089588, "rewards/rejected": -5.724458204334366, "step": 4590 }, { "epoch": 2.407746663177179, "grad_norm": 0.7180486773047849, "kl": 0.0, "learning_rate": 5.693169213687824e-06, "logits/chosen": -655569728.0, "logits/rejected": -571211776.0, "logps/chosen": -352.59442724458205, "logps/rejected": -779.205047318612, "loss": 0.235, "rewards/chosen": -0.15576020704334365, "rewards/margins": 4.647867553209022, "rewards/rejected": -4.803627760252366, "step": 4600 }, { "epoch": 2.407746663177179, "eval_kl": 0.0, "eval_logits/chosen": -1404625792.0, "eval_logits/rejected": -1132328960.0, "eval_logps/chosen": -488.93023255813955, "eval_logps/rejected": -651.3615116857285, "eval_loss": 0.455929696559906, "eval_rewards/chosen": -1.5702003958436417, "eval_rewards/margins": 1.9910875206158312, "eval_rewards/rejected": -3.561287916459473, "eval_runtime": 93.4284, "eval_samples_per_second": 42.814, "eval_steps_per_second": 0.674, "step": 4600 }, { "epoch": 2.412980895053651, "grad_norm": 0.42584673850291915, "kl": 0.0, "learning_rate": 5.596774907369659e-06, "logits/chosen": -692269888.0, "logits/rejected": -605133184.0, "logps/chosen": -334.1758957654723, "logps/rejected": -871.7837837837837, "loss": 0.2151, "rewards/chosen": -0.0889149022801303, "rewards/margins": 5.420094106728879, "rewards/rejected": -5.509009009009009, "step": 4610 }, { "epoch": 2.418215126930123, "grad_norm": 0.5818582065995188, "kl": 0.0, "learning_rate": 5.501100676595761e-06, "logits/chosen": -740714112.0, "logits/rejected": -470758208.0, "logps/chosen": -391.3394495412844, "logps/rejected": -817.891373801917, "loss": 0.2548, "rewards/chosen": -0.3944954128440367, "rewards/margins": 5.256463053609638, "rewards/rejected": -5.650958466453674, "step": 4620 }, { "epoch": 2.4234493588065953, "grad_norm": 1.3819770535923588, "kl": 0.0, "learning_rate": 5.406150071944604e-06, "logits/chosen": -700763328.0, "logits/rejected": -526280288.0, "logps/chosen": -326.3647798742138, "logps/rejected": -896.5962732919255, "loss": 0.2412, "rewards/chosen": 0.03156937893081761, "rewards/margins": 6.1480290062600105, "rewards/rejected": -6.116459627329193, "step": 4630 }, { "epoch": 2.4286835906830673, "grad_norm": 0.4913752890099384, "kl": 0.0, "learning_rate": 5.311926617140122e-06, "logits/chosen": -603350656.0, "logits/rejected": -634807936.0, "logps/chosen": -343.64963503649636, "logps/rejected": -894.4262295081967, "loss": 0.2068, "rewards/chosen": -0.08704949817518248, "rewards/margins": 5.797513343354872, "rewards/rejected": -5.884562841530054, "step": 4640 }, { "epoch": 2.4339178225595393, "grad_norm": 0.721220360194313, "kl": 0.0, "learning_rate": 5.218433808920884e-06, "logits/chosen": -737778048.0, "logits/rejected": -527958016.0, "logps/chosen": -374.52037617554856, "logps/rejected": -881.8442367601247, "loss": 0.2279, "rewards/chosen": -0.04217280564263323, "rewards/margins": 5.678232178781043, "rewards/rejected": -5.720404984423676, "step": 4650 }, { "epoch": 2.4391520544360117, "grad_norm": 0.45581445352448063, "kl": 0.0, "learning_rate": 5.125675116910325e-06, "logits/chosen": -658925184.0, "logits/rejected": -453980992.0, "logps/chosen": -415.0886075949367, "logps/rejected": -960.395061728395, "loss": 0.2374, "rewards/chosen": -0.5506081882911392, "rewards/margins": 6.295070824054539, "rewards/rejected": -6.845679012345679, "step": 4660 }, { "epoch": 2.4443862863124837, "grad_norm": 2.000847358239112, "kl": 0.0, "learning_rate": 5.033653983488029e-06, "logits/chosen": -733111936.0, "logits/rejected": -521614144.0, "logps/chosen": -416.0955223880597, "logps/rejected": -1016.4459016393442, "loss": 0.2485, "rewards/chosen": -0.7988339552238806, "rewards/margins": 5.922477520185955, "rewards/rejected": -6.721311475409836, "step": 4670 }, { "epoch": 2.4496205181889557, "grad_norm": 1.0399985853719926, "kl": 0.0, "learning_rate": 4.942373823661927e-06, "logits/chosen": -693528192.0, "logits/rejected": -522977280.0, "logps/chosen": -377.45, "logps/rejected": -891.3, "loss": 0.2406, "rewards/chosen": -0.49594554901123045, "rewards/margins": 5.42124195098877, "rewards/rejected": -5.9171875, "step": 4680 }, { "epoch": 2.454854750065428, "grad_norm": 0.5943719797905609, "kl": 0.0, "learning_rate": 4.85183802494159e-06, "logits/chosen": -666370048.0, "logits/rejected": -534931040.0, "logps/chosen": -309.0658307210031, "logps/rejected": -861.1090342679128, "loss": 0.2172, "rewards/chosen": 0.3622159090909091, "rewards/margins": 6.108321828094025, "rewards/rejected": -5.746105919003115, "step": 4690 }, { "epoch": 2.4600889819419, "grad_norm": 0.9426707645421406, "kl": 0.0, "learning_rate": 4.762049947212521e-06, "logits/chosen": -638687616.0, "logits/rejected": -570530176.0, "logps/chosen": -326.69182389937106, "logps/rejected": -930.1863354037267, "loss": 0.2246, "rewards/chosen": 0.051788522012578615, "rewards/margins": 6.245887900894566, "rewards/rejected": -6.194099378881988, "step": 4700 }, { "epoch": 2.4600889819419, "eval_kl": 0.0, "eval_logits/chosen": -1378527872.0, "eval_logits/rejected": -1094446976.0, "eval_logps/chosen": -506.0781791192479, "eval_logps/rejected": -691.9065141720537, "eval_loss": 0.45228123664855957, "eval_rewards/chosen": -1.7423305294408709, "eval_rewards/margins": 2.2244770289877716, "eval_rewards/rejected": -3.9668075584286426, "eval_runtime": 93.451, "eval_samples_per_second": 42.803, "eval_steps_per_second": 0.674, "step": 4700 }, { "epoch": 2.465323213818372, "grad_norm": 0.48273618157451387, "kl": 0.0, "learning_rate": 4.673012922611436e-06, "logits/chosen": -747215232.0, "logits/rejected": -566309696.0, "logps/chosen": -337.8793650793651, "logps/rejected": -835.3476923076923, "loss": 0.2229, "rewards/chosen": -0.011532738095238096, "rewards/margins": 5.370774954212454, "rewards/rejected": -5.382307692307692, "step": 4710 }, { "epoch": 2.470557445694844, "grad_norm": 1.1484291534793738, "kl": 0.0, "learning_rate": 4.584730255402647e-06, "logits/chosen": -764307072.0, "logits/rejected": -553333568.0, "logps/chosen": -317.72121212121215, "logps/rejected": -895.174193548387, "loss": 0.2187, "rewards/chosen": 0.34360795454545456, "rewards/margins": 6.343607954545455, "rewards/rejected": -6.0, "step": 4720 }, { "epoch": 2.4757916775713165, "grad_norm": 0.5790162958462677, "kl": 0.0, "learning_rate": 4.497205221855386e-06, "logits/chosen": -617611264.0, "logits/rejected": -612473216.0, "logps/chosen": -287.1551155115512, "logps/rejected": -787.1810089020771, "loss": 0.2337, "rewards/chosen": 0.18550175330033003, "rewards/margins": 4.99336525478401, "rewards/rejected": -4.80786350148368, "step": 4730 }, { "epoch": 2.4810259094477884, "grad_norm": 0.5496024948748146, "kl": 0.0, "learning_rate": 4.41044107012227e-06, "logits/chosen": -723465024.0, "logits/rejected": -491048128.0, "logps/chosen": -358.67069486404836, "logps/rejected": -776.4919093851132, "loss": 0.2634, "rewards/chosen": -0.27346110271903323, "rewards/margins": 4.758092295339219, "rewards/rejected": -5.031553398058253, "step": 4740 }, { "epoch": 2.4862601413242604, "grad_norm": 0.8519585979935906, "kl": 0.0, "learning_rate": 4.324441020118722e-06, "logits/chosen": -624951296.0, "logits/rejected": -460744288.0, "logps/chosen": -340.8862275449102, "logps/rejected": -869.0196078431372, "loss": 0.263, "rewards/chosen": -0.4779425523952096, "rewards/margins": 5.2352927417224375, "rewards/rejected": -5.713235294117647, "step": 4750 }, { "epoch": 2.491494373200733, "grad_norm": 0.9981597518915051, "kl": 0.0, "learning_rate": 4.2392082634034825e-06, "logits/chosen": -665531200.0, "logits/rejected": -534039744.0, "logps/chosen": -379.18954248366015, "logps/rejected": -900.9820359281437, "loss": 0.2408, "rewards/chosen": -0.45895884395424835, "rewards/margins": 5.738645946464913, "rewards/rejected": -6.197604790419161, "step": 4760 }, { "epoch": 2.496728605077205, "grad_norm": 0.784519279781345, "kl": 0.0, "learning_rate": 4.154745963060197e-06, "logits/chosen": -680420992.0, "logits/rejected": -558261888.0, "logps/chosen": -400.24767801857587, "logps/rejected": -837.9558359621451, "loss": 0.24, "rewards/chosen": -0.4515770123839009, "rewards/margins": 5.036593334619253, "rewards/rejected": -5.488170347003154, "step": 4770 }, { "epoch": 2.501962836953677, "grad_norm": 0.9234334650905448, "kl": 0.0, "learning_rate": 4.071057253579979e-06, "logits/chosen": -700763328.0, "logits/rejected": -511705088.0, "logps/chosen": -306.5619335347432, "logps/rejected": -808.5954692556634, "loss": 0.236, "rewards/chosen": 0.07057212990936555, "rewards/margins": 5.465394136381858, "rewards/rejected": -5.394822006472492, "step": 4780 }, { "epoch": 2.5071970688301493, "grad_norm": 0.6296245262199346, "kl": 0.0, "learning_rate": 3.988145240745136e-06, "logits/chosen": -675597504.0, "logits/rejected": -546937216.0, "logps/chosen": -355.64307692307693, "logps/rejected": -811.479365079365, "loss": 0.2441, "rewards/chosen": -0.27162259615384615, "rewards/margins": 4.9458377213064715, "rewards/rejected": -5.217460317460318, "step": 4790 }, { "epoch": 2.5124313007066212, "grad_norm": 0.6754890512905317, "kl": 0.0, "learning_rate": 3.906013001513886e-06, "logits/chosen": -699400192.0, "logits/rejected": -578394496.0, "logps/chosen": -377.0769230769231, "logps/rejected": -790.2439024390244, "loss": 0.24, "rewards/chosen": -0.38168569711538464, "rewards/margins": 4.640417961421201, "rewards/rejected": -5.022103658536586, "step": 4800 }, { "epoch": 2.5124313007066212, "eval_kl": 0.0, "eval_logits/chosen": -1400231808.0, "eval_logits/rejected": -1111490560.0, "eval_logps/chosen": -509.52993567540824, "eval_logps/rejected": -701.2312282446544, "eval_loss": 0.45185938477516174, "eval_rewards/chosen": -1.7776472043542801, "eval_rewards/margins": 2.2837650283657593, "eval_rewards/rejected": -4.0614122327200395, "eval_runtime": 93.4505, "eval_samples_per_second": 42.803, "eval_steps_per_second": 0.674, "step": 4800 }, { "epoch": 2.5176655325830932, "grad_norm": 0.800808631316141, "kl": 0.0, "learning_rate": 3.824663583906144e-06, "logits/chosen": -750675584.0, "logits/rejected": -554801536.0, "logps/chosen": -339.219512195122, "logps/rejected": -819.3846153846154, "loss": 0.2421, "rewards/chosen": -0.08836699695121951, "rewards/margins": 5.197690695356473, "rewards/rejected": -5.2860576923076925, "step": 4810 }, { "epoch": 2.5228997644595657, "grad_norm": 0.8762123307256493, "kl": 0.0, "learning_rate": 3.744100006890461e-06, "logits/chosen": -683776384.0, "logits/rejected": -558156992.0, "logps/chosen": -339.73006134969324, "logps/rejected": -753.8343949044586, "loss": 0.2553, "rewards/chosen": -0.3314081671779141, "rewards/margins": 4.196458074860303, "rewards/rejected": -4.5278662420382165, "step": 4820 }, { "epoch": 2.5281339963360376, "grad_norm": 0.8555920028342421, "kl": 0.0, "learning_rate": 3.664325260271953e-06, "logits/chosen": -740294656.0, "logits/rejected": -543162368.0, "logps/chosen": -314.2153846153846, "logps/rejected": -803.1492063492063, "loss": 0.246, "rewards/chosen": 0.09826923076923078, "rewards/margins": 5.433189865689865, "rewards/rejected": -5.334920634920635, "step": 4830 }, { "epoch": 2.5333682282125096, "grad_norm": 0.752604945442186, "kl": 0.0, "learning_rate": 3.5853423045813377e-06, "logits/chosen": -632134016.0, "logits/rejected": -568537920.0, "logps/chosen": -365.74426229508197, "logps/rejected": -893.3253731343284, "loss": 0.2529, "rewards/chosen": -0.5505635245901639, "rewards/margins": 5.412123042574016, "rewards/rejected": -5.962686567164179, "step": 4840 }, { "epoch": 2.538602460088982, "grad_norm": 0.8106638168857512, "kl": 0.0, "learning_rate": 3.507154070965099e-06, "logits/chosen": -654730880.0, "logits/rejected": -509398208.0, "logps/chosen": -379.9388379204893, "logps/rejected": -792.2300319488818, "loss": 0.2938, "rewards/chosen": -0.660431001529052, "rewards/margins": 4.429025867480533, "rewards/rejected": -5.0894568690095845, "step": 4850 }, { "epoch": 2.543836691965454, "grad_norm": 0.5194989581627086, "kl": 0.0, "learning_rate": 3.4297634610766765e-06, "logits/chosen": -698771072.0, "logits/rejected": -520355840.0, "logps/chosen": -403.9384615384615, "logps/rejected": -867.3523809523809, "loss": 0.2571, "rewards/chosen": -0.6559375, "rewards/margins": 5.1821577380952375, "rewards/rejected": -5.838095238095238, "step": 4860 }, { "epoch": 2.549070923841926, "grad_norm": 2.654714248081575, "kl": 0.0, "learning_rate": 3.3531733469687855e-06, "logits/chosen": -668886656.0, "logits/rejected": -503578624.0, "logps/chosen": -348.48902821316614, "logps/rejected": -929.993769470405, "loss": 0.2305, "rewards/chosen": -0.058752938871473356, "rewards/margins": 6.480187871097375, "rewards/rejected": -6.538940809968848, "step": 4870 }, { "epoch": 2.5543051557183984, "grad_norm": 0.4859429710310574, "kl": 0.0, "learning_rate": 3.277386570986868e-06, "logits/chosen": -697407872.0, "logits/rejected": -608803200.0, "logps/chosen": -364.1038961038961, "logps/rejected": -910.843373493976, "loss": 0.2223, "rewards/chosen": -0.05125050730519481, "rewards/margins": 6.076761540887576, "rewards/rejected": -6.128012048192771, "step": 4880 }, { "epoch": 2.5595393875948704, "grad_norm": 0.4463890469673047, "kl": 0.0, "learning_rate": 3.2024059456635558e-06, "logits/chosen": -721420288.0, "logits/rejected": -572207936.0, "logps/chosen": -330.7951807228916, "logps/rejected": -956.7792207792207, "loss": 0.251, "rewards/chosen": -0.08921427899096386, "rewards/margins": 6.315006500229815, "rewards/rejected": -6.404220779220779, "step": 4890 }, { "epoch": 2.5647736194713424, "grad_norm": 0.6907246619558275, "kl": 0.0, "learning_rate": 3.128234253614343e-06, "logits/chosen": -705377088.0, "logits/rejected": -538758336.0, "logps/chosen": -374.12345679012344, "logps/rejected": -850.632911392405, "loss": 0.2557, "rewards/chosen": -0.3543836805555556, "rewards/margins": 5.324413787798875, "rewards/rejected": -5.67879746835443, "step": 4900 }, { "epoch": 2.5647736194713424, "eval_kl": 0.0, "eval_logits/chosen": -1397235840.0, "eval_logits/rejected": -1123008256.0, "eval_logps/chosen": -528.5304304799604, "eval_logps/rejected": -749.2869219293883, "eval_loss": 0.4514218866825104, "eval_rewards/chosen": -1.962333003463632, "eval_rewards/margins": 2.5779454649600373, "eval_rewards/rejected": -4.5402784684236694, "eval_runtime": 93.432, "eval_samples_per_second": 42.812, "eval_steps_per_second": 0.674, "step": 4900 }, { "epoch": 2.570007851347815, "grad_norm": 0.7318646432841975, "kl": 0.0, "learning_rate": 3.054874247434278e-06, "logits/chosen": -659659136.0, "logits/rejected": -554539392.0, "logps/chosen": -352.96894409937886, "logps/rejected": -883.4213836477987, "loss": 0.2456, "rewards/chosen": -0.4120305221273292, "rewards/margins": 5.228692748312922, "rewards/rejected": -5.640723270440252, "step": 4910 }, { "epoch": 2.575242083224287, "grad_norm": 1.2433568467442586, "kl": 0.0, "learning_rate": 2.9823286495958558e-06, "logits/chosen": -699819648.0, "logits/rejected": -545364352.0, "logps/chosen": -384.69565217391306, "logps/rejected": -883.4252199413489, "loss": 0.2342, "rewards/chosen": -0.39778428093645485, "rewards/margins": 5.525236246922782, "rewards/rejected": -5.923020527859237, "step": 4920 }, { "epoch": 2.580476315100759, "grad_norm": 0.4162299263993999, "kl": 0.0, "learning_rate": 2.9106001523479364e-06, "logits/chosen": -648124800.0, "logits/rejected": -592655168.0, "logps/chosen": -363.44303797468353, "logps/rejected": -929.1851851851852, "loss": 0.2375, "rewards/chosen": -0.4677734375, "rewards/margins": 5.842411747685185, "rewards/rejected": -6.310185185185185, "step": 4930 }, { "epoch": 2.5857105469772312, "grad_norm": 1.2374037693342654, "kl": 0.0, "learning_rate": 2.8396914176158694e-06, "logits/chosen": -693790336.0, "logits/rejected": -532047456.0, "logps/chosen": -407.1446540880503, "logps/rejected": -952.4472049689441, "loss": 0.2642, "rewards/chosen": -0.6344462460691824, "rewards/margins": 5.9152431949246065, "rewards/rejected": -6.549689440993789, "step": 4940 }, { "epoch": 2.5909447788537032, "grad_norm": 0.5581455511945357, "kl": 0.0, "learning_rate": 2.7696050769026956e-06, "logits/chosen": -608383808.0, "logits/rejected": -527171584.0, "logps/chosen": -404.16720257234726, "logps/rejected": -1129.240121580547, "loss": 0.2551, "rewards/chosen": -0.9647558279742765, "rewards/margins": 7.3635116492293715, "rewards/rejected": -8.328267477203648, "step": 4950 }, { "epoch": 2.596179010730175, "grad_norm": 0.4543139629422976, "kl": 0.0, "learning_rate": 2.7003437311914766e-06, "logits/chosen": -705377088.0, "logits/rejected": -622854144.0, "logps/chosen": -373.4177215189873, "logps/rejected": -959.4074074074074, "loss": 0.243, "rewards/chosen": -0.6134295886075949, "rewards/margins": 6.047064238552899, "rewards/rejected": -6.660493827160494, "step": 4960 }, { "epoch": 2.6014132426066476, "grad_norm": 0.5615407102756935, "kl": 0.0, "learning_rate": 2.631909950848793e-06, "logits/chosen": -709990784.0, "logits/rejected": -555535552.0, "logps/chosen": -470.8430769230769, "logps/rejected": -1022.984126984127, "loss": 0.2512, "rewards/chosen": -1.1778846153846154, "rewards/margins": 5.991956654456654, "rewards/rejected": -7.16984126984127, "step": 4970 }, { "epoch": 2.6066474744831196, "grad_norm": 0.4780257246681461, "kl": 0.0, "learning_rate": 2.5643062755293407e-06, "logits/chosen": -697512768.0, "logits/rejected": -619813248.0, "logps/chosen": -387.7281553398058, "logps/rejected": -971.7945619335347, "loss": 0.2401, "rewards/chosen": -0.6754778519417476, "rewards/margins": 5.8902321178467725, "rewards/rejected": -6.56570996978852, "step": 4980 }, { "epoch": 2.6118817063595916, "grad_norm": 0.699923487248618, "kl": 0.0, "learning_rate": 2.4975352140816615e-06, "logits/chosen": -778253120.0, "logits/rejected": -530894016.0, "logps/chosen": -379.28834355828224, "logps/rejected": -936.968152866242, "loss": 0.2341, "rewards/chosen": -0.31894651073619634, "rewards/margins": 6.252709540219218, "rewards/rejected": -6.571656050955414, "step": 4990 }, { "epoch": 2.617115938236064, "grad_norm": 0.5015730941076766, "kl": 0.0, "learning_rate": 2.4315992444550824e-06, "logits/chosen": -789577728.0, "logits/rejected": -634598208.0, "logps/chosen": -394.1366459627329, "logps/rejected": -975.2955974842768, "loss": 0.2413, "rewards/chosen": -0.46241750776397517, "rewards/margins": 6.148431548839798, "rewards/rejected": -6.610849056603773, "step": 5000 }, { "epoch": 2.617115938236064, "eval_kl": 0.0, "eval_logits/chosen": -1477659904.0, "eval_logits/rejected": -1194644352.0, "eval_logps/chosen": -527.0420583869371, "eval_logps/rejected": -771.0870213823969, "eval_loss": 0.45064061880111694, "eval_rewards/chosen": -1.9520658090054428, "eval_rewards/margins": 2.808749705663876, "eval_rewards/rejected": -4.760815514669319, "eval_runtime": 93.4433, "eval_samples_per_second": 42.807, "eval_steps_per_second": 0.674, "step": 5000 }, { "epoch": 2.622350170112536, "grad_norm": 0.9405868473590856, "kl": 0.0, "learning_rate": 2.3665008136077334e-06, "logits/chosen": -703489664.0, "logits/rejected": -601148608.0, "logps/chosen": -385.5114006514658, "logps/rejected": -1009.4894894894895, "loss": 0.2443, "rewards/chosen": -0.36499898208469056, "rewards/margins": 6.6905565734708645, "rewards/rejected": -7.055555555555555, "step": 5010 }, { "epoch": 2.627584401989008, "grad_norm": 1.0482270197343795, "kl": 0.0, "learning_rate": 2.3022423374157135e-06, "logits/chosen": -744803520.0, "logits/rejected": -540855488.0, "logps/chosen": -420.9079754601227, "logps/rejected": -787.2611464968153, "loss": 0.2767, "rewards/chosen": -0.9802051380368099, "rewards/margins": 4.1161324415810245, "rewards/rejected": -5.096337579617835, "step": 5020 }, { "epoch": 2.6328186338654804, "grad_norm": 0.4804923751480013, "kl": 0.0, "learning_rate": 2.2388262005834852e-06, "logits/chosen": -752458112.0, "logits/rejected": -549768384.0, "logps/chosen": -353.7177914110429, "logps/rejected": -899.8726114649681, "loss": 0.2598, "rewards/chosen": -0.32867618865030673, "rewards/margins": 5.797916168037592, "rewards/rejected": -6.126592356687898, "step": 5030 }, { "epoch": 2.6380528657419524, "grad_norm": 0.4130460548051139, "kl": 0.0, "learning_rate": 2.1762547565553292e-06, "logits/chosen": -732010880.0, "logits/rejected": -620547264.0, "logps/chosen": -420.453074433657, "logps/rejected": -821.558912386707, "loss": 0.2426, "rewards/chosen": -0.8308555825242718, "rewards/margins": 4.620806048895668, "rewards/rejected": -5.45166163141994, "step": 5040 }, { "epoch": 2.6432870976184244, "grad_norm": 2.0506539104559693, "kl": 0.0, "learning_rate": 2.114530327428027e-06, "logits/chosen": -746795840.0, "logits/rejected": -582588800.0, "logps/chosen": -391.55828220858893, "logps/rejected": -943.4904458598726, "loss": 0.2477, "rewards/chosen": -0.5600076687116564, "rewards/margins": 5.672476407721464, "rewards/rejected": -6.232484076433121, "step": 5050 }, { "epoch": 2.648521329494897, "grad_norm": 0.8035699204629465, "kl": 0.0, "learning_rate": 2.0536552038646963e-06, "logits/chosen": -715443392.0, "logits/rejected": -582798528.0, "logps/chosen": -408.33656957928804, "logps/rejected": -902.3806646525679, "loss": 0.2451, "rewards/chosen": -0.5545610841423948, "rewards/margins": 5.68788604576697, "rewards/rejected": -6.242447129909365, "step": 5060 }, { "epoch": 2.653755561371369, "grad_norm": 0.4585898319323983, "kl": 0.0, "learning_rate": 1.993631645009747e-06, "logits/chosen": -766089600.0, "logits/rejected": -619603584.0, "logps/chosen": -348.60681114551085, "logps/rejected": -976.0504731861199, "loss": 0.2299, "rewards/chosen": -0.15412151702786378, "rewards/margins": 6.421588262151947, "rewards/rejected": -6.575709779179811, "step": 5070 }, { "epoch": 2.658989793247841, "grad_norm": 0.5318634490809688, "kl": 0.0, "learning_rate": 1.9344618784050887e-06, "logits/chosen": -704433344.0, "logits/rejected": -550292672.0, "logps/chosen": -412.3154574132492, "logps/rejected": -909.4736842105264, "loss": 0.2276, "rewards/chosen": -0.5821205885252366, "rewards/margins": 5.635371671536683, "rewards/rejected": -6.21749226006192, "step": 5080 }, { "epoch": 2.6642240251243132, "grad_norm": 0.41118095505831503, "kl": 0.0, "learning_rate": 1.8761480999074126e-06, "logits/chosen": -799119744.0, "logits/rejected": -529740608.0, "logps/chosen": -380.5679758308157, "logps/rejected": -929.7605177993528, "loss": 0.2336, "rewards/chosen": -0.2652709592145015, "rewards/margins": 6.10123389515443, "rewards/rejected": -6.366504854368932, "step": 5090 }, { "epoch": 2.669458257000785, "grad_norm": 0.5504088625457714, "kl": 0.0, "learning_rate": 1.8186924736067479e-06, "logits/chosen": -722259136.0, "logits/rejected": -525179296.0, "logps/chosen": -355.1111111111111, "logps/rejected": -964.9620253164557, "loss": 0.2273, "rewards/chosen": -0.06327160493827161, "rewards/margins": 6.755557508985779, "rewards/rejected": -6.818829113924051, "step": 5100 }, { "epoch": 2.669458257000785, "eval_kl": 0.0, "eval_logits/chosen": -1420737280.0, "eval_logits/rejected": -1144246016.0, "eval_logps/chosen": -524.350321622959, "eval_logps/rejected": -753.7105917454003, "eval_loss": 0.4495312571525574, "eval_rewards/chosen": -1.927078179119248, "eval_rewards/margins": 2.660937733362104, "eval_rewards/rejected": -4.588015912481352, "eval_runtime": 93.4354, "eval_samples_per_second": 42.81, "eval_steps_per_second": 0.674, "step": 5100 }, { "epoch": 2.674692488877257, "grad_norm": 0.4037617876587109, "kl": 0.0, "learning_rate": 1.7620971317461182e-06, "logits/chosen": -730018624.0, "logits/rejected": -624112448.0, "logps/chosen": -335.34591194968556, "logps/rejected": -958.6086956521739, "loss": 0.2173, "rewards/chosen": -0.051088345125786166, "rewards/margins": 6.337886810153718, "rewards/rejected": -6.388975155279503, "step": 5110 }, { "epoch": 2.6799267207537296, "grad_norm": 0.47909127799613394, "kl": 0.0, "learning_rate": 1.7063641746424164e-06, "logits/chosen": -762839040.0, "logits/rejected": -514326528.0, "logps/chosen": -370.7951807228916, "logps/rejected": -817.038961038961, "loss": 0.2518, "rewards/chosen": -0.3368552334337349, "rewards/margins": 5.094962948384447, "rewards/rejected": -5.431818181818182, "step": 5120 }, { "epoch": 2.6851609526302016, "grad_norm": 0.651180944659031, "kl": 0.0, "learning_rate": 1.6514956706084883e-06, "logits/chosen": -767872192.0, "logits/rejected": -499856192.0, "logps/chosen": -337.3953488372093, "logps/rejected": -988.2162162162163, "loss": 0.2433, "rewards/chosen": 0.0436500726744186, "rewards/margins": 6.9997311537555, "rewards/rejected": -6.956081081081081, "step": 5130 }, { "epoch": 2.6903951845066736, "grad_norm": 1.0432660455153437, "kl": 0.0, "learning_rate": 1.5974936558763226e-06, "logits/chosen": -713975424.0, "logits/rejected": -539911808.0, "logps/chosen": -363.3482428115016, "logps/rejected": -944.7339449541284, "loss": 0.2372, "rewards/chosen": -0.45554612619808305, "rewards/margins": 6.005463048113843, "rewards/rejected": -6.4610091743119265, "step": 5140 }, { "epoch": 2.695629416383146, "grad_norm": 0.8243520837524749, "kl": 0.0, "learning_rate": 1.5443601345215358e-06, "logits/chosen": -716282240.0, "logits/rejected": -531261024.0, "logps/chosen": -353.57055214723925, "logps/rejected": -889.5796178343949, "loss": 0.2588, "rewards/chosen": -0.29694689417177916, "rewards/margins": 5.6075117045543355, "rewards/rejected": -5.904458598726115, "step": 5150 }, { "epoch": 2.700863648259618, "grad_norm": 0.6082955016572004, "kl": 0.0, "learning_rate": 1.4920970783889737e-06, "logits/chosen": -789368000.0, "logits/rejected": -548405248.0, "logps/chosen": -392.90909090909093, "logps/rejected": -871.3333333333334, "loss": 0.2667, "rewards/chosen": -0.6789994673295454, "rewards/margins": 4.7966949771149, "rewards/rejected": -5.475694444444445, "step": 5160 }, { "epoch": 2.70609788013609, "grad_norm": 1.3862654743582614, "kl": 0.0, "learning_rate": 1.4407064270195253e-06, "logits/chosen": -744279232.0, "logits/rejected": -584476288.0, "logps/chosen": -407.30546623794214, "logps/rejected": -871.2948328267477, "loss": 0.2287, "rewards/chosen": -0.5827220659163987, "rewards/margins": 5.0753326453298016, "rewards/rejected": -5.6580547112462005, "step": 5170 }, { "epoch": 2.7113321120125624, "grad_norm": 0.47285633180777387, "kl": 0.0, "learning_rate": 1.390190087578161e-06, "logits/chosen": -686817280.0, "logits/rejected": -630194176.0, "logps/chosen": -394.38961038961037, "logps/rejected": -863.6144578313254, "loss": 0.2427, "rewards/chosen": -0.6700994318181818, "rewards/margins": 5.0279427368565175, "rewards/rejected": -5.698042168674699, "step": 5180 }, { "epoch": 2.7165663438890344, "grad_norm": 0.6246600302967272, "kl": 0.0, "learning_rate": 1.3405499347831641e-06, "logits/chosen": -671298368.0, "logits/rejected": -661966016.0, "logps/chosen": -415.588424437299, "logps/rejected": -817.4103343465046, "loss": 0.2685, "rewards/chosen": -0.8964529742765274, "rewards/margins": 4.246404168580616, "rewards/rejected": -5.142857142857143, "step": 5190 }, { "epoch": 2.7218005757655064, "grad_norm": 0.6070359162128525, "kl": 0.0, "learning_rate": 1.2917878108365229e-06, "logits/chosen": -691535872.0, "logits/rejected": -587412288.0, "logps/chosen": -435.1392405063291, "logps/rejected": -877.0370370370371, "loss": 0.2645, "rewards/chosen": -0.9455102848101266, "rewards/margins": 5.006650209017034, "rewards/rejected": -5.952160493827161, "step": 5200 }, { "epoch": 2.7218005757655064, "eval_kl": 0.0, "eval_logits/chosen": -1426662528.0, "eval_logits/rejected": -1147708032.0, "eval_logps/chosen": -529.068777832756, "eval_logps/rejected": -767.9681750372948, "eval_loss": 0.450070321559906, "eval_rewards/chosen": -1.9729713013359722, "eval_rewards/margins": 2.7549004042831227, "eval_rewards/rejected": -4.727871705619095, "eval_runtime": 93.4352, "eval_samples_per_second": 42.81, "eval_steps_per_second": 0.674, "step": 5200 }, { "epoch": 2.727034807641979, "grad_norm": 1.0037145493476631, "kl": 0.0, "learning_rate": 1.2439055253556015e-06, "logits/chosen": -641309056.0, "logits/rejected": -601987456.0, "logps/chosen": -424.1898305084746, "logps/rejected": -1009.7159420289855, "loss": 0.2554, "rewards/chosen": -0.9086864406779661, "rewards/margins": 6.078270081061164, "rewards/rejected": -6.98695652173913, "step": 5210 }, { "epoch": 2.732269039518451, "grad_norm": 0.6312057167352639, "kl": 0.0, "learning_rate": 1.1969048553059609e-06, "logits/chosen": -784544576.0, "logits/rejected": -514588672.0, "logps/chosen": -355.0320699708455, "logps/rejected": -1084.7676767676767, "loss": 0.2584, "rewards/chosen": -0.23856596209912537, "rewards/margins": 7.694935721402558, "rewards/rejected": -7.933501683501683, "step": 5220 }, { "epoch": 2.7375032713949228, "grad_norm": 0.4977395450990626, "kl": 0.0, "learning_rate": 1.1507875449354166e-06, "logits/chosen": -731591488.0, "logits/rejected": -560411456.0, "logps/chosen": -418.01846153846157, "logps/rejected": -895.3904761904762, "loss": 0.2805, "rewards/chosen": -0.9154807692307693, "rewards/margins": 5.0964239926739925, "rewards/rejected": -6.011904761904762, "step": 5230 }, { "epoch": 2.7427375032713948, "grad_norm": 0.5728940069453299, "kl": 0.0, "learning_rate": 1.1055553057093215e-06, "logits/chosen": -733688640.0, "logits/rejected": -545364352.0, "logps/chosen": -341.25373134328356, "logps/rejected": -818.7803278688525, "loss": 0.2671, "rewards/chosen": -0.31609141791044776, "rewards/margins": 5.065875795204306, "rewards/rejected": -5.381967213114754, "step": 5240 }, { "epoch": 2.747971735147867, "grad_norm": 0.4250248012002315, "kl": 0.0, "learning_rate": 1.06120981624703e-06, "logits/chosen": -722993152.0, "logits/rejected": -619341440.0, "logps/chosen": -366.1237785016287, "logps/rejected": -946.9309309309309, "loss": 0.2349, "rewards/chosen": -0.4523082820134365, "rewards/margins": 5.939583609878456, "rewards/rejected": -6.391891891891892, "step": 5250 }, { "epoch": 2.753205967024339, "grad_norm": 0.5454111392127353, "kl": 0.0, "learning_rate": 1.017752722259624e-06, "logits/chosen": -730018624.0, "logits/rejected": -651899712.0, "logps/chosen": -406.984126984127, "logps/rejected": -901.5138461538462, "loss": 0.2371, "rewards/chosen": -0.650719246031746, "rewards/margins": 5.203126907814408, "rewards/rejected": -5.8538461538461535, "step": 5260 }, { "epoch": 2.758440198900811, "grad_norm": 0.9196034243834265, "kl": 0.0, "learning_rate": 9.751856364888178e-07, "logits/chosen": -695363200.0, "logits/rejected": -569376768.0, "logps/chosen": -404.28205128205127, "logps/rejected": -996.8780487804878, "loss": 0.2357, "rewards/chosen": -0.558769030448718, "rewards/margins": 6.532694384185429, "rewards/rejected": -7.091463414634147, "step": 5270 }, { "epoch": 2.7636744307772836, "grad_norm": 0.6213552424976277, "kl": 0.0, "learning_rate": 9.335101386471285e-07, "logits/chosen": -686607552.0, "logits/rejected": -656828032.0, "logps/chosen": -461.7721518987342, "logps/rejected": -922.5679012345679, "loss": 0.2452, "rewards/chosen": -1.2786787974683544, "rewards/margins": 4.898018733395843, "rewards/rejected": -6.176697530864198, "step": 5280 }, { "epoch": 2.7689086626537556, "grad_norm": 0.6679904514162947, "kl": 0.0, "learning_rate": 8.927277753592339e-07, "logits/chosen": -717435712.0, "logits/rejected": -489842272.0, "logps/chosen": -387.56656346749224, "logps/rejected": -1071.6466876971608, "loss": 0.2605, "rewards/chosen": -0.5881990131578947, "rewards/margins": 7.4181101351070895, "rewards/rejected": -8.006309148264984, "step": 5290 }, { "epoch": 2.7741428945302276, "grad_norm": 0.5523711430994795, "kl": 0.0, "learning_rate": 8.528400601045816e-07, "logits/chosen": -682308416.0, "logits/rejected": -570530176.0, "logps/chosen": -432.96676737160124, "logps/rejected": -1099.495145631068, "loss": 0.2637, "rewards/chosen": -1.1677799282477341, "rewards/margins": 7.145326867868771, "rewards/rejected": -8.313106796116505, "step": 5300 }, { "epoch": 2.7741428945302276, "eval_kl": 0.0, "eval_logits/chosen": -1414479104.0, "eval_logits/rejected": -1139319424.0, "eval_logps/chosen": -551.0460168233548, "eval_logps/rejected": -835.7553455992044, "eval_loss": 0.4497031271457672, "eval_rewards/chosen": -2.193159327065809, "eval_rewards/margins": 3.214598007593564, "eval_rewards/rejected": -5.407757334659373, "eval_runtime": 93.4396, "eval_samples_per_second": 42.808, "eval_steps_per_second": 0.674, "step": 5300 }, { "epoch": 2.7793771264067, "grad_norm": 0.47999789112467145, "kl": 0.0, "learning_rate": 8.138484731612272e-07, "logits/chosen": -720686272.0, "logits/rejected": -552914112.0, "logps/chosen": -408.576802507837, "logps/rejected": -1018.018691588785, "loss": 0.2428, "rewards/chosen": -0.6440536833855799, "rewards/margins": 6.398002391380776, "rewards/rejected": -7.042056074766355, "step": 5310 }, { "epoch": 2.784611358283172, "grad_norm": 0.8354416613963258, "kl": 0.0, "learning_rate": 7.757544615508927e-07, "logits/chosen": -685873536.0, "logits/rejected": -544420672.0, "logps/chosen": -448.6771159874608, "logps/rejected": -1034.766355140187, "loss": 0.2554, "rewards/chosen": -1.1204692398119123, "rewards/margins": 6.40445287856815, "rewards/rejected": -7.524922118380062, "step": 5320 }, { "epoch": 2.789845590159644, "grad_norm": 0.5173887570787987, "kl": 0.0, "learning_rate": 7.385594389852674e-07, "logits/chosen": -755499008.0, "logits/rejected": -526490016.0, "logps/chosen": -370.35158501440924, "logps/rejected": -963.6587030716723, "loss": 0.2679, "rewards/chosen": -0.6374279538904899, "rewards/margins": 5.954295595597564, "rewards/rejected": -6.591723549488054, "step": 5330 }, { "epoch": 2.795079822036116, "grad_norm": 0.4805949533097288, "kl": 0.0, "learning_rate": 7.0226478581355e-07, "logits/chosen": -667418624.0, "logits/rejected": -534459200.0, "logps/chosen": -417.87012987012986, "logps/rejected": -965.2048192771084, "loss": 0.2464, "rewards/chosen": -0.8856026785714286, "rewards/margins": 5.993162381669535, "rewards/rejected": -6.878765060240964, "step": 5340 }, { "epoch": 2.8003140539125884, "grad_norm": 0.4980762598857717, "kl": 0.0, "learning_rate": 6.668718489712039e-07, "logits/chosen": -703594496.0, "logits/rejected": -536399040.0, "logps/chosen": -437.4019292604502, "logps/rejected": -1035.5744680851064, "loss": 0.2438, "rewards/chosen": -0.9048118468649518, "rewards/margins": 6.747923715445078, "rewards/rejected": -7.65273556231003, "step": 5350 }, { "epoch": 2.8055482857890603, "grad_norm": 0.6042457279492705, "kl": 0.0, "learning_rate": 6.323819419299992e-07, "logits/chosen": -694471872.0, "logits/rejected": -554591872.0, "logps/chosen": -529.083870967742, "logps/rejected": -1095.7575757575758, "loss": 0.2427, "rewards/chosen": -1.7447580645161291, "rewards/margins": 6.24463587487781, "rewards/rejected": -7.989393939393939, "step": 5360 }, { "epoch": 2.8107825176655323, "grad_norm": 0.5542724861237701, "kl": 0.0, "learning_rate": 5.987963446492384e-07, "logits/chosen": -647128704.0, "logits/rejected": -598422336.0, "logps/chosen": -427.97241379310344, "logps/rejected": -950.8571428571429, "loss": 0.2395, "rewards/chosen": -0.9704067887931035, "rewards/margins": 5.628164639778325, "rewards/rejected": -6.598571428571429, "step": 5370 }, { "epoch": 2.8160167495420048, "grad_norm": 0.5420236989736987, "kl": 0.0, "learning_rate": 5.661163035282802e-07, "logits/chosen": -674129536.0, "logits/rejected": -598212608.0, "logps/chosen": -419.10828025477707, "logps/rejected": -1006.4294478527607, "loss": 0.2559, "rewards/chosen": -0.90734474522293, "rewards/margins": 5.825784089132899, "rewards/rejected": -6.733128834355829, "step": 5380 }, { "epoch": 2.8212509814184767, "grad_norm": 0.562792644335705, "kl": 0.0, "learning_rate": 5.343430313602738e-07, "logits/chosen": -708522816.0, "logits/rejected": -509660352.0, "logps/chosen": -383.055900621118, "logps/rejected": -904.8553459119497, "loss": 0.2659, "rewards/chosen": -0.43432162267080743, "rewards/margins": 5.90608718236064, "rewards/rejected": -6.340408805031447, "step": 5390 }, { "epoch": 2.8264852132949487, "grad_norm": 0.5976247674593289, "kl": 0.0, "learning_rate": 5.034777072871394e-07, "logits/chosen": -699505024.0, "logits/rejected": -571159360.0, "logps/chosen": -456.67973856209153, "logps/rejected": -919.5688622754491, "loss": 0.2683, "rewards/chosen": -1.327777139501634, "rewards/margins": 5.115336632953456, "rewards/rejected": -6.4431137724550895, "step": 5400 }, { "epoch": 2.8264852132949487, "eval_kl": 0.0, "eval_logits/chosen": -1420271232.0, "eval_logits/rejected": -1144445824.0, "eval_logps/chosen": -552.9777337951509, "eval_logps/rejected": -847.9124813525609, "eval_loss": 0.4496484398841858, "eval_rewards/chosen": -2.211652647204354, "eval_rewards/margins": 3.318680520373965, "eval_rewards/rejected": -5.530333167578319, "eval_runtime": 93.4275, "eval_samples_per_second": 42.814, "eval_steps_per_second": 0.674, "step": 5400 }, { "epoch": 2.831719445171421, "grad_norm": 0.8906081192471247, "kl": 0.0, "learning_rate": 4.735214767558338e-07, "logits/chosen": -713451136.0, "logits/rejected": -585105408.0, "logps/chosen": -354.5263157894737, "logps/rejected": -850.6845637583892, "loss": 0.2582, "rewards/chosen": -0.46142292580409355, "rewards/margins": 5.177838819162349, "rewards/rejected": -5.639261744966443, "step": 5410 }, { "epoch": 2.836953677047893, "grad_norm": 0.4724582634761599, "kl": 0.0, "learning_rate": 4.444754514758231e-07, "logits/chosen": -735785792.0, "logits/rejected": -530736736.0, "logps/chosen": -417.6190476190476, "logps/rejected": -1088.421052631579, "loss": 0.2558, "rewards/chosen": -0.9201078869047619, "rewards/margins": 6.978740797305765, "rewards/rejected": -7.8988486842105265, "step": 5420 }, { "epoch": 2.842187908924365, "grad_norm": 0.5395820316831023, "kl": 0.0, "learning_rate": 4.163407093778243e-07, "logits/chosen": -717121152.0, "logits/rejected": -543739072.0, "logps/chosen": -373.1482649842271, "logps/rejected": -935.9256965944272, "loss": 0.2558, "rewards/chosen": -0.5599122634069401, "rewards/margins": 5.833276591082224, "rewards/rejected": -6.393188854489164, "step": 5430 }, { "epoch": 2.8474221408008376, "grad_norm": 0.7642717712653311, "kl": 0.0, "learning_rate": 3.891182945738259e-07, "logits/chosen": -648124800.0, "logits/rejected": -659030016.0, "logps/chosen": -400.1025641025641, "logps/rejected": -936.6829268292682, "loss": 0.254, "rewards/chosen": -0.9886067708333334, "rewards/margins": 5.395539570630081, "rewards/rejected": -6.384146341463414, "step": 5440 }, { "epoch": 2.8526563726773095, "grad_norm": 0.4376405764138052, "kl": 0.0, "learning_rate": 3.628092173183023e-07, "logits/chosen": -712612224.0, "logits/rejected": -543109952.0, "logps/chosen": -414.125, "logps/rejected": -1204.4, "loss": 0.2247, "rewards/chosen": -0.70224609375, "rewards/margins": 8.368847656249999, "rewards/rejected": -9.07109375, "step": 5450 }, { "epoch": 2.8578906045537815, "grad_norm": 0.41737288250225, "kl": 0.0, "learning_rate": 3.37414453970758e-07, "logits/chosen": -698456448.0, "logits/rejected": -527433728.0, "logps/chosen": -375.7546012269939, "logps/rejected": -1011.3630573248407, "loss": 0.2532, "rewards/chosen": -0.6709403757668712, "rewards/margins": 6.528900388564339, "rewards/rejected": -7.19984076433121, "step": 5460 }, { "epoch": 2.863124836430254, "grad_norm": 0.623828872587112, "kl": 0.0, "learning_rate": 3.129349469594728e-07, "logits/chosen": -694052480.0, "logits/rejected": -591711424.0, "logps/chosen": -393.32515337423314, "logps/rejected": -972.8407643312102, "loss": 0.2531, "rewards/chosen": -0.6528230444785276, "rewards/margins": 5.910871223037397, "rewards/rejected": -6.563694267515924, "step": 5470 }, { "epoch": 2.868359068306726, "grad_norm": 0.5135357902714759, "kl": 0.0, "learning_rate": 2.8937160474652725e-07, "logits/chosen": -613049984.0, "logits/rejected": -571473920.0, "logps/chosen": -415.1111111111111, "logps/rejected": -963.1616766467066, "loss": 0.2537, "rewards/chosen": -0.9870046977124183, "rewards/margins": 5.6424863202516535, "rewards/rejected": -6.629491017964072, "step": 5480 }, { "epoch": 2.873593300183198, "grad_norm": 0.7684511654598384, "kl": 0.0, "learning_rate": 2.667253017941018e-07, "logits/chosen": -783810560.0, "logits/rejected": -508873920.0, "logps/chosen": -450.54117647058825, "logps/rejected": -1056.96, "loss": 0.2651, "rewards/chosen": -1.0138327205882354, "rewards/margins": 6.628667279411765, "rewards/rejected": -7.6425, "step": 5490 }, { "epoch": 2.8788275320596703, "grad_norm": 0.6529707604480456, "kl": 0.0, "learning_rate": 2.449968785320139e-07, "logits/chosen": -714289984.0, "logits/rejected": -590715264.0, "logps/chosen": -452.7840531561462, "logps/rejected": -955.3746312684366, "loss": 0.2551, "rewards/chosen": -1.0597487541528239, "rewards/margins": 5.417389888915023, "rewards/rejected": -6.477138643067847, "step": 5500 }, { "epoch": 2.8788275320596703, "eval_kl": 0.0, "eval_logits/chosen": -1425663872.0, "eval_logits/rejected": -1148307200.0, "eval_logps/chosen": -551.4576942107867, "eval_logps/rejected": -846.35305818001, "eval_loss": 0.44935154914855957, "eval_rewards/chosen": -2.1969322117763483, "eval_rewards/margins": 3.312391507766168, "eval_rewards/rejected": -5.509323719542516, "eval_runtime": 93.4457, "eval_samples_per_second": 42.806, "eval_steps_per_second": 0.674, "step": 5500 }, { "epoch": 2.8840617639361423, "grad_norm": 0.7418538204206191, "kl": 0.0, "learning_rate": 2.2418714132653173e-07, "logits/chosen": -696569024.0, "logits/rejected": -590453120.0, "logps/chosen": -439.5974842767296, "logps/rejected": -989.1180124223603, "loss": 0.2709, "rewards/chosen": -0.9837608097484277, "rewards/margins": 5.791860308263995, "rewards/rejected": -6.775621118012422, "step": 5510 }, { "epoch": 2.8892959958126143, "grad_norm": 0.4173372869938946, "kl": 0.0, "learning_rate": 2.0429686245045098e-07, "logits/chosen": -672556672.0, "logits/rejected": -566545600.0, "logps/chosen": -474.6842105263158, "logps/rejected": -996.5714285714286, "loss": 0.2616, "rewards/chosen": -1.384534333881579, "rewards/margins": 5.601328761356516, "rewards/rejected": -6.985863095238095, "step": 5520 }, { "epoch": 2.8945302276890867, "grad_norm": 1.1299589811683035, "kl": 0.0, "learning_rate": 1.853267800544234e-07, "logits/chosen": -631767040.0, "logits/rejected": -611581952.0, "logps/chosen": -452.3076923076923, "logps/rejected": -1065.3658536585365, "loss": 0.2683, "rewards/chosen": -1.2772435897435896, "rewards/margins": 6.299738117573484, "rewards/rejected": -7.576981707317073, "step": 5530 }, { "epoch": 2.8997644595655587, "grad_norm": 0.8935381297411652, "kl": 0.0, "learning_rate": 1.6727759813958965e-07, "logits/chosen": -684615296.0, "logits/rejected": -543948800.0, "logps/chosen": -414.6953846153846, "logps/rejected": -994.1333333333333, "loss": 0.2516, "rewards/chosen": -1.030673076923077, "rewards/margins": 5.983612637362637, "rewards/rejected": -7.014285714285714, "step": 5540 }, { "epoch": 2.9049986914420307, "grad_norm": 0.48082815632583886, "kl": 0.0, "learning_rate": 1.501499865314171e-07, "logits/chosen": -717540544.0, "logits/rejected": -618921984.0, "logps/chosen": -390.4891640866873, "logps/rejected": -1102.1324921135647, "loss": 0.2538, "rewards/chosen": -0.614454334365325, "rewards/margins": 7.245955760271899, "rewards/rejected": -7.860410094637224, "step": 5550 }, { "epoch": 2.910232923318503, "grad_norm": 1.2162251721623474, "kl": 0.0, "learning_rate": 1.3394458085487505e-07, "logits/chosen": -692374720.0, "logits/rejected": -560516288.0, "logps/chosen": -422.4935064935065, "logps/rejected": -964.2409638554217, "loss": 0.2427, "rewards/chosen": -0.7687195616883117, "rewards/margins": 5.697394896143013, "rewards/rejected": -6.466114457831325, "step": 5560 }, { "epoch": 2.915467155194975, "grad_norm": 0.977753969363026, "kl": 0.0, "learning_rate": 1.1866198251082594e-07, "logits/chosen": -727606912.0, "logits/rejected": -533830048.0, "logps/chosen": -468.3987915407855, "logps/rejected": -1069.7734627831715, "loss": 0.2706, "rewards/chosen": -1.368202416918429, "rewards/margins": 6.212703731948885, "rewards/rejected": -7.580906148867314, "step": 5570 }, { "epoch": 2.920701387071447, "grad_norm": 0.48217186745357443, "kl": 0.0, "learning_rate": 1.0430275865371265e-07, "logits/chosen": -637009920.0, "logits/rejected": -540121472.0, "logps/chosen": -410.9306930693069, "logps/rejected": -1008.9970326409496, "loss": 0.2447, "rewards/chosen": -0.8706683168316832, "rewards/margins": 6.483931089696507, "rewards/rejected": -7.35459940652819, "step": 5580 }, { "epoch": 2.9259356189479195, "grad_norm": 0.3671802124859091, "kl": 0.0, "learning_rate": 9.086744217050857e-08, "logits/chosen": -659030016.0, "logits/rejected": -551341248.0, "logps/chosen": -413.5424836601307, "logps/rejected": -1029.748502994012, "loss": 0.2476, "rewards/chosen": -0.7669015522875817, "rewards/margins": 6.527260124359125, "rewards/rejected": -7.294161676646707, "step": 5590 }, { "epoch": 2.9311698508243915, "grad_norm": 0.8070041617835978, "kl": 0.0, "learning_rate": 7.835653166094747e-08, "logits/chosen": -722888320.0, "logits/rejected": -539806912.0, "logps/chosen": -418.1951219512195, "logps/rejected": -940.6153846153846, "loss": 0.2695, "rewards/chosen": -1.0270400628810976, "rewards/margins": 5.376405449939416, "rewards/rejected": -6.403445512820513, "step": 5600 }, { "epoch": 2.9311698508243915, "eval_kl": 0.0, "eval_logits/chosen": -1428859520.0, "eval_logits/rejected": -1154099328.0, "eval_logps/chosen": -548.8292924294904, "eval_logps/rejected": -841.0064644455495, "eval_loss": 0.449031263589859, "eval_rewards/chosen": -2.172068283028204, "eval_rewards/margins": 3.2870316672452917, "eval_rewards/rejected": -5.459099950273496, "eval_runtime": 93.4114, "eval_samples_per_second": 42.821, "eval_steps_per_second": 0.674, "step": 5600 }, { "epoch": 2.9364040827008635, "grad_norm": 0.567629032177363, "kl": 0.0, "learning_rate": 6.677049141901315e-08, "logits/chosen": -703279936.0, "logits/rejected": -649383104.0, "logps/chosen": -395.5880398671096, "logps/rejected": -1018.3362831858407, "loss": 0.2348, "rewards/chosen": -0.4837001661129568, "rewards/margins": 6.577509273415067, "rewards/rejected": -7.061209439528024, "step": 5610 }, { "epoch": 2.941638314577336, "grad_norm": 0.6635829541099674, "kl": 0.0, "learning_rate": 5.610975141571162e-08, "logits/chosen": -723622272.0, "logits/rejected": -624531840.0, "logps/chosen": -416.86624203821657, "logps/rejected": -1150.8220858895706, "loss": 0.2657, "rewards/chosen": -0.8588525079617835, "rewards/margins": 7.375043197559689, "rewards/rejected": -8.233895705521473, "step": 5620 }, { "epoch": 2.946872546453808, "grad_norm": 0.6391268824424916, "kl": 0.0, "learning_rate": 4.6374707283117215e-08, "logits/chosen": -744069504.0, "logits/rejected": -576349824.0, "logps/chosen": -394.24, "logps/rejected": -878.0190476190476, "loss": 0.2767, "rewards/chosen": -0.7142788461538462, "rewards/margins": 5.07143543956044, "rewards/rejected": -5.785714285714286, "step": 5630 }, { "epoch": 2.95210677833028, "grad_norm": 0.4791240509178076, "kl": 0.0, "learning_rate": 3.7565720299687076e-08, "logits/chosen": -732849792.0, "logits/rejected": -558478144.0, "logps/chosen": -396.319018404908, "logps/rejected": -951.4394904458599, "loss": 0.2625, "rewards/chosen": -0.5491875958588958, "rewards/margins": 5.981863359555117, "rewards/rejected": -6.531050955414012, "step": 5640 }, { "epoch": 2.9573410102067523, "grad_norm": 0.41528798410488044, "kl": 0.0, "learning_rate": 2.9683117376852475e-08, "logits/chosen": -745537536.0, "logits/rejected": -595486336.0, "logps/chosen": -381.840490797546, "logps/rejected": -956.7388535031847, "loss": 0.2575, "rewards/chosen": -0.3813266871165644, "rewards/margins": 6.192717898870697, "rewards/rejected": -6.574044585987261, "step": 5650 }, { "epoch": 2.9625752420832243, "grad_norm": 0.5510367626492982, "kl": 0.0, "learning_rate": 2.272719104688403e-08, "logits/chosen": -772171392.0, "logits/rejected": -536870912.0, "logps/chosen": -351.65, "logps/rejected": -1027.6, "loss": 0.2431, "rewards/chosen": 0.03939208984375, "rewards/margins": 7.49798583984375, "rewards/rejected": -7.45859375, "step": 5660 }, { "epoch": 2.9678094739596963, "grad_norm": 0.8534857961161093, "kl": 0.0, "learning_rate": 1.6698199452053198e-08, "logits/chosen": -638346880.0, "logits/rejected": -599523328.0, "logps/chosen": -422.4533333333333, "logps/rejected": -949.2705882352941, "loss": 0.2679, "rewards/chosen": -1.1184375, "rewards/margins": 5.462444852941177, "rewards/rejected": -6.580882352941177, "step": 5670 }, { "epoch": 2.9730437058361687, "grad_norm": 0.679885460906798, "kl": 0.0, "learning_rate": 1.1596366335023257e-08, "logits/chosen": -714499712.0, "logits/rejected": -607125504.0, "logps/chosen": -410.2180685358255, "logps/rejected": -1051.6865203761756, "loss": 0.2684, "rewards/chosen": -0.8111005646417445, "rewards/margins": 6.545482507458569, "rewards/rejected": -7.356583072100314, "step": 5680 }, { "epoch": 2.9782779377126407, "grad_norm": 1.111508955638191, "kl": 0.0, "learning_rate": 7.42188103057262e-09, "logits/chosen": -697198208.0, "logits/rejected": -521194688.0, "logps/chosen": -484.1025641025641, "logps/rejected": -1003.2941176470588, "loss": 0.3154, "rewards/chosen": -1.8350249287749287, "rewards/margins": 5.181843583335798, "rewards/rejected": -7.016868512110727, "step": 5690 }, { "epoch": 2.9835121695891127, "grad_norm": 1.4290512414061876, "kl": 0.0, "learning_rate": 4.174898458556009e-09, "logits/chosen": -756652416.0, "logits/rejected": -551550976.0, "logps/chosen": -395.7317073170732, "logps/rejected": -899.8974358974359, "loss": 0.2664, "rewards/chosen": -0.5129215891768293, "rewards/margins": 5.686597641592401, "rewards/rejected": -6.199519230769231, "step": 5700 }, { "epoch": 2.9835121695891127, "eval_kl": 0.0, "eval_logits/chosen": -1428926080.0, "eval_logits/rejected": -1154165888.0, "eval_logps/chosen": -548.5442850074221, "eval_logps/rejected": -840.7200397812034, "eval_loss": 0.4491328001022339, "eval_rewards/chosen": -2.1686046511627906, "eval_rewards/margins": 3.2858955974697306, "eval_rewards/rejected": -5.454500248632521, "eval_runtime": 93.4352, "eval_samples_per_second": 42.81, "eval_steps_per_second": 0.674, "step": 5700 }, { "epoch": 2.988746401465585, "grad_norm": 0.7011755361178991, "kl": 0.0, "learning_rate": 1.8555391181507288e-09, "logits/chosen": -752353280.0, "logits/rejected": -584895680.0, "logps/chosen": -422.3105590062112, "logps/rejected": -1009.5094339622641, "loss": 0.2625, "rewards/chosen": -0.750873447204969, "rewards/margins": 6.488120263486856, "rewards/rejected": -7.238993710691824, "step": 5710 }, { "epoch": 2.993980633342057, "grad_norm": 1.9855747771297168, "kl": 0.0, "learning_rate": 4.638890833991161e-10, "logits/chosen": -771856768.0, "logits/rejected": -604137088.0, "logps/chosen": -429.2048192771084, "logps/rejected": -1076.5714285714287, "loss": 0.2949, "rewards/chosen": -0.9644672439759037, "rewards/margins": 6.747383405374746, "rewards/rejected": -7.71185064935065, "step": 5720 }, { "epoch": 2.999214865218529, "grad_norm": 0.5949657119573064, "kl": 0.0, "learning_rate": 0.0, "logits/chosen": -709256832.0, "logits/rejected": -596089216.0, "logps/chosen": -497.5903614457831, "logps/rejected": -1091.844155844156, "loss": 0.2986, "rewards/chosen": -1.5481574736445782, "rewards/margins": 6.373108760121656, "rewards/rejected": -7.921266233766234, "step": 5730 }, { "epoch": 2.999214865218529, "step": 5730, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 0.0431, "train_samples_per_second": 8512981.493, "train_steps_per_second": 132983.787 } ], "logging_steps": 10, "max_steps": 5730, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }