{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.998828811243412, "eval_steps": 75, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024985360140542652, "grad_norm": 14.177000429059124, "learning_rate": 4e-09, "logits/chosen": -0.7216415405273438, "logits/rejected": -0.7776755690574646, "logps/chosen": -180.14370727539062, "logps/rejected": -163.619384765625, "loss": 0.5911, "rewards/accuracies": 0.671875, "rewards/chosen": 2.089264392852783, "rewards/margins": 0.4413459897041321, "rewards/rejected": 1.647918462753296, "step": 2 }, { "epoch": 0.049970720281085304, "grad_norm": 13.032161582993453, "learning_rate": 8e-09, "logits/chosen": -0.6800286769866943, "logits/rejected": -0.7293923497200012, "logps/chosen": -187.60638427734375, "logps/rejected": -170.1859893798828, "loss": 0.5762, "rewards/accuracies": 0.66796875, "rewards/chosen": 2.104745388031006, "rewards/margins": 0.4643358588218689, "rewards/rejected": 1.6404094696044922, "step": 4 }, { "epoch": 0.07495608042162795, "grad_norm": 14.424559495239313, "learning_rate": 1.1999999999999998e-08, "logits/chosen": -0.7123140096664429, "logits/rejected": -0.7762022018432617, "logps/chosen": -186.28652954101562, "logps/rejected": -169.9141387939453, "loss": 0.589, "rewards/accuracies": 0.66015625, "rewards/chosen": 2.052624464035034, "rewards/margins": 0.40381550788879395, "rewards/rejected": 1.6488089561462402, "step": 6 }, { "epoch": 0.09994144056217061, "grad_norm": 15.25048206057953, "learning_rate": 1.6e-08, "logits/chosen": -0.6801178455352783, "logits/rejected": -0.7383629083633423, "logps/chosen": -184.20584106445312, "logps/rejected": -170.67449951171875, "loss": 0.5855, "rewards/accuracies": 0.703125, "rewards/chosen": 2.0641534328460693, "rewards/margins": 0.4149114787578583, "rewards/rejected": 1.6492421627044678, "step": 8 }, { "epoch": 0.12492680070271325, "grad_norm": 15.020860898871753, "learning_rate": 2e-08, "logits/chosen": -0.6762746572494507, "logits/rejected": -0.7306088805198669, "logps/chosen": -181.46649169921875, "logps/rejected": -171.62355041503906, "loss": 0.5886, "rewards/accuracies": 0.671875, "rewards/chosen": 2.0547406673431396, "rewards/margins": 0.4107271730899811, "rewards/rejected": 1.6440132856369019, "step": 10 }, { "epoch": 0.1499121608432559, "grad_norm": 15.546401340611093, "learning_rate": 2.3999999999999997e-08, "logits/chosen": -0.6915724873542786, "logits/rejected": -0.7597174048423767, "logps/chosen": -191.33026123046875, "logps/rejected": -172.05740356445312, "loss": 0.5952, "rewards/accuracies": 0.68359375, "rewards/chosen": 2.090841293334961, "rewards/margins": 0.4058065116405487, "rewards/rejected": 1.685034990310669, "step": 12 }, { "epoch": 0.17489752098379854, "grad_norm": 15.179087180823856, "learning_rate": 2.8000000000000003e-08, "logits/chosen": -0.648224413394928, "logits/rejected": -0.7036635875701904, "logps/chosen": -181.56240844726562, "logps/rejected": -161.77291870117188, "loss": 0.5662, "rewards/accuracies": 0.69140625, "rewards/chosen": 2.07316255569458, "rewards/margins": 0.45191100239753723, "rewards/rejected": 1.6212515830993652, "step": 14 }, { "epoch": 0.19988288112434122, "grad_norm": 14.717039853262694, "learning_rate": 3.2e-08, "logits/chosen": -0.678563117980957, "logits/rejected": -0.7321793437004089, "logps/chosen": -191.10882568359375, "logps/rejected": -172.11471557617188, "loss": 0.5792, "rewards/accuracies": 0.69140625, "rewards/chosen": 2.1359810829162598, "rewards/margins": 0.4702029824256897, "rewards/rejected": 1.6657780408859253, "step": 16 }, { "epoch": 0.22486824126488386, "grad_norm": 15.26388916085504, "learning_rate": 3.6e-08, "logits/chosen": -0.6794100999832153, "logits/rejected": -0.7467265129089355, "logps/chosen": -190.895751953125, "logps/rejected": -171.15126037597656, "loss": 0.5949, "rewards/accuracies": 0.66796875, "rewards/chosen": 2.0863959789276123, "rewards/margins": 0.4278351664543152, "rewards/rejected": 1.658560872077942, "step": 18 }, { "epoch": 0.2498536014054265, "grad_norm": 14.719456857161541, "learning_rate": 4e-08, "logits/chosen": -0.6606219410896301, "logits/rejected": -0.7190724611282349, "logps/chosen": -179.43295288085938, "logps/rejected": -163.46678161621094, "loss": 0.5819, "rewards/accuracies": 0.71484375, "rewards/chosen": 2.1400554180145264, "rewards/margins": 0.5210827589035034, "rewards/rejected": 1.6189727783203125, "step": 20 }, { "epoch": 0.27483896154596915, "grad_norm": 14.799553727376024, "learning_rate": 4.4e-08, "logits/chosen": -0.6596983671188354, "logits/rejected": -0.7132915258407593, "logps/chosen": -186.89849853515625, "logps/rejected": -177.6392364501953, "loss": 0.5881, "rewards/accuracies": 0.73828125, "rewards/chosen": 2.0855584144592285, "rewards/margins": 0.5197086334228516, "rewards/rejected": 1.5658495426177979, "step": 22 }, { "epoch": 0.2998243216865118, "grad_norm": 16.401751337438842, "learning_rate": 4.799999999999999e-08, "logits/chosen": -0.6935199499130249, "logits/rejected": -0.7622916102409363, "logps/chosen": -191.56312561035156, "logps/rejected": -166.0808563232422, "loss": 0.5876, "rewards/accuracies": 0.72265625, "rewards/chosen": 2.1759369373321533, "rewards/margins": 0.584960401058197, "rewards/rejected": 1.590976357460022, "step": 24 }, { "epoch": 0.32480968182705444, "grad_norm": 14.092401453744207, "learning_rate": 5.2e-08, "logits/chosen": -0.6964302062988281, "logits/rejected": -0.7522369623184204, "logps/chosen": -183.28709411621094, "logps/rejected": -176.45947265625, "loss": 0.5887, "rewards/accuracies": 0.71484375, "rewards/chosen": 2.1809558868408203, "rewards/margins": 0.4667380154132843, "rewards/rejected": 1.7142179012298584, "step": 26 }, { "epoch": 0.3497950419675971, "grad_norm": 14.992677802834425, "learning_rate": 5.6000000000000005e-08, "logits/chosen": -0.6727583408355713, "logits/rejected": -0.7299581170082092, "logps/chosen": -176.1138458251953, "logps/rejected": -165.51553344726562, "loss": 0.5947, "rewards/accuracies": 0.6875, "rewards/chosen": 2.0702695846557617, "rewards/margins": 0.42442983388900757, "rewards/rejected": 1.645839810371399, "step": 28 }, { "epoch": 0.3747804021081398, "grad_norm": 14.234258704647342, "learning_rate": 6e-08, "logits/chosen": -0.6875941753387451, "logits/rejected": -0.7378899455070496, "logps/chosen": -179.08218383789062, "logps/rejected": -170.3502197265625, "loss": 0.5854, "rewards/accuracies": 0.67578125, "rewards/chosen": 2.12485408782959, "rewards/margins": 0.4190685749053955, "rewards/rejected": 1.7057857513427734, "step": 30 }, { "epoch": 0.39976576224868243, "grad_norm": 15.320369672206587, "learning_rate": 6.4e-08, "logits/chosen": -0.6457805633544922, "logits/rejected": -0.7087669372558594, "logps/chosen": -174.39279174804688, "logps/rejected": -161.18417358398438, "loss": 0.556, "rewards/accuracies": 0.6796875, "rewards/chosen": 2.1353416442871094, "rewards/margins": 0.5485972166061401, "rewards/rejected": 1.5867444276809692, "step": 32 }, { "epoch": 0.4247511223892251, "grad_norm": 14.996986688498861, "learning_rate": 6.8e-08, "logits/chosen": -0.677814781665802, "logits/rejected": -0.7355855703353882, "logps/chosen": -184.22764587402344, "logps/rejected": -164.98434448242188, "loss": 0.5726, "rewards/accuracies": 0.71484375, "rewards/chosen": 2.0874392986297607, "rewards/margins": 0.5497796535491943, "rewards/rejected": 1.5376596450805664, "step": 34 }, { "epoch": 0.4497364825297677, "grad_norm": 15.836133161520731, "learning_rate": 7.2e-08, "logits/chosen": -0.6465247273445129, "logits/rejected": -0.7009165287017822, "logps/chosen": -183.50096130371094, "logps/rejected": -172.29428100585938, "loss": 0.5909, "rewards/accuracies": 0.6796875, "rewards/chosen": 2.1548638343811035, "rewards/margins": 0.48915886878967285, "rewards/rejected": 1.6657049655914307, "step": 36 }, { "epoch": 0.47472184267031037, "grad_norm": 15.068791059044901, "learning_rate": 7.599999999999999e-08, "logits/chosen": -0.684742271900177, "logits/rejected": -0.7406108379364014, "logps/chosen": -178.16604614257812, "logps/rejected": -172.45472717285156, "loss": 0.5728, "rewards/accuracies": 0.69140625, "rewards/chosen": 2.1049206256866455, "rewards/margins": 0.4164605736732483, "rewards/rejected": 1.6884599924087524, "step": 38 }, { "epoch": 0.499707202810853, "grad_norm": 15.172663789942417, "learning_rate": 8e-08, "logits/chosen": -0.6802005767822266, "logits/rejected": -0.7308796048164368, "logps/chosen": -178.501708984375, "logps/rejected": -162.97750854492188, "loss": 0.5778, "rewards/accuracies": 0.703125, "rewards/chosen": 2.145007848739624, "rewards/margins": 0.5102132558822632, "rewards/rejected": 1.6347947120666504, "step": 40 }, { "epoch": 0.5246925629513957, "grad_norm": 13.193729893516823, "learning_rate": 8.4e-08, "logits/chosen": -0.67890864610672, "logits/rejected": -0.7359157204627991, "logps/chosen": -180.63043212890625, "logps/rejected": -177.0836181640625, "loss": 0.5706, "rewards/accuracies": 0.70703125, "rewards/chosen": 2.1336517333984375, "rewards/margins": 0.4847910404205322, "rewards/rejected": 1.6488608121871948, "step": 42 }, { "epoch": 0.5496779230919383, "grad_norm": 14.543118187410414, "learning_rate": 8.8e-08, "logits/chosen": -0.6593753099441528, "logits/rejected": -0.7200923562049866, "logps/chosen": -179.75027465820312, "logps/rejected": -161.65733337402344, "loss": 0.5614, "rewards/accuracies": 0.72265625, "rewards/chosen": 2.1679983139038086, "rewards/margins": 0.6123022437095642, "rewards/rejected": 1.5556960105895996, "step": 44 }, { "epoch": 0.574663283232481, "grad_norm": 14.478724863209543, "learning_rate": 9.2e-08, "logits/chosen": -0.6733975410461426, "logits/rejected": -0.725917398929596, "logps/chosen": -179.19137573242188, "logps/rejected": -167.48928833007812, "loss": 0.5721, "rewards/accuracies": 0.7265625, "rewards/chosen": 2.126537561416626, "rewards/margins": 0.5884015560150146, "rewards/rejected": 1.5381361246109009, "step": 46 }, { "epoch": 0.5996486433730236, "grad_norm": 13.598065354511457, "learning_rate": 9.599999999999999e-08, "logits/chosen": -0.6861451864242554, "logits/rejected": -0.7490273118019104, "logps/chosen": -192.40524291992188, "logps/rejected": -166.65826416015625, "loss": 0.5666, "rewards/accuracies": 0.69140625, "rewards/chosen": 2.1456832885742188, "rewards/margins": 0.5288498997688293, "rewards/rejected": 1.6168336868286133, "step": 48 }, { "epoch": 0.6246340035135662, "grad_norm": 13.749586623653736, "learning_rate": 1e-07, "logits/chosen": -0.6737085580825806, "logits/rejected": -0.7165706753730774, "logps/chosen": -176.8297119140625, "logps/rejected": -168.13772583007812, "loss": 0.5656, "rewards/accuracies": 0.734375, "rewards/chosen": 2.0855467319488525, "rewards/margins": 0.5343782305717468, "rewards/rejected": 1.551168441772461, "step": 50 }, { "epoch": 0.6496193636541089, "grad_norm": 14.236522635027217, "learning_rate": 1.04e-07, "logits/chosen": -0.6797468662261963, "logits/rejected": -0.7432878613471985, "logps/chosen": -180.42208862304688, "logps/rejected": -165.42669677734375, "loss": 0.5405, "rewards/accuracies": 0.71875, "rewards/chosen": 2.1683268547058105, "rewards/margins": 0.678287148475647, "rewards/rejected": 1.4900394678115845, "step": 52 }, { "epoch": 0.6746047237946515, "grad_norm": 12.868471228668062, "learning_rate": 1.08e-07, "logits/chosen": -0.6745160818099976, "logits/rejected": -0.7254283428192139, "logps/chosen": -183.60704040527344, "logps/rejected": -170.13792419433594, "loss": 0.5348, "rewards/accuracies": 0.73046875, "rewards/chosen": 2.0506410598754883, "rewards/margins": 0.5821288228034973, "rewards/rejected": 1.4685120582580566, "step": 54 }, { "epoch": 0.6995900839351942, "grad_norm": 13.767404224251546, "learning_rate": 1.1200000000000001e-07, "logits/chosen": -0.6854877471923828, "logits/rejected": -0.746857225894928, "logps/chosen": -178.72006225585938, "logps/rejected": -162.08724975585938, "loss": 0.5387, "rewards/accuracies": 0.703125, "rewards/chosen": 2.124311923980713, "rewards/margins": 0.6121358871459961, "rewards/rejected": 1.5121760368347168, "step": 56 }, { "epoch": 0.7245754440757368, "grad_norm": 13.792453070210335, "learning_rate": 1.1599999999999999e-07, "logits/chosen": -0.6948191523551941, "logits/rejected": -0.7636308073997498, "logps/chosen": -195.91062927246094, "logps/rejected": -176.2474365234375, "loss": 0.5338, "rewards/accuracies": 0.73046875, "rewards/chosen": 2.136486530303955, "rewards/margins": 0.6362313032150269, "rewards/rejected": 1.5002549886703491, "step": 58 }, { "epoch": 0.7495608042162796, "grad_norm": 12.863853198025703, "learning_rate": 1.2e-07, "logits/chosen": -0.6513829231262207, "logits/rejected": -0.7188961505889893, "logps/chosen": -190.9204864501953, "logps/rejected": -170.77809143066406, "loss": 0.5334, "rewards/accuracies": 0.7109375, "rewards/chosen": 2.1112589836120605, "rewards/margins": 0.6223936676979065, "rewards/rejected": 1.4888653755187988, "step": 60 }, { "epoch": 0.7745461643568222, "grad_norm": 13.104450063440881, "learning_rate": 1.24e-07, "logits/chosen": -0.6542866230010986, "logits/rejected": -0.7189180254936218, "logps/chosen": -183.43380737304688, "logps/rejected": -169.60279846191406, "loss": 0.538, "rewards/accuracies": 0.7578125, "rewards/chosen": 2.178042411804199, "rewards/margins": 0.7206485867500305, "rewards/rejected": 1.4573938846588135, "step": 62 }, { "epoch": 0.7995315244973649, "grad_norm": 14.223685904396252, "learning_rate": 1.28e-07, "logits/chosen": -0.6746785640716553, "logits/rejected": -0.7393426895141602, "logps/chosen": -187.83718872070312, "logps/rejected": -170.26303100585938, "loss": 0.5409, "rewards/accuracies": 0.71875, "rewards/chosen": 2.0394065380096436, "rewards/margins": 0.6107546091079712, "rewards/rejected": 1.4286518096923828, "step": 64 }, { "epoch": 0.8245168846379075, "grad_norm": 11.892937673420246, "learning_rate": 1.32e-07, "logits/chosen": -0.6900768280029297, "logits/rejected": -0.7491022944450378, "logps/chosen": -190.11602783203125, "logps/rejected": -177.3651123046875, "loss": 0.4913, "rewards/accuracies": 0.7890625, "rewards/chosen": 2.1415176391601562, "rewards/margins": 0.7801377773284912, "rewards/rejected": 1.3613799810409546, "step": 66 }, { "epoch": 0.8495022447784502, "grad_norm": 12.90980511070953, "learning_rate": 1.36e-07, "logits/chosen": -0.671898603439331, "logits/rejected": -0.7221825122833252, "logps/chosen": -182.26194763183594, "logps/rejected": -171.02517700195312, "loss": 0.5013, "rewards/accuracies": 0.78125, "rewards/chosen": 1.9341095685958862, "rewards/margins": 0.7295835018157959, "rewards/rejected": 1.2045260667800903, "step": 68 }, { "epoch": 0.8744876049189928, "grad_norm": 11.941560260701717, "learning_rate": 1.3999999999999998e-07, "logits/chosen": -0.6553590297698975, "logits/rejected": -0.7277964353561401, "logps/chosen": -191.17935180664062, "logps/rejected": -182.75697326660156, "loss": 0.4992, "rewards/accuracies": 0.76171875, "rewards/chosen": 1.9563246965408325, "rewards/margins": 0.735268771648407, "rewards/rejected": 1.2210559844970703, "step": 70 }, { "epoch": 0.8994729650595354, "grad_norm": 11.363918779296476, "learning_rate": 1.44e-07, "logits/chosen": -0.6677660346031189, "logits/rejected": -0.7358181476593018, "logps/chosen": -187.28823852539062, "logps/rejected": -175.23736572265625, "loss": 0.4782, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.8920280933380127, "rewards/margins": 0.9304031729698181, "rewards/rejected": 0.9616249799728394, "step": 72 }, { "epoch": 0.9244583252000781, "grad_norm": 11.665594088546383, "learning_rate": 1.48e-07, "logits/chosen": -0.6850963830947876, "logits/rejected": -0.750001072883606, "logps/chosen": -190.98031616210938, "logps/rejected": -173.23446655273438, "loss": 0.4758, "rewards/accuracies": 0.79296875, "rewards/chosen": 1.8105218410491943, "rewards/margins": 0.8698927760124207, "rewards/rejected": 0.9406291246414185, "step": 74 }, { "epoch": 0.9369510052703494, "eval_logits/chosen": -0.6242849230766296, "eval_logits/rejected": -0.7280451059341431, "eval_logps/chosen": -193.7286376953125, "eval_logps/rejected": -160.47738647460938, "eval_loss": 0.5220226645469666, "eval_rewards/accuracies": 0.8399999737739563, "eval_rewards/chosen": 1.9179359674453735, "eval_rewards/margins": 1.0572994947433472, "eval_rewards/rejected": 0.8606364727020264, "eval_runtime": 29.5424, "eval_samples_per_second": 3.385, "eval_steps_per_second": 0.846, "step": 75 }, { "epoch": 0.9494436853406207, "grad_norm": 11.894064434017581, "learning_rate": 1.5199999999999998e-07, "logits/chosen": -0.6667495965957642, "logits/rejected": -0.7195772528648376, "logps/chosen": -188.1094512939453, "logps/rejected": -173.92083740234375, "loss": 0.4603, "rewards/accuracies": 0.8125, "rewards/chosen": 1.8090986013412476, "rewards/margins": 0.9019326567649841, "rewards/rejected": 0.9071658849716187, "step": 76 }, { "epoch": 0.9744290454811634, "grad_norm": 12.785309864943024, "learning_rate": 1.56e-07, "logits/chosen": -0.6789891123771667, "logits/rejected": -0.7438546419143677, "logps/chosen": -190.32470703125, "logps/rejected": -169.19207763671875, "loss": 0.4541, "rewards/accuracies": 0.80078125, "rewards/chosen": 1.736232042312622, "rewards/margins": 0.9202592372894287, "rewards/rejected": 0.8159728050231934, "step": 78 }, { "epoch": 0.999414405621706, "grad_norm": 12.07847451966405, "learning_rate": 1.6e-07, "logits/chosen": -0.687256395816803, "logits/rejected": -0.7596179246902466, "logps/chosen": -195.14768981933594, "logps/rejected": -174.72589111328125, "loss": 0.4471, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.6982433795928955, "rewards/margins": 0.9969222545623779, "rewards/rejected": 0.7013211250305176, "step": 80 }, { "epoch": 1.0243997657622488, "grad_norm": 12.209730650020617, "learning_rate": 1.6399999999999999e-07, "logits/chosen": -0.6745339632034302, "logits/rejected": -0.7286314368247986, "logps/chosen": -184.5238494873047, "logps/rejected": -180.26815795898438, "loss": 0.4635, "rewards/accuracies": 0.78515625, "rewards/chosen": 1.6792489290237427, "rewards/margins": 0.8308749198913574, "rewards/rejected": 0.8483741879463196, "step": 82 }, { "epoch": 1.0493851259027913, "grad_norm": 11.63839350311622, "learning_rate": 1.68e-07, "logits/chosen": -0.6981229186058044, "logits/rejected": -0.7625120878219604, "logps/chosen": -191.15847778320312, "logps/rejected": -189.08364868164062, "loss": 0.4418, "rewards/accuracies": 0.80078125, "rewards/chosen": 1.7394218444824219, "rewards/margins": 1.0957342386245728, "rewards/rejected": 0.6436874866485596, "step": 84 }, { "epoch": 1.074370486043334, "grad_norm": 12.047547514582906, "learning_rate": 1.7199999999999998e-07, "logits/chosen": -0.6573597192764282, "logits/rejected": -0.7058761715888977, "logps/chosen": -189.32237243652344, "logps/rejected": -182.69403076171875, "loss": 0.4339, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.724177598953247, "rewards/margins": 1.020638108253479, "rewards/rejected": 0.7035394906997681, "step": 86 }, { "epoch": 1.0993558461838766, "grad_norm": 12.65465734374361, "learning_rate": 1.76e-07, "logits/chosen": -0.67812180519104, "logits/rejected": -0.7225789427757263, "logps/chosen": -185.65040588378906, "logps/rejected": -187.9286651611328, "loss": 0.437, "rewards/accuracies": 0.81640625, "rewards/chosen": 1.5955438613891602, "rewards/margins": 0.9583697319030762, "rewards/rejected": 0.6371738910675049, "step": 88 }, { "epoch": 1.1243412063244194, "grad_norm": 10.575082434339704, "learning_rate": 1.8e-07, "logits/chosen": -0.6781046986579895, "logits/rejected": -0.7251250743865967, "logps/chosen": -189.33551025390625, "logps/rejected": -188.9590606689453, "loss": 0.4156, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.5732731819152832, "rewards/margins": 1.1116917133331299, "rewards/rejected": 0.4615815281867981, "step": 90 }, { "epoch": 1.149326566464962, "grad_norm": 9.55392077902883, "learning_rate": 1.84e-07, "logits/chosen": -0.6788798570632935, "logits/rejected": -0.7428586483001709, "logps/chosen": -198.3631591796875, "logps/rejected": -182.88487243652344, "loss": 0.3982, "rewards/accuracies": 0.82421875, "rewards/chosen": 1.3846931457519531, "rewards/margins": 1.0993235111236572, "rewards/rejected": 0.2853696346282959, "step": 92 }, { "epoch": 1.1743119266055047, "grad_norm": 9.670365765437687, "learning_rate": 1.88e-07, "logits/chosen": -0.7081943154335022, "logits/rejected": -0.775234580039978, "logps/chosen": -189.57760620117188, "logps/rejected": -181.6404571533203, "loss": 0.3911, "rewards/accuracies": 0.8828125, "rewards/chosen": 1.2745850086212158, "rewards/margins": 1.3070428371429443, "rewards/rejected": -0.032457947731018066, "step": 94 }, { "epoch": 1.1992972867460472, "grad_norm": 10.78932105106093, "learning_rate": 1.9199999999999997e-07, "logits/chosen": -0.6830898523330688, "logits/rejected": -0.734713613986969, "logps/chosen": -191.22511291503906, "logps/rejected": -186.43077087402344, "loss": 0.3897, "rewards/accuracies": 0.82421875, "rewards/chosen": 1.0674785375595093, "rewards/margins": 1.171497106552124, "rewards/rejected": -0.10401848703622818, "step": 96 }, { "epoch": 1.22428264688659, "grad_norm": 9.097857033695211, "learning_rate": 1.9599999999999998e-07, "logits/chosen": -0.7023120522499084, "logits/rejected": -0.7581274509429932, "logps/chosen": -195.01312255859375, "logps/rejected": -188.2948455810547, "loss": 0.3536, "rewards/accuracies": 0.85546875, "rewards/chosen": 1.0987714529037476, "rewards/margins": 1.5480579137802124, "rewards/rejected": -0.4492865800857544, "step": 98 }, { "epoch": 1.2492680070271325, "grad_norm": 12.363599073474044, "learning_rate": 2e-07, "logits/chosen": -0.6939373016357422, "logits/rejected": -0.7522105574607849, "logps/chosen": -193.21104431152344, "logps/rejected": -190.04568481445312, "loss": 0.3649, "rewards/accuracies": 0.85546875, "rewards/chosen": 0.7619870901107788, "rewards/margins": 1.3099664449691772, "rewards/rejected": -0.547979474067688, "step": 100 }, { "epoch": 1.2742533671676752, "grad_norm": 9.956621854528622, "learning_rate": 1.9945218953682733e-07, "logits/chosen": -0.7141095399856567, "logits/rejected": -0.772229015827179, "logps/chosen": -206.34132385253906, "logps/rejected": -199.00970458984375, "loss": 0.3505, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8092713952064514, "rewards/margins": 1.6670289039611816, "rewards/rejected": -0.8577573299407959, "step": 102 }, { "epoch": 1.2992387273082178, "grad_norm": 9.301525503546692, "learning_rate": 1.9781476007338056e-07, "logits/chosen": -0.7332565188407898, "logits/rejected": -0.7983365058898926, "logps/chosen": -201.5232696533203, "logps/rejected": -191.0933380126953, "loss": 0.3195, "rewards/accuracies": 0.89453125, "rewards/chosen": 0.5621832609176636, "rewards/margins": 1.6282891035079956, "rewards/rejected": -1.0661057233810425, "step": 104 }, { "epoch": 1.3242240874487605, "grad_norm": 9.869807635078832, "learning_rate": 1.9510565162951537e-07, "logits/chosen": -0.7436533570289612, "logits/rejected": -0.8179137706756592, "logps/chosen": -198.8864288330078, "logps/rejected": -192.3636016845703, "loss": 0.3368, "rewards/accuracies": 0.87109375, "rewards/chosen": 0.5397917628288269, "rewards/margins": 1.7668784856796265, "rewards/rejected": -1.2270865440368652, "step": 106 }, { "epoch": 1.349209447589303, "grad_norm": 10.035654401238393, "learning_rate": 1.9135454576426007e-07, "logits/chosen": -0.6918727159500122, "logits/rejected": -0.7655491828918457, "logps/chosen": -203.7888946533203, "logps/rejected": -215.9702606201172, "loss": 0.3397, "rewards/accuracies": 0.90234375, "rewards/chosen": 0.5103797912597656, "rewards/margins": 1.8079906702041626, "rewards/rejected": -1.297610878944397, "step": 108 }, { "epoch": 1.3741948077298458, "grad_norm": 9.358482838142306, "learning_rate": 1.8660254037844388e-07, "logits/chosen": -0.7152352929115295, "logits/rejected": -0.7777791023254395, "logps/chosen": -207.73023986816406, "logps/rejected": -194.01126098632812, "loss": 0.3365, "rewards/accuracies": 0.875, "rewards/chosen": 0.4271202087402344, "rewards/margins": 1.687686562538147, "rewards/rejected": -1.2605663537979126, "step": 110 }, { "epoch": 1.3991801678703886, "grad_norm": 9.789127179150574, "learning_rate": 1.8090169943749475e-07, "logits/chosen": -0.7317672371864319, "logits/rejected": -0.7882843017578125, "logps/chosen": -203.46849060058594, "logps/rejected": -208.32135009765625, "loss": 0.3157, "rewards/accuracies": 0.87890625, "rewards/chosen": 0.33698615431785583, "rewards/margins": 1.7098716497421265, "rewards/rejected": -1.3728857040405273, "step": 112 }, { "epoch": 1.424165528010931, "grad_norm": 9.275207179944992, "learning_rate": 1.7431448254773942e-07, "logits/chosen": -0.7219483852386475, "logits/rejected": -0.7694462537765503, "logps/chosen": -199.74270629882812, "logps/rejected": -204.81101989746094, "loss": 0.3034, "rewards/accuracies": 0.88671875, "rewards/chosen": 0.40492168068885803, "rewards/margins": 1.9214580059051514, "rewards/rejected": -1.5165363550186157, "step": 114 }, { "epoch": 1.4491508881514736, "grad_norm": 9.183521827422608, "learning_rate": 1.669130606358858e-07, "logits/chosen": -0.7337281107902527, "logits/rejected": -0.7940360307693481, "logps/chosen": -198.19046020507812, "logps/rejected": -200.4697265625, "loss": 0.3176, "rewards/accuracies": 0.88671875, "rewards/chosen": 0.3844246566295624, "rewards/margins": 1.8816416263580322, "rewards/rejected": -1.4972169399261475, "step": 116 }, { "epoch": 1.4741362482920164, "grad_norm": 9.337739314896169, "learning_rate": 1.5877852522924732e-07, "logits/chosen": -0.7224444150924683, "logits/rejected": -0.7723821401596069, "logps/chosen": -195.0646209716797, "logps/rejected": -201.89569091796875, "loss": 0.2752, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3896394371986389, "rewards/margins": 1.9735894203186035, "rewards/rejected": -1.5839500427246094, "step": 118 }, { "epoch": 1.4991216084325591, "grad_norm": 8.793369516249312, "learning_rate": 1.5e-07, "logits/chosen": -0.7569531202316284, "logits/rejected": -0.8058477640151978, "logps/chosen": -208.27215576171875, "logps/rejected": -208.09347534179688, "loss": 0.317, "rewards/accuracies": 0.87890625, "rewards/chosen": 0.2652769982814789, "rewards/margins": 1.866006851196289, "rewards/rejected": -1.6007298231124878, "step": 120 }, { "epoch": 1.5241069685731017, "grad_norm": 9.48150415114474, "learning_rate": 1.4067366430758004e-07, "logits/chosen": -0.7591882348060608, "logits/rejected": -0.8140251636505127, "logps/chosen": -205.2285614013672, "logps/rejected": -203.8860321044922, "loss": 0.2965, "rewards/accuracies": 0.86328125, "rewards/chosen": 0.22455668449401855, "rewards/margins": 1.875580072402954, "rewards/rejected": -1.6510233879089355, "step": 122 }, { "epoch": 1.5490923287136442, "grad_norm": 9.98138144476122, "learning_rate": 1.3090169943749475e-07, "logits/chosen": -0.7185292840003967, "logits/rejected": -0.7869015336036682, "logps/chosen": -207.1554718017578, "logps/rejected": -239.03298950195312, "loss": 0.301, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.21323075890541077, "rewards/margins": 1.779089093208313, "rewards/rejected": -1.565858244895935, "step": 124 }, { "epoch": 1.574077688854187, "grad_norm": 7.9906817459937995, "learning_rate": 1.207911690817759e-07, "logits/chosen": -0.7233790159225464, "logits/rejected": -0.7781089544296265, "logps/chosen": -194.47396850585938, "logps/rejected": -199.163330078125, "loss": 0.3084, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.403320848941803, "rewards/margins": 1.8897595405578613, "rewards/rejected": -1.486438512802124, "step": 126 }, { "epoch": 1.5990630489947297, "grad_norm": 9.241594419692872, "learning_rate": 1.1045284632676535e-07, "logits/chosen": -0.7427763342857361, "logits/rejected": -0.811578094959259, "logps/chosen": -211.1174774169922, "logps/rejected": -199.56015014648438, "loss": 0.2962, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.37835511565208435, "rewards/margins": 2.0853826999664307, "rewards/rejected": -1.7070273160934448, "step": 128 }, { "epoch": 1.6240484091352723, "grad_norm": 9.666544829037878, "learning_rate": 1e-07, "logits/chosen": -0.7333863973617554, "logits/rejected": -0.7908891439437866, "logps/chosen": -201.62767028808594, "logps/rejected": -204.39947509765625, "loss": 0.3047, "rewards/accuracies": 0.91015625, "rewards/chosen": 0.5189218521118164, "rewards/margins": 2.134669542312622, "rewards/rejected": -1.6157476902008057, "step": 130 }, { "epoch": 1.6490337692758148, "grad_norm": 8.419662280451101, "learning_rate": 8.954715367323466e-08, "logits/chosen": -0.7702259421348572, "logits/rejected": -0.8284745812416077, "logps/chosen": -199.48992919921875, "logps/rejected": -219.00027465820312, "loss": 0.2776, "rewards/accuracies": 0.86328125, "rewards/chosen": 0.36517998576164246, "rewards/margins": 2.0277538299560547, "rewards/rejected": -1.6625735759735107, "step": 132 }, { "epoch": 1.6740191294163576, "grad_norm": 9.261225693287605, "learning_rate": 7.920883091822408e-08, "logits/chosen": -0.7342085242271423, "logits/rejected": -0.7807326912879944, "logps/chosen": -198.77467346191406, "logps/rejected": -204.98635864257812, "loss": 0.3185, "rewards/accuracies": 0.88671875, "rewards/chosen": 0.39617919921875, "rewards/margins": 1.929458737373352, "rewards/rejected": -1.533279538154602, "step": 134 }, { "epoch": 1.6990044895569003, "grad_norm": 9.299697052167406, "learning_rate": 6.909830056250527e-08, "logits/chosen": -0.7558231949806213, "logits/rejected": -0.8186966776847839, "logps/chosen": -200.72955322265625, "logps/rejected": -197.19003295898438, "loss": 0.2807, "rewards/accuracies": 0.88671875, "rewards/chosen": 0.42143842577934265, "rewards/margins": 2.1845474243164062, "rewards/rejected": -1.7631090879440308, "step": 136 }, { "epoch": 1.723989849697443, "grad_norm": 8.587815614728, "learning_rate": 5.9326335692419996e-08, "logits/chosen": -0.755694568157196, "logits/rejected": -0.8112677335739136, "logps/chosen": -198.68673706054688, "logps/rejected": -197.39120483398438, "loss": 0.2802, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.23039419949054718, "rewards/margins": 1.9206252098083496, "rewards/rejected": -1.6902309656143188, "step": 138 }, { "epoch": 1.7489752098379856, "grad_norm": 8.680976094750916, "learning_rate": 5.000000000000002e-08, "logits/chosen": -0.7553902864456177, "logits/rejected": -0.8158895373344421, "logps/chosen": -199.0127716064453, "logps/rejected": -202.2300262451172, "loss": 0.2768, "rewards/accuracies": 0.890625, "rewards/chosen": 0.5232993960380554, "rewards/margins": 2.2032899856567383, "rewards/rejected": -1.6799907684326172, "step": 140 }, { "epoch": 1.7739605699785281, "grad_norm": 9.234907906222318, "learning_rate": 4.1221474770752695e-08, "logits/chosen": -0.7363643646240234, "logits/rejected": -0.79323410987854, "logps/chosen": -203.90921020507812, "logps/rejected": -207.4276123046875, "loss": 0.2921, "rewards/accuracies": 0.85546875, "rewards/chosen": 0.3827190697193146, "rewards/margins": 2.0326881408691406, "rewards/rejected": -1.6499687433242798, "step": 142 }, { "epoch": 1.798945930119071, "grad_norm": 8.988965068155167, "learning_rate": 3.3086939364114206e-08, "logits/chosen": -0.7579203844070435, "logits/rejected": -0.8293938636779785, "logps/chosen": -201.67063903808594, "logps/rejected": -223.98065185546875, "loss": 0.2825, "rewards/accuracies": 0.90234375, "rewards/chosen": 0.3738960325717926, "rewards/margins": 2.088986873626709, "rewards/rejected": -1.7150908708572388, "step": 144 }, { "epoch": 1.8239312902596136, "grad_norm": 8.4124330379094, "learning_rate": 2.5685517452260564e-08, "logits/chosen": -0.7071250081062317, "logits/rejected": -0.7688826322555542, "logps/chosen": -203.57652282714844, "logps/rejected": -203.83291625976562, "loss": 0.282, "rewards/accuracies": 0.90234375, "rewards/chosen": 0.2634541988372803, "rewards/margins": 2.0646886825561523, "rewards/rejected": -1.801234245300293, "step": 146 }, { "epoch": 1.8489166504001562, "grad_norm": 8.280715517124031, "learning_rate": 1.9098300562505266e-08, "logits/chosen": -0.755478024482727, "logits/rejected": -0.8133871555328369, "logps/chosen": -202.27098083496094, "logps/rejected": -195.1833038330078, "loss": 0.2677, "rewards/accuracies": 0.90625, "rewards/chosen": 0.45085206627845764, "rewards/margins": 2.2143564224243164, "rewards/rejected": -1.7635046243667603, "step": 148 }, { "epoch": 1.8739020105406987, "grad_norm": 9.365073690139782, "learning_rate": 1.3397459621556128e-08, "logits/chosen": -0.7708315849304199, "logits/rejected": -0.8214279413223267, "logps/chosen": -198.73464965820312, "logps/rejected": -201.75244140625, "loss": 0.2866, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.23038014769554138, "rewards/margins": 1.8956291675567627, "rewards/rejected": -1.6652488708496094, "step": 150 }, { "epoch": 1.8739020105406987, "eval_logits/chosen": -0.6863436102867126, "eval_logits/rejected": -0.7882587909698486, "eval_logps/chosen": -206.22607421875, "eval_logps/rejected": -185.4351806640625, "eval_loss": 0.28332585096359253, "eval_rewards/accuracies": 0.9200000166893005, "eval_rewards/chosen": 0.6681913137435913, "eval_rewards/margins": 2.303332567214966, "eval_rewards/rejected": -1.6351412534713745, "eval_runtime": 30.6661, "eval_samples_per_second": 3.261, "eval_steps_per_second": 0.815, "step": 150 }, { "epoch": 1.8988873706812415, "grad_norm": 8.503745608023737, "learning_rate": 8.645454235739902e-09, "logits/chosen": -0.7426515817642212, "logits/rejected": -0.8051266670227051, "logps/chosen": -195.47421264648438, "logps/rejected": -199.013916015625, "loss": 0.2643, "rewards/accuracies": 0.9140625, "rewards/chosen": 0.39815255999565125, "rewards/margins": 2.137446403503418, "rewards/rejected": -1.7392936944961548, "step": 152 }, { "epoch": 1.9238727308217842, "grad_norm": 8.367825814127663, "learning_rate": 4.8943483704846465e-09, "logits/chosen": -0.7322957515716553, "logits/rejected": -0.7974464893341064, "logps/chosen": -193.97613525390625, "logps/rejected": -191.2456817626953, "loss": 0.2622, "rewards/accuracies": 0.921875, "rewards/chosen": 0.45596182346343994, "rewards/margins": 2.185451030731201, "rewards/rejected": -1.7294889688491821, "step": 154 }, { "epoch": 1.9488580909623268, "grad_norm": 7.920183430972945, "learning_rate": 2.1852399266194312e-09, "logits/chosen": -0.7559969425201416, "logits/rejected": -0.8131712079048157, "logps/chosen": -203.8223876953125, "logps/rejected": -202.947509765625, "loss": 0.2773, "rewards/accuracies": 0.91015625, "rewards/chosen": 0.32228347659111023, "rewards/margins": 2.08817458152771, "rewards/rejected": -1.7658910751342773, "step": 156 }, { "epoch": 1.9738434511028693, "grad_norm": 8.06856390519066, "learning_rate": 5.47810463172671e-10, "logits/chosen": -0.7470804452896118, "logits/rejected": -0.8129448294639587, "logps/chosen": -210.6734619140625, "logps/rejected": -196.4785919189453, "loss": 0.2755, "rewards/accuracies": 0.890625, "rewards/chosen": 0.42607438564300537, "rewards/margins": 2.0202014446258545, "rewards/rejected": -1.5941270589828491, "step": 158 }, { "epoch": 1.998828811243412, "grad_norm": 9.773194489937959, "learning_rate": 0.0, "logits/chosen": -0.80845707654953, "logits/rejected": -0.8671077489852905, "logps/chosen": -203.4849853515625, "logps/rejected": -204.71002197265625, "loss": 0.2941, "rewards/accuracies": 0.91796875, "rewards/chosen": 0.35994353890419006, "rewards/margins": 2.0885844230651855, "rewards/rejected": -1.7286407947540283, "step": 160 } ], "logging_steps": 2, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }