diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -9,13 +9,13 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0, - "grad_norm": 2.184279469343464, - "learning_rate": 4.3066322136089575e-10, - "logits/chosen": -2.9685676097869873, - "logits/rejected": -2.926340103149414, - "logps/chosen": -44.04426574707031, - "logps/rejected": -41.580841064453125, + "epoch": 0.00017229496898690558, + "grad_norm": 2.1823763847351074, + "learning_rate": 8.613264427217915e-11, + "logits/chosen": -2.967046022415161, + "logits/rejected": -2.9243061542510986, + "logps/chosen": -43.99115753173828, + "logps/rejected": -41.627906799316406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -24,19269 +24,19269 @@ "step": 1 }, { - "epoch": 0.0, - "grad_norm": 2.397164451396864, - "learning_rate": 4.306632213608958e-09, - "logits/chosen": -3.057889223098755, - "logits/rejected": -3.028320550918579, - "logps/chosen": -50.45764923095703, - "logps/rejected": -49.59663391113281, - "loss": 0.6931, - "rewards/accuracies": 0.3611111044883728, - "rewards/chosen": 4.533848914434202e-05, - "rewards/margins": 1.5664114471292123e-05, - "rewards/rejected": 2.9674369216081686e-05, + "epoch": 0.0017229496898690559, + "grad_norm": 2.387622594833374, + "learning_rate": 8.613264427217916e-10, + "logits/chosen": -3.055140972137451, + "logits/rejected": -3.0257670879364014, + "logps/chosen": -50.45387649536133, + "logps/rejected": -49.622737884521484, + "loss": 0.6929, + "rewards/accuracies": 0.5763888955116272, + "rewards/chosen": 0.00012421452265698463, + "rewards/margins": 0.0005009726155549288, + "rewards/rejected": -0.00037675804924219847, "step": 10 }, { - "epoch": 0.0, - "grad_norm": 2.2428396092279437, - "learning_rate": 8.613264427217916e-09, - "logits/chosen": -3.1213667392730713, - "logits/rejected": -3.113072633743286, - "logps/chosen": -52.6474494934082, - "logps/rejected": -52.98405075073242, - "loss": 0.6931, - "rewards/accuracies": 0.53125, - "rewards/chosen": -2.8674810891970992e-05, - "rewards/margins": 0.00011320582416374236, - "rewards/rejected": -0.00014188062050379813, + "epoch": 0.0034458993797381117, + "grad_norm": 2.2426376342773438, + "learning_rate": 1.7226528854435832e-09, + "logits/chosen": -3.119055986404419, + "logits/rejected": -3.1108334064483643, + "logps/chosen": -52.67353057861328, + "logps/rejected": -53.0074462890625, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0001044743912643753, + "rewards/margins": -8.299542969325557e-05, + "rewards/rejected": -2.1478936105268076e-05, "step": 20 }, { - "epoch": 0.01, - "grad_norm": 2.5739690837982745, - "learning_rate": 1.2919896640826872e-08, - "logits/chosen": -3.093750476837158, - "logits/rejected": -3.0699524879455566, - "logps/chosen": -56.7930793762207, - "logps/rejected": -58.43015670776367, - "loss": 0.6932, - "rewards/accuracies": 0.46875, - "rewards/chosen": -0.00020466512069106102, - "rewards/margins": -1.0724004823714495e-05, - "rewards/rejected": -0.00019394111586734653, + "epoch": 0.005168849069607168, + "grad_norm": 2.5754239559173584, + "learning_rate": 2.5839793281653743e-09, + "logits/chosen": -3.091740369796753, + "logits/rejected": -3.0679197311401367, + "logps/chosen": -56.782386779785156, + "logps/rejected": -58.43836212158203, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.00013735578977502882, + "rewards/margins": 0.00014890992315486073, + "rewards/rejected": -1.1554128832358401e-05, "step": 30 }, { - "epoch": 0.01, - "grad_norm": 2.0121230094500575, - "learning_rate": 1.722652885443583e-08, - "logits/chosen": -3.107394218444824, - "logits/rejected": -3.075824499130249, - "logps/chosen": -55.259185791015625, - "logps/rejected": -50.681114196777344, - "loss": 0.693, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": 2.6132946004508995e-05, - "rewards/margins": 0.0003762342967092991, - "rewards/rejected": -0.00035010138526558876, + "epoch": 0.006891798759476223, + "grad_norm": 2.0134646892547607, + "learning_rate": 3.4453057708871665e-09, + "logits/chosen": -3.1050755977630615, + "logits/rejected": -3.073472499847412, + "logps/chosen": -55.25732421875, + "logps/rejected": -50.6669921875, + "loss": 0.6932, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 0.00014844746328890324, + "rewards/margins": -3.4907094232039526e-05, + "rewards/rejected": 0.0001833545247791335, "step": 40 }, { - "epoch": 0.01, - "grad_norm": 2.387552940684203, - "learning_rate": 2.153316106804479e-08, - "logits/chosen": -3.1034653186798096, - "logits/rejected": -3.0867769718170166, - "logps/chosen": -53.10588455200195, - "logps/rejected": -51.49999237060547, - "loss": 0.6932, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -0.00013910401321481913, - "rewards/margins": -8.180685108527541e-05, - "rewards/rejected": -5.7297169405501336e-05, + "epoch": 0.00861474844934528, + "grad_norm": 2.385958194732666, + "learning_rate": 4.306632213608958e-09, + "logits/chosen": -3.100965738296509, + "logits/rejected": -3.0844969749450684, + "logps/chosen": -53.12641143798828, + "logps/rejected": -51.509857177734375, + "loss": 0.6931, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -9.805540321394801e-05, + "rewards/margins": 2.18080822378397e-06, + "rewards/rejected": -0.00010023623326560482, "step": 50 }, { - "epoch": 0.01, - "grad_norm": 2.7970003076901, - "learning_rate": 2.5839793281653743e-08, - "logits/chosen": -3.156252384185791, - "logits/rejected": -3.1266000270843506, - "logps/chosen": -57.58796310424805, - "logps/rejected": -54.14855194091797, - "loss": 0.6931, - "rewards/accuracies": 0.53125, - "rewards/chosen": -4.136812640354037e-05, - "rewards/margins": 5.838483411935158e-05, - "rewards/rejected": -9.975295688491315e-05, + "epoch": 0.010337698139214336, + "grad_norm": 2.795139789581299, + "learning_rate": 5.167958656330749e-09, + "logits/chosen": -3.1540331840515137, + "logits/rejected": -3.1243770122528076, + "logps/chosen": -57.58463668823242, + "logps/rejected": -54.14760208129883, + "loss": 0.6933, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 1.8001441276283003e-05, + "rewards/margins": -0.00032479705987498164, + "rewards/rejected": 0.00034279850660823286, "step": 60 }, { - "epoch": 0.01, - "grad_norm": 2.204322757523193, - "learning_rate": 3.01464254952627e-08, - "logits/chosen": -3.0535032749176025, - "logits/rejected": -3.033651828765869, - "logps/chosen": -53.7407112121582, - "logps/rejected": -53.21503448486328, - "loss": 0.693, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.0001532899623271078, - "rewards/margins": 0.0002148848434444517, - "rewards/rejected": -6.159489566925913e-05, + "epoch": 0.012060647829083391, + "grad_norm": 2.2026424407958984, + "learning_rate": 6.02928509905254e-09, + "logits/chosen": -3.0509531497955322, + "logits/rejected": -3.0309481620788574, + "logps/chosen": -53.7503547668457, + "logps/rejected": -53.22412872314453, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.00018892706430051476, + "rewards/margins": 0.00018463641754351556, + "rewards/rejected": 4.290644028515089e-06, "step": 70 }, { - "epoch": 0.01, - "grad_norm": 2.4363897145491564, - "learning_rate": 3.445305770887166e-08, - "logits/chosen": -3.1622116565704346, - "logits/rejected": -3.1288113594055176, - "logps/chosen": -59.07722091674805, - "logps/rejected": -54.100318908691406, - "loss": 0.693, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 6.512943946290761e-05, - "rewards/margins": 0.0002318086044397205, - "rewards/rejected": -0.00016667917952872813, + "epoch": 0.013783597518952447, + "grad_norm": 2.4384751319885254, + "learning_rate": 6.890611541774333e-09, + "logits/chosen": -3.15974497795105, + "logits/rejected": -3.1263458728790283, + "logps/chosen": -59.09418869018555, + "logps/rejected": -54.10878372192383, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 2.246947587991599e-05, + "rewards/margins": 0.00015436351532116532, + "rewards/rejected": -0.00013189406308811158, "step": 80 }, { - "epoch": 0.02, - "grad_norm": 2.4857228513755465, - "learning_rate": 3.8759689922480615e-08, - "logits/chosen": -2.996279239654541, - "logits/rejected": -2.9815406799316406, - "logps/chosen": -53.46660232543945, - "logps/rejected": -52.83372116088867, - "loss": 0.693, + "epoch": 0.015506547208821502, + "grad_norm": 2.472266674041748, + "learning_rate": 7.751937984496123e-09, + "logits/chosen": -2.9935109615325928, + "logits/rejected": -2.9786577224731445, + "logps/chosen": -53.474945068359375, + "logps/rejected": -52.831932067871094, + "loss": 0.6932, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.0002413403708487749, - "rewards/margins": 0.0003210466238670051, - "rewards/rejected": -7.97062530182302e-05, + "rewards/chosen": -3.8896745536476374e-05, + "rewards/margins": -0.00014048551383893937, + "rewards/rejected": 0.00010158878285437822, "step": 90 }, { - "epoch": 0.02, - "grad_norm": 2.4854058944857753, - "learning_rate": 4.306632213608958e-08, - "logits/chosen": -3.1720452308654785, - "logits/rejected": -3.109947681427002, - "logps/chosen": -55.90839385986328, - "logps/rejected": -49.635841369628906, - "loss": 0.6929, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.0004091753507964313, - "rewards/margins": 0.0005016516079194844, - "rewards/rejected": -9.247624257113785e-05, + "epoch": 0.01722949689869056, + "grad_norm": 2.484550714492798, + "learning_rate": 8.613264427217916e-09, + "logits/chosen": -3.1697795391082764, + "logits/rejected": -3.1076667308807373, + "logps/chosen": -55.963539123535156, + "logps/rejected": -49.63465118408203, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00018626314704306424, + "rewards/margins": -0.00013597900397144258, + "rewards/rejected": -5.028415398555808e-05, "step": 100 }, { - "epoch": 0.02, - "eval_logits/chosen": -3.165482521057129, - "eval_logits/rejected": -3.1598188877105713, - "eval_logps/chosen": -58.70554733276367, - "eval_logps/rejected": -63.15681457519531, - "eval_loss": 0.6931592226028442, - "eval_rewards/accuracies": 0.4986059367656708, - "eval_rewards/chosen": -1.7028520232997835e-05, - "eval_rewards/margins": -2.261956069560256e-05, - "eval_rewards/rejected": 5.5910377341206186e-06, - "eval_runtime": 356.9348, - "eval_samples_per_second": 12.058, - "eval_steps_per_second": 1.507, + "epoch": 0.01722949689869056, + "eval_logits/chosen": -3.162914991378784, + "eval_logits/rejected": -3.157243490219116, + "eval_logps/chosen": -58.70985794067383, + "eval_logps/rejected": -63.17200469970703, + "eval_loss": 0.6931781768798828, + "eval_rewards/accuracies": 0.48187732696533203, + "eval_rewards/chosen": 2.0352143110358156e-05, + "eval_rewards/margins": -6.087439396651462e-05, + "eval_rewards/rejected": 8.122652798192576e-05, + "eval_runtime": 358.6096, + "eval_samples_per_second": 12.002, + "eval_steps_per_second": 1.5, "step": 100 }, { - "epoch": 0.02, - "grad_norm": 2.5379181619698423, - "learning_rate": 4.7372954349698534e-08, - "logits/chosen": -3.12424898147583, - "logits/rejected": -3.1003119945526123, - "logps/chosen": -55.57979202270508, - "logps/rejected": -52.30139923095703, + "epoch": 0.018952446588559616, + "grad_norm": 2.542006015777588, + "learning_rate": 9.474590869939706e-09, + "logits/chosen": -3.121802806854248, + "logits/rejected": -3.097970485687256, + "logps/chosen": -55.59550094604492, + "logps/rejected": -52.33124542236328, "loss": 0.6932, - "rewards/accuracies": 0.45625001192092896, - "rewards/chosen": -0.00025836736313067377, - "rewards/margins": -0.0001904887321870774, - "rewards/rejected": -6.787859456380829e-05, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.00018450118659529835, + "rewards/margins": -0.00018190534319728613, + "rewards/rejected": -2.59584044215444e-06, "step": 110 }, { - "epoch": 0.02, - "grad_norm": 2.561368552467561, - "learning_rate": 5.1679586563307486e-08, - "logits/chosen": -3.0679683685302734, - "logits/rejected": -3.0525035858154297, - "logps/chosen": -53.182281494140625, - "logps/rejected": -55.54204177856445, + "epoch": 0.02067539627842867, + "grad_norm": 2.568783760070801, + "learning_rate": 1.0335917312661497e-08, + "logits/chosen": -3.065636157989502, + "logits/rejected": -3.050166606903076, + "logps/chosen": -53.187355041503906, + "logps/rejected": -55.559715270996094, "loss": 0.6932, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -0.00013645360013470054, - "rewards/margins": -7.856530282879248e-05, - "rewards/rejected": -5.78882682020776e-05, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -5.230843089520931e-05, + "rewards/margins": -0.00017658497381489724, + "rewards/rejected": 0.00012427649926394224, "step": 120 }, { - "epoch": 0.02, - "grad_norm": 2.138984879227857, - "learning_rate": 5.598621877691645e-08, - "logits/chosen": -3.10345458984375, - "logits/rejected": -3.089416980743408, - "logps/chosen": -55.18548583984375, - "logps/rejected": -53.74910354614258, - "loss": 0.6931, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -3.565276711015031e-05, - "rewards/margins": 0.00010266680328641087, - "rewards/rejected": -0.0001383195340167731, + "epoch": 0.022398345968297727, + "grad_norm": 2.1416878700256348, + "learning_rate": 1.119724375538329e-08, + "logits/chosen": -3.1008267402648926, + "logits/rejected": -3.086881637573242, + "logps/chosen": -55.180259704589844, + "logps/rejected": -53.775177001953125, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0001337065768893808, + "rewards/margins": 0.00028117914916947484, + "rewards/rejected": -0.00014747263048775494, "step": 130 }, { - "epoch": 0.02, - "grad_norm": 2.4347210527199588, - "learning_rate": 6.02928509905254e-08, - "logits/chosen": -3.1250388622283936, - "logits/rejected": -3.106936454772949, - "logps/chosen": -54.17211151123047, - "logps/rejected": -53.7529296875, + "epoch": 0.024121295658166782, + "grad_norm": 2.428621768951416, + "learning_rate": 1.205857019810508e-08, + "logits/chosen": -3.122899293899536, + "logits/rejected": -3.1046016216278076, + "logps/chosen": -54.185447692871094, + "logps/rejected": -53.77512741088867, "loss": 0.6931, - "rewards/accuracies": 0.48124998807907104, - "rewards/chosen": 1.1661546523100697e-05, - "rewards/margins": -4.082123723492259e-06, - "rewards/rejected": 1.5743673429824412e-05, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.432188572740415e-06, + "rewards/margins": 0.00018397874373476952, + "rewards/rejected": -0.00017854655743576586, "step": 140 }, { - "epoch": 0.03, - "grad_norm": 2.213735398044619, - "learning_rate": 6.459948320413436e-08, - "logits/chosen": -3.0300798416137695, - "logits/rejected": -3.0123374462127686, - "logps/chosen": -52.598976135253906, - "logps/rejected": -52.38323211669922, - "loss": 0.6932, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -6.447284249588847e-05, - "rewards/margins": -5.51502344023902e-05, - "rewards/rejected": -9.322635378339328e-06, + "epoch": 0.025844245348035838, + "grad_norm": 2.2146685123443604, + "learning_rate": 1.2919896640826872e-08, + "logits/chosen": -3.0275516510009766, + "logits/rejected": -3.0097765922546387, + "logps/chosen": -52.62202072143555, + "logps/rejected": -52.41334915161133, + "loss": 0.6931, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 4.773867112817243e-05, + "rewards/margins": 0.00015990195970516652, + "rewards/rejected": -0.0001121632958529517, "step": 150 }, { - "epoch": 0.03, - "grad_norm": 2.156655887327036, - "learning_rate": 6.890611541774332e-08, - "logits/chosen": -3.0911943912506104, - "logits/rejected": -3.070504665374756, - "logps/chosen": -53.4869499206543, - "logps/rejected": -54.68552780151367, + "epoch": 0.027567195037904894, + "grad_norm": 2.1602721214294434, + "learning_rate": 1.3781223083548666e-08, + "logits/chosen": -3.088822364807129, + "logits/rejected": -3.0679469108581543, + "logps/chosen": -53.511573791503906, + "logps/rejected": -54.70383834838867, "loss": 0.6933, - "rewards/accuracies": 0.4437499940395355, - "rewards/chosen": -5.7128123444272205e-05, - "rewards/margins": -0.00023163272999227047, - "rewards/rejected": 0.00017450464656576514, + "rewards/accuracies": 0.4312500059604645, + "rewards/chosen": -0.0002432465844321996, + "rewards/margins": -0.0002327763504581526, + "rewards/rejected": -1.0470268534845673e-05, "step": 160 }, { - "epoch": 0.03, - "grad_norm": 2.3532949837745685, - "learning_rate": 7.321274763135228e-08, - "logits/chosen": -3.0787293910980225, - "logits/rejected": -3.0594067573547363, - "logps/chosen": -56.2595100402832, - "logps/rejected": -51.335472106933594, - "loss": 0.6931, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": 0.00012248748680576682, - "rewards/margins": 0.00017826970724854618, - "rewards/rejected": -5.5782216804800555e-05, + "epoch": 0.02929014472777395, + "grad_norm": 2.3529062271118164, + "learning_rate": 1.4642549526270457e-08, + "logits/chosen": -3.0763819217681885, + "logits/rejected": -3.0568976402282715, + "logps/chosen": -56.28557586669922, + "logps/rejected": -51.32001495361328, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -9.712641258374788e-06, + "rewards/margins": -2.3522810806753114e-05, + "rewards/rejected": 1.3810163181915414e-05, "step": 170 }, { - "epoch": 0.03, - "grad_norm": 2.6258607223080777, - "learning_rate": 7.751937984496123e-08, - "logits/chosen": -3.0651602745056152, - "logits/rejected": -3.0461204051971436, - "logps/chosen": -56.38677215576172, - "logps/rejected": -53.772865295410156, - "loss": 0.6931, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": 8.890104800229892e-05, - "rewards/margins": 6.191270949784666e-05, - "rewards/rejected": 2.6988331228494644e-05, + "epoch": 0.031013094417643005, + "grad_norm": 2.607346296310425, + "learning_rate": 1.5503875968992246e-08, + "logits/chosen": -3.0625109672546387, + "logits/rejected": -3.0438477993011475, + "logps/chosen": -56.41377639770508, + "logps/rejected": -53.783180236816406, + "loss": 0.6932, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -8.122886356431991e-05, + "rewards/margins": -5.78999643039424e-05, + "rewards/rejected": -2.3328897441388108e-05, "step": 180 }, { - "epoch": 0.03, - "grad_norm": 2.636977530600279, - "learning_rate": 8.18260120585702e-08, - "logits/chosen": -3.126418352127075, - "logits/rejected": -3.0830445289611816, - "logps/chosen": -58.16786575317383, - "logps/rejected": -52.552574157714844, - "loss": 0.6929, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 8.153868111548945e-05, - "rewards/margins": 0.00046413601376116276, - "rewards/rejected": -0.0003825973253697157, + "epoch": 0.03273604410751206, + "grad_norm": 2.635439157485962, + "learning_rate": 1.636520241171404e-08, + "logits/chosen": -3.1240930557250977, + "logits/rejected": -3.080695390701294, + "logps/chosen": -58.194053649902344, + "logps/rejected": -52.56025314331055, + "loss": 0.6931, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.00010519265197217464, + "rewards/margins": 0.00011926700972253457, + "rewards/rejected": -0.00022445968352258205, "step": 190 }, { - "epoch": 0.03, - "grad_norm": 2.580185714051456, - "learning_rate": 8.613264427217916e-08, - "logits/chosen": -3.0618324279785156, - "logits/rejected": -3.04618239402771, - "logps/chosen": -54.1072998046875, - "logps/rejected": -54.72209548950195, - "loss": 0.693, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -2.7029391276300885e-05, - "rewards/margins": 0.0002754017186816782, - "rewards/rejected": -0.00030243111541494727, + "epoch": 0.03445899379738112, + "grad_norm": 2.576287031173706, + "learning_rate": 1.722652885443583e-08, + "logits/chosen": -3.0595781803131104, + "logits/rejected": -3.04390287399292, + "logps/chosen": -54.107139587402344, + "logps/rejected": -54.70692825317383, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.00015565704961773008, + "rewards/margins": 2.6284324121661484e-06, + "rewards/rejected": 0.00015302860992960632, "step": 200 }, { - "epoch": 0.03, - "eval_logits/chosen": -3.165257692337036, - "eval_logits/rejected": -3.159618616104126, - "eval_logps/chosen": -58.68031311035156, - "eval_logps/rejected": -63.137481689453125, - "eval_loss": 0.69312983751297, - "eval_rewards/accuracies": 0.5127788186073303, - "eval_rewards/chosen": 0.0002353396121179685, - "eval_rewards/margins": 3.6308691051090136e-05, - "eval_rewards/rejected": 0.00019903088104911149, - "eval_runtime": 355.2015, - "eval_samples_per_second": 12.117, - "eval_steps_per_second": 1.515, + "epoch": 0.03445899379738112, + "eval_logits/chosen": -3.1632211208343506, + "eval_logits/rejected": -3.1575613021850586, + "eval_logps/chosen": -58.711769104003906, + "eval_logps/rejected": -63.17155456542969, + "eval_loss": 0.6931898593902588, + "eval_rewards/accuracies": 0.4893122613430023, + "eval_rewards/chosen": 1.2645278957279515e-06, + "eval_rewards/margins": -8.44153473735787e-05, + "eval_rewards/rejected": 8.56798724271357e-05, + "eval_runtime": 358.4945, + "eval_samples_per_second": 12.006, + "eval_steps_per_second": 1.501, "step": 200 }, { - "epoch": 0.04, - "grad_norm": 2.28669707979838, - "learning_rate": 9.043927648578811e-08, - "logits/chosen": -3.0168232917785645, - "logits/rejected": -3.008084535598755, - "logps/chosen": -53.26890182495117, - "logps/rejected": -57.292236328125, + "epoch": 0.03618194348725017, + "grad_norm": 2.2875328063964844, + "learning_rate": 1.8087855297157624e-08, + "logits/chosen": -3.0142710208892822, + "logits/rejected": -3.0056774616241455, + "logps/chosen": -53.26692581176758, + "logps/rejected": -57.282997131347656, "loss": 0.6932, - "rewards/accuracies": 0.4312500059604645, - "rewards/chosen": -0.00018195889424532652, - "rewards/margins": -8.943781722337008e-05, - "rewards/rejected": -9.252109157387167e-05, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -2.878843224607408e-05, + "rewards/margins": -0.00010379708692198619, + "rewards/rejected": 7.500863284803927e-05, "step": 210 }, { - "epoch": 0.04, - "grad_norm": 2.336438367711948, - "learning_rate": 9.474590869939707e-08, - "logits/chosen": -3.053389072418213, - "logits/rejected": -3.022315502166748, - "logps/chosen": -52.211769104003906, - "logps/rejected": -51.38096237182617, - "loss": 0.6928, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.0002459119423292577, - "rewards/margins": 0.0007187900482676923, - "rewards/rejected": -0.00096470199059695, + "epoch": 0.03790489317711923, + "grad_norm": 2.327131748199463, + "learning_rate": 1.8949181739879413e-08, + "logits/chosen": -3.0508837699890137, + "logits/rejected": -3.019562244415283, + "logps/chosen": -52.19524002075195, + "logps/rejected": -51.32683181762695, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 4.231429556966759e-05, + "rewards/margins": 0.00014887299039401114, + "rewards/rejected": -0.00010655868391040713, "step": 220 }, { - "epoch": 0.04, - "grad_norm": 2.394447162636319, - "learning_rate": 9.905254091300602e-08, - "logits/chosen": -3.0536513328552246, - "logits/rejected": -3.0352864265441895, - "logps/chosen": -48.92305374145508, - "logps/rejected": -50.00139617919922, - "loss": 0.693, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.000386159576009959, - "rewards/margins": 0.00027917162515223026, - "rewards/rejected": -0.0006653312011621892, + "epoch": 0.03962784286698828, + "grad_norm": 2.3964474201202393, + "learning_rate": 1.9810508182601205e-08, + "logits/chosen": -3.051140546798706, + "logits/rejected": -3.0328004360198975, + "logps/chosen": -48.911827087402344, + "logps/rejected": -49.93369674682617, + "loss": 0.6932, + "rewards/accuracies": 0.40625, + "rewards/chosen": -2.207815668953117e-05, + "rewards/margins": -0.00013280121493153274, + "rewards/rejected": 0.00011072307825088501, "step": 230 }, { - "epoch": 0.04, - "grad_norm": 2.25083824617627, - "learning_rate": 1.0335917312661497e-07, - "logits/chosen": -3.026599168777466, - "logits/rejected": -2.9841794967651367, - "logps/chosen": -55.9691276550293, - "logps/rejected": -52.21491622924805, - "loss": 0.6931, - "rewards/accuracies": 0.53125, - "rewards/chosen": -0.0005459034582599998, - "rewards/margins": 9.160184708889574e-05, - "rewards/rejected": -0.0006375053199008107, + "epoch": 0.04135079255685734, + "grad_norm": 2.246399164199829, + "learning_rate": 2.0671834625322995e-08, + "logits/chosen": -3.0245563983917236, + "logits/rejected": -2.982118844985962, + "logps/chosen": -55.94266891479492, + "logps/rejected": -52.16364669799805, + "loss": 0.693, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": 0.00011497551167849451, + "rewards/margins": 0.00020507563021965325, + "rewards/rejected": -9.010009671328589e-05, "step": 240 }, { - "epoch": 0.04, - "grad_norm": 2.323445591258243, - "learning_rate": 1.0766580534022394e-07, - "logits/chosen": -3.1199052333831787, - "logits/rejected": -3.0994296073913574, - "logps/chosen": -52.30159378051758, - "logps/rejected": -51.17644119262695, - "loss": 0.6928, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.00036477210232988, - "rewards/margins": 0.000624177569989115, - "rewards/rejected": -0.000988949672318995, + "epoch": 0.043073742246726394, + "grad_norm": 2.313981294631958, + "learning_rate": 2.153316106804479e-08, + "logits/chosen": -3.1180367469787598, + "logits/rejected": -3.0976953506469727, + "logps/chosen": -52.28910446166992, + "logps/rejected": -51.09497833251953, + "loss": 0.6932, + "rewards/accuracies": 0.46875, + "rewards/chosen": -9.025823965203017e-05, + "rewards/margins": -0.00011660426389425993, + "rewards/rejected": 2.6346038794144988e-05, "step": 250 }, { - "epoch": 0.04, - "grad_norm": 2.316340782882154, - "learning_rate": 1.119724375538329e-07, - "logits/chosen": -3.0962424278259277, - "logits/rejected": -3.0838680267333984, - "logps/chosen": -54.875404357910156, - "logps/rejected": -56.73250198364258, - "loss": 0.6927, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.00046339546679519117, - "rewards/margins": 0.000800677458755672, - "rewards/rejected": -0.0012640730710700154, + "epoch": 0.044796691936595454, + "grad_norm": 2.310410976409912, + "learning_rate": 2.239448751076658e-08, + "logits/chosen": -3.0942649841308594, + "logits/rejected": -3.0820066928863525, + "logps/chosen": -54.8565673828125, + "logps/rejected": -56.64534378051758, + "loss": 0.6929, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0001261205761693418, + "rewards/margins": 0.0004069819115102291, + "rewards/rejected": -0.0002808613935485482, "step": 260 }, { - "epoch": 0.05, - "grad_norm": 2.212868394326073, - "learning_rate": 1.1627906976744186e-07, - "logits/chosen": -3.034665584564209, - "logits/rejected": -3.0166120529174805, - "logps/chosen": -53.17912673950195, - "logps/rejected": -54.439247131347656, - "loss": 0.6927, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.00068662193370983, - "rewards/margins": 0.000886773515958339, - "rewards/rejected": -0.0015733955660834908, + "epoch": 0.046519641626464506, + "grad_norm": 2.2104978561401367, + "learning_rate": 2.3255813953488372e-08, + "logits/chosen": -3.0330111980438232, + "logits/rejected": -3.014916181564331, + "logps/chosen": -53.13231658935547, + "logps/rejected": -54.31147384643555, + "loss": 0.6932, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0001277371047763154, + "rewards/margins": -6.514495180454105e-05, + "rewards/rejected": -6.259213841985911e-05, "step": 270 }, { - "epoch": 0.05, - "grad_norm": 2.4332688553771162, - "learning_rate": 1.205857019810508e-07, - "logits/chosen": -3.125800609588623, - "logits/rejected": -3.0919315814971924, - "logps/chosen": -57.64659881591797, - "logps/rejected": -53.57320022583008, - "loss": 0.6925, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.00048610559315420687, - "rewards/margins": 0.0012957851868122816, - "rewards/rejected": -0.0017818908672779799, + "epoch": 0.048242591316333565, + "grad_norm": 2.4354984760284424, + "learning_rate": 2.411714039621016e-08, + "logits/chosen": -3.1244430541992188, + "logits/rejected": -3.0903525352478027, + "logps/chosen": -57.60699462890625, + "logps/rejected": -53.4240837097168, + "loss": 0.6931, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 6.179927731864154e-05, + "rewards/margins": 0.00015818976680748165, + "rewards/rejected": -9.639047493692487e-05, "step": 280 }, { - "epoch": 0.05, - "grad_norm": 2.249917169819848, - "learning_rate": 1.2489233419465976e-07, - "logits/chosen": -3.048657178878784, - "logits/rejected": -3.034323215484619, - "logps/chosen": -55.451141357421875, - "logps/rejected": -54.447296142578125, - "loss": 0.6926, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.0007702001603320241, - "rewards/margins": 0.0010641098488122225, - "rewards/rejected": -0.0018343102419748902, + "epoch": 0.04996554100620262, + "grad_norm": 2.2454993724823, + "learning_rate": 2.4978466838931954e-08, + "logits/chosen": -3.0470690727233887, + "logits/rejected": -3.032789707183838, + "logps/chosen": -55.36420440673828, + "logps/rejected": -54.27949142456055, + "loss": 0.6931, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.00012243811215739697, + "rewards/margins": 9.9905242677778e-05, + "rewards/rejected": 2.2532873117597774e-05, "step": 290 }, { - "epoch": 0.05, - "grad_norm": 2.369298746667673, - "learning_rate": 1.2919896640826872e-07, - "logits/chosen": -3.004129409790039, - "logits/rejected": -2.9955711364746094, - "logps/chosen": -52.908668518066406, - "logps/rejected": -54.089874267578125, - "loss": 0.6926, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.0007483543595299125, - "rewards/margins": 0.0010376429418101907, - "rewards/rejected": -0.0017859973013401031, + "epoch": 0.051688490696071676, + "grad_norm": 2.363119125366211, + "learning_rate": 2.5839793281653743e-08, + "logits/chosen": -3.002816677093506, + "logits/rejected": -2.9941678047180176, + "logps/chosen": -52.84331130981445, + "logps/rejected": -53.92693328857422, + "loss": 0.6932, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -9.468554708291776e-06, + "rewards/margins": -0.00018169668328482658, + "rewards/rejected": 0.00017222815949935466, "step": 300 }, { - "epoch": 0.05, - "eval_logits/chosen": -3.163788318634033, - "eval_logits/rejected": -3.158111572265625, - "eval_logps/chosen": -58.64423751831055, - "eval_logps/rejected": -63.127235412597656, - "eval_loss": 0.6930013298988342, - "eval_rewards/accuracies": 0.5394981503486633, - "eval_rewards/chosen": 0.0005960779963061213, - "eval_rewards/margins": 0.00029463876853697, - "eval_rewards/rejected": 0.0003014392568729818, - "eval_runtime": 356.1408, - "eval_samples_per_second": 12.085, - "eval_steps_per_second": 1.511, + "epoch": 0.051688490696071676, + "eval_logits/chosen": -3.1631081104278564, + "eval_logits/rejected": -3.1574697494506836, + "eval_logps/chosen": -58.7095947265625, + "eval_logps/rejected": -63.167720794677734, + "eval_loss": 0.6931981444358826, + "eval_rewards/accuracies": 0.46956318616867065, + "eval_rewards/chosen": 2.3022554159979336e-05, + "eval_rewards/margins": -0.0001009509724099189, + "eval_rewards/rejected": 0.00012397351383697242, + "eval_runtime": 358.8693, + "eval_samples_per_second": 11.993, + "eval_steps_per_second": 1.499, "step": 300 }, { - "epoch": 0.05, - "grad_norm": 2.4743047933409654, - "learning_rate": 1.335055986218777e-07, - "logits/chosen": -3.0664687156677246, - "logits/rejected": -3.060901641845703, - "logps/chosen": -53.61384201049805, - "logps/rejected": -53.52678298950195, - "loss": 0.6926, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.0011877021752297878, - "rewards/margins": 0.0011907459702342749, - "rewards/rejected": -0.0023784481454640627, + "epoch": 0.05341144038594073, + "grad_norm": 2.470623731613159, + "learning_rate": 2.6701119724375536e-08, + "logits/chosen": -3.0653061866760254, + "logits/rejected": -3.0597829818725586, + "logps/chosen": -53.518653869628906, + "logps/rejected": -53.315582275390625, + "loss": 0.693, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 7.493379143852508e-06, + "rewards/margins": 0.0002100001584040001, + "rewards/rejected": -0.00020250678062438965, "step": 310 }, { - "epoch": 0.06, - "grad_norm": 2.4049747365414973, - "learning_rate": 1.3781223083548665e-07, - "logits/chosen": -3.023968458175659, - "logits/rejected": -2.9977526664733887, - "logps/chosen": -54.628395080566406, - "logps/rejected": -49.533180236816406, - "loss": 0.6923, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.0013292916119098663, - "rewards/margins": 0.0016255916561931372, - "rewards/rejected": -0.0029548832681030035, + "epoch": 0.05513439007580979, + "grad_norm": 2.3561344146728516, + "learning_rate": 2.756244616709733e-08, + "logits/chosen": -3.0231707096099854, + "logits/rejected": -2.996835947036743, + "logps/chosen": -54.5215950012207, + "logps/rejected": -49.26898956298828, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0001359484886052087, + "rewards/margins": 0.00015204887313302606, + "rewards/rejected": -0.00028799736173823476, "step": 320 }, { - "epoch": 0.06, - "grad_norm": 2.3870544128030664, - "learning_rate": 1.421188630490956e-07, - "logits/chosen": -3.0835556983947754, - "logits/rejected": -3.0599796772003174, - "logps/chosen": -55.1311149597168, - "logps/rejected": -52.4721794128418, - "loss": 0.6923, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.0011483869748190045, - "rewards/margins": 0.0016829262021929026, - "rewards/rejected": -0.0028313130605965853, + "epoch": 0.05685733976567884, + "grad_norm": 2.341036081314087, + "learning_rate": 2.8423772609819118e-08, + "logits/chosen": -3.0833919048309326, + "logits/rejected": -3.059788465499878, + "logps/chosen": -55.05128860473633, + "logps/rejected": -52.21721267700195, + "loss": 0.6932, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0001815518771763891, + "rewards/margins": -0.00011233085388084874, + "rewards/rejected": -6.922103784745559e-05, "step": 330 }, { - "epoch": 0.06, - "grad_norm": 2.177069307915396, - "learning_rate": 1.4642549526270455e-07, - "logits/chosen": -3.0061721801757812, - "logits/rejected": -2.9844064712524414, - "logps/chosen": -52.63057327270508, - "logps/rejected": -52.16088104248047, - "loss": 0.6926, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.001195924007333815, - "rewards/margins": 0.0012026333715766668, - "rewards/rejected": -0.0023985574953258038, + "epoch": 0.0585802894555479, + "grad_norm": 2.1545321941375732, + "learning_rate": 2.9285099052540913e-08, + "logits/chosen": -3.0052454471588135, + "logits/rejected": -2.983654022216797, + "logps/chosen": -52.548309326171875, + "logps/rejected": -51.952247619628906, + "loss": 0.6932, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.00021751364693045616, + "rewards/margins": -5.0237147661391646e-05, + "rewards/rejected": -0.0001672764919931069, "step": 340 }, { - "epoch": 0.06, - "grad_norm": 2.3234378454912106, - "learning_rate": 1.507321274763135e-07, - "logits/chosen": -2.978062391281128, - "logits/rejected": -2.9385359287261963, - "logps/chosen": -56.37324142456055, - "logps/rejected": -53.88068389892578, - "loss": 0.6923, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.0015299760270863771, - "rewards/margins": 0.0017499777022749186, - "rewards/rejected": -0.003279953496530652, + "epoch": 0.06030323914541695, + "grad_norm": 2.320605993270874, + "learning_rate": 3.01464254952627e-08, + "logits/chosen": -2.977752923965454, + "logits/rejected": -2.9382853507995605, + "logps/chosen": -56.247840881347656, + "logps/rejected": -53.581092834472656, + "loss": 0.6932, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0003836905234493315, + "rewards/margins": -0.00013840605970472097, + "rewards/rejected": -0.0002452843473292887, "step": 350 }, { - "epoch": 0.06, - "grad_norm": 2.4262238382471386, - "learning_rate": 1.5503875968992246e-07, - "logits/chosen": -3.1277754306793213, - "logits/rejected": -3.1047608852386475, - "logps/chosen": -54.798912048339844, - "logps/rejected": -50.93855667114258, - "loss": 0.6921, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.0022310030180960894, - "rewards/margins": 0.00208022678270936, - "rewards/rejected": -0.004311230033636093, + "epoch": 0.06202618883528601, + "grad_norm": 2.4087536334991455, + "learning_rate": 3.100775193798449e-08, + "logits/chosen": -3.1283411979675293, + "logits/rejected": -3.105438709259033, + "logps/chosen": -54.5811767578125, + "logps/rejected": -50.53804016113281, + "loss": 0.693, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 6.32234150543809e-05, + "rewards/margins": 0.000302921689581126, + "rewards/rejected": -0.0002396982890786603, "step": 360 }, { - "epoch": 0.06, - "grad_norm": 2.349381595760878, - "learning_rate": 1.5934539190353144e-07, - "logits/chosen": -3.1027891635894775, - "logits/rejected": -3.0734617710113525, - "logps/chosen": -52.51411819458008, - "logps/rejected": -51.80864715576172, - "loss": 0.6915, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.0014876796631142497, - "rewards/margins": 0.0032487933058291674, - "rewards/rejected": -0.0047364733181893826, + "epoch": 0.06374913852515507, + "grad_norm": 2.3251330852508545, + "learning_rate": 3.186907838070629e-08, + "logits/chosen": -3.1035568714141846, + "logits/rejected": -3.0743801593780518, + "logps/chosen": -52.390281677246094, + "logps/rejected": -51.37581253051758, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.121355570736341e-05, + "rewards/margins": 0.0002968797634821385, + "rewards/rejected": -0.00034809333737939596, "step": 370 }, { - "epoch": 0.07, - "grad_norm": 2.097053054398486, - "learning_rate": 1.636520241171404e-07, - "logits/chosen": -3.201812744140625, - "logits/rejected": -3.177008867263794, - "logps/chosen": -53.772377014160156, - "logps/rejected": -52.72692108154297, - "loss": 0.6919, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.0027772397734224796, - "rewards/margins": 0.002589695155620575, - "rewards/rejected": -0.005366935394704342, + "epoch": 0.06547208821502412, + "grad_norm": 2.0806522369384766, + "learning_rate": 3.273040482342808e-08, + "logits/chosen": -3.2038414478302, + "logits/rejected": -3.179076671600342, + "logps/chosen": -53.51982879638672, + "logps/rejected": -52.2159309387207, + "loss": 0.6931, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.0001610913168406114, + "rewards/margins": 9.2298214440234e-05, + "rewards/rejected": -0.0002533895312808454, "step": 380 }, { - "epoch": 0.07, - "grad_norm": 2.410804418588189, - "learning_rate": 1.6795865633074934e-07, - "logits/chosen": -3.0987088680267334, - "logits/rejected": -3.0732438564300537, - "logps/chosen": -56.3135986328125, - "logps/rejected": -55.72515106201172, - "loss": 0.6917, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.0026503982953727245, - "rewards/margins": 0.0030277767218649387, - "rewards/rejected": -0.005678174551576376, + "epoch": 0.06719503790489317, + "grad_norm": 2.4117937088012695, + "learning_rate": 3.359173126614987e-08, + "logits/chosen": -3.1004347801208496, + "logits/rejected": -3.074990749359131, + "logps/chosen": -56.068931579589844, + "logps/rejected": -55.217803955078125, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -3.399456909392029e-05, + "rewards/margins": 6.996531010372564e-05, + "rewards/rejected": -0.00010395983554190025, "step": 390 }, { - "epoch": 0.07, - "grad_norm": 2.178080396872868, - "learning_rate": 1.7226528854435832e-07, - "logits/chosen": -3.0680224895477295, - "logits/rejected": -3.0521273612976074, - "logps/chosen": -52.94443893432617, - "logps/rejected": -53.484153747558594, - "loss": 0.691, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.0031809869688004255, - "rewards/margins": 0.0043879905715584755, - "rewards/rejected": -0.00756897684186697, + "epoch": 0.06891798759476224, + "grad_norm": 2.096646308898926, + "learning_rate": 3.445305770887166e-08, + "logits/chosen": -3.0706591606140137, + "logits/rejected": -3.054755687713623, + "logps/chosen": -52.662017822265625, + "logps/rejected": -52.76776123046875, + "loss": 0.6933, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.00022807842469774187, + "rewards/margins": -0.00022528569388668984, + "rewards/rejected": -2.7927221708523575e-06, "step": 400 }, { - "epoch": 0.07, - "eval_logits/chosen": -3.1602914333343506, - "eval_logits/rejected": -3.1546647548675537, - "eval_logps/chosen": -58.560550689697266, - "eval_logps/rejected": -63.11561965942383, - "eval_loss": 0.6926479339599609, - "eval_rewards/accuracies": 0.5611059665679932, - "eval_rewards/chosen": 0.00143293512519449, - "eval_rewards/margins": 0.0010153905022889376, - "eval_rewards/rejected": 0.00041754471021704376, - "eval_runtime": 357.1134, - "eval_samples_per_second": 12.052, - "eval_steps_per_second": 1.507, + "epoch": 0.06891798759476224, + "eval_logits/chosen": -3.1630728244781494, + "eval_logits/rejected": -3.157430648803711, + "eval_logps/chosen": -58.69290542602539, + "eval_logps/rejected": -63.15719985961914, + "eval_loss": 0.6931674480438232, + "eval_rewards/accuracies": 0.4844330847263336, + "eval_rewards/chosen": 0.00018984945199918002, + "eval_rewards/margins": -3.937046858482063e-05, + "eval_rewards/rejected": 0.00022921990603208542, + "eval_runtime": 358.8505, + "eval_samples_per_second": 11.994, + "eval_steps_per_second": 1.499, "step": 400 }, { - "epoch": 0.07, - "grad_norm": 2.1621440173846986, - "learning_rate": 1.7657192075796725e-07, - "logits/chosen": -3.071235179901123, - "logits/rejected": -3.0671350955963135, - "logps/chosen": -51.14980697631836, - "logps/rejected": -56.166534423828125, - "loss": 0.6921, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.004188390448689461, - "rewards/margins": 0.002166205085813999, - "rewards/rejected": -0.00635459553450346, + "epoch": 0.07064093728463129, + "grad_norm": 2.153409242630005, + "learning_rate": 3.531438415159345e-08, + "logits/chosen": -3.0743587017059326, + "logits/rejected": -3.070240020751953, + "logps/chosen": -50.77604293823242, + "logps/rejected": -55.5684814453125, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0003055145207326859, + "rewards/margins": 9.899081487674266e-05, + "rewards/rejected": -0.0004045053501613438, "step": 410 }, { - "epoch": 0.07, - "grad_norm": 2.497135126567034, - "learning_rate": 1.8087855297157623e-07, - "logits/chosen": -3.0559000968933105, - "logits/rejected": -3.0479495525360107, - "logps/chosen": -54.61994552612305, - "logps/rejected": -54.487693786621094, - "loss": 0.6925, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.004710297100245953, - "rewards/margins": 0.0013622719561681151, - "rewards/rejected": -0.006072568707168102, + "epoch": 0.07236388697450034, + "grad_norm": 2.5351555347442627, + "learning_rate": 3.617571059431525e-08, + "logits/chosen": -3.0597853660583496, + "logits/rejected": -3.05192232131958, + "logps/chosen": -54.19941329956055, + "logps/rejected": -53.91211700439453, + "loss": 0.6933, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0003343108983244747, + "rewards/margins": -0.00035097176441922784, + "rewards/rejected": 1.6660869732731953e-05, "step": 420 }, { - "epoch": 0.07, - "grad_norm": 2.277715100196342, - "learning_rate": 1.8518518518518516e-07, - "logits/chosen": -3.083691358566284, - "logits/rejected": -3.069835662841797, - "logps/chosen": -53.39365768432617, - "logps/rejected": -54.8032341003418, - "loss": 0.6915, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.0031691633630543947, - "rewards/margins": 0.0033808585721999407, - "rewards/rejected": -0.006550021469593048, + "epoch": 0.0740868366643694, + "grad_norm": 2.2438621520996094, + "learning_rate": 3.7037037037037036e-08, + "logits/chosen": -3.0879173278808594, + "logits/rejected": -3.074145555496216, + "logps/chosen": -53.09697723388672, + "logps/rejected": -54.19342803955078, + "loss": 0.693, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -5.8635945606511086e-05, + "rewards/margins": 0.0002445178688503802, + "rewards/rejected": -0.0003031538217328489, "step": 430 }, { - "epoch": 0.08, - "grad_norm": 2.5466826911783222, - "learning_rate": 1.8949181739879413e-07, - "logits/chosen": -3.128629684448242, - "logits/rejected": -3.093276262283325, - "logps/chosen": -54.6483268737793, - "logps/rejected": -54.21075439453125, - "loss": 0.6901, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.0036215814761817455, - "rewards/margins": 0.006118671037256718, - "rewards/rejected": -0.009740252047777176, + "epoch": 0.07580978635423846, + "grad_norm": 2.5127062797546387, + "learning_rate": 3.7898363479758826e-08, + "logits/chosen": -3.133507490158081, + "logits/rejected": -3.098098039627075, + "logps/chosen": -54.32233428955078, + "logps/rejected": -53.277740478515625, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.00017584441229701042, + "rewards/margins": 0.0001403773931087926, + "rewards/rejected": -0.0003162218490615487, "step": 440 }, { - "epoch": 0.08, - "grad_norm": 2.2918968289848176, - "learning_rate": 1.9379844961240311e-07, - "logits/chosen": -3.050204038619995, - "logits/rejected": -3.0178327560424805, - "logps/chosen": -56.4046516418457, - "logps/rejected": -55.3991813659668, - "loss": 0.6907, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.0031705971341580153, - "rewards/margins": 0.004987316206097603, - "rewards/rejected": -0.008157914504408836, + "epoch": 0.07753273604410751, + "grad_norm": 2.283708333969116, + "learning_rate": 3.875968992248062e-08, + "logits/chosen": -3.0555710792541504, + "logits/rejected": -3.023096799850464, + "logps/chosen": -56.102012634277344, + "logps/rejected": -54.62397384643555, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": -6.127264350652695e-05, + "rewards/margins": 0.00016291522479150444, + "rewards/rejected": -0.00022418785374611616, "step": 450 }, { - "epoch": 0.08, - "grad_norm": 2.446433174118867, - "learning_rate": 1.9810508182601204e-07, - "logits/chosen": -3.0185251235961914, - "logits/rejected": -2.998788833618164, - "logps/chosen": -56.675636291503906, - "logps/rejected": -53.9906005859375, - "loss": 0.6905, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.004722902551293373, - "rewards/margins": 0.005447733215987682, - "rewards/rejected": -0.010170635767281055, + "epoch": 0.07925568573397657, + "grad_norm": 2.387023448944092, + "learning_rate": 3.962101636520241e-08, + "logits/chosen": -3.024385929107666, + "logits/rejected": -3.004836082458496, + "logps/chosen": -56.2154655456543, + "logps/rejected": -53.040618896484375, + "loss": 0.693, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.00015148159582167864, + "rewards/margins": 0.0003812285140156746, + "rewards/rejected": -0.0005327101098373532, "step": 460 }, { - "epoch": 0.08, - "grad_norm": 2.260629002268937, - "learning_rate": 2.0241171403962102e-07, - "logits/chosen": -3.0403809547424316, - "logits/rejected": -3.0088233947753906, - "logps/chosen": -53.7476692199707, - "logps/rejected": -52.34978103637695, - "loss": 0.6906, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.004685810301452875, - "rewards/margins": 0.005127486772835255, - "rewards/rejected": -0.009813296608626842, + "epoch": 0.08097863542384562, + "grad_norm": 2.2020015716552734, + "learning_rate": 4.04823428079242e-08, + "logits/chosen": -3.0474162101745605, + "logits/rejected": -3.015669584274292, + "logps/chosen": -53.30449676513672, + "logps/rejected": -51.4334716796875, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.0003891284577548504, + "rewards/margins": -4.8433535994263366e-05, + "rewards/rejected": -0.0003406949108466506, "step": 470 }, { - "epoch": 0.08, - "grad_norm": 2.4519536559144295, - "learning_rate": 2.0671834625322995e-07, - "logits/chosen": -3.0377235412597656, - "logits/rejected": -3.032456398010254, - "logps/chosen": -54.986839294433594, - "logps/rejected": -60.05756378173828, - "loss": 0.6914, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.007612565997987986, - "rewards/margins": 0.003608607454225421, - "rewards/rejected": -0.011221175082027912, + "epoch": 0.08270158511371468, + "grad_norm": 2.44252610206604, + "learning_rate": 4.134366925064599e-08, + "logits/chosen": -3.0451035499572754, + "logits/rejected": -3.0398306846618652, + "logps/chosen": -54.25200271606445, + "logps/rejected": -59.00077438354492, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.00012817162496503443, + "rewards/margins": 0.00028305535670369864, + "rewards/rejected": -0.00041122696711681783, "step": 480 }, { - "epoch": 0.08, - "grad_norm": 2.4663820792794677, - "learning_rate": 2.1102497846683892e-07, - "logits/chosen": -2.9463233947753906, - "logits/rejected": -2.8967654705047607, - "logps/chosen": -61.15422439575195, - "logps/rejected": -52.75993728637695, - "loss": 0.6889, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.005517776124179363, - "rewards/margins": 0.008782900869846344, - "rewards/rejected": -0.014300678856670856, + "epoch": 0.08442453480358374, + "grad_norm": 2.477860927581787, + "learning_rate": 4.2204995693367785e-08, + "logits/chosen": -2.9549922943115234, + "logits/rejected": -2.9052486419677734, + "logps/chosen": -60.62226486206055, + "logps/rejected": -51.43208694458008, + "loss": 0.6927, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -6.762434350093827e-05, + "rewards/margins": 0.0008653089171275496, + "rewards/rejected": -0.000932933297008276, "step": 490 }, { - "epoch": 0.09, - "grad_norm": 2.330775006199537, - "learning_rate": 2.1533161068044788e-07, - "logits/chosen": -3.0073325634002686, - "logits/rejected": -2.978564500808716, - "logps/chosen": -55.71733474731445, - "logps/rejected": -52.86774826049805, - "loss": 0.6907, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.007723568938672543, - "rewards/margins": 0.005096676293760538, - "rewards/rejected": -0.012820245698094368, + "epoch": 0.08614748449345279, + "grad_norm": 2.260258913040161, + "learning_rate": 4.306632213608958e-08, + "logits/chosen": -3.0170648097991943, + "logits/rejected": -2.9883503913879395, + "logps/chosen": -55.014686584472656, + "logps/rejected": -51.66577911376953, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0006481458549387753, + "rewards/margins": 0.00012295909982640296, + "rewards/rejected": -0.0007711048820056021, "step": 500 }, { - "epoch": 0.09, - "eval_logits/chosen": -3.1515376567840576, - "eval_logits/rejected": -3.1458804607391357, - "eval_logps/chosen": -58.49737548828125, - "eval_logps/rejected": -63.162086486816406, - "eval_loss": 0.6921212077140808, - "eval_rewards/accuracies": 0.5755111575126648, - "eval_rewards/chosen": 0.0020647228229790926, - "eval_rewards/margins": 0.0021117778960615396, - "eval_rewards/rejected": -4.7055131290107965e-05, - "eval_runtime": 357.3148, - "eval_samples_per_second": 12.045, - "eval_steps_per_second": 1.506, + "epoch": 0.08614748449345279, + "eval_logits/chosen": -3.1627964973449707, + "eval_logits/rejected": -3.157134771347046, + "eval_logps/chosen": -58.68923568725586, + "eval_logps/rejected": -63.15824508666992, + "eval_loss": 0.6931439638137817, + "eval_rewards/accuracies": 0.5016263723373413, + "eval_rewards/chosen": 0.00022665159485768527, + "eval_rewards/margins": 7.881514648033772e-06, + "eval_rewards/rejected": 0.00021877007384318858, + "eval_runtime": 358.4675, + "eval_samples_per_second": 12.007, + "eval_steps_per_second": 1.501, "step": 500 }, { - "epoch": 0.09, - "grad_norm": 2.29181189171612, - "learning_rate": 2.1963824289405683e-07, - "logits/chosen": -2.9982523918151855, - "logits/rejected": -2.977153778076172, - "logps/chosen": -59.12430953979492, - "logps/rejected": -53.17649459838867, - "loss": 0.6916, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.008605951443314552, - "rewards/margins": 0.0032728458754718304, - "rewards/rejected": -0.01187879778444767, + "epoch": 0.08787043418332184, + "grad_norm": 2.2401208877563477, + "learning_rate": 4.3927648578811363e-08, + "logits/chosen": -3.009411573410034, + "logits/rejected": -2.988041400909424, + "logps/chosen": -58.32307052612305, + "logps/rejected": -52.057106018066406, + "loss": 0.6931, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.0004813670529983938, + "rewards/margins": 0.00014227270730771124, + "rewards/rejected": -0.0006236397894099355, "step": 510 }, { - "epoch": 0.09, - "grad_norm": 2.083836403157807, - "learning_rate": 2.239448751076658e-07, - "logits/chosen": -3.045750141143799, - "logits/rejected": -3.0202910900115967, - "logps/chosen": -57.38309860229492, - "logps/rejected": -53.21696090698242, - "loss": 0.6902, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -0.009720906615257263, - "rewards/margins": 0.006151780020445585, - "rewards/rejected": -0.015872687101364136, + "epoch": 0.08959338387319091, + "grad_norm": 2.0701003074645996, + "learning_rate": 4.478897502153316e-08, + "logits/chosen": -3.0574822425842285, + "logits/rejected": -3.031964063644409, + "logps/chosen": -56.46551513671875, + "logps/rejected": -51.7152214050293, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0005857711075805128, + "rewards/margins": 0.0002671369875315577, + "rewards/rejected": -0.0008529081824235618, "step": 520 }, { - "epoch": 0.09, - "grad_norm": 2.0972964611747837, - "learning_rate": 2.2825150732127476e-07, - "logits/chosen": -3.0414490699768066, - "logits/rejected": -3.0001041889190674, - "logps/chosen": -56.354469299316406, - "logps/rejected": -53.02845001220703, - "loss": 0.6874, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.006593731231987476, - "rewards/margins": 0.01189809013158083, - "rewards/rejected": -0.018491821363568306, + "epoch": 0.09131633356305996, + "grad_norm": 2.063835382461548, + "learning_rate": 4.5650301464254955e-08, + "logits/chosen": -3.0539472103118896, + "logits/rejected": -3.012070894241333, + "logps/chosen": -55.717613220214844, + "logps/rejected": -51.2911491394043, + "loss": 0.6927, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.00017993396613746881, + "rewards/margins": 0.0008722777711227536, + "rewards/rejected": -0.0010522117372602224, "step": 530 }, { - "epoch": 0.09, - "grad_norm": 2.264333945297249, - "learning_rate": 2.3255813953488372e-07, - "logits/chosen": -3.0273146629333496, - "logits/rejected": -3.0108981132507324, - "logps/chosen": -54.00878143310547, - "logps/rejected": -54.52630615234375, - "loss": 0.6909, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.011568492278456688, - "rewards/margins": 0.004814336076378822, - "rewards/rejected": -0.01638282835483551, + "epoch": 0.09303928325292901, + "grad_norm": 2.2334859371185303, + "learning_rate": 4.6511627906976744e-08, + "logits/chosen": -3.039977550506592, + "logits/rejected": -3.0235543251037598, + "logps/chosen": -52.89643096923828, + "logps/rejected": -52.98133087158203, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.0004417643358465284, + "rewards/margins": 0.00019420584430918097, + "rewards/rejected": -0.0006359702092595398, "step": 540 }, { - "epoch": 0.09, - "grad_norm": 2.2643011063724696, - "learning_rate": 2.3686477174849267e-07, - "logits/chosen": -3.086310863494873, - "logits/rejected": -3.0684800148010254, - "logps/chosen": -54.64630126953125, - "logps/rejected": -53.90349197387695, - "loss": 0.689, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.010920585133135319, - "rewards/margins": 0.008641783148050308, - "rewards/rejected": -0.0195623692125082, + "epoch": 0.09476223294279806, + "grad_norm": 2.2609357833862305, + "learning_rate": 4.7372954349698534e-08, + "logits/chosen": -3.1005699634552, + "logits/rejected": -3.082533836364746, + "logps/chosen": -53.61052322387695, + "logps/rejected": -52.064308166503906, + "loss": 0.6928, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.00036659216857515275, + "rewards/margins": 0.0006485350313596427, + "rewards/rejected": -0.0010151272872462869, "step": 550 }, { - "epoch": 0.1, - "grad_norm": 2.6473142726505916, - "learning_rate": 2.411714039621016e-07, - "logits/chosen": -3.0569510459899902, - "logits/rejected": -3.0491485595703125, - "logps/chosen": -53.267906188964844, - "logps/rejected": -57.03251266479492, - "loss": 0.6887, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.011830061674118042, - "rewards/margins": 0.009134244173765182, - "rewards/rejected": -0.020964305847883224, + "epoch": 0.09648518263266713, + "grad_norm": 2.5743932723999023, + "learning_rate": 4.823428079242032e-08, + "logits/chosen": -3.07200026512146, + "logits/rejected": -3.0641121864318848, + "logps/chosen": -52.15167236328125, + "logps/rejected": -55.08266067504883, + "loss": 0.6928, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0004310603253543377, + "rewards/margins": 0.0006935327546671033, + "rewards/rejected": -0.001124593080021441, "step": 560 }, { - "epoch": 0.1, - "grad_norm": 2.2526602237955093, - "learning_rate": 2.454780361757106e-07, - "logits/chosen": -3.0276739597320557, - "logits/rejected": -3.019141435623169, - "logps/chosen": -52.507972717285156, - "logps/rejected": -55.79932403564453, - "loss": 0.6892, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.011621413752436638, - "rewards/margins": 0.008228513412177563, - "rewards/rejected": -0.019849926233291626, + "epoch": 0.09820813232253618, + "grad_norm": 2.238826036453247, + "learning_rate": 4.909560723514212e-08, + "logits/chosen": -3.044468641281128, + "logits/rejected": -3.0358006954193115, + "logps/chosen": -51.41157913208008, + "logps/rejected": -53.927734375, + "loss": 0.6929, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0004941682564094663, + "rewards/margins": 0.0005627792561426759, + "rewards/rejected": -0.0010569475125521421, "step": 570 }, { - "epoch": 0.1, - "grad_norm": 1.8450642814148608, - "learning_rate": 2.4978466838931953e-07, - "logits/chosen": -3.0327651500701904, - "logits/rejected": -3.0267834663391113, - "logps/chosen": -52.5694580078125, - "logps/rejected": -55.16640090942383, - "loss": 0.691, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.014748876914381981, - "rewards/margins": 0.004590337164700031, - "rewards/rejected": -0.019339213147759438, + "epoch": 0.09993108201240523, + "grad_norm": 1.7754663228988647, + "learning_rate": 4.995693367786391e-08, + "logits/chosen": -3.0513761043548584, + "logits/rejected": -3.0455574989318848, + "logps/chosen": -51.184574127197266, + "logps/rejected": -53.333045959472656, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0007650370826013386, + "rewards/margins": 0.00014660393935628235, + "rewards/rejected": -0.0009116409346461296, "step": 580 }, { - "epoch": 0.1, - "grad_norm": 2.0917320499094676, - "learning_rate": 2.540913006029285e-07, - "logits/chosen": -3.029935359954834, - "logits/rejected": -3.0082030296325684, - "logps/chosen": -56.06707763671875, - "logps/rejected": -56.61497116088867, - "loss": 0.6903, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.012053562328219414, - "rewards/margins": 0.006142253987491131, - "rewards/rejected": -0.01819581724703312, + "epoch": 0.1016540317022743, + "grad_norm": 2.0567564964294434, + "learning_rate": 5.08182601205857e-08, + "logits/chosen": -3.0504350662231445, + "logits/rejected": -3.028501033782959, + "logps/chosen": -54.93719482421875, + "logps/rejected": -54.896331787109375, + "loss": 0.6929, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0005827373242937028, + "rewards/margins": 0.0005187354981899261, + "rewards/rejected": -0.00110147288069129, "step": 590 }, { - "epoch": 0.1, - "grad_norm": 2.3704490382647596, - "learning_rate": 2.5839793281653743e-07, - "logits/chosen": -3.001335382461548, - "logits/rejected": -2.9773948192596436, - "logps/chosen": -54.8140869140625, - "logps/rejected": -59.22527313232422, - "loss": 0.6852, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.009530474431812763, - "rewards/margins": 0.01643664576113224, - "rewards/rejected": -0.025967121124267578, + "epoch": 0.10337698139214335, + "grad_norm": 2.3285975456237793, + "learning_rate": 5.1679586563307486e-08, + "logits/chosen": -3.0234103202819824, + "logits/rejected": -2.9992403984069824, + "logps/chosen": -53.90376663208008, + "logps/rejected": -56.81464385986328, + "loss": 0.6925, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.00020016773487441242, + "rewards/margins": 0.0012626869138330221, + "rewards/rejected": -0.001462854677811265, "step": 600 }, { - "epoch": 0.1, - "eval_logits/chosen": -3.1388049125671387, - "eval_logits/rejected": -3.1331238746643066, - "eval_logps/chosen": -58.60028839111328, - "eval_logps/rejected": -63.40556716918945, - "eval_loss": 0.6914582848548889, - "eval_rewards/accuracies": 0.5822490453720093, - "eval_rewards/chosen": 0.0010355679551139474, - "eval_rewards/margins": 0.0035174190998077393, - "eval_rewards/rejected": -0.0024818514939397573, - "eval_runtime": 357.2236, - "eval_samples_per_second": 12.048, - "eval_steps_per_second": 1.506, + "epoch": 0.10337698139214335, + "eval_logits/chosen": -3.1622629165649414, + "eval_logits/rejected": -3.1565933227539062, + "eval_logps/chosen": -58.667110443115234, + "eval_logps/rejected": -63.150657653808594, + "eval_loss": 0.6930716037750244, + "eval_rewards/accuracies": 0.5157992839813232, + "eval_rewards/chosen": 0.0004477898473851383, + "eval_rewards/margins": 0.0001531161105958745, + "eval_rewards/rejected": 0.0002946736931335181, + "eval_runtime": 358.8537, + "eval_samples_per_second": 11.994, + "eval_steps_per_second": 1.499, "step": 600 }, { - "epoch": 0.11, - "grad_norm": 2.356076010409613, - "learning_rate": 2.627045650301464e-07, - "logits/chosen": -2.964993715286255, - "logits/rejected": -2.9622960090637207, - "logps/chosen": -54.120277404785156, - "logps/rejected": -55.163963317871094, - "loss": 0.6917, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.015915414318442345, - "rewards/margins": 0.0033319643698632717, - "rewards/rejected": -0.019247379153966904, + "epoch": 0.1050999310820124, + "grad_norm": 2.227032423019409, + "learning_rate": 5.254091300602929e-08, + "logits/chosen": -2.9893407821655273, + "logits/rejected": -2.9866795539855957, + "logps/chosen": -52.614341735839844, + "logps/rejected": -53.344757080078125, + "loss": 0.693, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0004973303293809295, + "rewards/margins": 0.0003403354494366795, + "rewards/rejected": -0.0008376656915061176, "step": 610 }, { - "epoch": 0.11, - "grad_norm": 2.474319380074331, - "learning_rate": 2.670111972437554e-07, - "logits/chosen": -3.120192289352417, - "logits/rejected": -3.0936880111694336, - "logps/chosen": -56.2474479675293, - "logps/rejected": -56.004188537597656, - "loss": 0.6857, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.012355372309684753, - "rewards/margins": 0.015266923233866692, - "rewards/rejected": -0.027622297406196594, + "epoch": 0.10682288077188146, + "grad_norm": 2.3722407817840576, + "learning_rate": 5.340223944875107e-08, + "logits/chosen": -3.148256540298462, + "logits/rejected": -3.1216824054718018, + "logps/chosen": -55.066368103027344, + "logps/rejected": -53.42070388793945, + "loss": 0.6926, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0004736125993076712, + "rewards/margins": 0.001064404845237732, + "rewards/rejected": -0.0015380174154415727, "step": 620 }, { - "epoch": 0.11, - "grad_norm": 2.5148230940892784, - "learning_rate": 2.713178294573643e-07, - "logits/chosen": -3.101238250732422, - "logits/rejected": -3.0744516849517822, - "logps/chosen": -55.61579513549805, - "logps/rejected": -53.38257598876953, - "loss": 0.6895, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.01770562306046486, - "rewards/margins": 0.007805233355611563, - "rewards/rejected": -0.02551085688173771, + "epoch": 0.10854583046175052, + "grad_norm": 2.4768080711364746, + "learning_rate": 5.426356589147286e-08, + "logits/chosen": -3.1315252780914307, + "logits/rejected": -3.1046836376190186, + "logps/chosen": -53.95940017700195, + "logps/rejected": -51.006202697753906, + "loss": 0.6929, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0009459175053052604, + "rewards/margins": 0.0005772692384198308, + "rewards/rejected": -0.0015231866855174303, "step": 630 }, { - "epoch": 0.11, - "grad_norm": 2.4893970821776743, - "learning_rate": 2.756244616709733e-07, - "logits/chosen": -3.067873239517212, - "logits/rejected": -3.056884527206421, - "logps/chosen": -54.79213333129883, - "logps/rejected": -56.961280822753906, - "loss": 0.6896, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.018890097737312317, - "rewards/margins": 0.007660907693207264, - "rewards/rejected": -0.026551008224487305, + "epoch": 0.11026878015161957, + "grad_norm": 2.4684054851531982, + "learning_rate": 5.512489233419466e-08, + "logits/chosen": -3.099504232406616, + "logits/rejected": -3.0885913372039795, + "logps/chosen": -52.9940071105957, + "logps/rejected": -54.46311569213867, + "loss": 0.6929, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0008191806264221668, + "rewards/margins": 0.0005473231431096792, + "rewards/rejected": -0.001366503769531846, "step": 640 }, { - "epoch": 0.11, - "grad_norm": 2.8078860081688024, - "learning_rate": 2.799310938845822e-07, - "logits/chosen": -3.078247308731079, - "logits/rejected": -3.0796327590942383, - "logps/chosen": -53.70562744140625, - "logps/rejected": -57.23451614379883, - "loss": 0.6925, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -0.02245044708251953, - "rewards/margins": 0.0018996421713382006, - "rewards/rejected": -0.024350086227059364, + "epoch": 0.11199172984148863, + "grad_norm": 2.682082176208496, + "learning_rate": 5.598621877691645e-08, + "logits/chosen": -3.111945629119873, + "logits/rejected": -3.1133782863616943, + "logps/chosen": -51.602684020996094, + "logps/rejected": -54.963134765625, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0012779730604961514, + "rewards/margins": 0.00013899999612476677, + "rewards/rejected": -0.0014169730711728334, "step": 650 }, { - "epoch": 0.11, - "grad_norm": 2.4337428834992307, - "learning_rate": 2.842377260981912e-07, - "logits/chosen": -2.967741012573242, - "logits/rejected": -2.9617857933044434, - "logps/chosen": -56.363494873046875, - "logps/rejected": -54.866737365722656, - "loss": 0.689, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.017431819811463356, - "rewards/margins": 0.008792152628302574, - "rewards/rejected": -0.02622397243976593, + "epoch": 0.11371467953135768, + "grad_norm": 2.239454984664917, + "learning_rate": 5.6847545219638235e-08, + "logits/chosen": -3.002143621444702, + "logits/rejected": -2.995913028717041, + "logps/chosen": -54.70659255981445, + "logps/rejected": -52.40941619873047, + "loss": 0.6928, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.000785406562499702, + "rewards/margins": 0.0006732040201313794, + "rewards/rejected": -0.0014586106408387423, "step": 660 }, { - "epoch": 0.12, - "grad_norm": 2.339727211875298, - "learning_rate": 2.885443583118002e-07, - "logits/chosen": -2.9894890785217285, - "logits/rejected": -2.984511137008667, - "logps/chosen": -54.77692794799805, - "logps/rejected": -60.03133010864258, - "loss": 0.6897, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.01770101860165596, - "rewards/margins": 0.007546558044850826, - "rewards/rejected": -0.02524757757782936, + "epoch": 0.11543762922122675, + "grad_norm": 2.2007970809936523, + "learning_rate": 5.770887166236004e-08, + "logits/chosen": -3.0248777866363525, + "logits/rejected": -3.020057439804077, + "logps/chosen": -53.08942794799805, + "logps/rejected": -57.63835906982422, + "loss": 0.693, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.0008383308304473758, + "rewards/margins": 0.000339227553922683, + "rewards/rejected": -0.0011775584425777197, "step": 670 }, { - "epoch": 0.12, - "grad_norm": 2.5872483817238128, - "learning_rate": 2.928509905254091e-07, - "logits/chosen": -2.9396934509277344, - "logits/rejected": -2.914903163909912, - "logps/chosen": -55.78126907348633, - "logps/rejected": -54.39397430419922, - "loss": 0.685, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.018597902730107307, - "rewards/margins": 0.016813453286886215, - "rewards/rejected": -0.035411350429058075, + "epoch": 0.1171605789110958, + "grad_norm": 2.3188748359680176, + "learning_rate": 5.857019810508183e-08, + "logits/chosen": -2.976760149002075, + "logits/rejected": -2.9516377449035645, + "logps/chosen": -54.03368377685547, + "logps/rejected": -51.06056213378906, + "loss": 0.6927, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0010046338429674506, + "rewards/margins": 0.0009472542442381382, + "rewards/rejected": -0.0019518882036209106, "step": 680 }, { - "epoch": 0.12, - "grad_norm": 2.7292903036637064, - "learning_rate": 2.971576227390181e-07, - "logits/chosen": -3.085705518722534, - "logits/rejected": -3.056213617324829, - "logps/chosen": -60.99599075317383, - "logps/rejected": -53.571380615234375, - "loss": 0.6877, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.019351843744516373, - "rewards/margins": 0.011478090658783913, - "rewards/rejected": -0.030829936265945435, + "epoch": 0.11888352860096485, + "grad_norm": 2.590332269668579, + "learning_rate": 5.9431524547803616e-08, + "logits/chosen": -3.1253817081451416, + "logits/rejected": -3.0954742431640625, + "logps/chosen": -59.173851013183594, + "logps/rejected": -50.692222595214844, + "loss": 0.6929, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0009134371066465974, + "rewards/margins": 0.0005907953018322587, + "rewards/rejected": -0.0015042326413094997, "step": 690 }, { - "epoch": 0.12, - "grad_norm": 2.389276468868878, - "learning_rate": 3.01464254952627e-07, - "logits/chosen": -3.0419516563415527, - "logits/rejected": -3.0135397911071777, - "logps/chosen": -57.74323272705078, - "logps/rejected": -56.58147430419922, - "loss": 0.6854, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.02020222134888172, - "rewards/margins": 0.016081832349300385, - "rewards/rejected": -0.03628405183553696, + "epoch": 0.1206064782908339, + "grad_norm": 2.1985859870910645, + "learning_rate": 6.02928509905254e-08, + "logits/chosen": -3.083073139190674, + "logits/rejected": -3.0547680854797363, + "logps/chosen": -55.823707580566406, + "logps/rejected": -53.165374755859375, + "loss": 0.6927, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0009546114015392959, + "rewards/margins": 0.0008917865343391895, + "rewards/rejected": -0.0018463979940861464, "step": 700 }, { - "epoch": 0.12, - "eval_logits/chosen": -3.1206588745117188, - "eval_logits/rejected": -3.114993095397949, - "eval_logps/chosen": -58.9453010559082, - "eval_logps/rejected": -63.95465087890625, - "eval_loss": 0.6904971599578857, - "eval_rewards/accuracies": 0.5894516706466675, - "eval_rewards/chosen": -0.0024145517963916063, - "eval_rewards/margins": 0.005558097269386053, - "eval_rewards/rejected": -0.007972650229930878, - "eval_runtime": 357.4068, - "eval_samples_per_second": 12.042, - "eval_steps_per_second": 1.505, + "epoch": 0.1206064782908339, + "eval_logits/chosen": -3.161243200302124, + "eval_logits/rejected": -3.1556365489959717, + "eval_logps/chosen": -58.65497970581055, + "eval_logps/rejected": -63.14204025268555, + "eval_loss": 0.6930544376373291, + "eval_rewards/accuracies": 0.5276486873626709, + "eval_rewards/chosen": 0.0005691515398211777, + "eval_rewards/margins": 0.0001882914948510006, + "eval_rewards/rejected": 0.00038086005952209234, + "eval_runtime": 358.9432, + "eval_samples_per_second": 11.991, + "eval_steps_per_second": 1.499, "step": 700 }, { - "epoch": 0.12, - "grad_norm": 2.586197787105945, - "learning_rate": 3.05770887166236e-07, - "logits/chosen": -3.0198841094970703, - "logits/rejected": -2.9914581775665283, - "logps/chosen": -56.7116584777832, - "logps/rejected": -58.61717987060547, - "loss": 0.6843, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.020994270220398903, - "rewards/margins": 0.018473895266652107, - "rewards/rejected": -0.03946816921234131, + "epoch": 0.12232942798070297, + "grad_norm": 2.2699103355407715, + "learning_rate": 6.11541774332472e-08, + "logits/chosen": -3.0615015029907227, + "logits/rejected": -3.032845973968506, + "logps/chosen": -54.72351837158203, + "logps/rejected": -54.8898811340332, + "loss": 0.6926, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0007481739739887416, + "rewards/margins": 0.0011700403410941362, + "rewards/rejected": -0.0019182141404598951, "step": 710 }, { - "epoch": 0.12, - "grad_norm": 2.702057793681809, - "learning_rate": 3.100775193798449e-07, - "logits/chosen": -2.9804978370666504, - "logits/rejected": -2.9764490127563477, - "logps/chosen": -55.76905059814453, - "logps/rejected": -58.02165603637695, - "loss": 0.6876, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -0.024686764925718307, - "rewards/margins": 0.011716444976627827, - "rewards/rejected": -0.03640320897102356, + "epoch": 0.12405237767057202, + "grad_norm": 2.5009286403656006, + "learning_rate": 6.201550387596898e-08, + "logits/chosen": -3.0233469009399414, + "logits/rejected": -3.019094944000244, + "logps/chosen": -53.42400360107422, + "logps/rejected": -54.57286834716797, + "loss": 0.6928, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0010689168702811003, + "rewards/margins": 0.0006829799385741353, + "rewards/rejected": -0.0017518966924399137, "step": 720 }, { - "epoch": 0.13, - "grad_norm": 2.6466880554114547, - "learning_rate": 3.143841515934539e-07, - "logits/chosen": -3.0941200256347656, - "logits/rejected": -3.069159746170044, - "logps/chosen": -59.358177185058594, - "logps/rejected": -57.388641357421875, - "loss": 0.6844, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.03032388910651207, - "rewards/margins": 0.01852940209209919, - "rewards/rejected": -0.04885329678654671, + "epoch": 0.12577532736044109, + "grad_norm": 2.380788803100586, + "learning_rate": 6.287683031869078e-08, + "logits/chosen": -3.1400551795959473, + "logits/rejected": -3.115017890930176, + "logps/chosen": -56.4382209777832, + "logps/rejected": -52.75396728515625, + "loss": 0.6925, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.001059521222487092, + "rewards/margins": 0.001269210479222238, + "rewards/rejected": -0.0023287315852940083, "step": 730 }, { - "epoch": 0.13, - "grad_norm": 2.3238465611991073, - "learning_rate": 3.186907838070629e-07, - "logits/chosen": -2.9740123748779297, - "logits/rejected": -2.9482741355895996, - "logps/chosen": -57.704978942871094, - "logps/rejected": -58.32112503051758, - "loss": 0.6846, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.0279629435390234, - "rewards/margins": 0.01816297508776188, - "rewards/rejected": -0.04612591490149498, + "epoch": 0.12749827705031014, + "grad_norm": 2.2064075469970703, + "learning_rate": 6.373815676141258e-08, + "logits/chosen": -3.0220272541046143, + "logits/rejected": -2.996253252029419, + "logps/chosen": -54.99372100830078, + "logps/rejected": -53.92853927612305, + "loss": 0.6924, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0007686042226850986, + "rewards/margins": 0.0015255699399858713, + "rewards/rejected": -0.0022941739298403263, "step": 740 }, { - "epoch": 0.13, - "grad_norm": 2.60752160244168, - "learning_rate": 3.229974160206718e-07, - "logits/chosen": -3.142927646636963, - "logits/rejected": -3.115029811859131, - "logps/chosen": -58.75443649291992, - "logps/rejected": -59.41529083251953, - "loss": 0.6801, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.027507567778229713, - "rewards/margins": 0.026881281286478043, - "rewards/rejected": -0.05438884347677231, + "epoch": 0.1292212267401792, + "grad_norm": 2.4289848804473877, + "learning_rate": 6.459948320413436e-08, + "logits/chosen": -3.195591449737549, + "logits/rejected": -3.1674859523773193, + "logps/chosen": -56.058433532714844, + "logps/rejected": -54.27191162109375, + "loss": 0.692, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.00045926342136226594, + "rewards/margins": 0.0022056284360587597, + "rewards/rejected": -0.0026648917701095343, "step": 750 }, { - "epoch": 0.13, - "grad_norm": 3.113387241673442, - "learning_rate": 3.273040482342808e-07, - "logits/chosen": -2.9908509254455566, - "logits/rejected": -2.9526984691619873, - "logps/chosen": -57.714439392089844, - "logps/rejected": -54.94502639770508, - "loss": 0.683, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.03286290913820267, - "rewards/margins": 0.02157803811132908, - "rewards/rejected": -0.0544409453868866, + "epoch": 0.13094417643004824, + "grad_norm": 2.580063819885254, + "learning_rate": 6.546080964685615e-08, + "logits/chosen": -3.0457024574279785, + "logits/rejected": -3.007045269012451, + "logps/chosen": -54.52983856201172, + "logps/rejected": -49.803890228271484, + "loss": 0.6922, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.00093385751824826, + "rewards/margins": 0.0019278116524219513, + "rewards/rejected": -0.002861668821424246, "step": 760 }, { - "epoch": 0.13, - "grad_norm": 2.4453618798570953, - "learning_rate": 3.3161068044788976e-07, - "logits/chosen": -3.038949489593506, - "logits/rejected": -3.015986680984497, - "logps/chosen": -56.661033630371094, - "logps/rejected": -57.70270538330078, - "loss": 0.685, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.037630461156368256, - "rewards/margins": 0.017523907124996185, - "rewards/rejected": -0.05515437200665474, + "epoch": 0.1326671261199173, + "grad_norm": 2.090689182281494, + "learning_rate": 6.632213608957795e-08, + "logits/chosen": -3.096262216567993, + "logits/rejected": -3.073035478591919, + "logps/chosen": -53.0078239440918, + "logps/rejected": -52.45222091674805, + "loss": 0.6924, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0008852818864397705, + "rewards/margins": 0.0015501610469073057, + "rewards/rejected": -0.00243544252589345, "step": 770 }, { - "epoch": 0.13, - "grad_norm": 2.7995118829195893, - "learning_rate": 3.359173126614987e-07, - "logits/chosen": -3.0295486450195312, - "logits/rejected": -2.9988982677459717, - "logps/chosen": -57.22774124145508, - "logps/rejected": -57.87713623046875, - "loss": 0.6804, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.04078695923089981, - "rewards/margins": 0.026560068130493164, - "rewards/rejected": -0.06734703481197357, + "epoch": 0.13439007580978635, + "grad_norm": 2.663207530975342, + "learning_rate": 6.718346253229975e-08, + "logits/chosen": -3.091189384460449, + "logits/rejected": -3.06011700630188, + "logps/chosen": -53.27953338623047, + "logps/rejected": -51.486976623535156, + "loss": 0.6923, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0013955377507954836, + "rewards/margins": 0.00176303053740412, + "rewards/rejected": -0.003158567938953638, "step": 780 }, { - "epoch": 0.14, - "grad_norm": 2.59814741855216, - "learning_rate": 3.402239448751076e-07, - "logits/chosen": -3.0351967811584473, - "logits/rejected": -3.001878499984741, - "logps/chosen": -57.94755172729492, - "logps/rejected": -59.64392852783203, - "loss": 0.6835, + "epoch": 0.1361130254996554, + "grad_norm": 2.2545323371887207, + "learning_rate": 6.804478897502153e-08, + "logits/chosen": -3.0985918045043945, + "logits/rejected": -3.065253496170044, + "logps/chosen": -54.0023078918457, + "logps/rejected": -53.85721969604492, + "loss": 0.6922, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.04035332426428795, - "rewards/margins": 0.02070373296737671, - "rewards/rejected": -0.06105704978108406, + "rewards/chosen": -0.0009566719527356327, + "rewards/margins": 0.001929369755089283, + "rewards/rejected": -0.0028860417660325766, "step": 790 }, { - "epoch": 0.14, - "grad_norm": 2.979271826495132, - "learning_rate": 3.4453057708871665e-07, - "logits/chosen": -2.9210307598114014, - "logits/rejected": -2.8951001167297363, - "logps/chosen": -59.6242790222168, - "logps/rejected": -61.24828338623047, - "loss": 0.6829, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.04646407067775726, - "rewards/margins": 0.021958164870738983, - "rewards/rejected": -0.06842224299907684, + "epoch": 0.13783597518952448, + "grad_norm": 2.6749494075775146, + "learning_rate": 6.890611541774332e-08, + "logits/chosen": -2.984854221343994, + "logits/rejected": -2.958704710006714, + "logps/chosen": -55.17264938354492, + "logps/rejected": -54.7309455871582, + "loss": 0.6924, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0017509472090750933, + "rewards/margins": 0.0016059564659371972, + "rewards/rejected": -0.003356903325766325, "step": 800 }, { - "epoch": 0.14, - "eval_logits/chosen": -3.094379186630249, - "eval_logits/rejected": -3.0886640548706055, - "eval_logps/chosen": -60.67955017089844, - "eval_logps/rejected": -66.09896087646484, - "eval_loss": 0.6886637806892395, - "eval_rewards/accuracies": 0.5734200477600098, - "eval_rewards/chosen": -0.019756997004151344, - "eval_rewards/margins": 0.009658826515078545, - "eval_rewards/rejected": -0.029415827244520187, - "eval_runtime": 356.2401, - "eval_samples_per_second": 12.082, - "eval_steps_per_second": 1.51, + "epoch": 0.13783597518952448, + "eval_logits/chosen": -3.160144567489624, + "eval_logits/rejected": -3.154557228088379, + "eval_logps/chosen": -58.60886764526367, + "eval_logps/rejected": -63.124420166015625, + "eval_loss": 0.692913293838501, + "eval_rewards/accuracies": 0.550882875919342, + "eval_rewards/chosen": 0.0010302558075636625, + "eval_rewards/margins": 0.0004732160014100373, + "eval_rewards/rejected": 0.0005570398643612862, + "eval_runtime": 358.4769, + "eval_samples_per_second": 12.006, + "eval_steps_per_second": 1.501, "step": 800 }, { - "epoch": 0.14, - "grad_norm": 2.4246944915810724, - "learning_rate": 3.4883720930232557e-07, - "logits/chosen": -2.9902844429016113, - "logits/rejected": -2.9628844261169434, - "logps/chosen": -61.16777801513672, - "logps/rejected": -62.579185485839844, - "loss": 0.6814, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.045093193650245667, - "rewards/margins": 0.025155682116746902, - "rewards/rejected": -0.07024887204170227, + "epoch": 0.13955892487939353, + "grad_norm": 2.225269317626953, + "learning_rate": 6.976744186046512e-08, + "logits/chosen": -3.059091567993164, + "logits/rejected": -3.0310420989990234, + "logps/chosen": -56.76906204223633, + "logps/rejected": -55.854148864746094, + "loss": 0.6924, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0013166740536689758, + "rewards/margins": 0.001593802124261856, + "rewards/rejected": -0.002910476177930832, "step": 810 }, { - "epoch": 0.14, - "grad_norm": 2.572085405803537, - "learning_rate": 3.531438415159345e-07, - "logits/chosen": -3.0369210243225098, - "logits/rejected": -3.0116257667541504, - "logps/chosen": -56.83648681640625, - "logps/rejected": -58.9111328125, - "loss": 0.6793, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.05299956351518631, - "rewards/margins": 0.028944198042154312, - "rewards/rejected": -0.08194376528263092, + "epoch": 0.14128187456926258, + "grad_norm": 2.152216911315918, + "learning_rate": 7.06287683031869e-08, + "logits/chosen": -3.108264923095703, + "logits/rejected": -3.0827205181121826, + "logps/chosen": -51.76808547973633, + "logps/rejected": -51.052452087402344, + "loss": 0.6926, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0022040274925529957, + "rewards/margins": 0.0010511528234928846, + "rewards/rejected": -0.0032551803160458803, "step": 820 }, { - "epoch": 0.14, - "grad_norm": 2.8751549171801125, - "learning_rate": 3.574504737295435e-07, - "logits/chosen": -2.96337628364563, - "logits/rejected": -2.948317050933838, - "logps/chosen": -61.24248504638672, - "logps/rejected": -63.26741409301758, - "loss": 0.6811, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.06662433594465256, - "rewards/margins": 0.02630782127380371, - "rewards/rejected": -0.09293216466903687, + "epoch": 0.14300482425913164, + "grad_norm": 2.438965082168579, + "learning_rate": 7.149009474590869e-08, + "logits/chosen": -3.036221742630005, + "logits/rejected": -3.0208523273468018, + "logps/chosen": -54.74811935424805, + "logps/rejected": -54.435279846191406, + "loss": 0.6917, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0016831646207720041, + "rewards/margins": 0.002907418180257082, + "rewards/rejected": -0.00459058303385973, "step": 830 }, { - "epoch": 0.14, - "grad_norm": 2.78034944593138, - "learning_rate": 3.6175710594315246e-07, - "logits/chosen": -3.060987949371338, - "logits/rejected": -3.034540891647339, - "logps/chosen": -60.72172164916992, - "logps/rejected": -59.53022003173828, - "loss": 0.6774, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.06628672778606415, - "rewards/margins": 0.033673472702503204, - "rewards/rejected": -0.09996020048856735, + "epoch": 0.1447277739490007, + "grad_norm": 2.556420087814331, + "learning_rate": 7.23514211886305e-08, + "logits/chosen": -3.139453172683716, + "logits/rejected": -3.112957715988159, + "logps/chosen": -54.24663162231445, + "logps/rejected": -49.9597053527832, + "loss": 0.6918, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0015173661522567272, + "rewards/margins": 0.002657081466168165, + "rewards/rejected": -0.004174447618424892, "step": 840 }, { - "epoch": 0.15, - "grad_norm": 2.6269588611544923, - "learning_rate": 3.660637381567614e-07, - "logits/chosen": -2.93640398979187, - "logits/rejected": -2.9257328510284424, - "logps/chosen": -58.38752365112305, - "logps/rejected": -64.44731140136719, - "loss": 0.6844, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -0.07559017837047577, - "rewards/margins": 0.01969515159726143, - "rewards/rejected": -0.0952853411436081, + "epoch": 0.14645072363886974, + "grad_norm": 2.29156494140625, + "learning_rate": 7.321274763135228e-08, + "logits/chosen": -3.015171766281128, + "logits/rejected": -3.0045599937438965, + "logps/chosen": -51.05498504638672, + "logps/rejected": -55.26131057739258, + "loss": 0.6927, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.002338874852284789, + "rewards/margins": 0.0009175013983622193, + "rewards/rejected": -0.0032563761342316866, "step": 850 }, { - "epoch": 0.15, - "grad_norm": 2.5848797454702646, - "learning_rate": 3.703703703703703e-07, - "logits/chosen": -2.9593284130096436, - "logits/rejected": -2.9393258094787598, - "logps/chosen": -60.31854248046875, - "logps/rejected": -63.26483154296875, - "loss": 0.6769, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.07502218335866928, - "rewards/margins": 0.03479871153831482, - "rewards/rejected": -0.1098209023475647, + "epoch": 0.1481736733287388, + "grad_norm": 2.323979377746582, + "learning_rate": 7.407407407407407e-08, + "logits/chosen": -3.0407052040100098, + "logits/rejected": -3.0206594467163086, + "logps/chosen": -53.085052490234375, + "logps/rejected": -52.8076057434082, + "loss": 0.6918, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0022581566590815783, + "rewards/margins": 0.0027536931447684765, + "rewards/rejected": -0.005011850036680698, "step": 860 }, { - "epoch": 0.15, - "grad_norm": 2.5705323083064373, - "learning_rate": 3.7467700258397934e-07, - "logits/chosen": -3.0270254611968994, - "logits/rejected": -3.0232787132263184, - "logps/chosen": -58.79963302612305, - "logps/rejected": -64.54035949707031, - "loss": 0.6772, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.07506883889436722, - "rewards/margins": 0.033983923494815826, - "rewards/rejected": -0.10905275493860245, + "epoch": 0.14989662301860784, + "grad_norm": 1.9577609300613403, + "learning_rate": 7.493540051679587e-08, + "logits/chosen": -3.1143956184387207, + "logits/rejected": -3.1105878353118896, + "logps/chosen": -51.548980712890625, + "logps/rejected": -54.14459228515625, + "loss": 0.692, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0024740765802562237, + "rewards/margins": 0.0024134463164955378, + "rewards/rejected": -0.004887523595243692, "step": 870 }, { - "epoch": 0.15, - "grad_norm": 2.689358158624382, - "learning_rate": 3.7898363479758827e-07, - "logits/chosen": -2.9357428550720215, - "logits/rejected": -2.9134020805358887, - "logps/chosen": -59.15460205078125, - "logps/rejected": -62.313621520996094, - "loss": 0.6765, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.07416196167469025, - "rewards/margins": 0.035411037504673004, - "rewards/rejected": -0.10957300662994385, + "epoch": 0.15161957270847692, + "grad_norm": 1.9016143083572388, + "learning_rate": 7.579672695951765e-08, + "logits/chosen": -3.027916669845581, + "logits/rejected": -3.005197763442993, + "logps/chosen": -51.94502639770508, + "logps/rejected": -51.932273864746094, + "loss": 0.6915, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0021511423401534557, + "rewards/margins": 0.003370120655745268, + "rewards/rejected": -0.005521262995898724, "step": 880 }, { - "epoch": 0.15, - "grad_norm": 2.6438277738000293, - "learning_rate": 3.832902670111972e-07, - "logits/chosen": -2.9593563079833984, - "logits/rejected": -2.9229862689971924, - "logps/chosen": -64.84220123291016, - "logps/rejected": -63.8851203918457, - "loss": 0.6778, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.06395251303911209, - "rewards/margins": 0.03320237249135971, - "rewards/rejected": -0.0971548929810524, + "epoch": 0.15334252239834598, + "grad_norm": 2.2559702396392822, + "learning_rate": 7.665805340223945e-08, + "logits/chosen": -3.055936098098755, + "logits/rejected": -3.018589496612549, + "logps/chosen": -58.56825637817383, + "logps/rejected": -54.62053298950195, + "loss": 0.6913, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0008119450649246573, + "rewards/margins": 0.0037904041819274426, + "rewards/rejected": -0.004602349363267422, "step": 890 }, { - "epoch": 0.16, - "grad_norm": 2.832318924397106, - "learning_rate": 3.8759689922480623e-07, - "logits/chosen": -2.977108955383301, - "logits/rejected": -2.9669320583343506, - "logps/chosen": -62.267547607421875, - "logps/rejected": -63.87932586669922, - "loss": 0.6773, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.08133289217948914, - "rewards/margins": 0.033999234437942505, - "rewards/rejected": -0.11533211171627045, + "epoch": 0.15506547208821503, + "grad_norm": 2.1713647842407227, + "learning_rate": 7.751937984496124e-08, + "logits/chosen": -3.076097011566162, + "logits/rejected": -3.0657076835632324, + "logps/chosen": -54.41865158081055, + "logps/rejected": -52.89152908325195, + "loss": 0.692, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.002819065237417817, + "rewards/margins": 0.002241636859253049, + "rewards/rejected": -0.005060701631009579, "step": 900 }, { - "epoch": 0.16, - "eval_logits/chosen": -3.057013750076294, - "eval_logits/rejected": -3.051332473754883, - "eval_logps/chosen": -63.69248962402344, - "eval_logps/rejected": -69.66419982910156, - "eval_loss": 0.6862910985946655, - "eval_rewards/accuracies": 0.5929368138313293, - "eval_rewards/chosen": -0.049886368215084076, - "eval_rewards/margins": 0.01518191583454609, - "eval_rewards/rejected": -0.06506828218698502, - "eval_runtime": 356.6845, - "eval_samples_per_second": 12.067, - "eval_steps_per_second": 1.508, + "epoch": 0.15506547208821503, + "eval_logits/chosen": -3.158005714416504, + "eval_logits/rejected": -3.152367115020752, + "eval_logps/chosen": -58.56904983520508, + "eval_logps/rejected": -63.108516693115234, + "eval_loss": 0.6927970051765442, + "eval_rewards/accuracies": 0.553438663482666, + "eval_rewards/chosen": 0.001428465824574232, + "eval_rewards/margins": 0.000712412700522691, + "eval_rewards/rejected": 0.0007160529494285583, + "eval_runtime": 358.7453, + "eval_samples_per_second": 11.997, + "eval_steps_per_second": 1.5, "step": 900 }, { - "epoch": 0.16, - "grad_norm": 2.791646492165207, - "learning_rate": 3.9190353143841515e-07, - "logits/chosen": -2.9454283714294434, - "logits/rejected": -2.9370040893554688, - "logps/chosen": -60.4368896484375, - "logps/rejected": -65.48957061767578, - "loss": 0.676, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.09426609426736832, - "rewards/margins": 0.04028692469000816, - "rewards/rejected": -0.13455303013324738, + "epoch": 0.15678842177808408, + "grad_norm": 2.131281852722168, + "learning_rate": 7.838070628768303e-08, + "logits/chosen": -3.046898603439331, + "logits/rejected": -3.038809299468994, + "logps/chosen": -51.3541374206543, + "logps/rejected": -52.53623580932617, + "loss": 0.6924, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0032709583174437284, + "rewards/margins": 0.001514771138317883, + "rewards/rejected": -0.004785729572176933, "step": 910 }, { - "epoch": 0.16, - "grad_norm": 3.011151931674002, - "learning_rate": 3.962101636520241e-07, - "logits/chosen": -2.9765427112579346, - "logits/rejected": -2.934263229370117, - "logps/chosen": -64.45548248291016, - "logps/rejected": -62.556602478027344, - "loss": 0.6796, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.10165347158908844, - "rewards/margins": 0.029758507385849953, - "rewards/rejected": -0.13141196966171265, + "epoch": 0.15851137146795313, + "grad_norm": 2.3213186264038086, + "learning_rate": 7.924203273040482e-08, + "logits/chosen": -3.0878713130950928, + "logits/rejected": -3.0445356369018555, + "logps/chosen": -54.627174377441406, + "logps/rejected": -50.106483459472656, + "loss": 0.6914, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.003049984108656645, + "rewards/margins": 0.003485618159174919, + "rewards/rejected": -0.006535602267831564, "step": 920 }, { - "epoch": 0.16, - "grad_norm": 3.2118502689123245, - "learning_rate": 4.0051679586563306e-07, - "logits/chosen": -3.031832456588745, - "logits/rejected": -3.0159494876861572, - "logps/chosen": -62.47021484375, - "logps/rejected": -68.52490234375, - "loss": 0.6746, + "epoch": 0.16023432115782218, + "grad_norm": 2.5753068923950195, + "learning_rate": 8.010335917312662e-08, + "logits/chosen": -3.148953437805176, + "logits/rejected": -3.1329092979431152, + "logps/chosen": -52.779518127441406, + "logps/rejected": -55.11871337890625, + "loss": 0.6915, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.10014158487319946, - "rewards/margins": 0.04036015272140503, - "rewards/rejected": -0.14050175249576569, + "rewards/chosen": -0.0029565533623099327, + "rewards/margins": 0.0033222727943211794, + "rewards/rejected": -0.006278825458139181, "step": 930 }, { - "epoch": 0.16, - "grad_norm": 2.9839009058497106, - "learning_rate": 4.0482342807924204e-07, - "logits/chosen": -3.00956392288208, - "logits/rejected": -2.973090410232544, - "logps/chosen": -71.03943634033203, - "logps/rejected": -69.27840423583984, - "loss": 0.6769, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.10842480510473251, - "rewards/margins": 0.03579176589846611, - "rewards/rejected": -0.14421656727790833, + "epoch": 0.16195727084769124, + "grad_norm": 2.208367347717285, + "learning_rate": 8.09646856158484e-08, + "logits/chosen": -3.128950834274292, + "logits/rejected": -3.091381072998047, + "logps/chosen": -60.43787384033203, + "logps/rejected": -55.45613479614258, + "loss": 0.6915, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0023253881372511387, + "rewards/margins": 0.0033840001560747623, + "rewards/rejected": -0.005709387827664614, "step": 940 }, { - "epoch": 0.16, - "grad_norm": 2.966178612417894, - "learning_rate": 4.0913006029285096e-07, - "logits/chosen": -2.802929639816284, - "logits/rejected": -2.785623073577881, - "logps/chosen": -67.41767883300781, - "logps/rejected": -71.18104553222656, - "loss": 0.6779, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.1217707023024559, - "rewards/margins": 0.033875979483127594, - "rewards/rejected": -0.1556466817855835, + "epoch": 0.16368022053756032, + "grad_norm": 2.2127363681793213, + "learning_rate": 8.18260120585702e-08, + "logits/chosen": -2.9215450286865234, + "logits/rejected": -2.904310941696167, + "logps/chosen": -55.69914627075195, + "logps/rejected": -56.334136962890625, + "loss": 0.6918, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.004277623258531094, + "rewards/margins": 0.002753492910414934, + "rewards/rejected": -0.007031116634607315, "step": 950 }, { - "epoch": 0.17, - "grad_norm": 3.310978864654213, - "learning_rate": 4.134366925064599e-07, - "logits/chosen": -2.7671194076538086, - "logits/rejected": -2.770169734954834, - "logps/chosen": -63.908164978027344, - "logps/rejected": -69.78746032714844, - "loss": 0.6894, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -0.1316661685705185, - "rewards/margins": 0.010264839045703411, - "rewards/rejected": -0.14193101227283478, + "epoch": 0.16540317022742937, + "grad_norm": 2.286198139190674, + "learning_rate": 8.268733850129198e-08, + "logits/chosen": -2.8904712200164795, + "logits/rejected": -2.893658399581909, + "logps/chosen": -51.27644729614258, + "logps/rejected": -56.144874572753906, + "loss": 0.6931, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.005351345054805279, + "rewards/margins": 9.641332144383341e-05, + "rewards/rejected": -0.00544775789603591, "step": 960 }, { - "epoch": 0.17, - "grad_norm": 3.407472849470465, - "learning_rate": 4.177433247200689e-07, - "logits/chosen": -2.93009877204895, - "logits/rejected": -2.8950753211975098, - "logps/chosen": -72.93366241455078, - "logps/rejected": -67.29154968261719, - "loss": 0.6821, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.1230693906545639, - "rewards/margins": 0.025729473680257797, - "rewards/rejected": -0.148798868060112, + "epoch": 0.16712611991729842, + "grad_norm": 2.3423335552215576, + "learning_rate": 8.354866494401377e-08, + "logits/chosen": -3.0606789588928223, + "logits/rejected": -3.025033473968506, + "logps/chosen": -61.13622283935547, + "logps/rejected": -53.06995391845703, + "loss": 0.6924, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.004727092571556568, + "rewards/margins": 0.0015245076501742005, + "rewards/rejected": -0.0062515996396541595, "step": 970 }, { - "epoch": 0.17, - "grad_norm": 3.6748252700178887, - "learning_rate": 4.2204995693367785e-07, - "logits/chosen": -3.024207353591919, - "logits/rejected": -3.0045340061187744, - "logps/chosen": -67.85172271728516, - "logps/rejected": -72.01612854003906, - "loss": 0.6733, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.11790040880441666, - "rewards/margins": 0.05413081496953964, - "rewards/rejected": -0.1720312237739563, + "epoch": 0.16884906960716747, + "grad_norm": 3.1241321563720703, + "learning_rate": 8.440999138673557e-08, + "logits/chosen": -3.1625547409057617, + "logits/rejected": -3.142909288406372, + "logps/chosen": -56.2438850402832, + "logps/rejected": -55.52935028076172, + "loss": 0.6907, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0019513750448822975, + "rewards/margins": 0.004942773841321468, + "rewards/rejected": -0.006894148886203766, "step": 980 }, { - "epoch": 0.17, - "grad_norm": 3.726003522783883, - "learning_rate": 4.263565891472868e-07, - "logits/chosen": -2.889413833618164, - "logits/rejected": -2.8664448261260986, - "logps/chosen": -66.66841125488281, - "logps/rejected": -68.96918487548828, - "loss": 0.6738, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.12137794494628906, - "rewards/margins": 0.04220535233616829, - "rewards/rejected": -0.16358330845832825, + "epoch": 0.17057201929703653, + "grad_norm": 2.4787614345550537, + "learning_rate": 8.527131782945735e-08, + "logits/chosen": -3.0291647911071777, + "logits/rejected": -3.0057761669158936, + "logps/chosen": -54.96424102783203, + "logps/rejected": -53.39398956298828, + "loss": 0.6913, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.003949849866330624, + "rewards/margins": 0.003804834559559822, + "rewards/rejected": -0.007754684425890446, "step": 990 }, { - "epoch": 0.17, - "grad_norm": 3.397153414453954, - "learning_rate": 4.3066322136089576e-07, - "logits/chosen": -2.836636543273926, - "logits/rejected": -2.8090505599975586, - "logps/chosen": -71.08350372314453, - "logps/rejected": -68.55287170410156, - "loss": 0.6818, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.14227214455604553, - "rewards/margins": 0.02618454024195671, - "rewards/rejected": -0.16845668852329254, + "epoch": 0.17229496898690558, + "grad_norm": 2.216383218765259, + "learning_rate": 8.613264427217916e-08, + "logits/chosen": -2.9808809757232666, + "logits/rejected": -2.9531712532043457, + "logps/chosen": -57.33502197265625, + "logps/rejected": -52.36609649658203, + "loss": 0.6924, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.004668045789003372, + "rewards/margins": 0.0015643674414604902, + "rewards/rejected": -0.006232412997633219, "step": 1000 }, { - "epoch": 0.17, - "eval_logits/chosen": -3.0087461471557617, - "eval_logits/rejected": -3.003145694732666, - "eval_logps/chosen": -67.30126190185547, - "eval_logps/rejected": -73.87139892578125, - "eval_loss": 0.6837059855461121, - "eval_rewards/accuracies": 0.5971189737319946, - "eval_rewards/chosen": -0.08597413450479507, - "eval_rewards/margins": 0.02116604894399643, - "eval_rewards/rejected": -0.1071401834487915, - "eval_runtime": 356.1874, - "eval_samples_per_second": 12.084, - "eval_steps_per_second": 1.51, + "epoch": 0.17229496898690558, + "eval_logits/chosen": -3.1550450325012207, + "eval_logits/rejected": -3.1494359970092773, + "eval_logps/chosen": -58.53335952758789, + "eval_logps/rejected": -63.10966491699219, + "eval_loss": 0.6926171779632568, + "eval_rewards/accuracies": 0.5659851431846619, + "eval_rewards/chosen": 0.0017853755271062255, + "eval_rewards/margins": 0.0010808416409417987, + "eval_rewards/rejected": 0.0007045338279567659, + "eval_runtime": 358.4783, + "eval_samples_per_second": 12.006, + "eval_steps_per_second": 1.501, "step": 1000 }, { - "epoch": 0.17, - "grad_norm": 3.4839564072636575, - "learning_rate": 4.3496985357450473e-07, - "logits/chosen": -2.771763324737549, - "logits/rejected": -2.781416416168213, - "logps/chosen": -67.20372009277344, - "logps/rejected": -78.4428939819336, - "loss": 0.6666, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.13454464077949524, - "rewards/margins": 0.07122045755386353, - "rewards/rejected": -0.20576509833335876, + "epoch": 0.17401791867677463, + "grad_norm": 2.3578145503997803, + "learning_rate": 8.699397071490094e-08, + "logits/chosen": -2.9164555072784424, + "logits/rejected": -2.927083730697632, + "logps/chosen": -54.320526123046875, + "logps/rejected": -58.7243537902832, + "loss": 0.6918, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005627059377729893, + "rewards/margins": 0.002754826098680496, + "rewards/rejected": -0.008381885476410389, "step": 1010 }, { - "epoch": 0.18, - "grad_norm": 3.456440043201763, - "learning_rate": 4.3927648578811366e-07, - "logits/chosen": -2.9379830360412598, - "logits/rejected": -2.9037129878997803, - "logps/chosen": -70.02944946289062, - "logps/rejected": -73.0363540649414, - "loss": 0.6657, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.1292036920785904, - "rewards/margins": 0.05881650000810623, - "rewards/rejected": -0.18802018463611603, + "epoch": 0.17574086836664368, + "grad_norm": 2.234851837158203, + "learning_rate": 8.785529715762273e-08, + "logits/chosen": -3.0988657474517822, + "logits/rejected": -3.063410997390747, + "logps/chosen": -57.33527374267578, + "logps/rejected": -55.05609893798828, + "loss": 0.6902, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.002326482906937599, + "rewards/margins": 0.005923739168792963, + "rewards/rejected": -0.008250223472714424, "step": 1020 }, { - "epoch": 0.18, - "grad_norm": 3.573048388794818, - "learning_rate": 4.4358311800172264e-07, - "logits/chosen": -2.9771835803985596, - "logits/rejected": -2.9505438804626465, - "logps/chosen": -67.1936264038086, - "logps/rejected": -70.21983337402344, - "loss": 0.6709, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.1349172741174698, - "rewards/margins": 0.048518020659685135, - "rewards/rejected": -0.18343529105186462, + "epoch": 0.17746381805651276, + "grad_norm": 2.3829662799835205, + "learning_rate": 8.871662360034454e-08, + "logits/chosen": -3.147603988647461, + "logits/rejected": -3.120401382446289, + "logps/chosen": -54.09687423706055, + "logps/rejected": -52.60686492919922, + "loss": 0.6916, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0037808609195053577, + "rewards/margins": 0.003162707667797804, + "rewards/rejected": -0.006943568587303162, "step": 1030 }, { - "epoch": 0.18, - "grad_norm": 4.0876459743090034, - "learning_rate": 4.478897502153316e-07, - "logits/chosen": -2.893394947052002, - "logits/rejected": -2.8834469318389893, - "logps/chosen": -68.99354553222656, - "logps/rejected": -74.55607604980469, - "loss": 0.6785, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.1559665948152542, - "rewards/margins": 0.03430451080203056, - "rewards/rejected": -0.19027109444141388, + "epoch": 0.17918676774638181, + "grad_norm": 2.459951639175415, + "learning_rate": 8.957795004306632e-08, + "logits/chosen": -3.0629427433013916, + "logits/rejected": -3.052708148956299, + "logps/chosen": -53.935150146484375, + "logps/rejected": -56.3193244934082, + "loss": 0.692, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.005367881152778864, + "rewards/margins": 0.0024500801227986813, + "rewards/rejected": -0.007817961275577545, "step": 1040 }, { - "epoch": 0.18, - "grad_norm": 3.9493370777335084, - "learning_rate": 4.5219638242894055e-07, - "logits/chosen": -2.8083250522613525, - "logits/rejected": -2.773916721343994, - "logps/chosen": -73.37397766113281, - "logps/rejected": -71.31639862060547, - "loss": 0.6732, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.16260358691215515, - "rewards/margins": 0.04476263374090195, - "rewards/rejected": -0.2073661983013153, + "epoch": 0.18090971743625087, + "grad_norm": 2.4851348400115967, + "learning_rate": 9.04392764857881e-08, + "logits/chosen": -2.9838688373565674, + "logits/rejected": -2.9484922885894775, + "logps/chosen": -57.58784103393555, + "logps/rejected": -51.438087463378906, + "loss": 0.6911, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.004447963088750839, + "rewards/margins": 0.0042016999796032906, + "rewards/rejected": -0.00864966306835413, "step": 1050 }, { - "epoch": 0.18, - "grad_norm": 3.7318845219809025, - "learning_rate": 4.565030146425495e-07, - "logits/chosen": -2.8137454986572266, - "logits/rejected": -2.7951653003692627, - "logps/chosen": -73.75227355957031, - "logps/rejected": -76.26736450195312, - "loss": 0.6754, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.1725139617919922, - "rewards/margins": 0.04043154790997505, - "rewards/rejected": -0.21294550597667694, + "epoch": 0.18263266712611992, + "grad_norm": 2.4529802799224854, + "learning_rate": 9.130060292850991e-08, + "logits/chosen": -2.9925010204315186, + "logits/rejected": -2.973201036453247, + "logps/chosen": -57.02306365966797, + "logps/rejected": -55.665924072265625, + "loss": 0.6923, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.005218047183007002, + "rewards/margins": 0.0016771454829722643, + "rewards/rejected": -0.00689519289880991, "step": 1060 }, { - "epoch": 0.18, - "grad_norm": 3.921970666271597, - "learning_rate": 4.6080964685615845e-07, - "logits/chosen": -2.9450836181640625, - "logits/rejected": -2.912147045135498, - "logps/chosen": -74.61820220947266, - "logps/rejected": -76.5606689453125, - "loss": 0.6729, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.18662679195404053, - "rewards/margins": 0.04664309695363045, - "rewards/rejected": -0.23326988518238068, + "epoch": 0.18435561681598897, + "grad_norm": 2.302189588546753, + "learning_rate": 9.216192937123169e-08, + "logits/chosen": -3.135798215866089, + "logits/rejected": -3.101851463317871, + "logps/chosen": -56.41740798950195, + "logps/rejected": -54.17853546142578, + "loss": 0.691, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.004861590452492237, + "rewards/margins": 0.004440676420927048, + "rewards/rejected": -0.009302266873419285, "step": 1070 }, { - "epoch": 0.19, - "grad_norm": 3.8287702054233956, - "learning_rate": 4.6511627906976743e-07, - "logits/chosen": -2.9213509559631348, - "logits/rejected": -2.8993239402770996, - "logps/chosen": -76.99789428710938, - "logps/rejected": -75.40824127197266, - "loss": 0.6781, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.21221812069416046, - "rewards/margins": 0.03467785567045212, - "rewards/rejected": -0.2468959540128708, + "epoch": 0.18607856650585802, + "grad_norm": 2.2802231311798096, + "learning_rate": 9.302325581395349e-08, + "logits/chosen": -3.1133651733398438, + "logits/rejected": -3.090050458908081, + "logps/chosen": -56.36030960083008, + "logps/rejected": -51.7191047668457, + "loss": 0.6911, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0057452223263680935, + "rewards/margins": 0.004140386823564768, + "rewards/rejected": -0.009885609149932861, "step": 1080 }, { - "epoch": 0.19, - "grad_norm": 3.9047242283272396, - "learning_rate": 4.6942291128337636e-07, - "logits/chosen": -2.797940731048584, - "logits/rejected": -2.7896194458007812, - "logps/chosen": -71.64787292480469, - "logps/rejected": -77.1730728149414, - "loss": 0.6704, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.19434688985347748, - "rewards/margins": 0.051201529800891876, - "rewards/rejected": -0.24554841220378876, + "epoch": 0.18780151619572708, + "grad_norm": 2.3795809745788574, + "learning_rate": 9.388458225667527e-08, + "logits/chosen": -2.990147590637207, + "logits/rejected": -2.9818215370178223, + "logps/chosen": -52.83929443359375, + "logps/rejected": -53.5362434387207, + "loss": 0.6917, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.006076144985854626, + "rewards/margins": 0.0029487418942153454, + "rewards/rejected": -0.009024888277053833, "step": 1090 }, { - "epoch": 0.19, - "grad_norm": 3.742135088088785, - "learning_rate": 4.7372954349698534e-07, - "logits/chosen": -2.8556008338928223, - "logits/rejected": -2.8531699180603027, - "logps/chosen": -69.83229064941406, - "logps/rejected": -78.55072784423828, - "loss": 0.6715, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.18923941254615784, - "rewards/margins": 0.050502412021160126, - "rewards/rejected": -0.23974183201789856, + "epoch": 0.18952446588559613, + "grad_norm": 2.1332859992980957, + "learning_rate": 9.474590869939707e-08, + "logits/chosen": -3.0555577278137207, + "logits/rejected": -3.0526106357574463, + "logps/chosen": -51.41303634643555, + "logps/rejected": -55.484619140625, + "loss": 0.6913, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.005056067835539579, + "rewards/margins": 0.0037126517854630947, + "rewards/rejected": -0.008768720552325249, "step": 1100 }, { - "epoch": 0.19, - "eval_logits/chosen": -2.946117401123047, - "eval_logits/rejected": -2.94049072265625, - "eval_logps/chosen": -71.7703857421875, - "eval_logps/rejected": -79.22164916992188, - "eval_loss": 0.679982602596283, - "eval_rewards/accuracies": 0.6057156324386597, - "eval_rewards/chosen": -0.13066548109054565, - "eval_rewards/margins": 0.029977135360240936, - "eval_rewards/rejected": -0.1606426239013672, - "eval_runtime": 356.7556, - "eval_samples_per_second": 12.064, - "eval_steps_per_second": 1.508, + "epoch": 0.18952446588559613, + "eval_logits/chosen": -3.1514084339141846, + "eval_logits/rejected": -3.145766258239746, + "eval_logps/chosen": -58.5028076171875, + "eval_logps/rejected": -63.130279541015625, + "eval_loss": 0.6923677325248718, + "eval_rewards/accuracies": 0.5736523866653442, + "eval_rewards/chosen": 0.0020908990409225225, + "eval_rewards/margins": 0.0015924354083836079, + "eval_rewards/rejected": 0.0004984635161235929, + "eval_runtime": 358.7667, + "eval_samples_per_second": 11.997, + "eval_steps_per_second": 1.5, "step": 1100 }, { - "epoch": 0.19, - "grad_norm": 4.753611130936478, - "learning_rate": 4.780361757105943e-07, - "logits/chosen": -2.840238571166992, - "logits/rejected": -2.8512330055236816, - "logps/chosen": -71.57658386230469, - "logps/rejected": -81.72315979003906, - "loss": 0.6636, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.18691562116146088, - "rewards/margins": 0.0658998116850853, - "rewards/rejected": -0.2528154253959656, + "epoch": 0.1912474155754652, + "grad_norm": 2.6487016677856445, + "learning_rate": 9.560723514211886e-08, + "logits/chosen": -3.052523136138916, + "logits/rejected": -3.0647952556610107, + "logps/chosen": -53.663841247558594, + "logps/rejected": -57.5274543762207, + "loss": 0.6917, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.007680105976760387, + "rewards/margins": 0.0030740201473236084, + "rewards/rejected": -0.010754126124083996, "step": 1110 }, { - "epoch": 0.19, - "grad_norm": 4.200014824991902, - "learning_rate": 4.823428079242032e-07, - "logits/chosen": -2.8601462841033936, - "logits/rejected": -2.83921480178833, - "logps/chosen": -75.09476470947266, - "logps/rejected": -77.8160400390625, - "loss": 0.6691, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.1874438375234604, - "rewards/margins": 0.05362669751048088, - "rewards/rejected": -0.24107055366039276, + "epoch": 0.19297036526533426, + "grad_norm": 2.3997607231140137, + "learning_rate": 9.646856158484065e-08, + "logits/chosen": -3.0824146270751953, + "logits/rejected": -3.0605499744415283, + "logps/chosen": -56.986732482910156, + "logps/rejected": -54.80094528198242, + "loss": 0.691, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.006157047115266323, + "rewards/margins": 0.004412065260112286, + "rewards/rejected": -0.010569113306701183, "step": 1120 }, { - "epoch": 0.19, - "grad_norm": 4.069383490654212, - "learning_rate": 4.866494401378123e-07, - "logits/chosen": -2.903735399246216, - "logits/rejected": -2.8804819583892822, - "logps/chosen": -72.0593490600586, - "logps/rejected": -81.51217651367188, - "loss": 0.6601, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.20027342438697815, - "rewards/margins": 0.07303432375192642, - "rewards/rejected": -0.273307740688324, + "epoch": 0.1946933149552033, + "grad_norm": 2.228640556335449, + "learning_rate": 9.732988802756244e-08, + "logits/chosen": -3.1479599475860596, + "logits/rejected": -3.1234474182128906, + "logps/chosen": -52.66511917114258, + "logps/rejected": -55.532569885253906, + "loss": 0.6897, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.006327435374259949, + "rewards/margins": 0.007075057830661535, + "rewards/rejected": -0.013402493670582771, "step": 1130 }, { - "epoch": 0.2, - "grad_norm": 4.350410897251021, - "learning_rate": 4.909560723514212e-07, - "logits/chosen": -2.808011293411255, - "logits/rejected": -2.7773683071136475, - "logps/chosen": -77.37016296386719, - "logps/rejected": -78.98043060302734, - "loss": 0.6698, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.2049834281206131, - "rewards/margins": 0.055016178637742996, - "rewards/rejected": -0.2599996030330658, + "epoch": 0.19641626464507236, + "grad_norm": 2.443138837814331, + "learning_rate": 9.819121447028424e-08, + "logits/chosen": -3.0550379753112793, + "logits/rejected": -3.0215907096862793, + "logps/chosen": -57.4569091796875, + "logps/rejected": -54.08552932739258, + "loss": 0.6906, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.005755481775850058, + "rewards/margins": 0.005251362454146147, + "rewards/rejected": -0.011006844229996204, "step": 1140 }, { - "epoch": 0.2, - "grad_norm": 4.680052319466735, - "learning_rate": 4.952627045650301e-07, - "logits/chosen": -2.7504663467407227, - "logits/rejected": -2.7321338653564453, - "logps/chosen": -74.69068908691406, - "logps/rejected": -83.3788833618164, - "loss": 0.6601, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.21349653601646423, - "rewards/margins": 0.0752733126282692, - "rewards/rejected": -0.28876984119415283, + "epoch": 0.19813921433494142, + "grad_norm": 2.4156675338745117, + "learning_rate": 9.905254091300602e-08, + "logits/chosen": -2.994870662689209, + "logits/rejected": -2.9757802486419678, + "logps/chosen": -53.95721435546875, + "logps/rejected": -55.600433349609375, + "loss": 0.6906, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.006008354481309652, + "rewards/margins": 0.005140149500221014, + "rewards/rejected": -0.011148503981530666, "step": 1150 }, { - "epoch": 0.2, - "grad_norm": 4.383585223712436, - "learning_rate": 4.995693367786391e-07, - "logits/chosen": -2.8323190212249756, - "logits/rejected": -2.823431968688965, - "logps/chosen": -74.20658874511719, - "logps/rejected": -82.88806915283203, - "loss": 0.6568, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.20177213847637177, - "rewards/margins": 0.08152016252279282, - "rewards/rejected": -0.283292293548584, + "epoch": 0.19986216402481047, + "grad_norm": 2.241849899291992, + "learning_rate": 9.991386735572782e-08, + "logits/chosen": -3.090665817260742, + "logits/rejected": -3.081921100616455, + "logps/chosen": -54.39727020263672, + "logps/rejected": -55.57891845703125, + "loss": 0.6899, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.0037164140958338976, + "rewards/margins": 0.00659269979223609, + "rewards/rejected": -0.010309114120900631, "step": 1160 }, { - "epoch": 0.2, - "grad_norm": 4.7924512014269425, - "learning_rate": 4.999990843883228e-07, - "logits/chosen": -2.706714630126953, - "logits/rejected": -2.694248914718628, - "logps/chosen": -75.1012191772461, - "logps/rejected": -86.26618194580078, - "loss": 0.6645, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.2305610179901123, - "rewards/margins": 0.08545393496751785, - "rewards/rejected": -0.31601497530937195, + "epoch": 0.20158511371467952, + "grad_norm": 2.1488406658172607, + "learning_rate": 9.999981687766457e-08, + "logits/chosen": -2.9630703926086426, + "logits/rejected": -2.9503164291381836, + "logps/chosen": -52.81767654418945, + "logps/rejected": -55.88347244262695, + "loss": 0.6909, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.007383792661130428, + "rewards/margins": 0.004716981668025255, + "rewards/rejected": -0.012100773863494396, "step": 1170 }, { - "epoch": 0.2, - "grad_norm": 4.531230658306969, - "learning_rate": 4.999959193195308e-07, - "logits/chosen": -2.665681838989258, - "logits/rejected": -2.6355409622192383, - "logps/chosen": -78.78900146484375, - "logps/rejected": -83.79048156738281, - "loss": 0.6643, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.2599770426750183, - "rewards/margins": 0.06831606477499008, - "rewards/rejected": -0.3282931447029114, + "epoch": 0.2033080634045486, + "grad_norm": 2.5440945625305176, + "learning_rate": 9.999918386390616e-08, + "logits/chosen": -2.934640645980835, + "logits/rejected": -2.903498411178589, + "logps/chosen": -53.553077697753906, + "logps/rejected": -52.60760498046875, + "loss": 0.6889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.007512775249779224, + "rewards/margins": 0.008542842231690884, + "rewards/rejected": -0.016055617481470108, "step": 1180 }, { - "epoch": 0.21, - "grad_norm": 4.6260170747775105, - "learning_rate": 4.999904935183911e-07, - "logits/chosen": -2.841900587081909, - "logits/rejected": -2.8088276386260986, - "logps/chosen": -83.71769714355469, - "logps/rejected": -83.9743423461914, - "loss": 0.6567, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.25023940205574036, - "rewards/margins": 0.08566378057003021, - "rewards/rejected": -0.33590319752693176, + "epoch": 0.20503101309441765, + "grad_norm": 2.4966301918029785, + "learning_rate": 9.999809870367821e-08, + "logits/chosen": -3.1306917667388916, + "logits/rejected": -3.0955986976623535, + "logps/chosen": -59.563941955566406, + "logps/rejected": -51.870330810546875, + "loss": 0.6902, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.008644647896289825, + "rewards/margins": 0.006059793755412102, + "rewards/rejected": -0.014704440720379353, "step": 1190 }, { - "epoch": 0.21, - "grad_norm": 4.214091282801035, - "learning_rate": 4.999828070339698e-07, - "logits/chosen": -2.669875144958496, - "logits/rejected": -2.655302047729492, - "logps/chosen": -79.1661605834961, - "logps/rejected": -83.97240447998047, - "loss": 0.6651, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.2603180706501007, - "rewards/margins": 0.0669156163930893, - "rewards/rejected": -0.3272337019443512, + "epoch": 0.2067539627842867, + "grad_norm": 2.1367452144622803, + "learning_rate": 9.999656140679395e-08, + "logits/chosen": -2.962451934814453, + "logits/rejected": -2.9473876953125, + "logps/chosen": -54.0195426940918, + "logps/rejected": -52.5385856628418, + "loss": 0.6912, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.008550790138542652, + "rewards/margins": 0.004117645788937807, + "rewards/rejected": -0.012668436393141747, "step": 1200 }, { - "epoch": 0.21, - "eval_logits/chosen": -2.85164475440979, - "eval_logits/rejected": -2.84596586227417, - "eval_logps/chosen": -78.02967834472656, - "eval_logps/rejected": -86.59571075439453, - "eval_loss": 0.6755677461624146, - "eval_rewards/accuracies": 0.5996747016906738, - "eval_rewards/chosen": -0.1932583451271057, - "eval_rewards/margins": 0.04112492874264717, - "eval_rewards/rejected": -0.23438328504562378, - "eval_runtime": 357.4222, - "eval_samples_per_second": 12.042, - "eval_steps_per_second": 1.505, + "epoch": 0.2067539627842867, + "eval_logits/chosen": -3.1463611125946045, + "eval_logits/rejected": -3.140730619430542, + "eval_logps/chosen": -58.488128662109375, + "eval_logps/rejected": -63.16765594482422, + "eval_loss": 0.692119836807251, + "eval_rewards/accuracies": 0.5794609785079956, + "eval_rewards/chosen": 0.0022376305423676968, + "eval_rewards/margins": 0.002112939953804016, + "eval_rewards/rejected": 0.00012469064677134156, + "eval_runtime": 358.7316, + "eval_samples_per_second": 11.998, + "eval_steps_per_second": 1.5, "step": 1200 }, { - "epoch": 0.21, - "grad_norm": 5.729626186243229, - "learning_rate": 4.999728599357762e-07, - "logits/chosen": -2.7580645084381104, - "logits/rejected": -2.725999355316162, - "logps/chosen": -82.05293273925781, - "logps/rejected": -90.14490509033203, - "loss": 0.6559, + "epoch": 0.20847691247415576, + "grad_norm": 2.3286216259002686, + "learning_rate": 9.999457198715525e-08, + "logits/chosen": -3.0613112449645996, + "logits/rejected": -3.0273356437683105, + "logps/chosen": -54.412635803222656, + "logps/rejected": -54.4266471862793, + "loss": 0.6893, "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.2827844023704529, - "rewards/margins": 0.08879880607128143, - "rewards/rejected": -0.37158316373825073, + "rewards/chosen": -0.006243783514946699, + "rewards/margins": 0.00786572601646185, + "rewards/rejected": -0.014109509997069836, "step": 1210 }, { - "epoch": 0.21, - "grad_norm": 4.843172675927166, - "learning_rate": 4.999606523137628e-07, - "logits/chosen": -2.7558417320251465, - "logits/rejected": -2.730149745941162, - "logps/chosen": -82.96326446533203, - "logps/rejected": -92.13468933105469, - "loss": 0.6487, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.2960751950740814, - "rewards/margins": 0.10249904543161392, - "rewards/rejected": -0.39857420325279236, + "epoch": 0.2101998621640248, + "grad_norm": 2.1323070526123047, + "learning_rate": 9.999213046275256e-08, + "logits/chosen": -3.0718696117401123, + "logits/rejected": -3.0449776649475098, + "logps/chosen": -54.26097869873047, + "logps/rejected": -53.925315856933594, + "loss": 0.6894, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.008796758949756622, + "rewards/margins": 0.007621658034622669, + "rewards/rejected": -0.016418416053056717, "step": 1220 }, { - "epoch": 0.21, - "grad_norm": 5.963904626730247, - "learning_rate": 4.99946184278324e-07, - "logits/chosen": -2.815377950668335, - "logits/rejected": -2.7770168781280518, - "logps/chosen": -87.67040252685547, - "logps/rejected": -92.19978332519531, - "loss": 0.6591, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.31936585903167725, - "rewards/margins": 0.08062832802534103, - "rewards/rejected": -0.3999941945075989, + "epoch": 0.21192281185389386, + "grad_norm": 2.6729142665863037, + "learning_rate": 9.99892368556648e-08, + "logits/chosen": -3.1416068077087402, + "logits/rejected": -3.0996181964874268, + "logps/chosen": -56.39683151245117, + "logps/rejected": -53.861724853515625, + "loss": 0.6883, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.006459666881710291, + "rewards/margins": 0.009909730404615402, + "rewards/rejected": -0.016369396820664406, "step": 1230 }, { - "epoch": 0.21, - "grad_norm": 5.432890340977972, - "learning_rate": 4.999294559602954e-07, - "logits/chosen": -2.681164264678955, - "logits/rejected": -2.666093349456787, - "logps/chosen": -85.52889251708984, - "logps/rejected": -93.53931427001953, - "loss": 0.6638, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.3310098946094513, - "rewards/margins": 0.07634725421667099, - "rewards/rejected": -0.4073571562767029, + "epoch": 0.2136457615437629, + "grad_norm": 2.1508169174194336, + "learning_rate": 9.998589119205909e-08, + "logits/chosen": -3.002638816833496, + "logits/rejected": -2.98604679107666, + "logps/chosen": -53.4490852355957, + "logps/rejected": -54.5033073425293, + "loss": 0.6899, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.01011999323964119, + "rewards/margins": 0.006711783818900585, + "rewards/rejected": -0.0168317761272192, "step": 1240 }, { - "epoch": 0.22, - "grad_norm": 5.250007678464064, - "learning_rate": 4.999104675109525e-07, - "logits/chosen": -2.787619113922119, - "logits/rejected": -2.7531590461730957, - "logps/chosen": -84.2263412475586, - "logps/rejected": -89.0356216430664, - "loss": 0.6633, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.3051467835903168, - "rewards/margins": 0.07372823357582092, - "rewards/rejected": -0.3788750171661377, + "epoch": 0.21536871123363197, + "grad_norm": 2.67529034614563, + "learning_rate": 9.99820935021905e-08, + "logits/chosen": -3.1221249103546143, + "logits/rejected": -3.085853099822998, + "logps/chosen": -54.63789749145508, + "logps/rejected": -52.832984924316406, + "loss": 0.6897, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.009338639676570892, + "rewards/margins": 0.007117290049791336, + "rewards/rejected": -0.01645592972636223, "step": 1250 }, - { - "epoch": 0.22, - "grad_norm": 5.437906354386665, - "learning_rate": 4.998892191020092e-07, - "logits/chosen": -2.6413798332214355, - "logits/rejected": -2.6131081581115723, - "logps/chosen": -83.180419921875, - "logps/rejected": -89.31295776367188, - "loss": 0.6567, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.30223608016967773, - "rewards/margins": 0.08826258033514023, - "rewards/rejected": -0.39049869775772095, + { + "epoch": 0.21709166092350105, + "grad_norm": 2.1187283992767334, + "learning_rate": 9.997784382040184e-08, + "logits/chosen": -2.9702210426330566, + "logits/rejected": -2.9389166831970215, + "logps/chosen": -53.91698455810547, + "logps/rejected": -52.1651611328125, + "loss": 0.6886, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.009416448883712292, + "rewards/margins": 0.0093917828053236, + "rewards/rejected": -0.018808234483003616, "step": 1260 }, { - "epoch": 0.22, - "grad_norm": 7.461728534142161, - "learning_rate": 4.998657109256166e-07, - "logits/chosen": -2.6996243000030518, - "logits/rejected": -2.6940901279449463, - "logps/chosen": -85.5318374633789, - "logps/rejected": -94.53944396972656, - "loss": 0.6692, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.33671867847442627, - "rewards/margins": 0.06170588731765747, - "rewards/rejected": -0.39842456579208374, + "epoch": 0.2188146106133701, + "grad_norm": 2.4756853580474854, + "learning_rate": 9.997314218512333e-08, + "logits/chosen": -3.030531644821167, + "logits/rejected": -3.024228572845459, + "logps/chosen": -52.87784957885742, + "logps/rejected": -56.451568603515625, + "loss": 0.6895, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.010020340792834759, + "rewards/margins": 0.007446722127497196, + "rewards/rejected": -0.017467062920331955, "step": 1270 }, { - "epoch": 0.22, - "grad_norm": 5.558513126380609, - "learning_rate": 4.998399431943609e-07, - "logits/chosen": -2.768416166305542, - "logits/rejected": -2.7740864753723145, - "logps/chosen": -79.00863647460938, - "logps/rejected": -98.24131774902344, - "loss": 0.6481, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.2783285081386566, - "rewards/margins": 0.10511146485805511, - "rewards/rejected": -0.38343995809555054, + "epoch": 0.22053756030323915, + "grad_norm": 2.364240884780884, + "learning_rate": 9.996798863887219e-08, + "logits/chosen": -3.1014938354492188, + "logits/rejected": -3.1085102558135986, + "logps/chosen": -52.22028732299805, + "logps/rejected": -61.497581481933594, + "loss": 0.6906, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.010487397201359272, + "rewards/margins": 0.005354976747184992, + "rewards/rejected": -0.01584237441420555, "step": 1280 }, { - "epoch": 0.22, - "grad_norm": 5.865934178946898, - "learning_rate": 4.998119161412618e-07, - "logits/chosen": -2.6547913551330566, - "logits/rejected": -2.62509822845459, - "logps/chosen": -86.21808624267578, - "logps/rejected": -90.81287384033203, - "loss": 0.6581, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.3163338005542755, - "rewards/margins": 0.08770157396793365, - "rewards/rejected": -0.40403538942337036, + "epoch": 0.2222605099931082, + "grad_norm": 2.251617193222046, + "learning_rate": 9.996238322825236e-08, + "logits/chosen": -3.001249074935913, + "logits/rejected": -2.9696197509765625, + "logps/chosen": -55.589752197265625, + "logps/rejected": -52.25542449951172, + "loss": 0.6891, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009888270869851112, + "rewards/margins": 0.008468730375170708, + "rewards/rejected": -0.01835699938237667, "step": 1290 }, { - "epoch": 0.22, - "grad_norm": 6.062029915276689, - "learning_rate": 4.997816300197699e-07, - "logits/chosen": -2.7270829677581787, - "logits/rejected": -2.714017391204834, - "logps/chosen": -87.53861999511719, - "logps/rejected": -99.14437866210938, - "loss": 0.663, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.37334591150283813, - "rewards/margins": 0.0809113010764122, - "rewards/rejected": -0.4542572498321533, + "epoch": 0.22398345968297725, + "grad_norm": 2.166390895843506, + "learning_rate": 9.995632600395398e-08, + "logits/chosen": -3.0807461738586426, + "logits/rejected": -3.0679678916931152, + "logps/chosen": -51.707557678222656, + "logps/rejected": -55.653839111328125, + "loss": 0.6911, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.014926217496395111, + "rewards/margins": 0.004388140980154276, + "rewards/rejected": -0.0193143580108881, "step": 1300 }, { - "epoch": 0.22, - "eval_logits/chosen": -2.7855560779571533, - "eval_logits/rejected": -2.7796361446380615, - "eval_logps/chosen": -86.98535919189453, - "eval_logps/rejected": -97.24429321289062, - "eval_loss": 0.6691488027572632, - "eval_rewards/accuracies": 0.6171003580093384, - "eval_rewards/chosen": -0.2828150987625122, - "eval_rewards/margins": 0.05805408954620361, - "eval_rewards/rejected": -0.3408692181110382, - "eval_runtime": 357.0192, - "eval_samples_per_second": 12.055, - "eval_steps_per_second": 1.507, + "epoch": 0.22398345968297725, + "eval_logits/chosen": -3.141386032104492, + "eval_logits/rejected": -3.1357979774475098, + "eval_logps/chosen": -58.53720474243164, + "eval_logps/rejected": -63.28915023803711, + "eval_loss": 0.6917756199836731, + "eval_rewards/accuracies": 0.5901486873626709, + "eval_rewards/chosen": 0.0017469110898673534, + "eval_rewards/margins": 0.0028372127562761307, + "eval_rewards/rejected": -0.0010903014335781336, + "eval_runtime": 358.9888, + "eval_samples_per_second": 11.989, + "eval_steps_per_second": 1.499, "step": 1300 }, { - "epoch": 0.23, - "grad_norm": 7.2374921490781166, - "learning_rate": 4.997490851037651e-07, - "logits/chosen": -2.7199060916900635, - "logits/rejected": -2.685650110244751, - "logps/chosen": -90.32498931884766, - "logps/rejected": -97.50151824951172, - "loss": 0.6474, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.3698830008506775, - "rewards/margins": 0.11114762723445892, - "rewards/rejected": -0.4810306429862976, + "epoch": 0.2257064093728463, + "grad_norm": 1.9226795434951782, + "learning_rate": 9.9949817020753e-08, + "logits/chosen": -3.080859661102295, + "logits/rejected": -3.045379877090454, + "logps/chosen": -54.435699462890625, + "logps/rejected": -51.69322967529297, + "loss": 0.6874, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.010951696895062923, + "rewards/margins": 0.011780351400375366, + "rewards/rejected": -0.022732049226760864, "step": 1310 }, { - "epoch": 0.23, - "grad_norm": 7.329198026466427, - "learning_rate": 4.997142816875534e-07, - "logits/chosen": -2.6866555213928223, - "logits/rejected": -2.6637563705444336, - "logps/chosen": -93.89530944824219, - "logps/rejected": -97.65375518798828, - "loss": 0.6605, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.38488394021987915, - "rewards/margins": 0.0846201553940773, - "rewards/rejected": -0.469504177570343, + "epoch": 0.22742935906271536, + "grad_norm": 2.345458745956421, + "learning_rate": 9.994285633751067e-08, + "logits/chosen": -3.053536891937256, + "logits/rejected": -3.0280728340148926, + "logps/chosen": -56.30860137939453, + "logps/rejected": -52.7589225769043, + "loss": 0.6876, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008888078853487968, + "rewards/margins": 0.011339199729263783, + "rewards/rejected": -0.020227279514074326, "step": 1320 }, { - "epoch": 0.23, - "grad_norm": 6.989316967263134, - "learning_rate": 4.996772200858648e-07, - "logits/chosen": -2.759702682495117, - "logits/rejected": -2.731628894805908, - "logps/chosen": -94.14637756347656, - "logps/rejected": -99.45263671875, - "loss": 0.6529, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.3895763158798218, - "rewards/margins": 0.10352253913879395, - "rewards/rejected": -0.4930989146232605, + "epoch": 0.22915230875258444, + "grad_norm": 2.4452571868896484, + "learning_rate": 9.993544401717297e-08, + "logits/chosen": -3.1321685314178467, + "logits/rejected": -3.1015255451202393, + "logps/chosen": -56.034820556640625, + "logps/rejected": -52.01787185668945, + "loss": 0.688, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.008172191679477692, + "rewards/margins": 0.010616080835461617, + "rewards/rejected": -0.018788272514939308, "step": 1330 }, { - "epoch": 0.23, - "grad_norm": 5.339352998534476, - "learning_rate": 4.996379006338504e-07, - "logits/chosen": -2.6027302742004395, - "logits/rejected": -2.582573890686035, - "logps/chosen": -86.88957977294922, - "logps/rejected": -96.32715606689453, - "loss": 0.6431, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.3176007866859436, - "rewards/margins": 0.11718092858791351, - "rewards/rejected": -0.4347817003726959, + "epoch": 0.2308752584424535, + "grad_norm": 2.2883365154266357, + "learning_rate": 9.992758012677008e-08, + "logits/chosen": -2.973372459411621, + "logits/rejected": -2.951862335205078, + "logps/chosen": -56.06450271606445, + "logps/rejected": -54.80854415893555, + "loss": 0.6882, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.009250979870557785, + "rewards/margins": 0.010158119723200798, + "rewards/rejected": -0.019409101456403732, "step": 1340 }, { - "epoch": 0.23, - "grad_norm": 6.251815390342403, - "learning_rate": 4.99596323687079e-07, - "logits/chosen": -2.6558520793914795, - "logits/rejected": -2.632688522338867, - "logps/chosen": -94.7468490600586, - "logps/rejected": -102.03514099121094, - "loss": 0.666, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.40661248564720154, - "rewards/margins": 0.08162738382816315, - "rewards/rejected": -0.4882398247718811, + "epoch": 0.23259820813232254, + "grad_norm": 2.2209107875823975, + "learning_rate": 9.991926473741578e-08, + "logits/chosen": -3.038304090499878, + "logits/rejected": -3.013697862625122, + "logps/chosen": -55.443382263183594, + "logps/rejected": -55.3911018371582, + "loss": 0.6893, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.013421006500720978, + "rewards/margins": 0.008150272071361542, + "rewards/rejected": -0.02157127857208252, "step": 1350 }, { - "epoch": 0.23, - "grad_norm": 6.6807835207225725, - "learning_rate": 4.995524896215339e-07, - "logits/chosen": -2.606091022491455, - "logits/rejected": -2.593371868133545, - "logps/chosen": -95.50038146972656, - "logps/rejected": -104.9891586303711, - "loss": 0.6603, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.40270328521728516, - "rewards/margins": 0.08724579960107803, - "rewards/rejected": -0.48994913697242737, + "epoch": 0.2343211578221916, + "grad_norm": 2.3511853218078613, + "learning_rate": 9.991049792430679e-08, + "logits/chosen": -2.9954142570495605, + "logits/rejected": -2.9832215309143066, + "logps/chosen": -56.58509063720703, + "logps/rejected": -58.09856033325195, + "loss": 0.6899, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.013825833797454834, + "rewards/margins": 0.006910757161676884, + "rewards/rejected": -0.020736588165163994, "step": 1360 }, { - "epoch": 0.24, - "grad_norm": 7.719117110945399, - "learning_rate": 4.995063988336101e-07, - "logits/chosen": -2.6957902908325195, - "logits/rejected": -2.67728328704834, - "logps/chosen": -93.07683563232422, - "logps/rejected": -106.47142028808594, - "loss": 0.6415, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.4041716456413269, - "rewards/margins": 0.13314835727214813, - "rewards/rejected": -0.5373200178146362, + "epoch": 0.23604410751206065, + "grad_norm": 2.4769160747528076, + "learning_rate": 9.990127976672203e-08, + "logits/chosen": -3.1084232330322266, + "logits/rejected": -3.0888638496398926, + "logps/chosen": -53.93939208984375, + "logps/rejected": -55.24439239501953, + "loss": 0.6873, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.012645436450839043, + "rewards/margins": 0.011925434693694115, + "rewards/rejected": -0.024570871144533157, "step": 1370 }, { - "epoch": 0.24, - "grad_norm": 7.523503188478382, - "learning_rate": 4.994580517401102e-07, - "logits/chosen": -2.5843305587768555, - "logits/rejected": -2.56766414642334, - "logps/chosen": -97.11516571044922, - "logps/rejected": -107.70014953613281, - "loss": 0.6448, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.42785486578941345, - "rewards/margins": 0.12621551752090454, - "rewards/rejected": -0.5540703535079956, + "epoch": 0.2377670572019297, + "grad_norm": 2.2031283378601074, + "learning_rate": 9.989161034802205e-08, + "logits/chosen": -3.0005006790161133, + "logits/rejected": -2.9822466373443604, + "logps/chosen": -55.36827850341797, + "logps/rejected": -54.40727996826172, + "loss": 0.6881, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.010355006903409958, + "rewards/margins": 0.010543933138251305, + "rewards/rejected": -0.020898941904306412, "step": 1380 }, { - "epoch": 0.24, - "grad_norm": 7.994616924999172, - "learning_rate": 4.994074487782406e-07, - "logits/chosen": -2.7036585807800293, - "logits/rejected": -2.6787309646606445, - "logps/chosen": -103.4170913696289, - "logps/rejected": -113.5262222290039, - "loss": 0.6457, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.46558231115341187, - "rewards/margins": 0.1351221799850464, - "rewards/rejected": -0.6007044315338135, + "epoch": 0.23949000689179875, + "grad_norm": 2.3673853874206543, + "learning_rate": 9.988148975564812e-08, + "logits/chosen": -3.1491761207580566, + "logits/rejected": -3.1218996047973633, + "logps/chosen": -58.048797607421875, + "logps/rejected": -55.777122497558594, + "loss": 0.6875, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.011549661867320538, + "rewards/margins": 0.011656830087304115, + "rewards/rejected": -0.023206491023302078, "step": 1390 }, { - "epoch": 0.24, - "grad_norm": 7.076195209744298, - "learning_rate": 4.993545904056078e-07, - "logits/chosen": -2.5222525596618652, - "logits/rejected": -2.4996466636657715, - "logps/chosen": -100.6828384399414, - "logps/rejected": -113.02195739746094, - "loss": 0.6329, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.448734849691391, - "rewards/margins": 0.15365351736545563, - "rewards/rejected": -0.6023883819580078, + "epoch": 0.2412129565816678, + "grad_norm": 2.19177508354187, + "learning_rate": 9.987091808112155e-08, + "logits/chosen": -2.969104528427124, + "logits/rejected": -2.9434850215911865, + "logps/chosen": -57.14324951171875, + "logps/rejected": -55.39662551879883, + "loss": 0.6871, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.013344550505280495, + "rewards/margins": 0.012570117600262165, + "rewards/rejected": -0.025914669036865234, "step": 1400 }, { - "epoch": 0.24, - "eval_logits/chosen": -2.6804823875427246, - "eval_logits/rejected": -2.674381732940674, - "eval_logps/chosen": -96.39348602294922, - "eval_logps/rejected": -108.98139190673828, - "eval_loss": 0.6609914302825928, - "eval_rewards/accuracies": 0.61849445104599, - "eval_rewards/chosen": -0.37689635157585144, - "eval_rewards/margins": 0.08134372532367706, - "eval_rewards/rejected": -0.4582400619983673, - "eval_runtime": 356.0921, - "eval_samples_per_second": 12.087, - "eval_steps_per_second": 1.511, + "epoch": 0.2412129565816678, + "eval_logits/chosen": -3.135641574859619, + "eval_logits/rejected": -3.1300060749053955, + "eval_logps/chosen": -58.64906692504883, + "eval_logps/rejected": -63.489479064941406, + "eval_loss": 0.6913572549819946, + "eval_rewards/accuracies": 0.5785316228866577, + "eval_rewards/chosen": 0.000628301000688225, + "eval_rewards/margins": 0.0037218371871858835, + "eval_rewards/rejected": -0.0030935362447053194, + "eval_runtime": 358.6517, + "eval_samples_per_second": 12.001, + "eval_steps_per_second": 1.5, "step": 1400 }, { - "epoch": 0.24, - "grad_norm": 8.108597626289049, - "learning_rate": 4.992994771002141e-07, - "logits/chosen": -2.5680909156799316, - "logits/rejected": -2.5558865070343018, - "logps/chosen": -102.64105224609375, - "logps/rejected": -119.11705017089844, - "loss": 0.6367, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.48273658752441406, - "rewards/margins": 0.17660747468471527, - "rewards/rejected": -0.6593440175056458, + "epoch": 0.24293590627153688, + "grad_norm": 2.0600688457489014, + "learning_rate": 9.985989542004283e-08, + "logits/chosen": -3.032985210418701, + "logits/rejected": -3.02095365524292, + "logps/chosen": -55.83747100830078, + "logps/rejected": -55.733192443847656, + "loss": 0.688, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.014638135209679604, + "rewards/margins": 0.010690612718462944, + "rewards/rejected": -0.0253287460654974, "step": 1410 }, { - "epoch": 0.24, - "grad_norm": 7.858931662859935, - "learning_rate": 4.992421093604534e-07, - "logits/chosen": -2.4751877784729004, - "logits/rejected": -2.4777843952178955, - "logps/chosen": -101.93944549560547, - "logps/rejected": -124.42276763916016, - "loss": 0.6348, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.512130856513977, - "rewards/margins": 0.1852533519268036, - "rewards/rejected": -0.697384238243103, + "epoch": 0.24465885596140594, + "grad_norm": 2.159818649291992, + "learning_rate": 9.984842187209068e-08, + "logits/chosen": -2.950575590133667, + "logits/rejected": -2.956359624862671, + "logps/chosen": -52.466026306152344, + "logps/rejected": -56.8805046081543, + "loss": 0.6909, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.017118731513619423, + "rewards/margins": 0.004853106569498777, + "rewards/rejected": -0.021971840411424637, "step": 1420 }, { - "epoch": 0.25, - "grad_norm": 9.45227562873637, - "learning_rate": 4.991824877051067e-07, - "logits/chosen": -2.561638832092285, - "logits/rejected": -2.54856276512146, - "logps/chosen": -108.27215576171875, - "logps/rejected": -134.86288452148438, - "loss": 0.601, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.5470194816589355, - "rewards/margins": 0.2372448891401291, - "rewards/rejected": -0.784264326095581, + "epoch": 0.246381805651275, + "grad_norm": 2.4923534393310547, + "learning_rate": 9.983649754102133e-08, + "logits/chosen": -3.06890869140625, + "logits/rejected": -3.0572915077209473, + "logps/chosen": -54.95482635498047, + "logps/rejected": -59.163429260253906, + "loss": 0.6869, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.01370183564722538, + "rewards/margins": 0.013033677823841572, + "rewards/rejected": -0.026735514402389526, "step": 1430 }, { - "epoch": 0.25, - "grad_norm": 10.22819119060325, - "learning_rate": 4.991206126733369e-07, - "logits/chosen": -2.448366403579712, - "logits/rejected": -2.420719623565674, - "logps/chosen": -108.7235107421875, - "logps/rejected": -120.07295989990234, - "loss": 0.6473, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.5824471712112427, - "rewards/margins": 0.13498175144195557, - "rewards/rejected": -0.7174289226531982, + "epoch": 0.24810475534114404, + "grad_norm": 2.1912496089935303, + "learning_rate": 9.982412253466739e-08, + "logits/chosen": -2.995776653289795, + "logits/rejected": -2.964540958404541, + "logps/chosen": -52.209800720214844, + "logps/rejected": -51.3564338684082, + "loss": 0.6872, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.017473148182034492, + "rewards/margins": 0.012446993961930275, + "rewards/rejected": -0.029920142143964767, "step": 1440 }, { - "epoch": 0.25, - "grad_norm": 12.725503598146807, - "learning_rate": 4.990564848246851e-07, - "logits/chosen": -2.4409327507019043, - "logits/rejected": -2.408658504486084, - "logps/chosen": -114.9705810546875, - "logps/rejected": -128.18980407714844, - "loss": 0.6348, + "epoch": 0.2498277050310131, + "grad_norm": 2.883305788040161, + "learning_rate": 9.9811296964937e-08, + "logits/chosen": -3.008527994155884, + "logits/rejected": -2.971277952194214, + "logps/chosen": -56.74669647216797, + "logps/rejected": -54.7548713684082, + "loss": 0.6856, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.5951242446899414, - "rewards/margins": 0.16781339049339294, - "rewards/rejected": -0.7629376649856567, + "rewards/chosen": -0.012715873308479786, + "rewards/margins": 0.015632428228855133, + "rewards/rejected": -0.028348300606012344, "step": 1450 }, { - "epoch": 0.25, - "grad_norm": 9.436937177439919, - "learning_rate": 4.98990104739064e-07, - "logits/chosen": -2.4494917392730713, - "logits/rejected": -2.4222424030303955, - "logps/chosen": -109.37040710449219, - "logps/rejected": -126.34139251708984, - "loss": 0.6382, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.5768964290618896, - "rewards/margins": 0.18195541203022003, - "rewards/rejected": -0.7588518261909485, + "epoch": 0.25155065472088217, + "grad_norm": 2.354778289794922, + "learning_rate": 9.97980209478128e-08, + "logits/chosen": -3.0111136436462402, + "logits/rejected": -2.9785988330841064, + "logps/chosen": -53.67363357543945, + "logps/rejected": -53.75532913208008, + "loss": 0.6869, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.019740106537938118, + "rewards/margins": 0.01316138543188572, + "rewards/rejected": -0.03290148824453354, "step": 1460 }, { - "epoch": 0.25, - "grad_norm": 8.955583438989118, - "learning_rate": 4.989214730167541e-07, - "logits/chosen": -2.622709035873413, - "logits/rejected": -2.5908420085906982, - "logps/chosen": -110.5583267211914, - "logps/rejected": -124.64057922363281, - "loss": 0.6329, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.535399854183197, - "rewards/margins": 0.16410811245441437, - "rewards/rejected": -0.6995079517364502, + "epoch": 0.2532736044107512, + "grad_norm": 2.4939029216766357, + "learning_rate": 9.97842946033508e-08, + "logits/chosen": -3.1756339073181152, + "logits/rejected": -3.1396703720092773, + "logps/chosen": -58.40922164916992, + "logps/rejected": -57.95037841796875, + "loss": 0.684, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.01385563611984253, + "rewards/margins": 0.01888427510857582, + "rewards/rejected": -0.03273991495370865, "step": 1470 }, { - "epoch": 0.25, - "grad_norm": 9.49330055426053, - "learning_rate": 4.988505902783971e-07, - "logits/chosen": -2.590567111968994, - "logits/rejected": -2.556976795196533, - "logps/chosen": -105.81478118896484, - "logps/rejected": -119.9277114868164, - "loss": 0.6432, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.5450218915939331, - "rewards/margins": 0.13719643652439117, - "rewards/rejected": -0.6822183728218079, + "epoch": 0.2549965541006203, + "grad_norm": 2.315673351287842, + "learning_rate": 9.977011805567941e-08, + "logits/chosen": -3.1426024436950684, + "logits/rejected": -3.1065621376037598, + "logps/chosen": -53.217979431152344, + "logps/rejected": -54.60027313232422, + "loss": 0.6886, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.01907247118651867, + "rewards/margins": 0.00952645018696785, + "rewards/rejected": -0.028598923236131668, "step": 1480 }, { - "epoch": 0.26, - "grad_norm": 9.98525040448124, - "learning_rate": 4.987774571649912e-07, - "logits/chosen": -2.4983878135681152, - "logits/rejected": -2.4753427505493164, - "logps/chosen": -115.1557846069336, - "logps/rejected": -128.9449005126953, - "loss": 0.6331, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.5778855085372925, - "rewards/margins": 0.17497901618480682, - "rewards/rejected": -0.7528645992279053, + "epoch": 0.2567195037904893, + "grad_norm": 2.4700214862823486, + "learning_rate": 9.975549143299824e-08, + "logits/chosen": -3.0736770629882812, + "logits/rejected": -3.0491397380828857, + "logps/chosen": -59.33967971801758, + "logps/rejected": -56.70935821533203, + "loss": 0.688, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.01955995336174965, + "rewards/margins": 0.010835406370460987, + "rewards/rejected": -0.030395355075597763, "step": 1490 }, { - "epoch": 0.26, - "grad_norm": 10.26303189770449, - "learning_rate": 4.987020743378848e-07, - "logits/chosen": -2.385967969894409, - "logits/rejected": -2.383463144302368, - "logps/chosen": -110.9022445678711, - "logps/rejected": -130.71217346191406, - "loss": 0.6356, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.609332263469696, - "rewards/margins": 0.1715681552886963, - "rewards/rejected": -0.7809004187583923, + "epoch": 0.2584424534803584, + "grad_norm": 2.4240033626556396, + "learning_rate": 9.974041486757696e-08, + "logits/chosen": -2.9827020168304443, + "logits/rejected": -2.9801254272460938, + "logps/chosen": -52.06401824951172, + "logps/rejected": -56.11759567260742, + "loss": 0.6866, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.020753109827637672, + "rewards/margins": 0.013757812790572643, + "rewards/rejected": -0.03451092168688774, "step": 1500 }, { - "epoch": 0.26, - "eval_logits/chosen": -2.517664909362793, - "eval_logits/rejected": -2.5109217166900635, - "eval_logps/chosen": -107.28179168701172, - "eval_logps/rejected": -122.3668212890625, - "eval_loss": 0.6536844968795776, - "eval_rewards/accuracies": 0.6380111575126648, - "eval_rewards/chosen": -0.48577937483787537, - "eval_rewards/margins": 0.10631493479013443, - "eval_rewards/rejected": -0.5920943021774292, - "eval_runtime": 356.9471, - "eval_samples_per_second": 12.058, - "eval_steps_per_second": 1.507, + "epoch": 0.2584424534803584, + "eval_logits/chosen": -3.130272150039673, + "eval_logits/rejected": -3.124645471572876, + "eval_logps/chosen": -58.866065979003906, + "eval_logps/rejected": -63.78525924682617, + "eval_loss": 0.6909964680671692, + "eval_rewards/accuracies": 0.5750464797019958, + "eval_rewards/chosen": -0.001541697303764522, + "eval_rewards/margins": 0.0045096841640770435, + "eval_rewards/rejected": -0.006051382049918175, + "eval_runtime": 358.407, + "eval_samples_per_second": 12.009, + "eval_steps_per_second": 1.501, "step": 1500 }, { - "epoch": 0.26, - "grad_norm": 9.106501586861713, - "learning_rate": 4.986244424787706e-07, - "logits/chosen": -2.322202205657959, - "logits/rejected": -2.2912230491638184, - "logps/chosen": -118.87747955322266, - "logps/rejected": -133.88900756835938, - "loss": 0.6146, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.6029187440872192, - "rewards/margins": 0.2079295665025711, - "rewards/rejected": -0.8108483552932739, + "epoch": 0.2601654031702274, + "grad_norm": 2.4855523109436035, + "learning_rate": 9.972488849575411e-08, + "logits/chosen": -2.933039903640747, + "logits/rejected": -2.8975093364715576, + "logps/chosen": -60.27305221557617, + "logps/rejected": -56.31962203979492, + "loss": 0.6844, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.01667121797800064, + "rewards/margins": 0.018259279429912567, + "rewards/rejected": -0.034930501133203506, "step": 1510 }, { - "epoch": 0.26, - "grad_norm": 10.760728694678805, - "learning_rate": 4.985445622896794e-07, - "logits/chosen": -2.387296676635742, - "logits/rejected": -2.379225015640259, - "logps/chosen": -118.9466781616211, - "logps/rejected": -134.18948364257812, - "loss": 0.6424, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.6671319007873535, - "rewards/margins": 0.16024193167686462, - "rewards/rejected": -0.8273738026618958, + "epoch": 0.2618883528600965, + "grad_norm": 2.682542085647583, + "learning_rate": 9.970891245793588e-08, + "logits/chosen": -3.045544385910034, + "logits/rejected": -3.0375313758850098, + "logps/chosen": -54.4551887512207, + "logps/rejected": -54.225868225097656, + "loss": 0.6908, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.022045383229851723, + "rewards/margins": 0.0052342889830470085, + "rewards/rejected": -0.027279671281576157, "step": 1520 }, { - "epoch": 0.26, - "grad_norm": 15.020346958333217, - "learning_rate": 4.98462434492974e-07, - "logits/chosen": -2.2380728721618652, - "logits/rejected": -2.2234339714050293, - "logps/chosen": -127.2374038696289, - "logps/rejected": -143.69729614257812, - "loss": 0.6451, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.7556307315826416, - "rewards/margins": 0.1741071343421936, - "rewards/rejected": -0.9297378659248352, + "epoch": 0.26361130254996556, + "grad_norm": 2.655073881149292, + "learning_rate": 9.96924868985948e-08, + "logits/chosen": -2.9312405586242676, + "logits/rejected": -2.9148247241973877, + "logps/chosen": -53.90313720703125, + "logps/rejected": -54.13452911376953, + "loss": 0.6877, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02214507758617401, + "rewards/margins": 0.011658025905489922, + "rewards/rejected": -0.03380310535430908, "step": 1530 }, { - "epoch": 0.27, - "grad_norm": 10.57370425758323, - "learning_rate": 4.983780598313423e-07, - "logits/chosen": -2.3825461864471436, - "logits/rejected": -2.3513596057891846, - "logps/chosen": -120.11723327636719, - "logps/rejected": -140.9315185546875, - "loss": 0.6034, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.6420670747756958, - "rewards/margins": 0.2574729919433594, - "rewards/rejected": -0.8995401263237, + "epoch": 0.2653342522398346, + "grad_norm": 2.3471813201904297, + "learning_rate": 9.967561196626846e-08, + "logits/chosen": -3.059717893600464, + "logits/rejected": -3.0278360843658447, + "logps/chosen": -58.14380645751953, + "logps/rejected": -54.910728454589844, + "loss": 0.6852, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.022383395582437515, + "rewards/margins": 0.016689913347363472, + "rewards/rejected": -0.03907330706715584, "step": 1540 }, { - "epoch": 0.27, - "grad_norm": 11.337001973825672, - "learning_rate": 4.982914390677909e-07, - "logits/chosen": -2.2892661094665527, - "logits/rejected": -2.2704126834869385, - "logps/chosen": -114.27877044677734, - "logps/rejected": -134.2589111328125, - "loss": 0.6133, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.6090589165687561, - "rewards/margins": 0.2282298356294632, - "rewards/rejected": -0.8372887372970581, + "epoch": 0.26705720192970367, + "grad_norm": 2.238987684249878, + "learning_rate": 9.965828781355818e-08, + "logits/chosen": -2.978020668029785, + "logits/rejected": -2.9583582878112793, + "logps/chosen": -55.6759033203125, + "logps/rejected": -54.38631057739258, + "loss": 0.6858, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02306569740176201, + "rewards/margins": 0.015391230583190918, + "rewards/rejected": -0.03845692425966263, "step": 1550 }, { - "epoch": 0.27, - "grad_norm": 13.598814466918178, - "learning_rate": 4.982025729856381e-07, - "logits/chosen": -2.273789882659912, - "logits/rejected": -2.252927780151367, - "logps/chosen": -123.75514221191406, - "logps/rejected": -144.1699676513672, - "loss": 0.6334, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7292818427085876, - "rewards/margins": 0.20613010227680206, - "rewards/rejected": -0.935411810874939, + "epoch": 0.2687801516195727, + "grad_norm": 2.3482720851898193, + "learning_rate": 9.964051459712762e-08, + "logits/chosen": -2.991518974304199, + "logits/rejected": -2.974125385284424, + "logps/chosen": -53.505340576171875, + "logps/rejected": -54.268577575683594, + "loss": 0.6887, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.026575718075037003, + "rewards/margins": 0.00950780138373375, + "rewards/rejected": -0.03608352318406105, "step": 1560 }, { - "epoch": 0.27, - "grad_norm": 13.054000500346573, - "learning_rate": 4.981114623885066e-07, - "logits/chosen": -2.305429697036743, - "logits/rejected": -2.304576873779297, - "logps/chosen": -123.65141296386719, - "logps/rejected": -149.49166870117188, - "loss": 0.6306, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7176939845085144, - "rewards/margins": 0.21510076522827148, - "rewards/rejected": -0.9327947497367859, + "epoch": 0.2705031013094418, + "grad_norm": 2.3481009006500244, + "learning_rate": 9.962229247770133e-08, + "logits/chosen": -3.042026996612549, + "logits/rejected": -3.04404878616333, + "logps/chosen": -54.13409423828125, + "logps/rejected": -59.780250549316406, + "loss": 0.6868, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02218012697994709, + "rewards/margins": 0.0133208017796278, + "rewards/rejected": -0.03550093248486519, "step": 1570 }, { - "epoch": 0.27, - "grad_norm": 11.391373748499909, - "learning_rate": 4.980181081003167e-07, - "logits/chosen": -2.2610230445861816, - "logits/rejected": -2.248826265335083, - "logps/chosen": -120.3755874633789, - "logps/rejected": -140.72938537597656, - "loss": 0.6399, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.6746999621391296, - "rewards/margins": 0.19383227825164795, - "rewards/rejected": -0.8685322999954224, + "epoch": 0.2722260509993108, + "grad_norm": 2.7631266117095947, + "learning_rate": 9.960362162006333e-08, + "logits/chosen": -2.977109909057617, + "logits/rejected": -2.96714448928833, + "logps/chosen": -55.50933837890625, + "logps/rejected": -57.580101013183594, + "loss": 0.688, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.02594080939888954, + "rewards/margins": 0.010953729972243309, + "rewards/rejected": -0.0368945375084877, "step": 1580 }, { - "epoch": 0.27, - "grad_norm": 11.558255813156592, - "learning_rate": 4.979225109652783e-07, - "logits/chosen": -2.317185878753662, - "logits/rejected": -2.3010520935058594, - "logps/chosen": -120.50982666015625, - "logps/rejected": -136.13148498535156, - "loss": 0.6499, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.6638567447662354, - "rewards/margins": 0.15114296972751617, - "rewards/rejected": -0.8149996995925903, + "epoch": 0.2739490006891799, + "grad_norm": 2.5365638732910156, + "learning_rate": 9.958450219305565e-08, + "logits/chosen": -3.022242546081543, + "logits/rejected": -3.007664918899536, + "logps/chosen": -56.24147415161133, + "logps/rejected": -57.927711486816406, + "loss": 0.6876, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.02111046575009823, + "rewards/margins": 0.011745044030249119, + "rewards/rejected": -0.03285551071166992, "step": 1590 }, { - "epoch": 0.28, - "grad_norm": 10.010530526041645, - "learning_rate": 4.978246718478835e-07, - "logits/chosen": -2.298884630203247, - "logits/rejected": -2.2639718055725098, - "logps/chosen": -114.62701416015625, - "logps/rejected": -132.1314239501953, - "loss": 0.6275, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.6248496770858765, - "rewards/margins": 0.18698535859584808, - "rewards/rejected": -0.8118351101875305, + "epoch": 0.27567195037904896, + "grad_norm": 2.4298782348632812, + "learning_rate": 9.956493436957672e-08, + "logits/chosen": -3.0135552883148193, + "logits/rejected": -2.9755609035491943, + "logps/chosen": -54.7454833984375, + "logps/rejected": -54.69611358642578, + "loss": 0.6876, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.025662336498498917, + "rewards/margins": 0.011798778548836708, + "rewards/rejected": -0.037461116909980774, "step": 1600 }, { - "epoch": 0.28, - "eval_logits/chosen": -2.417057991027832, - "eval_logits/rejected": -2.4086451530456543, - "eval_logps/chosen": -116.99667358398438, - "eval_logps/rejected": -135.21180725097656, - "eval_loss": 0.6452447175979614, - "eval_rewards/accuracies": 0.6363847851753235, - "eval_rewards/chosen": -0.5829283595085144, - "eval_rewards/margins": 0.13761593401432037, - "eval_rewards/rejected": -0.7205442786216736, - "eval_runtime": 357.5084, - "eval_samples_per_second": 12.039, - "eval_steps_per_second": 1.505, + "epoch": 0.27567195037904896, + "eval_logits/chosen": -3.124072790145874, + "eval_logits/rejected": -3.118478775024414, + "eval_logps/chosen": -59.09284591674805, + "eval_logps/rejected": -64.08631134033203, + "eval_loss": 0.6906586289405823, + "eval_rewards/accuracies": 0.5873606204986572, + "eval_rewards/chosen": -0.0038095172494649887, + "eval_rewards/margins": 0.005252342205494642, + "eval_rewards/rejected": -0.009061858989298344, + "eval_runtime": 358.6329, + "eval_samples_per_second": 12.001, + "eval_steps_per_second": 1.5, "step": 1600 }, { - "epoch": 0.28, - "grad_norm": 10.584365804460724, - "learning_rate": 4.977245916328994e-07, - "logits/chosen": -2.3447985649108887, - "logits/rejected": -2.3194570541381836, - "logps/chosen": -130.62841796875, - "logps/rejected": -153.30650329589844, - "loss": 0.6354, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.7475208044052124, - "rewards/margins": 0.20843036472797394, - "rewards/rejected": -0.9559510946273804, + "epoch": 0.277394900068918, + "grad_norm": 2.2438275814056396, + "learning_rate": 9.954491832657987e-08, + "logits/chosen": -3.0523838996887207, + "logits/rejected": -3.0273146629333496, + "logps/chosen": -58.795616149902344, + "logps/rejected": -61.6534309387207, + "loss": 0.6883, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.029015138745307922, + "rewards/margins": 0.010533750057220459, + "rewards/rejected": -0.03954889252781868, "step": 1610 }, { - "epoch": 0.28, - "grad_norm": 12.935555369642161, - "learning_rate": 4.976222712253587e-07, - "logits/chosen": -2.2747273445129395, - "logits/rejected": -2.251038074493408, - "logps/chosen": -124.53253173828125, - "logps/rejected": -160.12515258789062, - "loss": 0.6036, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.7311369180679321, - "rewards/margins": 0.3294447958469391, - "rewards/rejected": -1.0605818033218384, + "epoch": 0.27911784975878706, + "grad_norm": 2.277097225189209, + "learning_rate": 9.952445424507174e-08, + "logits/chosen": -2.974520206451416, + "logits/rejected": -2.9524998664855957, + "logps/chosen": -54.341697692871094, + "logps/rejected": -58.2484016418457, + "loss": 0.6872, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.029016416519880295, + "rewards/margins": 0.012814966030418873, + "rewards/rejected": -0.04183139279484749, "step": 1620 }, { - "epoch": 0.28, - "grad_norm": 12.670402097997451, - "learning_rate": 4.97517711550553e-07, - "logits/chosen": -2.334963083267212, - "logits/rejected": -2.3116507530212402, - "logps/chosen": -132.87025451660156, - "logps/rejected": -149.6600799560547, - "loss": 0.6289, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.7737977504730225, - "rewards/margins": 0.2057873010635376, - "rewards/rejected": -0.9795848727226257, + "epoch": 0.2808407994486561, + "grad_norm": 2.4578676223754883, + "learning_rate": 9.950354231011059e-08, + "logits/chosen": -3.0443010330200195, + "logits/rejected": -3.0180282592773438, + "logps/chosen": -57.86555099487305, + "logps/rejected": -55.727874755859375, + "loss": 0.6854, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.023557905107736588, + "rewards/margins": 0.016457753255963326, + "rewards/rejected": -0.04001566022634506, "step": 1630 }, { - "epoch": 0.28, - "grad_norm": 12.932831879229832, - "learning_rate": 4.974109135540232e-07, - "logits/chosen": -2.379924774169922, - "logits/rejected": -2.3459315299987793, - "logps/chosen": -137.20947265625, - "logps/rejected": -144.1864013671875, - "loss": 0.6681, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.803007960319519, - "rewards/margins": 0.1256239265203476, - "rewards/rejected": -0.9286319017410278, + "epoch": 0.28256374913852517, + "grad_norm": 2.5449278354644775, + "learning_rate": 9.948218271080464e-08, + "logits/chosen": -3.085779905319214, + "logits/rejected": -3.0488622188568115, + "logps/chosen": -59.664390563964844, + "logps/rejected": -55.85193634033203, + "loss": 0.6847, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.027100270614027977, + "rewards/margins": 0.017988886684179306, + "rewards/rejected": -0.045089155435562134, "step": 1640 }, { - "epoch": 0.28, - "grad_norm": 10.93104733261659, - "learning_rate": 4.97301878201552e-07, - "logits/chosen": -2.3800089359283447, - "logits/rejected": -2.353868246078491, - "logps/chosen": -118.6199951171875, - "logps/rejected": -142.3135223388672, - "loss": 0.6032, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.613827109336853, - "rewards/margins": 0.2595062851905823, - "rewards/rejected": -0.8733335733413696, + "epoch": 0.2842866988283942, + "grad_norm": 2.6558942794799805, + "learning_rate": 9.94603756403104e-08, + "logits/chosen": -3.0556998252868652, + "logits/rejected": -3.0283634662628174, + "logps/chosen": -59.260841369628906, + "logps/rejected": -58.779266357421875, + "loss": 0.6847, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.020244024693965912, + "rewards/margins": 0.017693202942609787, + "rewards/rejected": -0.0379372276365757, "step": 1650 }, { - "epoch": 0.29, - "grad_norm": 8.435075129874804, - "learning_rate": 4.971906064791545e-07, - "logits/chosen": -2.4072935581207275, - "logits/rejected": -2.3678243160247803, - "logps/chosen": -116.49778747558594, - "logps/rejected": -127.7277603149414, - "loss": 0.6444, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.6152034401893616, - "rewards/margins": 0.16506488621234894, - "rewards/rejected": -0.7802683115005493, + "epoch": 0.28600964851826327, + "grad_norm": 2.612154722213745, + "learning_rate": 9.943812129583088e-08, + "logits/chosen": -3.0901899337768555, + "logits/rejected": -3.0435256958007812, + "logps/chosen": -57.216285705566406, + "logps/rejected": -54.11063766479492, + "loss": 0.6829, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.022447334602475166, + "rewards/margins": 0.02150181494653225, + "rewards/rejected": -0.043949149549007416, "step": 1660 }, { - "epoch": 0.29, - "grad_norm": 10.473049120892489, - "learning_rate": 4.970770993930693e-07, - "logits/chosen": -2.3916454315185547, - "logits/rejected": -2.366729259490967, - "logps/chosen": -112.58506774902344, - "logps/rejected": -138.53781127929688, - "loss": 0.6101, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.5803548693656921, - "rewards/margins": 0.25304341316223145, - "rewards/rejected": -0.8333982229232788, + "epoch": 0.2877325982081323, + "grad_norm": 2.6352086067199707, + "learning_rate": 9.941541987861386e-08, + "logits/chosen": -3.0773003101348877, + "logits/rejected": -3.055241584777832, + "logps/chosen": -56.383026123046875, + "logps/rejected": -59.22760772705078, + "loss": 0.6827, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.01868412271142006, + "rewards/margins": 0.021537818014621735, + "rewards/rejected": -0.040221940726041794, "step": 1670 }, { - "epoch": 0.29, - "grad_norm": 11.92770345279179, - "learning_rate": 4.969613579697499e-07, - "logits/chosen": -2.329380989074707, - "logits/rejected": -2.303520679473877, - "logps/chosen": -119.73587799072266, - "logps/rejected": -142.68923950195312, - "loss": 0.6175, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.6415343284606934, - "rewards/margins": 0.24146585166454315, - "rewards/rejected": -0.8830000758171082, + "epoch": 0.2894555478980014, + "grad_norm": 2.385436534881592, + "learning_rate": 9.939227159394998e-08, + "logits/chosen": -3.0334177017211914, + "logits/rejected": -3.007333278656006, + "logps/chosen": -58.554542541503906, + "logps/rejected": -59.123863220214844, + "loss": 0.6848, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0294620580971241, + "rewards/margins": 0.01769358292222023, + "rewards/rejected": -0.04715564101934433, "step": 1680 }, { - "epoch": 0.29, - "grad_norm": 10.415977139141571, - "learning_rate": 4.968433832558549e-07, - "logits/chosen": -2.2939274311065674, - "logits/rejected": -2.2756576538085938, - "logps/chosen": -115.64154052734375, - "logps/rejected": -131.75743103027344, - "loss": 0.637, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.601462185382843, - "rewards/margins": 0.19359458982944489, - "rewards/rejected": -0.7950568199157715, + "epoch": 0.29117849758787046, + "grad_norm": 2.487203359603882, + "learning_rate": 9.936867665117098e-08, + "logits/chosen": -3.0129518508911133, + "logits/rejected": -2.9973418712615967, + "logps/chosen": -58.51457595825195, + "logps/rejected": -55.72222137451172, + "loss": 0.6911, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.029692823067307472, + "rewards/margins": 0.004795686341822147, + "rewards/rejected": -0.03448851406574249, "step": 1690 }, { - "epoch": 0.29, - "grad_norm": 12.213920640091061, - "learning_rate": 4.967231763182385e-07, - "logits/chosen": -2.169027805328369, - "logits/rejected": -2.16825795173645, - "logps/chosen": -112.47358703613281, - "logps/rejected": -138.60568237304688, - "loss": 0.6315, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.6292012333869934, - "rewards/margins": 0.2038397490978241, - "rewards/rejected": -0.8330410122871399, + "epoch": 0.2929014472777395, + "grad_norm": 2.6827774047851562, + "learning_rate": 9.93446352636477e-08, + "logits/chosen": -2.9218242168426514, + "logits/rejected": -2.926464557647705, + "logps/chosen": -52.7581901550293, + "logps/rejected": -59.58576202392578, + "loss": 0.6882, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.031951457262039185, + "rewards/margins": 0.010778420604765415, + "rewards/rejected": -0.04272987321019173, "step": 1700 }, { - "epoch": 0.29, - "eval_logits/chosen": -2.3369693756103516, - "eval_logits/rejected": -2.3275210857391357, - "eval_logps/chosen": -117.66336822509766, - "eval_logps/rejected": -136.6091766357422, - "eval_loss": 0.6433987021446228, - "eval_rewards/accuracies": 0.633596658706665, - "eval_rewards/chosen": -0.5895951986312866, - "eval_rewards/margins": 0.14492255449295044, - "eval_rewards/rejected": -0.7345177531242371, - "eval_runtime": 357.4789, - "eval_samples_per_second": 12.04, - "eval_steps_per_second": 1.505, + "epoch": 0.2929014472777395, + "eval_logits/chosen": -3.1172916889190674, + "eval_logits/rejected": -3.1116535663604736, + "eval_logps/chosen": -59.38001251220703, + "eval_logps/rejected": -64.44491577148438, + "eval_loss": 0.6903406977653503, + "eval_rewards/accuracies": 0.5850371718406677, + "eval_rewards/chosen": -0.006681189872324467, + "eval_rewards/margins": 0.005966781172901392, + "eval_rewards/rejected": -0.012647970579564571, + "eval_runtime": 358.5851, + "eval_samples_per_second": 12.003, + "eval_steps_per_second": 1.5, "step": 1700 }, { - "epoch": 0.29, - "grad_norm": 17.554636232799652, - "learning_rate": 4.966007382439414e-07, - "logits/chosen": -2.2377054691314697, - "logits/rejected": -2.196046829223633, - "logps/chosen": -134.55264282226562, - "logps/rejected": -155.48165893554688, - "loss": 0.6201, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.761473536491394, - "rewards/margins": 0.24851946532726288, - "rewards/rejected": -1.0099929571151733, + "epoch": 0.29462439696760856, + "grad_norm": 2.7968385219573975, + "learning_rate": 9.932014764878828e-08, + "logits/chosen": -3.0532126426696777, + "logits/rejected": -3.0127451419830322, + "logps/chosen": -61.0193977355957, + "logps/rejected": -58.547027587890625, + "loss": 0.6863, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.026096154004335403, + "rewards/margins": 0.014539210125803947, + "rewards/rejected": -0.0406353622674942, "step": 1710 }, { - "epoch": 0.3, - "grad_norm": 12.864744504235462, - "learning_rate": 4.964760701401807e-07, - "logits/chosen": -2.2469406127929688, - "logits/rejected": -2.2177302837371826, - "logps/chosen": -136.5878143310547, - "logps/rejected": -150.06259155273438, - "loss": 0.6428, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.801825225353241, - "rewards/margins": 0.18852399289608002, - "rewards/rejected": -0.9903491735458374, + "epoch": 0.2963473466574776, + "grad_norm": 2.369809865951538, + "learning_rate": 9.929521402803614e-08, + "logits/chosen": -3.06132173538208, + "logits/rejected": -3.0323057174682617, + "logps/chosen": -59.400787353515625, + "logps/rejected": -55.1656608581543, + "loss": 0.6879, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.029749011620879173, + "rewards/margins": 0.011349111795425415, + "rewards/rejected": -0.04109812527894974, "step": 1720 }, { - "epoch": 0.3, - "grad_norm": 15.115706747879441, - "learning_rate": 4.963491731343395e-07, - "logits/chosen": -2.2426817417144775, - "logits/rejected": -2.225494146347046, - "logps/chosen": -133.5530548095703, - "logps/rejected": -154.15370178222656, - "loss": 0.629, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.7963297367095947, - "rewards/margins": 0.2201063185930252, - "rewards/rejected": -1.016435980796814, + "epoch": 0.29807029634734666, + "grad_norm": 2.2750089168548584, + "learning_rate": 9.92698346268679e-08, + "logits/chosen": -3.0554797649383545, + "logits/rejected": -3.0384256839752197, + "logps/chosen": -57.126609802246094, + "logps/rejected": -56.65547561645508, + "loss": 0.6888, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03192286565899849, + "rewards/margins": 0.009574519470334053, + "rewards/rejected": -0.04149738699197769, "step": 1730 }, { - "epoch": 0.3, - "grad_norm": 10.577013192942609, - "learning_rate": 4.962200483739572e-07, - "logits/chosen": -2.205991268157959, - "logits/rejected": -2.1916627883911133, - "logps/chosen": -137.78909301757812, - "logps/rejected": -165.40756225585938, - "loss": 0.6415, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.856925368309021, - "rewards/margins": 0.25689417123794556, - "rewards/rejected": -1.1138197183609009, + "epoch": 0.2997932460372157, + "grad_norm": 2.516836404800415, + "learning_rate": 9.924400967479145e-08, + "logits/chosen": -3.0047802925109863, + "logits/rejected": -2.994741678237915, + "logps/chosen": -55.8693733215332, + "logps/rejected": -58.82105255126953, + "loss": 0.6887, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03768100589513779, + "rewards/margins": 0.009916347451508045, + "rewards/rejected": -0.04759735241532326, "step": 1740 }, { - "epoch": 0.3, - "grad_norm": 14.966499781106553, - "learning_rate": 4.96088697026719e-07, - "logits/chosen": -2.2428221702575684, - "logits/rejected": -2.2297050952911377, - "logps/chosen": -130.33145141601562, - "logps/rejected": -156.0831756591797, - "loss": 0.6123, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.7514396905899048, - "rewards/margins": 0.2490360289812088, - "rewards/rejected": -1.0004757642745972, + "epoch": 0.30151619572708477, + "grad_norm": 2.693798303604126, + "learning_rate": 9.921773940534381e-08, + "logits/chosen": -3.0474226474761963, + "logits/rejected": -3.0362508296966553, + "logps/chosen": -57.77741622924805, + "logps/rejected": -59.905982971191406, + "loss": 0.6871, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.025913890451192856, + "rewards/margins": 0.012879503890872002, + "rewards/rejected": -0.03879339620471001, "step": 1750 }, { - "epoch": 0.3, - "grad_norm": 14.761249007069992, - "learning_rate": 4.959551202804452e-07, - "logits/chosen": -2.2175586223602295, - "logits/rejected": -2.1803672313690186, - "logps/chosen": -129.0546417236328, - "logps/rejected": -156.67083740234375, - "loss": 0.5915, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.7302212119102478, - "rewards/margins": 0.3069346249103546, - "rewards/rejected": -1.0371558666229248, + "epoch": 0.30323914541695385, + "grad_norm": 2.5608084201812744, + "learning_rate": 9.919102405608905e-08, + "logits/chosen": -3.0343425273895264, + "logits/rejected": -2.998051404953003, + "logps/chosen": -58.855934143066406, + "logps/rejected": -58.174530029296875, + "loss": 0.6817, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.028327789157629013, + "rewards/margins": 0.023931993171572685, + "rewards/rejected": -0.05225978419184685, "step": 1760 }, { - "epoch": 0.3, - "grad_norm": 13.94600835463878, - "learning_rate": 4.958193193430807e-07, - "logits/chosen": -2.2072737216949463, - "logits/rejected": -2.1699469089508057, - "logps/chosen": -136.83575439453125, - "logps/rejected": -160.46617126464844, - "loss": 0.5962, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.8186962008476257, - "rewards/margins": 0.2965734004974365, - "rewards/rejected": -1.115269660949707, + "epoch": 0.3049620951068229, + "grad_norm": 2.1804721355438232, + "learning_rate": 9.916386386861613e-08, + "logits/chosen": -3.060925245285034, + "logits/rejected": -3.0244739055633545, + "logps/chosen": -58.5760498046875, + "logps/rejected": -54.62489700317383, + "loss": 0.6833, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.035943519324064255, + "rewards/margins": 0.020883310586214066, + "rewards/rejected": -0.05682682991027832, "step": 1770 }, { - "epoch": 0.31, - "grad_norm": 14.814181161859743, - "learning_rate": 4.956812954426837e-07, - "logits/chosen": -2.0803823471069336, - "logits/rejected": -2.0697758197784424, - "logps/chosen": -145.62057495117188, - "logps/rejected": -193.1177520751953, - "loss": 0.5567, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.9111822843551636, - "rewards/margins": 0.43550905585289, - "rewards/rejected": -1.3466914892196655, + "epoch": 0.30668504479669195, + "grad_norm": 2.838467836380005, + "learning_rate": 9.913625908853674e-08, + "logits/chosen": -2.976637363433838, + "logits/rejected": -2.978466033935547, + "logps/chosen": -57.95159149169922, + "logps/rejected": -63.324363708496094, + "loss": 0.6865, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.034330397844314575, + "rewards/margins": 0.01428927667438984, + "rewards/rejected": -0.048619672656059265, "step": 1780 }, { - "epoch": 0.31, - "grad_norm": 12.92886463604504, - "learning_rate": 4.95541049827415e-07, - "logits/chosen": -2.077265739440918, - "logits/rejected": -2.0515055656433105, - "logps/chosen": -154.9197998046875, - "logps/rejected": -191.34097290039062, - "loss": 0.5844, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.029468297958374, - "rewards/margins": 0.35887202620506287, - "rewards/rejected": -1.3883404731750488, + "epoch": 0.308407994486561, + "grad_norm": 2.678936719894409, + "learning_rate": 9.910820996548301e-08, + "logits/chosen": -3.1204793453216553, + "logits/rejected": -3.0981993675231934, + "logps/chosen": -55.150596618652344, + "logps/rejected": -57.8071174621582, + "loss": 0.6831, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.03155216947197914, + "rewards/margins": 0.02121361531317234, + "rewards/rejected": -0.05276578664779663, "step": 1790 }, { - "epoch": 0.31, - "grad_norm": 14.455065715370587, - "learning_rate": 4.953985837655266e-07, - "logits/chosen": -2.03164005279541, - "logits/rejected": -2.004000186920166, - "logps/chosen": -154.93240356445312, - "logps/rejected": -186.32785034179688, - "loss": 0.6166, + "epoch": 0.31013094417643006, + "grad_norm": 2.4490652084350586, + "learning_rate": 9.907971675310532e-08, + "logits/chosen": -3.0842478275299072, + "logits/rejected": -3.060288190841675, + "logps/chosen": -55.61701202392578, + "logps/rejected": -57.78422927856445, + "loss": 0.6838, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.0314334630966187, - "rewards/margins": 0.31214088201522827, - "rewards/rejected": -1.3435744047164917, + "rewards/chosen": -0.0381484255194664, + "rewards/margins": 0.019785432144999504, + "rewards/rejected": -0.05793385952711105, "step": 1800 }, { - "epoch": 0.31, - "eval_logits/chosen": -2.099355936050415, - "eval_logits/rejected": -2.087456226348877, - "eval_logps/chosen": -137.8539276123047, - "eval_logps/rejected": -159.61842346191406, - "eval_loss": 0.6393665075302124, - "eval_rewards/accuracies": 0.6289498209953308, - "eval_rewards/chosen": -0.7915008664131165, - "eval_rewards/margins": 0.1731095165014267, - "eval_rewards/rejected": -0.9646103978157043, - "eval_runtime": 357.2087, - "eval_samples_per_second": 12.049, - "eval_steps_per_second": 1.506, + "epoch": 0.31013094417643006, + "eval_logits/chosen": -3.109452486038208, + "eval_logits/rejected": -3.103776216506958, + "eval_logps/chosen": -59.92013931274414, + "eval_logps/rejected": -65.07719421386719, + "eval_loss": 0.6899515390396118, + "eval_rewards/accuracies": 0.5824813842773438, + "eval_rewards/chosen": -0.012082410044968128, + "eval_rewards/margins": 0.006888336502015591, + "eval_rewards/rejected": -0.01897074468433857, + "eval_runtime": 358.2772, + "eval_samples_per_second": 12.013, + "eval_steps_per_second": 1.502, "step": 1800 }, { - "epoch": 0.31, - "grad_norm": 13.045686157199322, - "learning_rate": 4.952538985453499e-07, - "logits/chosen": -2.0923218727111816, - "logits/rejected": -2.058093547821045, - "logps/chosen": -148.35519409179688, - "logps/rejected": -166.8310089111328, - "loss": 0.6642, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.9175241589546204, - "rewards/margins": 0.21410436928272247, - "rewards/rejected": -1.131628394126892, + "epoch": 0.3118538938662991, + "grad_norm": 2.658210039138794, + "learning_rate": 9.905077970906998e-08, + "logits/chosen": -3.0863704681396484, + "logits/rejected": -3.0510458946228027, + "logps/chosen": -59.87849807739258, + "logps/rejected": -58.87512969970703, + "loss": 0.6841, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.032613541930913925, + "rewards/margins": 0.01923174224793911, + "rewards/rejected": -0.05184528976678848, "step": 1810 }, { - "epoch": 0.31, - "grad_norm": 14.746527696680772, - "learning_rate": 4.951069954752846e-07, - "logits/chosen": -2.104447841644287, - "logits/rejected": -2.0722100734710693, - "logps/chosen": -138.68417358398438, - "logps/rejected": -155.63540649414062, - "loss": 0.6363, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.8351799249649048, - "rewards/margins": 0.2144734412431717, - "rewards/rejected": -1.0496532917022705, + "epoch": 0.31357684355616816, + "grad_norm": 2.5303404331207275, + "learning_rate": 9.902139909505691e-08, + "logits/chosen": -3.056530475616455, + "logits/rejected": -3.0221753120422363, + "logps/chosen": -58.96949005126953, + "logps/rejected": -56.16400909423828, + "loss": 0.6854, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.03807400166988373, + "rewards/margins": 0.01674531027674675, + "rewards/rejected": -0.05481930822134018, "step": 1820 }, { - "epoch": 0.32, - "grad_norm": 14.068599458347373, - "learning_rate": 4.949578758837864e-07, - "logits/chosen": -2.0577917098999023, - "logits/rejected": -2.040351390838623, - "logps/chosen": -126.67193603515625, - "logps/rejected": -151.28140258789062, - "loss": 0.6184, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.711285412311554, - "rewards/margins": 0.254142701625824, - "rewards/rejected": -0.9654279947280884, + "epoch": 0.31529979324603724, + "grad_norm": 2.4892539978027344, + "learning_rate": 9.899157517675728e-08, + "logits/chosen": -2.952310085296631, + "logits/rejected": -2.938739061355591, + "logps/chosen": -59.15592575073242, + "logps/rejected": -59.31223678588867, + "loss": 0.6889, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.036126311868429184, + "rewards/margins": 0.009667845442891121, + "rewards/rejected": -0.045794155448675156, "step": 1830 }, { - "epoch": 0.32, - "grad_norm": 16.186612704580565, - "learning_rate": 4.948065411193554e-07, - "logits/chosen": -2.2264585494995117, - "logits/rejected": -2.2193264961242676, - "logps/chosen": -132.20895385742188, - "logps/rejected": -154.45802307128906, - "loss": 0.6388, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.7740861177444458, - "rewards/margins": 0.22031357884407043, - "rewards/rejected": -0.9943998456001282, + "epoch": 0.31702274293590627, + "grad_norm": 2.669368028640747, + "learning_rate": 9.896130822387107e-08, + "logits/chosen": -3.159759044647217, + "logits/rejected": -3.155785322189331, + "logps/chosen": -59.499298095703125, + "logps/rejected": -60.6772575378418, + "loss": 0.6889, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.04679185897111893, + "rewards/margins": 0.009518241509795189, + "rewards/rejected": -0.05631009861826897, "step": 1840 }, { - "epoch": 0.32, - "grad_norm": 13.631222489310312, - "learning_rate": 4.946529925505233e-07, - "logits/chosen": -2.104651927947998, - "logits/rejected": -2.099863052368164, - "logps/chosen": -124.88291931152344, - "logps/rejected": -150.3798828125, - "loss": 0.6244, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.7169033288955688, - "rewards/margins": 0.24011361598968506, - "rewards/rejected": -0.9570168256759644, + "epoch": 0.31874569262577535, + "grad_norm": 2.5876333713531494, + "learning_rate": 9.893059851010465e-08, + "logits/chosen": -3.025189161300659, + "logits/rejected": -3.0251142978668213, + "logps/chosen": -57.66413497924805, + "logps/rejected": -60.19395065307617, + "loss": 0.6886, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.04454426094889641, + "rewards/margins": 0.010319806635379791, + "rewards/rejected": -0.0548640713095665, "step": 1850 }, { - "epoch": 0.32, - "grad_norm": 11.597593527632558, - "learning_rate": 4.944972315658417e-07, - "logits/chosen": -2.038820505142212, - "logits/rejected": -2.0067200660705566, - "logps/chosen": -129.09518432617188, - "logps/rejected": -153.1046142578125, - "loss": 0.601, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.7181129455566406, - "rewards/margins": 0.27581366896629333, - "rewards/rejected": -0.9939267039299011, + "epoch": 0.32046864231564437, + "grad_norm": 2.732103109359741, + "learning_rate": 9.889944631316835e-08, + "logits/chosen": -2.9574267864227295, + "logits/rejected": -2.921621322631836, + "logps/chosen": -60.49852752685547, + "logps/rejected": -59.736732482910156, + "loss": 0.6798, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.032065801322460175, + "rewards/margins": 0.027947356924414635, + "rewards/rejected": -0.06001315638422966, "step": 1860 }, { - "epoch": 0.32, - "grad_norm": 14.814299163074143, - "learning_rate": 4.943392595738695e-07, - "logits/chosen": -2.0475425720214844, - "logits/rejected": -2.018345832824707, - "logps/chosen": -130.18441772460938, - "logps/rejected": -163.41127014160156, - "loss": 0.5883, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.7427932024002075, - "rewards/margins": 0.35372671484947205, - "rewards/rejected": -1.096519947052002, + "epoch": 0.32219159200551345, + "grad_norm": 3.014956474304199, + "learning_rate": 9.886785191477388e-08, + "logits/chosen": -3.003999710083008, + "logits/rejected": -2.9785971641540527, + "logps/chosen": -59.9724006652832, + "logps/rejected": -60.295265197753906, + "loss": 0.6814, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04022688791155815, + "rewards/margins": 0.024841442704200745, + "rewards/rejected": -0.06506834179162979, "step": 1870 }, { - "epoch": 0.32, - "grad_norm": 13.085964342637993, - "learning_rate": 4.941790780031591e-07, - "logits/chosen": -2.040121555328369, - "logits/rejected": -2.0052008628845215, - "logps/chosen": -139.4851531982422, - "logps/rejected": -169.9237518310547, - "loss": 0.5986, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.8679348230361938, - "rewards/margins": 0.3169488310813904, - "rewards/rejected": -1.184883713722229, + "epoch": 0.3239145416953825, + "grad_norm": 2.690962314605713, + "learning_rate": 9.883581560063181e-08, + "logits/chosen": -3.0487072467803955, + "logits/rejected": -3.011261224746704, + "logps/chosen": -56.3128662109375, + "logps/rejected": -57.95994186401367, + "loss": 0.6795, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.03597037121653557, + "rewards/margins": 0.028922447934746742, + "rewards/rejected": -0.06489281356334686, "step": 1880 }, { - "epoch": 0.33, - "grad_norm": 17.97087896244849, - "learning_rate": 4.94016688302245e-07, - "logits/chosen": -2.036181688308716, - "logits/rejected": -2.021777629852295, - "logps/chosen": -137.5182647705078, - "logps/rejected": -178.22129821777344, - "loss": 0.561, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.8445706367492676, - "rewards/margins": 0.39340075850486755, - "rewards/rejected": -1.237971544265747, + "epoch": 0.32563749138525155, + "grad_norm": 2.828683376312256, + "learning_rate": 9.8803337660449e-08, + "logits/chosen": -3.071624517440796, + "logits/rejected": -3.065598964691162, + "logps/chosen": -56.401588439941406, + "logps/rejected": -60.657203674316406, + "loss": 0.6796, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.0334382988512516, + "rewards/margins": 0.02865518070757389, + "rewards/rejected": -0.062093477696180344, "step": 1890 }, { - "epoch": 0.33, - "grad_norm": 15.473904552778107, - "learning_rate": 4.938520919396297e-07, - "logits/chosen": -1.9097896814346313, - "logits/rejected": -1.878089189529419, - "logps/chosen": -160.5067901611328, - "logps/rejected": -182.10073852539062, - "loss": 0.6238, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.030932068824768, - "rewards/margins": 0.26739898324012756, - "rewards/rejected": -1.2983310222625732, + "epoch": 0.32736044107512063, + "grad_norm": 2.8319833278656006, + "learning_rate": 9.877041838792595e-08, + "logits/chosen": -3.0038390159606934, + "logits/rejected": -2.9711012840270996, + "logps/chosen": -60.720428466796875, + "logps/rejected": -57.573692321777344, + "loss": 0.6836, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.032806314527988434, + "rewards/margins": 0.020312577486038208, + "rewards/rejected": -0.053118884563446045, "step": 1900 }, { - "epoch": 0.33, - "eval_logits/chosen": -1.9768445491790771, - "eval_logits/rejected": -1.964641809463501, - "eval_logps/chosen": -151.84054565429688, - "eval_logps/rejected": -174.0358428955078, - "eval_loss": 0.6393516659736633, - "eval_rewards/accuracies": 0.6280204653739929, - "eval_rewards/chosen": -0.9313669800758362, - "eval_rewards/margins": 0.1774175763130188, - "eval_rewards/rejected": -1.108784556388855, - "eval_runtime": 356.9465, - "eval_samples_per_second": 12.058, - "eval_steps_per_second": 1.507, + "epoch": 0.32736044107512063, + "eval_logits/chosen": -3.103686571121216, + "eval_logits/rejected": -3.0980401039123535, + "eval_logps/chosen": -60.28012466430664, + "eval_logps/rejected": -65.52766418457031, + "eval_loss": 0.6895469427108765, + "eval_rewards/accuracies": 0.5882899761199951, + "eval_rewards/chosen": -0.01568230614066124, + "eval_rewards/margins": 0.007793075405061245, + "eval_rewards/rejected": -0.02347538247704506, + "eval_runtime": 359.1258, + "eval_samples_per_second": 11.985, + "eval_steps_per_second": 1.498, "step": 1900 }, { - "epoch": 0.33, - "grad_norm": 18.295705836142915, - "learning_rate": 4.936852904037709e-07, - "logits/chosen": -1.8353763818740845, - "logits/rejected": -1.7998859882354736, - "logps/chosen": -162.01815795898438, - "logps/rejected": -199.1243438720703, - "loss": 0.5856, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.0756003856658936, - "rewards/margins": 0.3827807605266571, - "rewards/rejected": -1.458381175994873, + "epoch": 0.32908339076498966, + "grad_norm": 2.763634204864502, + "learning_rate": 9.87370580807542e-08, + "logits/chosen": -2.9044349193573, + "logits/rejected": -2.8699584007263184, + "logps/chosen": -58.88459014892578, + "logps/rejected": -59.89939498901367, + "loss": 0.6829, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.044168949127197266, + "rewards/margins": 0.02184966765344143, + "rewards/rejected": -0.06601861119270325, "step": 1910 }, { - "epoch": 0.33, - "grad_norm": 15.587066902188072, - "learning_rate": 4.935162852030678e-07, - "logits/chosen": -1.9671123027801514, - "logits/rejected": -1.9385311603546143, - "logps/chosen": -158.43060302734375, - "logps/rejected": -187.6251678466797, - "loss": 0.611, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.030988097190857, - "rewards/margins": 0.3086529076099396, - "rewards/rejected": -1.3396410942077637, + "epoch": 0.33080634045485874, + "grad_norm": 2.6262426376342773, + "learning_rate": 9.870325704061355e-08, + "logits/chosen": -3.0172269344329834, + "logits/rejected": -2.98966121673584, + "logps/chosen": -59.44633102416992, + "logps/rejected": -60.100379943847656, + "loss": 0.6823, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0407765693962574, + "rewards/margins": 0.02320735715329647, + "rewards/rejected": -0.06398393213748932, "step": 1920 }, { - "epoch": 0.33, - "grad_norm": 16.64428631174434, - "learning_rate": 4.933450778658472e-07, - "logits/chosen": -1.9721879959106445, - "logits/rejected": -1.9367185831069946, - "logps/chosen": -145.00579833984375, - "logps/rejected": -175.52078247070312, - "loss": 0.6052, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.9111844897270203, - "rewards/margins": 0.30568939447402954, - "rewards/rejected": -1.2168738842010498, + "epoch": 0.33252929014472776, + "grad_norm": 2.3864188194274902, + "learning_rate": 9.866901557316944e-08, + "logits/chosen": -2.964810609817505, + "logits/rejected": -2.9282431602478027, + "logps/chosen": -58.27106857299805, + "logps/rejected": -60.40107345581055, + "loss": 0.6829, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.043806519359350204, + "rewards/margins": 0.021700460463762283, + "rewards/rejected": -0.06550697982311249, "step": 1930 }, { - "epoch": 0.33, - "grad_norm": 13.008355304833884, - "learning_rate": 4.931716699403504e-07, - "logits/chosen": -2.0365664958953857, - "logits/rejected": -2.016010046005249, - "logps/chosen": -130.88787841796875, - "logps/rejected": -154.35255432128906, - "loss": 0.6209, + "epoch": 0.33425223983459684, + "grad_norm": 2.4197654724121094, + "learning_rate": 9.863433398807007e-08, + "logits/chosen": -2.9881813526153564, + "logits/rejected": -2.969853162765503, + "logps/chosen": -55.707054138183594, + "logps/rejected": -56.50288009643555, + "loss": 0.6852, "rewards/accuracies": 0.625, - "rewards/chosen": -0.7993988990783691, - "rewards/margins": 0.24416379630565643, - "rewards/rejected": -1.043562650680542, + "rewards/chosen": -0.047670621424913406, + "rewards/margins": 0.01711922325193882, + "rewards/rejected": -0.06478984653949738, "step": 1940 }, { - "epoch": 0.34, - "grad_norm": 11.221840915928341, - "learning_rate": 4.929960629947185e-07, - "logits/chosen": -2.021613597869873, - "logits/rejected": -2.012424945831299, - "logps/chosen": -137.0205535888672, - "logps/rejected": -171.9865264892578, - "loss": 0.6066, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.8452763557434082, - "rewards/margins": 0.3268183469772339, - "rewards/rejected": -1.172094702720642, + "epoch": 0.33597518952446587, + "grad_norm": 2.494558095932007, + "learning_rate": 9.85992125989437e-08, + "logits/chosen": -2.9932541847229004, + "logits/rejected": -2.991720676422119, + "logps/chosen": -57.3494987487793, + "logps/rejected": -61.66455078125, + "loss": 0.6837, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.048418622463941574, + "rewards/margins": 0.020452255383133888, + "rewards/rejected": -0.06887087225914001, "step": 1950 }, { - "epoch": 0.34, - "grad_norm": 12.459203609632565, - "learning_rate": 4.928182586169787e-07, - "logits/chosen": -2.0483787059783936, - "logits/rejected": -2.024353504180908, - "logps/chosen": -136.3292999267578, - "logps/rejected": -166.4497833251953, - "loss": 0.6003, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.8198174238204956, - "rewards/margins": 0.2998635768890381, - "rewards/rejected": -1.1196808815002441, + "epoch": 0.33769813921433495, + "grad_norm": 2.642951250076294, + "learning_rate": 9.856365172339574e-08, + "logits/chosen": -2.999870777130127, + "logits/rejected": -2.980663776397705, + "logps/chosen": -58.778953552246094, + "logps/rejected": -61.15327835083008, + "loss": 0.6827, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.04419439285993576, + "rewards/margins": 0.022369753569364548, + "rewards/rejected": -0.0665641501545906, "step": 1960 }, { - "epoch": 0.34, - "grad_norm": 16.3262276995175, - "learning_rate": 4.926382584150298e-07, - "logits/chosen": -2.052652359008789, - "logits/rejected": -2.0237042903900146, - "logps/chosen": -133.0852813720703, - "logps/rejected": -154.27850341796875, - "loss": 0.6179, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.7694526314735413, - "rewards/margins": 0.25520798563957214, - "rewards/rejected": -1.0246607065200806, + "epoch": 0.33942108890420397, + "grad_norm": 2.4793989658355713, + "learning_rate": 9.852765168300596e-08, + "logits/chosen": -3.043196201324463, + "logits/rejected": -3.0140252113342285, + "logps/chosen": -60.28436279296875, + "logps/rejected": -58.49019241333008, + "loss": 0.6814, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.04137754067778587, + "rewards/margins": 0.02506864070892334, + "rewards/rejected": -0.06644618511199951, "step": 1970 }, { - "epoch": 0.34, - "grad_norm": 13.309990683269428, - "learning_rate": 4.924560640166273e-07, - "logits/chosen": -1.9702781438827515, - "logits/rejected": -1.955529808998108, - "logps/chosen": -143.92767333984375, - "logps/rejected": -171.1623077392578, - "loss": 0.6143, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.8834748268127441, - "rewards/margins": 0.28803762793540955, - "rewards/rejected": -1.1715123653411865, + "epoch": 0.34114403859407305, + "grad_norm": 2.7259953022003174, + "learning_rate": 9.849121280332546e-08, + "logits/chosen": -2.9497714042663574, + "logits/rejected": -2.944230079650879, + "logps/chosen": -60.91106033325195, + "logps/rejected": -60.264892578125, + "loss": 0.6894, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.053232572972774506, + "rewards/margins": 0.009022532030940056, + "rewards/rejected": -0.06225510314106941, "step": 1980 }, { - "epoch": 0.34, - "grad_norm": 17.690808430077606, - "learning_rate": 4.922716770693691e-07, - "logits/chosen": -2.02256441116333, - "logits/rejected": -1.9881162643432617, - "logps/chosen": -148.55735778808594, - "logps/rejected": -185.8997344970703, - "loss": 0.5663, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.9775354266166687, - "rewards/margins": 0.3999343514442444, - "rewards/rejected": -1.377469778060913, + "epoch": 0.34286698828394213, + "grad_norm": 2.7751598358154297, + "learning_rate": 9.845433541387384e-08, + "logits/chosen": -3.076275587081909, + "logits/rejected": -3.041668653488159, + "logps/chosen": -55.9117431640625, + "logps/rejected": -56.07196044921875, + "loss": 0.6799, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.05077340081334114, + "rewards/margins": 0.02825034037232399, + "rewards/rejected": -0.07902374118566513, "step": 1990 }, { - "epoch": 0.34, - "grad_norm": 20.779887890208492, - "learning_rate": 4.920850992406809e-07, - "logits/chosen": -1.9081655740737915, - "logits/rejected": -1.9007370471954346, - "logps/chosen": -167.69796752929688, - "logps/rejected": -216.0054168701172, - "loss": 0.5824, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.1772109270095825, - "rewards/margins": 0.4069501757621765, - "rewards/rejected": -1.5841610431671143, + "epoch": 0.34458993797381116, + "grad_norm": 2.9599082469940186, + "learning_rate": 9.841701984813618e-08, + "logits/chosen": -2.9976062774658203, + "logits/rejected": -3.0039710998535156, + "logps/chosen": -55.9847412109375, + "logps/rejected": -65.41357421875, + "loss": 0.685, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.06023833900690079, + "rewards/margins": 0.01777346059679985, + "rewards/rejected": -0.07801179587841034, "step": 2000 }, { - "epoch": 0.34, - "eval_logits/chosen": -1.988376498222351, - "eval_logits/rejected": -1.9742034673690796, - "eval_logps/chosen": -156.2569122314453, - "eval_logps/rejected": -181.40647888183594, - "eval_loss": 0.6345042586326599, - "eval_rewards/accuracies": 0.6338289976119995, - "eval_rewards/chosen": -0.9755305647850037, - "eval_rewards/margins": 0.2069605439901352, - "eval_rewards/rejected": -1.1824910640716553, - "eval_runtime": 356.8317, - "eval_samples_per_second": 12.062, - "eval_steps_per_second": 1.508, + "epoch": 0.34458993797381116, + "eval_logits/chosen": -3.0961995124816895, + "eval_logits/rejected": -3.090507745742798, + "eval_logps/chosen": -60.984703063964844, + "eval_logps/rejected": -66.37020874023438, + "eval_loss": 0.6889471411705017, + "eval_rewards/accuracies": 0.589684009552002, + "eval_rewards/chosen": -0.022728124633431435, + "eval_rewards/margins": 0.009172691032290459, + "eval_rewards/rejected": -0.03190081566572189, + "eval_runtime": 358.5713, + "eval_samples_per_second": 12.003, + "eval_steps_per_second": 1.5, "step": 2000 }, { - "epoch": 0.35, - "grad_norm": 13.969050635341127, - "learning_rate": 4.918963322178002e-07, - "logits/chosen": -1.8815292119979858, - "logits/rejected": -1.8513492345809937, - "logps/chosen": -167.8777313232422, - "logps/rejected": -195.77195739746094, - "loss": 0.6169, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1228792667388916, - "rewards/margins": 0.3168966770172119, - "rewards/rejected": -1.439776062965393, + "epoch": 0.34631288766368024, + "grad_norm": 2.715341567993164, + "learning_rate": 9.837926644356002e-08, + "logits/chosen": -3.0026180744171143, + "logits/rejected": -2.9743618965148926, + "logps/chosen": -60.34886932373047, + "logps/rejected": -58.93947219848633, + "loss": 0.682, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.04747002199292183, + "rewards/margins": 0.02390061318874359, + "rewards/rejected": -0.07137063890695572, "step": 2010 }, { - "epoch": 0.35, - "grad_norm": 15.797574816697441, - "learning_rate": 4.917053777077616e-07, - "logits/chosen": -1.8998935222625732, - "logits/rejected": -1.8719685077667236, - "logps/chosen": -154.22711181640625, - "logps/rejected": -197.74649047851562, - "loss": 0.5839, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.0441197156906128, - "rewards/margins": 0.38671204447746277, - "rewards/rejected": -1.4308319091796875, + "epoch": 0.34803583735354926, + "grad_norm": 2.4323620796203613, + "learning_rate": 9.834107554155232e-08, + "logits/chosen": -2.990692615509033, + "logits/rejected": -2.968893051147461, + "logps/chosen": -55.50908279418945, + "logps/rejected": -63.061180114746094, + "loss": 0.6805, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.05679772049188614, + "rewards/margins": 0.02702813409268856, + "rewards/rejected": -0.08382586389780045, "step": 2020 }, { - "epoch": 0.35, - "grad_norm": 14.833429475519068, - "learning_rate": 4.915122374373815e-07, - "logits/chosen": -1.9642966985702515, - "logits/rejected": -1.9340064525604248, - "logps/chosen": -159.75148010253906, - "logps/rejected": -195.52828979492188, - "loss": 0.5832, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.0272386074066162, - "rewards/margins": 0.3762350082397461, - "rewards/rejected": -1.4034736156463623, + "epoch": 0.34975878704341834, + "grad_norm": 2.9917709827423096, + "learning_rate": 9.83024474874763e-08, + "logits/chosen": -3.07673716545105, + "logits/rejected": -3.0554261207580566, + "logps/chosen": -62.36286163330078, + "logps/rejected": -62.7202033996582, + "loss": 0.6828, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.053105879575014114, + "rewards/margins": 0.022281022742390633, + "rewards/rejected": -0.0753868967294693, "step": 2030 }, { - "epoch": 0.35, - "grad_norm": 14.865053760007001, - "learning_rate": 4.913169131532422e-07, - "logits/chosen": -1.820640206336975, - "logits/rejected": -1.7987966537475586, - "logps/chosen": -145.8691864013672, - "logps/rejected": -193.00489807128906, - "loss": 0.5621, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.9316226243972778, - "rewards/margins": 0.45906609296798706, - "rewards/rejected": -1.3906886577606201, + "epoch": 0.35148173673328736, + "grad_norm": 2.6420340538024902, + "learning_rate": 9.826338263064845e-08, + "logits/chosen": -2.9410736560821533, + "logits/rejected": -2.928380012512207, + "logps/chosen": -58.5155143737793, + "logps/rejected": -61.75719451904297, + "loss": 0.684, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.057995665818452835, + "rewards/margins": 0.020041372627019882, + "rewards/rejected": -0.07803703844547272, "step": 2040 }, { - "epoch": 0.35, - "grad_norm": 20.45821384576311, - "learning_rate": 4.911194066216765e-07, - "logits/chosen": -1.864013671875, - "logits/rejected": -1.8330237865447998, - "logps/chosen": -153.375732421875, - "logps/rejected": -193.52369689941406, - "loss": 0.5926, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.0217927694320679, - "rewards/margins": 0.36956191062927246, - "rewards/rejected": -1.3913547992706299, + "epoch": 0.35320468642315644, + "grad_norm": 2.810659170150757, + "learning_rate": 9.82238813243353e-08, + "logits/chosen": -3.056933641433716, + "logits/rejected": -3.0320918560028076, + "logps/chosen": -56.75238800048828, + "logps/rejected": -61.59584426879883, + "loss": 0.6856, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.05530368164181709, + "rewards/margins": 0.0165498498827219, + "rewards/rejected": -0.07185353338718414, "step": 2050 }, { - "epoch": 0.35, - "grad_norm": 15.765551507413845, - "learning_rate": 4.909197196287509e-07, - "logits/chosen": -1.8556013107299805, - "logits/rejected": -1.8133299350738525, - "logps/chosen": -158.96043395996094, - "logps/rejected": -183.34689331054688, - "loss": 0.6223, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.036510705947876, - "rewards/margins": 0.2811738848686218, - "rewards/rejected": -1.3176846504211426, + "epoch": 0.3549276361130255, + "grad_norm": 2.5844037532806396, + "learning_rate": 9.818394392575017e-08, + "logits/chosen": -3.044243574142456, + "logits/rejected": -3.0059947967529297, + "logps/chosen": -60.13054275512695, + "logps/rejected": -59.328285217285156, + "loss": 0.6793, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.04801074415445328, + "rewards/margins": 0.02944994531571865, + "rewards/rejected": -0.07746069133281708, "step": 2060 }, { - "epoch": 0.36, - "grad_norm": 16.728188759969367, - "learning_rate": 4.907178539802502e-07, - "logits/chosen": -1.8902781009674072, - "logits/rejected": -1.8563499450683594, - "logps/chosen": -154.345947265625, - "logps/rejected": -198.3380889892578, - "loss": 0.5807, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.9906846284866333, - "rewards/margins": 0.44513338804244995, - "rewards/rejected": -1.4358179569244385, + "epoch": 0.35665058580289455, + "grad_norm": 2.8128087520599365, + "learning_rate": 9.814357079605006e-08, + "logits/chosen": -3.0609805583953857, + "logits/rejected": -3.0380260944366455, + "logps/chosen": -60.5723876953125, + "logps/rejected": -62.3077392578125, + "loss": 0.6829, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.05279190093278885, + "rewards/margins": 0.022460918873548508, + "rewards/rejected": -0.07525281608104706, "step": 2070 }, { - "epoch": 0.36, - "grad_norm": 17.234432946942448, - "learning_rate": 4.905138115016614e-07, - "logits/chosen": -1.8345119953155518, - "logits/rejected": -1.7954838275909424, - "logps/chosen": -153.2652587890625, - "logps/rejected": -191.7286376953125, - "loss": 0.5862, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.0140693187713623, - "rewards/margins": 0.39727577567100525, - "rewards/rejected": -1.41134512424469, + "epoch": 0.35837353549276363, + "grad_norm": 2.812436580657959, + "learning_rate": 9.810276230033227e-08, + "logits/chosen": -3.0014538764953613, + "logits/rejected": -2.9721763134002686, + "logps/chosen": -58.136962890625, + "logps/rejected": -59.7253303527832, + "loss": 0.6799, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.06269152462482452, + "rewards/margins": 0.028253117576241493, + "rewards/rejected": -0.09094464033842087, "step": 2080 }, { - "epoch": 0.36, - "grad_norm": 17.27758613167349, - "learning_rate": 4.903075940381559e-07, - "logits/chosen": -1.8448431491851807, - "logits/rejected": -1.8325908184051514, - "logps/chosen": -147.30392456054688, - "logps/rejected": -176.21583557128906, - "loss": 0.6304, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.9291483163833618, - "rewards/margins": 0.2863280475139618, - "rewards/rejected": -1.2154762744903564, + "epoch": 0.36009648518263265, + "grad_norm": 2.542465925216675, + "learning_rate": 9.806151880763118e-08, + "logits/chosen": -3.0250954627990723, + "logits/rejected": -3.021667957305908, + "logps/chosen": -60.14142990112305, + "logps/rejected": -61.97650909423828, + "loss": 0.6861, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.05746666342020035, + "rewards/margins": 0.015712924301624298, + "rewards/rejected": -0.07317958772182465, "step": 2090 }, { - "epoch": 0.36, - "grad_norm": 13.545506652655037, - "learning_rate": 4.900992034545744e-07, - "logits/chosen": -1.8317134380340576, - "logits/rejected": -1.801325798034668, - "logps/chosen": -134.27406311035156, - "logps/rejected": -163.0066680908203, - "loss": 0.5895, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.7893965840339661, - "rewards/margins": 0.32822003960609436, - "rewards/rejected": -1.1176166534423828, + "epoch": 0.36181943487250173, + "grad_norm": 2.988396167755127, + "learning_rate": 9.801984069091486e-08, + "logits/chosen": -2.9737119674682617, + "logits/rejected": -2.9518253803253174, + "logps/chosen": -61.59429931640625, + "logps/rejected": -59.7376708984375, + "loss": 0.6828, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.06247835233807564, + "rewards/margins": 0.02240810915827751, + "rewards/rejected": -0.08488644659519196, "step": 2100 }, { - "epoch": 0.36, - "eval_logits/chosen": -1.95328950881958, - "eval_logits/rejected": -1.9400743246078491, - "eval_logps/chosen": -124.55523681640625, - "eval_logps/rejected": -143.9415740966797, - "eval_loss": 0.6449150443077087, - "eval_rewards/accuracies": 0.6338289976119995, - "eval_rewards/chosen": -0.6585139632225037, - "eval_rewards/margins": 0.14932793378829956, - "eval_rewards/rejected": -0.8078420162200928, - "eval_runtime": 356.7749, - "eval_samples_per_second": 12.064, - "eval_steps_per_second": 1.508, + "epoch": 0.36181943487250173, + "eval_logits/chosen": -3.089683771133423, + "eval_logits/rejected": -3.0839576721191406, + "eval_logps/chosen": -61.820899963378906, + "eval_logps/rejected": -67.35945892333984, + "eval_loss": 0.6882798671722412, + "eval_rewards/accuracies": 0.580622673034668, + "eval_rewards/chosen": -0.03109004907310009, + "eval_rewards/margins": 0.010703377425670624, + "eval_rewards/rejected": -0.041793424636125565, + "eval_runtime": 358.4509, + "eval_samples_per_second": 12.007, + "eval_steps_per_second": 1.501, "step": 2100 }, { - "epoch": 0.36, - "grad_norm": 20.3896857533217, - "learning_rate": 4.898886416354088e-07, - "logits/chosen": -1.8545172214508057, - "logits/rejected": -1.8379218578338623, - "logps/chosen": -136.37522888183594, - "logps/rejected": -184.2274932861328, - "loss": 0.5751, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.8746525049209595, - "rewards/margins": 0.41598910093307495, - "rewards/rejected": -1.2906416654586792, + "epoch": 0.36354238456237076, + "grad_norm": 2.661639451980591, + "learning_rate": 9.797772832708176e-08, + "logits/chosen": -3.0293941497802734, + "logits/rejected": -3.032090663909912, + "logps/chosen": -56.2503547668457, + "logps/rejected": -63.839683532714844, + "loss": 0.6877, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0735074132680893, + "rewards/margins": 0.012995610944926739, + "rewards/rejected": -0.08650301396846771, "step": 2110 }, { - "epoch": 0.37, - "grad_norm": 19.966726504918753, - "learning_rate": 4.896759104847859e-07, - "logits/chosen": -1.6481273174285889, - "logits/rejected": -1.6026216745376587, - "logps/chosen": -143.7649688720703, - "logps/rejected": -195.6068115234375, - "loss": 0.549, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.90205317735672, - "rewards/margins": 0.5448054075241089, - "rewards/rejected": -1.4468586444854736, + "epoch": 0.36526533425223984, + "grad_norm": 2.7685952186584473, + "learning_rate": 9.793518209695718e-08, + "logits/chosen": -2.8939576148986816, + "logits/rejected": -2.8669164180755615, + "logps/chosen": -59.037567138671875, + "logps/rejected": -58.662879943847656, + "loss": 0.6827, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05461913347244263, + "rewards/margins": 0.022714272141456604, + "rewards/rejected": -0.07733340561389923, "step": 2120 }, { - "epoch": 0.37, - "grad_norm": 22.5331311441494, - "learning_rate": 4.8946101192645e-07, - "logits/chosen": -1.5899341106414795, - "logits/rejected": -1.5583152770996094, - "logps/chosen": -178.53843688964844, - "logps/rejected": -229.2800750732422, - "loss": 0.5623, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2409361600875854, - "rewards/margins": 0.5267373323440552, - "rewards/rejected": -1.7676734924316406, + "epoch": 0.3669882839421089, + "grad_norm": 2.3837106227874756, + "learning_rate": 9.789220238528999e-08, + "logits/chosen": -2.930190324783325, + "logits/rejected": -2.9117355346679688, + "logps/chosen": -60.96294403076172, + "logps/rejected": -61.99995803833008, + "loss": 0.6793, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06492619961500168, + "rewards/margins": 0.029704291373491287, + "rewards/rejected": -0.09463049471378326, "step": 2130 }, { - "epoch": 0.37, - "grad_norm": 19.39894132054709, - "learning_rate": 4.892439479037451e-07, - "logits/chosen": -1.6246334314346313, - "logits/rejected": -1.6016099452972412, - "logps/chosen": -173.16412353515625, - "logps/rejected": -215.16110229492188, - "loss": 0.6048, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.245426058769226, - "rewards/margins": 0.37089425325393677, - "rewards/rejected": -1.6163203716278076, + "epoch": 0.36871123363197794, + "grad_norm": 2.6930501461029053, + "learning_rate": 9.784878958074901e-08, + "logits/chosen": -2.90777325630188, + "logits/rejected": -2.8999273777008057, + "logps/chosen": -57.04816818237305, + "logps/rejected": -63.14056396484375, + "loss": 0.688, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.08392993360757828, + "rewards/margins": 0.011952447704970837, + "rewards/rejected": -0.09588237851858139, "step": 2140 }, { - "epoch": 0.37, - "grad_norm": 18.938179724571942, - "learning_rate": 4.89024720379598e-07, - "logits/chosen": -1.6600227355957031, - "logits/rejected": -1.6034603118896484, - "logps/chosen": -167.83346557617188, - "logps/rejected": -214.2350616455078, - "loss": 0.5413, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.1314681768417358, - "rewards/margins": 0.531185507774353, - "rewards/rejected": -1.6626536846160889, + "epoch": 0.370434183321847, + "grad_norm": 2.75247859954834, + "learning_rate": 9.780494407591959e-08, + "logits/chosen": -2.957963228225708, + "logits/rejected": -2.911945343017578, + "logps/chosen": -60.3175048828125, + "logps/rejected": -57.6784553527832, + "loss": 0.6741, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05628920719027519, + "rewards/margins": 0.040407292544841766, + "rewards/rejected": -0.09669648855924606, "step": 2150 }, { - "epoch": 0.37, - "grad_norm": 17.819138078463755, - "learning_rate": 4.888033313365001e-07, - "logits/chosen": -1.5937135219573975, - "logits/rejected": -1.5616223812103271, - "logps/chosen": -189.21583557128906, - "logps/rejected": -235.78518676757812, - "loss": 0.5612, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.344967007637024, - "rewards/margins": 0.5175682902336121, - "rewards/rejected": -1.8625354766845703, + "epoch": 0.37215713301171605, + "grad_norm": 2.7918150424957275, + "learning_rate": 9.776066626730002e-08, + "logits/chosen": -2.9251790046691895, + "logits/rejected": -2.9098739624023438, + "logps/chosen": -61.5231819152832, + "logps/rejected": -58.77111053466797, + "loss": 0.6817, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.06766150891780853, + "rewards/margins": 0.024511078372597694, + "rewards/rejected": -0.09217258542776108, "step": 2160 }, { - "epoch": 0.37, - "grad_norm": 28.899143582146355, - "learning_rate": 4.885797827764895e-07, - "logits/chosen": -1.6774377822875977, - "logits/rejected": -1.6321741342544556, - "logps/chosen": -193.11886596679688, - "logps/rejected": -255.240234375, - "loss": 0.543, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3880548477172852, - "rewards/margins": 0.6497060656547546, - "rewards/rejected": -2.0377612113952637, + "epoch": 0.3738800827015851, + "grad_norm": 2.841006278991699, + "learning_rate": 9.77159565552979e-08, + "logits/chosen": -3.045158863067627, + "logits/rejected": -3.0296671390533447, + "logps/chosen": -60.80147171020508, + "logps/rejected": -60.424476623535156, + "loss": 0.6815, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.06470237672328949, + "rewards/margins": 0.02472129836678505, + "rewards/rejected": -0.08942367881536484, "step": 2170 }, { - "epoch": 0.38, - "grad_norm": 20.347133369564297, - "learning_rate": 4.88354076721133e-07, - "logits/chosen": -1.7743288278579712, - "logits/rejected": -1.7314989566802979, - "logps/chosen": -204.22975158691406, - "logps/rejected": -241.63369750976562, - "loss": 0.6385, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.4623057842254639, - "rewards/margins": 0.42681413888931274, - "rewards/rejected": -1.889120101928711, + "epoch": 0.37560303239145415, + "grad_norm": 2.9336609840393066, + "learning_rate": 9.76708153442266e-08, + "logits/chosen": -3.0933234691619873, + "logits/rejected": -3.05588436126709, + "logps/chosen": -64.88115692138672, + "logps/rejected": -63.36223220825195, + "loss": 0.6755, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.06857044249773026, + "rewards/margins": 0.03770887851715088, + "rewards/rejected": -0.10627932846546173, "step": 2180 }, { - "epoch": 0.38, - "grad_norm": 11.864063871740855, - "learning_rate": 4.88126215211508e-07, - "logits/chosen": -2.0105056762695312, - "logits/rejected": -1.9917558431625366, - "logps/chosen": -134.98178100585938, - "logps/rejected": -173.46408081054688, - "loss": 0.5908, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.8213205337524414, - "rewards/margins": 0.37510326504707336, - "rewards/rejected": -1.1964237689971924, + "epoch": 0.37732598208132323, + "grad_norm": 2.9289681911468506, + "learning_rate": 9.76252430423016e-08, + "logits/chosen": -3.1562750339508057, + "logits/rejected": -3.1501102447509766, + "logps/chosen": -60.25629806518555, + "logps/rejected": -62.5523567199707, + "loss": 0.6877, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0741446241736412, + "rewards/margins": 0.012826305814087391, + "rewards/rejected": -0.08697094023227692, "step": 2190 }, { - "epoch": 0.38, - "grad_norm": 12.665212840871442, - "learning_rate": 4.878962003081834e-07, - "logits/chosen": -1.8419253826141357, - "logits/rejected": -1.8090530633926392, - "logps/chosen": -125.55128479003906, - "logps/rejected": -168.30105590820312, - "loss": 0.5633, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.7324917912483215, - "rewards/margins": 0.4029674530029297, - "rewards/rejected": -1.1354591846466064, + "epoch": 0.37904893177119225, + "grad_norm": 2.7656748294830322, + "learning_rate": 9.75792400616367e-08, + "logits/chosen": -2.985726833343506, + "logits/rejected": -2.9629483222961426, + "logps/chosen": -58.70253372192383, + "logps/rejected": -65.15387725830078, + "loss": 0.6745, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.06393808126449585, + "rewards/margins": 0.03966350480914116, + "rewards/rejected": -0.1036015972495079, "step": 2200 }, { - "epoch": 0.38, - "eval_logits/chosen": -1.9549309015274048, - "eval_logits/rejected": -1.9415898323059082, - "eval_logps/chosen": -122.187744140625, - "eval_logps/rejected": -142.1007080078125, - "eval_loss": 0.6433526277542114, - "eval_rewards/accuracies": 0.6247676610946655, - "eval_rewards/chosen": -0.6348390579223633, - "eval_rewards/margins": 0.1545940786600113, - "eval_rewards/rejected": -0.7894331216812134, - "eval_runtime": 356.7846, - "eval_samples_per_second": 12.063, - "eval_steps_per_second": 1.508, + "epoch": 0.37904893177119225, + "eval_logits/chosen": -3.0810647010803223, + "eval_logits/rejected": -3.0753400325775146, + "eval_logps/chosen": -62.52730178833008, + "eval_logps/rejected": -68.22266387939453, + "eval_loss": 0.6875881552696228, + "eval_rewards/accuracies": 0.5882899761199951, + "eval_rewards/chosen": -0.03815402835607529, + "eval_rewards/margins": 0.01227136142551899, + "eval_rewards/rejected": -0.05042538791894913, + "eval_runtime": 358.5312, + "eval_samples_per_second": 12.005, + "eval_steps_per_second": 1.501, "step": 2200 }, { - "epoch": 0.38, - "grad_norm": 15.566693864294429, - "learning_rate": 4.87664034091202e-07, - "logits/chosen": -1.864985466003418, - "logits/rejected": -1.842546820640564, - "logps/chosen": -135.0320587158203, - "logps/rejected": -167.46334838867188, - "loss": 0.6055, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.8033797144889832, - "rewards/margins": 0.33440515398979187, - "rewards/rejected": -1.1377849578857422, + "epoch": 0.38077188146106133, + "grad_norm": 2.809079885482788, + "learning_rate": 9.75328068182404e-08, + "logits/chosen": -3.0429909229278564, + "logits/rejected": -3.0269718170166016, + "logps/chosen": -60.863067626953125, + "logps/rejected": -62.8907585144043, + "loss": 0.679, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.06136716529726982, + "rewards/margins": 0.03044435940682888, + "rewards/rejected": -0.09181152284145355, "step": 2210 }, { - "epoch": 0.38, - "grad_norm": 14.08070748751659, - "learning_rate": 4.874297186600607e-07, - "logits/chosen": -1.6942613124847412, - "logits/rejected": -1.6759631633758545, - "logps/chosen": -136.83392333984375, - "logps/rejected": -170.51962280273438, - "loss": 0.5989, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.8297648429870605, - "rewards/margins": 0.33708181977272034, - "rewards/rejected": -1.1668468713760376, + "epoch": 0.3824948311509304, + "grad_norm": 2.8651580810546875, + "learning_rate": 9.748594373201213e-08, + "logits/chosen": -2.8712568283081055, + "logits/rejected": -2.8639302253723145, + "logps/chosen": -60.365760803222656, + "logps/rejected": -62.60291290283203, + "loss": 0.6829, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.06487758457660675, + "rewards/margins": 0.022680306807160378, + "rewards/rejected": -0.08755789697170258, "step": 2220 }, { - "epoch": 0.38, - "grad_norm": 13.765120270646621, - "learning_rate": 4.871932561336917e-07, - "logits/chosen": -1.7974563837051392, - "logits/rejected": -1.7594830989837646, - "logps/chosen": -157.6973114013672, - "logps/rejected": -191.2778778076172, - "loss": 0.6047, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.0398889780044556, - "rewards/margins": 0.3657877445220947, - "rewards/rejected": -1.4056766033172607, + "epoch": 0.38421778084079944, + "grad_norm": 2.711296319961548, + "learning_rate": 9.743865122673835e-08, + "logits/chosen": -3.0264925956726074, + "logits/rejected": -2.9949655532836914, + "logps/chosen": -61.64265823364258, + "logps/rejected": -60.85839080810547, + "loss": 0.6834, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.07965074479579926, + "rewards/margins": 0.021513305604457855, + "rewards/rejected": -0.10116405785083771, "step": 2230 }, { - "epoch": 0.39, - "grad_norm": 14.959197166232116, - "learning_rate": 4.869546486504443e-07, - "logits/chosen": -1.7539308071136475, - "logits/rejected": -1.715118408203125, - "logps/chosen": -154.1725311279297, - "logps/rejected": -178.8619384765625, - "loss": 0.6287, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.9734878540039062, - "rewards/margins": 0.27934250235557556, - "rewards/rejected": -1.2528302669525146, + "epoch": 0.3859407305306685, + "grad_norm": 2.980560779571533, + "learning_rate": 9.739092973008886e-08, + "logits/chosen": -2.979952096939087, + "logits/rejected": -2.944349527359009, + "logps/chosen": -63.78528594970703, + "logps/rejected": -62.89183807373047, + "loss": 0.6826, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0697564110159874, + "rewards/margins": 0.02322663739323616, + "rewards/rejected": -0.09298305213451385, "step": 2240 }, { - "epoch": 0.39, - "grad_norm": 30.679025462387873, - "learning_rate": 4.867138983680639e-07, - "logits/chosen": -1.7157443761825562, - "logits/rejected": -1.6704628467559814, - "logps/chosen": -154.58035278320312, - "logps/rejected": -192.14291381835938, - "loss": 0.5929, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.9923057556152344, - "rewards/margins": 0.3957834541797638, - "rewards/rejected": -1.3880890607833862, + "epoch": 0.38766368022053754, + "grad_norm": 3.1012465953826904, + "learning_rate": 9.734277967361279e-08, + "logits/chosen": -2.9386608600616455, + "logits/rejected": -2.904419422149658, + "logps/chosen": -62.32648849487305, + "logps/rejected": -62.88580322265625, + "loss": 0.6814, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.06976626813411713, + "rewards/margins": 0.025638435035943985, + "rewards/rejected": -0.09540469944477081, "step": 2250 }, { - "epoch": 0.39, - "grad_norm": 13.975707619834106, - "learning_rate": 4.864710074636742e-07, - "logits/chosen": -1.6998507976531982, - "logits/rejected": -1.6631402969360352, - "logps/chosen": -162.00726318359375, - "logps/rejected": -190.08140563964844, - "loss": 0.6257, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.0117926597595215, - "rewards/margins": 0.3156191408634186, - "rewards/rejected": -1.3274118900299072, + "epoch": 0.3893866299104066, + "grad_norm": 2.673414945602417, + "learning_rate": 9.729420149273484e-08, + "logits/chosen": -2.896034002304077, + "logits/rejected": -2.870567798614502, + "logps/chosen": -68.32345581054688, + "logps/rejected": -67.14576721191406, + "loss": 0.6832, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.07515819370746613, + "rewards/margins": 0.022647675126791, + "rewards/rejected": -0.09780587255954742, "step": 2260 }, { - "epoch": 0.39, - "grad_norm": 16.6560589968763, - "learning_rate": 4.862259781337561e-07, - "logits/chosen": -1.7075884342193604, - "logits/rejected": -1.665636420249939, - "logps/chosen": -147.17477416992188, - "logps/rejected": -179.50704956054688, - "loss": 0.6146, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.9310780763626099, - "rewards/margins": 0.34703630208969116, - "rewards/rejected": -1.2781143188476562, + "epoch": 0.39110957960027565, + "grad_norm": 2.9458136558532715, + "learning_rate": 9.724519562675122e-08, + "logits/chosen": -2.880819797515869, + "logits/rejected": -2.8479409217834473, + "logps/chosen": -62.41205978393555, + "logps/rejected": -62.49220657348633, + "loss": 0.6823, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08350671082735062, + "rewards/margins": 0.024250809103250504, + "rewards/rejected": -0.10775750875473022, "step": 2270 }, { - "epoch": 0.39, - "grad_norm": 14.830061652144124, - "learning_rate": 4.859788125941288e-07, - "logits/chosen": -1.791953444480896, - "logits/rejected": -1.7653782367706299, - "logps/chosen": -126.0338134765625, - "logps/rejected": -163.85520935058594, - "loss": 0.5711, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.7260233163833618, - "rewards/margins": 0.3571609556674957, - "rewards/rejected": -1.0831841230392456, + "epoch": 0.3928325292901447, + "grad_norm": 2.828383445739746, + "learning_rate": 9.719576251882575e-08, + "logits/chosen": -2.9576196670532227, + "logits/rejected": -2.947373867034912, + "logps/chosen": -60.50732421875, + "logps/rejected": -64.59284210205078, + "loss": 0.684, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07063782960176468, + "rewards/margins": 0.019989144057035446, + "rewards/rejected": -0.09062696993350983, "step": 2280 }, { - "epoch": 0.39, - "grad_norm": 14.350187386244336, - "learning_rate": 4.857295130799293e-07, - "logits/chosen": -1.6346839666366577, - "logits/rejected": -1.5958842039108276, - "logps/chosen": -142.57481384277344, - "logps/rejected": -190.54708862304688, - "loss": 0.5495, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.8971936106681824, - "rewards/margins": 0.4708176255226135, - "rewards/rejected": -1.368011236190796, + "epoch": 0.3945554789800138, + "grad_norm": 2.8307809829711914, + "learning_rate": 9.714590261598585e-08, + "logits/chosen": -2.852996349334717, + "logits/rejected": -2.824765682220459, + "logps/chosen": -60.27251434326172, + "logps/rejected": -64.0623779296875, + "loss": 0.6795, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.07389631122350693, + "rewards/margins": 0.029124462977051735, + "rewards/rejected": -0.10302077233791351, "step": 2290 }, { - "epoch": 0.4, - "grad_norm": 21.919618404427325, - "learning_rate": 4.854780818455922e-07, - "logits/chosen": -1.7218765020370483, - "logits/rejected": -1.671383261680603, - "logps/chosen": -162.91741943359375, - "logps/rejected": -212.5228729248047, - "loss": 0.5459, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.0696805715560913, - "rewards/margins": 0.5191500782966614, - "rewards/rejected": -1.5888304710388184, + "epoch": 0.39627842866988283, + "grad_norm": 2.787592887878418, + "learning_rate": 9.709561636911845e-08, + "logits/chosen": -3.035175323486328, + "logits/rejected": -3.001577854156494, + "logps/chosen": -62.35862350463867, + "logps/rejected": -63.316009521484375, + "loss": 0.6781, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.06413549929857254, + "rewards/margins": 0.03246583789587021, + "rewards/rejected": -0.09660132229328156, "step": 2300 }, { - "epoch": 0.4, - "eval_logits/chosen": -1.7182925939559937, - "eval_logits/rejected": -1.699223279953003, - "eval_logps/chosen": -166.81613159179688, - "eval_logps/rejected": -196.93434143066406, - "eval_loss": 0.6319848895072937, - "eval_rewards/accuracies": 0.6301115155220032, - "eval_rewards/chosen": -1.0811227560043335, - "eval_rewards/margins": 0.25664687156677246, - "eval_rewards/rejected": -1.3377697467803955, - "eval_runtime": 355.9925, - "eval_samples_per_second": 12.09, - "eval_steps_per_second": 1.511, + "epoch": 0.39627842866988283, + "eval_logits/chosen": -3.074549913406372, + "eval_logits/rejected": -3.0688531398773193, + "eval_logps/chosen": -62.76375198364258, + "eval_logps/rejected": -68.54676818847656, + "eval_loss": 0.6871928572654724, + "eval_rewards/accuracies": 0.5908457040786743, + "eval_rewards/chosen": -0.040518537163734436, + "eval_rewards/margins": 0.01314793061465025, + "eval_rewards/rejected": -0.05366646498441696, + "eval_runtime": 358.9768, + "eval_samples_per_second": 11.99, + "eval_steps_per_second": 1.499, "step": 2300 }, { - "epoch": 0.4, - "grad_norm": 25.808234378973566, - "learning_rate": 4.852245211648297e-07, - "logits/chosen": -1.5000966787338257, - "logits/rejected": -1.466485619544983, - "logps/chosen": -192.41690063476562, - "logps/rejected": -250.5450439453125, - "loss": 0.5307, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.413515329360962, - "rewards/margins": 0.577852725982666, - "rewards/rejected": -1.991368055343628, + "epoch": 0.3980013783597519, + "grad_norm": 2.8675854206085205, + "learning_rate": 9.704490423296595e-08, + "logits/chosen": -2.937295436859131, + "logits/rejected": -2.921793222427368, + "logps/chosen": -58.962135314941406, + "logps/rejected": -62.21418380737305, + "loss": 0.6798, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.07884009182453156, + "rewards/margins": 0.028957273811101913, + "rewards/rejected": -0.10779736191034317, "step": 2310 }, { - "epoch": 0.4, - "grad_norm": 29.80630825333485, - "learning_rate": 4.849688333306104e-07, - "logits/chosen": -1.5186668634414673, - "logits/rejected": -1.4686113595962524, - "logps/chosen": -204.86245727539062, - "logps/rejected": -261.40045166015625, - "loss": 0.5375, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.474421501159668, - "rewards/margins": 0.6283503770828247, - "rewards/rejected": -2.1027719974517822, + "epoch": 0.39972432804962094, + "grad_norm": 2.8028993606567383, + "learning_rate": 9.699376666612209e-08, + "logits/chosen": -3.032089948654175, + "logits/rejected": -2.9912171363830566, + "logps/chosen": -64.53179931640625, + "logps/rejected": -62.15001678466797, + "loss": 0.6747, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.07090407609939575, + "rewards/margins": 0.039122868329286575, + "rewards/rejected": -0.11002693325281143, "step": 2320 }, { - "epoch": 0.4, - "grad_norm": 22.33412954401948, - "learning_rate": 4.847110206551393e-07, - "logits/chosen": -1.4632006883621216, - "logits/rejected": -1.4255832433700562, - "logps/chosen": -201.6676483154297, - "logps/rejected": -270.60858154296875, - "loss": 0.5217, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.4586617946624756, - "rewards/margins": 0.7248638868331909, - "rewards/rejected": -2.183525562286377, + "epoch": 0.40144727773949, + "grad_norm": 2.875821590423584, + "learning_rate": 9.694220413102785e-08, + "logits/chosen": -2.9437172412872314, + "logits/rejected": -2.9218506813049316, + "logps/chosen": -63.208892822265625, + "logps/rejected": -63.77135467529297, + "loss": 0.6739, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07399503886699677, + "rewards/margins": 0.041165683418512344, + "rewards/rejected": -0.1151607409119606, "step": 2330 }, { - "epoch": 0.4, - "grad_norm": 20.61698070907491, - "learning_rate": 4.844510854698359e-07, - "logits/chosen": -1.5553325414657593, - "logits/rejected": -1.5306508541107178, - "logps/chosen": -203.3428497314453, - "logps/rejected": -246.02001953125, - "loss": 0.6196, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.5106031894683838, - "rewards/margins": 0.3960232734680176, - "rewards/rejected": -1.9066263437271118, + "epoch": 0.40317022742935904, + "grad_norm": 2.7427754402160645, + "learning_rate": 9.689021709396718e-08, + "logits/chosen": -2.9846208095550537, + "logits/rejected": -2.972612142562866, + "logps/chosen": -60.94713592529297, + "logps/rejected": -65.76235961914062, + "loss": 0.6857, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0864558219909668, + "rewards/margins": 0.01739116758108139, + "rewards/rejected": -0.10384698957204819, "step": 2340 }, { - "epoch": 0.4, - "grad_norm": 24.7881803852688, - "learning_rate": 4.841890301253145e-07, - "logits/chosen": -1.548393726348877, - "logits/rejected": -1.509019136428833, - "logps/chosen": -186.4705047607422, - "logps/rejected": -242.9622039794922, - "loss": 0.544, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.2822117805480957, - "rewards/margins": 0.5945440530776978, - "rewards/rejected": -1.876755952835083, + "epoch": 0.4048931771192281, + "grad_norm": 2.897033214569092, + "learning_rate": 9.683780602506288e-08, + "logits/chosen": -2.9678235054016113, + "logits/rejected": -2.9454286098480225, + "logps/chosen": -65.06855773925781, + "logps/rejected": -65.86259460449219, + "loss": 0.6756, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.06807255744934082, + "rewards/margins": 0.03739145025610924, + "rewards/rejected": -0.10546400398015976, "step": 2350 }, { - "epoch": 0.41, - "grad_norm": 21.155630137442166, - "learning_rate": 4.839248569913614e-07, - "logits/chosen": -1.4889419078826904, - "logits/rejected": -1.452192783355713, - "logps/chosen": -189.53225708007812, - "logps/rejected": -255.5863800048828, - "loss": 0.5421, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3918462991714478, - "rewards/margins": 0.6527556777000427, - "rewards/rejected": -2.044602155685425, + "epoch": 0.4066161268090972, + "grad_norm": 2.6393425464630127, + "learning_rate": 9.678497139827229e-08, + "logits/chosen": -2.9860851764678955, + "logits/rejected": -2.973085641860962, + "logps/chosen": -58.43578338623047, + "logps/rejected": -61.41686248779297, + "loss": 0.683, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.08073704689741135, + "rewards/margins": 0.022134827449917793, + "rewards/rejected": -0.10287187993526459, "step": 2360 }, { - "epoch": 0.41, - "grad_norm": 34.58209504713677, - "learning_rate": 4.836585684569147e-07, - "logits/chosen": -1.4630403518676758, - "logits/rejected": -1.430633544921875, - "logps/chosen": -206.46591186523438, - "logps/rejected": -273.29779052734375, - "loss": 0.5551, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.527071237564087, - "rewards/margins": 0.6676559448242188, - "rewards/rejected": -2.1947274208068848, + "epoch": 0.4083390764989662, + "grad_norm": 3.105860948562622, + "learning_rate": 9.673171369138295e-08, + "logits/chosen": -2.9996142387390137, + "logits/rejected": -2.993256092071533, + "logps/chosen": -62.017311096191406, + "logps/rejected": -65.43057250976562, + "loss": 0.6775, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.08255334943532944, + "rewards/margins": 0.03310525417327881, + "rewards/rejected": -0.11565861850976944, "step": 2370 }, { - "epoch": 0.41, - "grad_norm": 26.061151506389557, - "learning_rate": 4.833901669300424e-07, - "logits/chosen": -1.4684240818023682, - "logits/rejected": -1.4264377355575562, - "logps/chosen": -186.2281494140625, - "logps/rejected": -233.110107421875, - "loss": 0.6138, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3187849521636963, - "rewards/margins": 0.4836592674255371, - "rewards/rejected": -1.8024442195892334, + "epoch": 0.4100620261888353, + "grad_norm": 2.6338090896606445, + "learning_rate": 9.667803338600848e-08, + "logits/chosen": -2.9264793395996094, + "logits/rejected": -2.9018797874450684, + "logps/chosen": -61.777565002441406, + "logps/rejected": -63.37926483154297, + "loss": 0.6788, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.07435405254364014, + "rewards/margins": 0.030792638659477234, + "rewards/rejected": -0.10514669120311737, "step": 2380 }, { - "epoch": 0.41, - "grad_norm": 18.221595816735224, - "learning_rate": 4.831196548379198e-07, - "logits/chosen": -1.5969889163970947, - "logits/rejected": -1.5512523651123047, - "logps/chosen": -178.01239013671875, - "logps/rejected": -238.6937255859375, - "loss": 0.5315, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2178453207015991, - "rewards/margins": 0.6382966041564941, - "rewards/rejected": -1.8561418056488037, + "epoch": 0.41178497587870433, + "grad_norm": 2.8210086822509766, + "learning_rate": 9.662393096758396e-08, + "logits/chosen": -3.010481357574463, + "logits/rejected": -2.983585834503174, + "logps/chosen": -64.10334014892578, + "logps/rejected": -64.34742736816406, + "loss": 0.6774, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0786934643983841, + "rewards/margins": 0.03383960574865341, + "rewards/rejected": -0.11253305524587631, "step": 2390 }, { - "epoch": 0.41, - "grad_norm": 15.589350815937946, - "learning_rate": 4.828470346268088e-07, - "logits/chosen": -1.6465771198272705, - "logits/rejected": -1.6099990606307983, - "logps/chosen": -179.09376525878906, - "logps/rejected": -228.93905639648438, - "loss": 0.5786, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2678877115249634, - "rewards/margins": 0.4891243577003479, - "rewards/rejected": -1.7570120096206665, + "epoch": 0.4135079255685734, + "grad_norm": 2.57030987739563, + "learning_rate": 9.656940692536178e-08, + "logits/chosen": -3.057115077972412, + "logits/rejected": -3.0364222526550293, + "logps/chosen": -60.174354553222656, + "logps/rejected": -63.84586715698242, + "loss": 0.6809, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0787380188703537, + "rewards/margins": 0.02708571031689644, + "rewards/rejected": -0.10582373291254044, "step": 2400 }, { - "epoch": 0.41, - "eval_logits/chosen": -1.6363126039505005, - "eval_logits/rejected": -1.6167610883712769, - "eval_logps/chosen": -178.53884887695312, - "eval_logps/rejected": -209.47793579101562, - "eval_loss": 0.6305522918701172, - "eval_rewards/accuracies": 0.6291821599006653, - "eval_rewards/chosen": -1.1983500719070435, - "eval_rewards/margins": 0.2648555040359497, - "eval_rewards/rejected": -1.4632055759429932, - "eval_runtime": 356.7532, - "eval_samples_per_second": 12.064, - "eval_steps_per_second": 1.508, + "epoch": 0.4135079255685734, + "eval_logits/chosen": -3.064911365509033, + "eval_logits/rejected": -3.059225082397461, + "eval_logps/chosen": -63.42079162597656, + "eval_logps/rejected": -69.33052062988281, + "eval_loss": 0.6866453886032104, + "eval_rewards/accuracies": 0.5906133651733398, + "eval_rewards/chosen": -0.04708903282880783, + "eval_rewards/margins": 0.014414963312447071, + "eval_rewards/rejected": -0.06150398775935173, + "eval_runtime": 358.4887, + "eval_samples_per_second": 12.006, + "eval_steps_per_second": 1.501, "step": 2400 }, { - "epoch": 0.42, - "grad_norm": 21.985975720515064, - "learning_rate": 4.82572308762035e-07, - "logits/chosen": -1.5702491998672485, - "logits/rejected": -1.5282782316207886, - "logps/chosen": -188.3826904296875, - "logps/rejected": -229.83132934570312, - "loss": 0.5676, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3452413082122803, - "rewards/margins": 0.4605236053466797, - "rewards/rejected": -1.8057647943496704, + "epoch": 0.41523087525844243, + "grad_norm": 2.556201934814453, + "learning_rate": 9.651446175240698e-08, + "logits/chosen": -3.02958345413208, + "logits/rejected": -2.990233898162842, + "logps/chosen": -61.12690353393555, + "logps/rejected": -60.686256408691406, + "loss": 0.6737, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.07276560366153717, + "rewards/margins": 0.0414142906665802, + "rewards/rejected": -0.11417989432811737, "step": 2410 }, { - "epoch": 0.42, - "grad_norm": 26.14144209889029, - "learning_rate": 4.822954797279652e-07, - "logits/chosen": -1.5276035070419312, - "logits/rejected": -1.4836372137069702, - "logps/chosen": -208.26181030273438, - "logps/rejected": -264.6378479003906, - "loss": 0.5711, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.5207151174545288, - "rewards/margins": 0.5766458511352539, - "rewards/rejected": -2.0973610877990723, + "epoch": 0.4169538249483115, + "grad_norm": 2.8453683853149414, + "learning_rate": 9.645909594559304e-08, + "logits/chosen": -3.0249388217926025, + "logits/rejected": -3.002185344696045, + "logps/chosen": -64.44881439208984, + "logps/rejected": -66.13243865966797, + "loss": 0.6796, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0826670378446579, + "rewards/margins": 0.02963237091898918, + "rewards/rejected": -0.11229941993951797, "step": 2420 }, { - "epoch": 0.42, - "grad_norm": 17.28711843102643, - "learning_rate": 4.82016550027986e-07, - "logits/chosen": -1.5296419858932495, - "logits/rejected": -1.4949567317962646, - "logps/chosen": -192.01145935058594, - "logps/rejected": -231.2733154296875, - "loss": 0.5997, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.371919870376587, - "rewards/margins": 0.4235268533229828, - "rewards/rejected": -1.7954469919204712, + "epoch": 0.41867677463818054, + "grad_norm": 2.8599154949188232, + "learning_rate": 9.64033100055972e-08, + "logits/chosen": -2.986757755279541, + "logits/rejected": -2.96449613571167, + "logps/chosen": -62.99231719970703, + "logps/rejected": -62.6896858215332, + "loss": 0.6805, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0814933180809021, + "rewards/margins": 0.028070122003555298, + "rewards/rejected": -0.1095634326338768, "step": 2430 }, { - "epoch": 0.42, - "grad_norm": 15.776095113993128, - "learning_rate": 4.817355221844802e-07, - "logits/chosen": -1.5643110275268555, - "logits/rejected": -1.5382698774337769, - "logps/chosen": -172.5228729248047, - "logps/rejected": -228.47189331054688, - "loss": 0.5417, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.1747311353683472, - "rewards/margins": 0.5653839707374573, - "rewards/rejected": -1.7401151657104492, + "epoch": 0.4203997243280496, + "grad_norm": 3.06473445892334, + "learning_rate": 9.634710443689602e-08, + "logits/chosen": -2.9826653003692627, + "logits/rejected": -2.9693188667297363, + "logps/chosen": -62.84228515625, + "logps/rejected": -66.02586364746094, + "loss": 0.6759, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.07785408198833466, + "rewards/margins": 0.037403546273708344, + "rewards/rejected": -0.1152576357126236, "step": 2440 }, { - "epoch": 0.42, - "grad_norm": 19.47187684087894, - "learning_rate": 4.814523987388038e-07, - "logits/chosen": -1.5278120040893555, - "logits/rejected": -1.490755319595337, - "logps/chosen": -177.8535919189453, - "logps/rejected": -222.85791015625, - "loss": 0.5882, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2479978799819946, - "rewards/margins": 0.45816653966903687, - "rewards/rejected": -1.7061645984649658, + "epoch": 0.4221226740179187, + "grad_norm": 2.7919325828552246, + "learning_rate": 9.629047974776077e-08, + "logits/chosen": -2.9399731159210205, + "logits/rejected": -2.9167487621307373, + "logps/chosen": -62.27665328979492, + "logps/rejected": -64.26480865478516, + "loss": 0.6802, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.09218315780162811, + "rewards/margins": 0.027885476127266884, + "rewards/rejected": -0.12006862461566925, "step": 2450 }, { - "epoch": 0.42, - "grad_norm": 14.220492265970561, - "learning_rate": 4.811671822512644e-07, - "logits/chosen": -1.5169602632522583, - "logits/rejected": -1.4735338687896729, - "logps/chosen": -167.81532287597656, - "logps/rejected": -209.4271240234375, - "loss": 0.586, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1169971227645874, - "rewards/margins": 0.45197534561157227, - "rewards/rejected": -1.5689725875854492, + "epoch": 0.4238456237077877, + "grad_norm": 2.8518171310424805, + "learning_rate": 9.623343645025288e-08, + "logits/chosen": -2.921736717224121, + "logits/rejected": -2.8892157077789307, + "logps/chosen": -64.50049591064453, + "logps/rejected": -64.17418670654297, + "loss": 0.6779, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0835905522108078, + "rewards/margins": 0.032821472734212875, + "rewards/rejected": -0.11641202867031097, "step": 2460 }, { - "epoch": 0.43, - "grad_norm": 24.10851119267855, - "learning_rate": 4.808798753010965e-07, - "logits/chosen": -1.648048758506775, - "logits/rejected": -1.6161121129989624, - "logps/chosen": -171.47268676757812, - "logps/rejected": -209.07357788085938, - "loss": 0.5908, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1645221710205078, - "rewards/margins": 0.39932164549827576, - "rewards/rejected": -1.5638437271118164, + "epoch": 0.4255685733976568, + "grad_norm": 3.0394136905670166, + "learning_rate": 9.61759750602193e-08, + "logits/chosen": -3.079592227935791, + "logits/rejected": -3.058415412902832, + "logps/chosen": -62.834922790527344, + "logps/rejected": -64.33911895751953, + "loss": 0.6753, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.07790704071521759, + "rewards/margins": 0.0384015329182148, + "rewards/rejected": -0.11630856990814209, "step": 2470 }, { - "epoch": 0.43, - "grad_norm": 15.012483229366685, - "learning_rate": 4.805904804864388e-07, - "logits/chosen": -1.6050293445587158, - "logits/rejected": -1.5672911405563354, - "logps/chosen": -169.68240356445312, - "logps/rejected": -201.05831909179688, - "loss": 0.6088, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.153080701828003, - "rewards/margins": 0.35358747839927673, - "rewards/rejected": -1.506668210029602, + "epoch": 0.4272915230875258, + "grad_norm": 2.9794809818267822, + "learning_rate": 9.611809609728777e-08, + "logits/chosen": -2.9777140617370605, + "logits/rejected": -2.946880340576172, + "logps/chosen": -64.09749603271484, + "logps/rejected": -62.37017822265625, + "loss": 0.683, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.09719328582286835, + "rewards/margins": 0.022251714020967484, + "rewards/rejected": -0.11944498866796494, "step": 2480 }, { - "epoch": 0.43, - "grad_norm": 17.517728604597213, - "learning_rate": 4.802990004243112e-07, - "logits/chosen": -1.6790720224380493, - "logits/rejected": -1.6492221355438232, - "logps/chosen": -141.60067749023438, - "logps/rejected": -183.9330291748047, - "loss": 0.581, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.9159472584724426, - "rewards/margins": 0.4076360762119293, - "rewards/rejected": -1.3235833644866943, + "epoch": 0.4290144727773949, + "grad_norm": 2.740145683288574, + "learning_rate": 9.605980008486224e-08, + "logits/chosen": -2.961674213409424, + "logits/rejected": -2.9528236389160156, + "logps/chosen": -59.51900100708008, + "logps/rejected": -63.45074462890625, + "loss": 0.6826, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.09480162709951401, + "rewards/margins": 0.0237564779818058, + "rewards/rejected": -0.11855810880661011, "step": 2490 }, { - "epoch": 0.43, - "grad_norm": 19.29815730049706, - "learning_rate": 4.800054377505901e-07, - "logits/chosen": -1.7170441150665283, - "logits/rejected": -1.6744773387908936, - "logps/chosen": -157.9257354736328, - "logps/rejected": -203.2161102294922, - "loss": 0.5679, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.0103013515472412, - "rewards/margins": 0.4565104842185974, - "rewards/rejected": -1.4668117761611938, + "epoch": 0.43073742246726393, + "grad_norm": 3.2707362174987793, + "learning_rate": 9.600108755011803e-08, + "logits/chosen": -3.0370676517486572, + "logits/rejected": -3.013251304626465, + "logps/chosen": -66.4466552734375, + "logps/rejected": -68.46018981933594, + "loss": 0.6828, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0955594927072525, + "rewards/margins": 0.023346083238720894, + "rewards/rejected": -0.11890558153390884, "step": 2500 }, { - "epoch": 0.43, - "eval_logits/chosen": -1.7219594717025757, - "eval_logits/rejected": -1.7044074535369873, - "eval_logps/chosen": -148.90235900878906, - "eval_logps/rejected": -175.4527587890625, - "eval_loss": 0.6329796314239502, - "eval_rewards/accuracies": 0.6345260143280029, - "eval_rewards/chosen": -0.9019851088523865, - "eval_rewards/margins": 0.22096872329711914, - "eval_rewards/rejected": -1.1229537725448608, - "eval_runtime": 356.8285, - "eval_samples_per_second": 12.062, - "eval_steps_per_second": 1.508, + "epoch": 0.43073742246726393, + "eval_logits/chosen": -3.0558035373687744, + "eval_logits/rejected": -3.05007004737854, + "eval_logps/chosen": -64.28133392333984, + "eval_logps/rejected": -70.30865478515625, + "eval_loss": 0.6861559748649597, + "eval_rewards/accuracies": 0.5913103818893433, + "eval_rewards/chosen": -0.05569446086883545, + "eval_rewards/margins": 0.015590852126479149, + "eval_rewards/rejected": -0.07128531485795975, + "eval_runtime": 358.8021, + "eval_samples_per_second": 11.995, + "eval_steps_per_second": 1.499, "step": 2500 }, { - "epoch": 0.43, - "grad_norm": 20.098541263273624, - "learning_rate": 4.797097951199854e-07, - "logits/chosen": -1.5535961389541626, - "logits/rejected": -1.5248453617095947, - "logps/chosen": -159.08462524414062, - "logps/rejected": -213.17837524414062, - "loss": 0.546, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.067018985748291, - "rewards/margins": 0.5451322793960571, - "rewards/rejected": -1.6121511459350586, + "epoch": 0.432460372157133, + "grad_norm": 2.9338889122009277, + "learning_rate": 9.594195902399708e-08, + "logits/chosen": -2.8873167037963867, + "logits/rejected": -2.877664566040039, + "logps/chosen": -61.8262939453125, + "logps/rejected": -65.24763488769531, + "loss": 0.6752, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.09440617263317108, + "rewards/margins": 0.03825463727116585, + "rewards/rejected": -0.13266082108020782, "step": 2510 }, { - "epoch": 0.43, - "grad_norm": 15.302909435904388, - "learning_rate": 4.794120752060162e-07, - "logits/chosen": -1.5149682760238647, - "logits/rejected": -1.4745677709579468, - "logps/chosen": -166.39657592773438, - "logps/rejected": -208.57785034179688, - "loss": 0.5939, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1393781900405884, - "rewards/margins": 0.4565187990665436, - "rewards/rejected": -1.5958969593048096, + "epoch": 0.4341833218470021, + "grad_norm": 2.6996138095855713, + "learning_rate": 9.588241504120325e-08, + "logits/chosen": -2.875840663909912, + "logits/rejected": -2.8484764099121094, + "logps/chosen": -61.40210723876953, + "logps/rejected": -61.77978515625, + "loss": 0.6753, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.08933891355991364, + "rewards/margins": 0.0384402871131897, + "rewards/rejected": -0.12777918577194214, "step": 2520 }, { - "epoch": 0.44, - "grad_norm": 25.687681170750803, - "learning_rate": 4.791122807009866e-07, - "logits/chosen": -1.568881869316101, - "logits/rejected": -1.552473783493042, - "logps/chosen": -177.05374145507812, - "logps/rejected": -220.03857421875, - "loss": 0.6046, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2589091062545776, - "rewards/margins": 0.40004104375839233, - "rewards/rejected": -1.6589502096176147, + "epoch": 0.4359062715368711, + "grad_norm": 2.8578429222106934, + "learning_rate": 9.582245614019734e-08, + "logits/chosen": -2.962174892425537, + "logits/rejected": -2.9574642181396484, + "logps/chosen": -61.437477111816406, + "logps/rejected": -66.27393341064453, + "loss": 0.6853, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.10267911851406097, + "rewards/margins": 0.018535245209932327, + "rewards/rejected": -0.1212143674492836, "step": 2530 }, { - "epoch": 0.44, - "grad_norm": 17.330151466831865, - "learning_rate": 4.788104143159616e-07, - "logits/chosen": -1.6212892532348633, - "logits/rejected": -1.5904419422149658, - "logps/chosen": -177.31298828125, - "logps/rejected": -227.2390899658203, - "loss": 0.6146, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2378965616226196, - "rewards/margins": 0.4961935579776764, - "rewards/rejected": -1.7340900897979736, + "epoch": 0.4376292212267402, + "grad_norm": 2.7969298362731934, + "learning_rate": 9.576208286319231e-08, + "logits/chosen": -3.001952886581421, + "logits/rejected": -2.991283416748047, + "logps/chosen": -64.27826690673828, + "logps/rejected": -67.79606628417969, + "loss": 0.6785, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10737824440002441, + "rewards/margins": 0.032104894518852234, + "rewards/rejected": -0.13948313891887665, "step": 2540 }, { - "epoch": 0.44, - "grad_norm": 17.990325513608543, - "learning_rate": 4.785064787807418e-07, - "logits/chosen": -1.6909650564193726, - "logits/rejected": -1.6431467533111572, - "logps/chosen": -151.2425537109375, - "logps/rejected": -197.7589569091797, - "loss": 0.5459, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.9256092309951782, - "rewards/margins": 0.5123879313468933, - "rewards/rejected": -1.4379972219467163, + "epoch": 0.4393521709166092, + "grad_norm": 3.0053772926330566, + "learning_rate": 9.570129575614835e-08, + "logits/chosen": -2.9459023475646973, + "logits/rejected": -2.913362503051758, + "logps/chosen": -67.61305236816406, + "logps/rejected": -66.90510559082031, + "loss": 0.6754, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.08929960429668427, + "rewards/margins": 0.04003571346402168, + "rewards/rejected": -0.12933531403541565, "step": 2550 }, { - "epoch": 0.44, - "grad_norm": 14.629576613571512, - "learning_rate": 4.782004768438399e-07, - "logits/chosen": -1.7803840637207031, - "logits/rejected": -1.744502305984497, - "logps/chosen": -138.01266479492188, - "logps/rejected": -177.8213348388672, - "loss": 0.5766, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.8278089761734009, - "rewards/margins": 0.41043511033058167, - "rewards/rejected": -1.2382439374923706, + "epoch": 0.4410751206064783, + "grad_norm": 2.983738899230957, + "learning_rate": 9.564009536876798e-08, + "logits/chosen": -3.0335261821746826, + "logits/rejected": -3.009237766265869, + "logps/chosen": -64.40341186523438, + "logps/rejected": -66.43016052246094, + "loss": 0.6782, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09143063426017761, + "rewards/margins": 0.03266965225338936, + "rewards/rejected": -0.12410029023885727, "step": 2560 }, { - "epoch": 0.44, - "grad_norm": 14.783534214563243, - "learning_rate": 4.778924112724548e-07, - "logits/chosen": -1.6910631656646729, - "logits/rejected": -1.6597950458526611, - "logps/chosen": -161.07774353027344, - "logps/rejected": -207.1680145263672, - "loss": 0.5656, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.02474045753479, - "rewards/margins": 0.4819648861885071, - "rewards/rejected": -1.506705403327942, + "epoch": 0.4427980702963473, + "grad_norm": 2.8760268688201904, + "learning_rate": 9.557848225449097e-08, + "logits/chosen": -2.9677720069885254, + "logits/rejected": -2.949582576751709, + "logps/chosen": -67.64404296875, + "logps/rejected": -69.02073669433594, + "loss": 0.6772, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09042315185070038, + "rewards/margins": 0.034747906029224396, + "rewards/rejected": -0.12517106533050537, "step": 2570 }, { - "epoch": 0.44, - "grad_norm": 24.195144500390953, - "learning_rate": 4.775822848524474e-07, - "logits/chosen": -1.65180242061615, - "logits/rejected": -1.6205854415893555, - "logps/chosen": -174.69419860839844, - "logps/rejected": -216.5833740234375, - "loss": 0.5993, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2136080265045166, - "rewards/margins": 0.4207339286804199, - "rewards/rejected": -1.634341835975647, + "epoch": 0.4445210199862164, + "grad_norm": 3.3024773597717285, + "learning_rate": 9.551645697048946e-08, + "logits/chosen": -2.9901578426361084, + "logits/rejected": -2.971513032913208, + "logps/chosen": -63.95698165893555, + "logps/rejected": -65.8638916015625, + "loss": 0.6838, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.10571177303791046, + "rewards/margins": 0.021099606528878212, + "rewards/rejected": -0.12681138515472412, "step": 2580 }, { - "epoch": 0.45, - "grad_norm": 24.07431376599948, - "learning_rate": 4.772701003883146e-07, - "logits/chosen": -1.6589524745941162, - "logits/rejected": -1.6199334859848022, - "logps/chosen": -160.41822814941406, - "logps/rejected": -193.14602661132812, - "loss": 0.6035, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.0372803211212158, - "rewards/margins": 0.38218361139297485, - "rewards/rejected": -1.419463872909546, + "epoch": 0.4462439696760855, + "grad_norm": 3.4273412227630615, + "learning_rate": 9.545402007766291e-08, + "logits/chosen": -2.968909740447998, + "logits/rejected": -2.9322829246520996, + "logps/chosen": -65.9303207397461, + "logps/rejected": -64.69056701660156, + "loss": 0.6738, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09235879778862, + "rewards/margins": 0.04207988083362579, + "rewards/rejected": -0.1344386637210846, "step": 2590 }, { - "epoch": 0.45, - "grad_norm": 17.260199729003336, - "learning_rate": 4.769558607031646e-07, - "logits/chosen": -1.6966606378555298, - "logits/rejected": -1.6404602527618408, - "logps/chosen": -153.80836486816406, - "logps/rejected": -197.2882080078125, - "loss": 0.5426, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.9672321081161499, - "rewards/margins": 0.5213042497634888, - "rewards/rejected": -1.4885362386703491, + "epoch": 0.4479669193659545, + "grad_norm": 3.036872148513794, + "learning_rate": 9.539117214063292e-08, + "logits/chosen": -2.9415981769561768, + "logits/rejected": -2.8942933082580566, + "logps/chosen": -67.31944274902344, + "logps/rejected": -62.59626007080078, + "loss": 0.6754, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.10223277658224106, + "rewards/margins": 0.039090316742658615, + "rewards/rejected": -0.14132311940193176, "step": 2600 }, { - "epoch": 0.45, - "eval_logits/chosen": -1.7993457317352295, - "eval_logits/rejected": -1.7825459241867065, - "eval_logps/chosen": -147.43885803222656, - "eval_logps/rejected": -172.26231384277344, - "eval_loss": 0.6352224946022034, - "eval_rewards/accuracies": 0.6354553699493408, - "eval_rewards/chosen": -0.8873502016067505, - "eval_rewards/margins": 0.20369918644428253, - "eval_rewards/rejected": -1.0910491943359375, - "eval_runtime": 356.7035, - "eval_samples_per_second": 12.066, - "eval_steps_per_second": 1.508, + "epoch": 0.4479669193659545, + "eval_logits/chosen": -3.049037218093872, + "eval_logits/rejected": -3.0432827472686768, + "eval_logps/chosen": -64.85841369628906, + "eval_logps/rejected": -71.00834655761719, + "eval_loss": 0.685627281665802, + "eval_rewards/accuracies": 0.591775119304657, + "eval_rewards/chosen": -0.0614653080701828, + "eval_rewards/margins": 0.01681698113679886, + "eval_rewards/rejected": -0.07828228920698166, + "eval_runtime": 358.2326, + "eval_samples_per_second": 12.015, + "eval_steps_per_second": 1.502, "step": 2600 }, { - "epoch": 0.45, - "grad_norm": 16.168191035992383, - "learning_rate": 4.7663956863869114e-07, - "logits/chosen": -1.646691918373108, - "logits/rejected": -1.5923856496810913, - "logps/chosen": -167.03103637695312, - "logps/rejected": -211.1219482421875, - "loss": 0.5665, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.0866007804870605, - "rewards/margins": 0.5080444812774658, - "rewards/rejected": -1.5946451425552368, + "epoch": 0.4496898690558236, + "grad_norm": 2.9528679847717285, + "learning_rate": 9.532791372773822e-08, + "logits/chosen": -2.9208762645721436, + "logits/rejected": -2.88173246383667, + "logps/chosen": -68.32647705078125, + "logps/rejected": -65.62649536132812, + "loss": 0.6751, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.099474236369133, + "rewards/margins": 0.040066082030534744, + "rewards/rejected": -0.13954029977321625, "step": 2610 }, { - "epoch": 0.45, - "grad_norm": 21.121725151266354, - "learning_rate": 4.7632122705514764e-07, - "logits/chosen": -1.6836843490600586, - "logits/rejected": -1.6414811611175537, - "logps/chosen": -174.40000915527344, - "logps/rejected": -224.135009765625, - "loss": 0.5734, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.2318413257598877, - "rewards/margins": 0.5052552223205566, - "rewards/rejected": -1.7370964288711548, + "epoch": 0.4514128187456926, + "grad_norm": 2.840683698654175, + "learning_rate": 9.526424541102953e-08, + "logits/chosen": -3.0072741508483887, + "logits/rejected": -2.977484703063965, + "logps/chosen": -61.29169845581055, + "logps/rejected": -64.94569396972656, + "loss": 0.6723, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.10025743395090103, + "rewards/margins": 0.044723428785800934, + "rewards/rejected": -0.14498087763786316, "step": 2620 }, { - "epoch": 0.45, - "grad_norm": 20.850911780664795, - "learning_rate": 4.760008388313216e-07, - "logits/chosen": -1.5688848495483398, - "logits/rejected": -1.5264674425125122, - "logps/chosen": -175.73428344726562, - "logps/rejected": -224.6223907470703, - "loss": 0.5769, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.209410548210144, - "rewards/margins": 0.5050948262214661, - "rewards/rejected": -1.7145051956176758, + "epoch": 0.4531357684355617, + "grad_norm": 3.2683465480804443, + "learning_rate": 9.520016776626432e-08, + "logits/chosen": -2.8932104110717773, + "logits/rejected": -2.8613693714141846, + "logps/chosen": -64.25384521484375, + "logps/rejected": -67.47929382324219, + "loss": 0.6704, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.09440313279628754, + "rewards/margins": 0.048835329711437225, + "rewards/rejected": -0.14323846995830536, "step": 2630 }, { - "epoch": 0.45, - "grad_norm": 19.418048097017046, - "learning_rate": 4.756784068645083e-07, - "logits/chosen": -1.5928579568862915, - "logits/rejected": -1.553302526473999, - "logps/chosen": -167.90257263183594, - "logps/rejected": -221.66702270507812, - "loss": 0.5415, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.178271770477295, - "rewards/margins": 0.5487642288208008, - "rewards/rejected": -1.7270358800888062, + "epoch": 0.4548587181254307, + "grad_norm": 2.818164348602295, + "learning_rate": 9.513568137290167e-08, + "logits/chosen": -2.91321063041687, + "logits/rejected": -2.8920276165008545, + "logps/chosen": -60.74837112426758, + "logps/rejected": -62.896446228027344, + "loss": 0.6784, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10652945935726166, + "rewards/margins": 0.03255968540906906, + "rewards/rejected": -0.13908913731575012, "step": 2640 }, { - "epoch": 0.46, - "grad_norm": 21.88823441031475, - "learning_rate": 4.75353934070485e-07, - "logits/chosen": -1.5368947982788086, - "logits/rejected": -1.5017019510269165, - "logps/chosen": -185.3848419189453, - "logps/rejected": -244.77587890625, - "loss": 0.5833, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3293806314468384, - "rewards/margins": 0.5755869746208191, - "rewards/rejected": -1.9049675464630127, + "epoch": 0.4565816678152998, + "grad_norm": 2.770519495010376, + "learning_rate": 9.507078681409701e-08, + "logits/chosen": -2.8849105834960938, + "logits/rejected": -2.8674635887145996, + "logps/chosen": -61.676353454589844, + "logps/rejected": -67.19678497314453, + "loss": 0.6765, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.09245527535676956, + "rewards/margins": 0.03666946291923523, + "rewards/rejected": -0.1291247308254242, "step": 2650 }, { - "epoch": 0.46, - "grad_norm": 22.610484118287804, - "learning_rate": 4.7502742338348406e-07, - "logits/chosen": -1.5877610445022583, - "logits/rejected": -1.5406101942062378, - "logps/chosen": -193.39242553710938, - "logps/rejected": -222.32186889648438, - "loss": 0.6741, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.351151943206787, - "rewards/margins": 0.34743010997772217, - "rewards/rejected": -1.6985820531845093, + "epoch": 0.4583046175051689, + "grad_norm": 3.1621921062469482, + "learning_rate": 9.500548467669681e-08, + "logits/chosen": -2.9127323627471924, + "logits/rejected": -2.8717422485351562, + "logps/chosen": -68.6246566772461, + "logps/rejected": -67.5105972290039, + "loss": 0.6716, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.10334835201501846, + "rewards/margins": 0.04701242595911026, + "rewards/rejected": -0.15036077797412872, "step": 2660 }, { - "epoch": 0.46, - "grad_norm": 13.819491655406011, - "learning_rate": 4.746988777561668e-07, - "logits/chosen": -1.6597168445587158, - "logits/rejected": -1.6143728494644165, - "logps/chosen": -164.24771118164062, - "logps/rejected": -210.0986328125, - "loss": 0.584, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1030311584472656, - "rewards/margins": 0.47755417227745056, - "rewards/rejected": -1.5805851221084595, + "epoch": 0.4600275671950379, + "grad_norm": 2.8156025409698486, + "learning_rate": 9.493977555123336e-08, + "logits/chosen": -2.8853299617767334, + "logits/rejected": -2.8571767807006836, + "logps/chosen": -64.55490112304688, + "logps/rejected": -66.96713256835938, + "loss": 0.6733, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.10608921200037003, + "rewards/margins": 0.04296635091304779, + "rewards/rejected": -0.14905555546283722, "step": 2670 }, { - "epoch": 0.46, - "grad_norm": 19.318025617063718, - "learning_rate": 4.743683001595965e-07, - "logits/chosen": -1.7418750524520874, - "logits/rejected": -1.7099230289459229, - "logps/chosen": -164.66427612304688, - "logps/rejected": -190.61524963378906, - "loss": 0.6165, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.0642914772033691, - "rewards/margins": 0.30566078424453735, - "rewards/rejected": -1.3699522018432617, + "epoch": 0.461750516884907, + "grad_norm": 3.240689277648926, + "learning_rate": 9.48736600319193e-08, + "logits/chosen": -2.9307405948638916, + "logits/rejected": -2.903907299041748, + "logps/chosen": -69.11388397216797, + "logps/rejected": -67.31000518798828, + "loss": 0.6805, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.10866351425647736, + "rewards/margins": 0.027808865532279015, + "rewards/rejected": -0.13647237420082092, "step": 2680 }, { - "epoch": 0.46, - "grad_norm": 12.859233397896235, - "learning_rate": 4.7403569358321206e-07, - "logits/chosen": -1.7552549839019775, - "logits/rejected": -1.7229642868041992, - "logps/chosen": -147.30789184570312, - "logps/rejected": -191.17984008789062, - "loss": 0.5505, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.9669073820114136, - "rewards/margins": 0.44619670510292053, - "rewards/rejected": -1.4131041765213013, + "epoch": 0.463473466574776, + "grad_norm": 3.0627663135528564, + "learning_rate": 9.480713871664241e-08, + "logits/chosen": -2.9041428565979004, + "logits/rejected": -2.8821544647216797, + "logps/chosen": -62.15593338012695, + "logps/rejected": -65.29117584228516, + "loss": 0.6753, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.11514836549758911, + "rewards/margins": 0.038897983729839325, + "rewards/rejected": -0.15404634177684784, "step": 2690 }, { - "epoch": 0.47, - "grad_norm": 15.446403175724251, - "learning_rate": 4.7370106103480013e-07, - "logits/chosen": -1.7358205318450928, - "logits/rejected": -1.6975898742675781, - "logps/chosen": -156.6081085205078, - "logps/rejected": -196.40362548828125, - "loss": 0.5888, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.033973217010498, - "rewards/margins": 0.3954750895500183, - "rewards/rejected": -1.4294483661651611, + "epoch": 0.4651964162646451, + "grad_norm": 2.908506393432617, + "learning_rate": 9.474021220696002e-08, + "logits/chosen": -2.9324769973754883, + "logits/rejected": -2.9060561656951904, + "logps/chosen": -63.67100143432617, + "logps/rejected": -67.40882873535156, + "loss": 0.6768, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.1044023260474205, + "rewards/margins": 0.03492189943790436, + "rewards/rejected": -0.13932421803474426, "step": 2700 }, { - "epoch": 0.47, - "eval_logits/chosen": -1.8467351198196411, - "eval_logits/rejected": -1.8293933868408203, - "eval_logps/chosen": -149.63987731933594, - "eval_logps/rejected": -176.10572814941406, - "eval_loss": 0.6302607655525208, - "eval_rewards/accuracies": 0.645213782787323, - "eval_rewards/chosen": -0.9093602895736694, - "eval_rewards/margins": 0.2201230674982071, - "eval_rewards/rejected": -1.1294833421707153, - "eval_runtime": 356.6899, - "eval_samples_per_second": 12.067, - "eval_steps_per_second": 1.508, + "epoch": 0.4651964162646451, + "eval_logits/chosen": -3.0427258014678955, + "eval_logits/rejected": -3.037022352218628, + "eval_logps/chosen": -65.44750213623047, + "eval_logps/rejected": -71.71359252929688, + "eval_loss": 0.6851440072059631, + "eval_rewards/accuracies": 0.595724880695343, + "eval_rewards/chosen": -0.06735601276159286, + "eval_rewards/margins": 0.017978651449084282, + "eval_rewards/rejected": -0.0853346586227417, + "eval_runtime": 359.3357, + "eval_samples_per_second": 11.978, + "eval_steps_per_second": 1.497, "step": 2700 }, { - "epoch": 0.47, - "grad_norm": 16.878338793520253, - "learning_rate": 4.733644055404687e-07, - "logits/chosen": -1.7432657480239868, - "logits/rejected": -1.7117999792099, - "logps/chosen": -164.71484375, - "logps/rejected": -207.14260864257812, - "loss": 0.5639, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.052567481994629, - "rewards/margins": 0.4653971791267395, - "rewards/rejected": -1.5179646015167236, + "epoch": 0.4669193659545141, + "grad_norm": 3.442694664001465, + "learning_rate": 9.467288110809373e-08, + "logits/chosen": -2.9548001289367676, + "logits/rejected": -2.9361233711242676, + "logps/chosen": -69.58091735839844, + "logps/rejected": -67.78138732910156, + "loss": 0.6832, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10134278237819672, + "rewards/margins": 0.022963717579841614, + "rewards/rejected": -0.12430648505687714, "step": 2710 }, { - "epoch": 0.47, - "grad_norm": 20.497517981633035, - "learning_rate": 4.7302573014461935e-07, - "logits/chosen": -1.7307226657867432, - "logits/rejected": -1.7192401885986328, - "logps/chosen": -163.8205108642578, - "logps/rejected": -208.58010864257812, - "loss": 0.5986, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1179062128067017, - "rewards/margins": 0.430508553981781, - "rewards/rejected": -1.548414707183838, + "epoch": 0.4686423156443832, + "grad_norm": 3.124640464782715, + "learning_rate": 9.460514602892386e-08, + "logits/chosen": -2.9450113773345947, + "logits/rejected": -2.948533773422241, + "logps/chosen": -63.363075256347656, + "logps/rejected": -67.94403839111328, + "loss": 0.6807, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11315824836492538, + "rewards/margins": 0.02865438722074032, + "rewards/rejected": -0.14181265234947205, "step": 2720 }, { - "epoch": 0.47, - "grad_norm": 19.634240873533788, - "learning_rate": 4.7268503790991977e-07, - "logits/chosen": -1.760005235671997, - "logits/rejected": -1.728356957435608, - "logps/chosen": -156.45030212402344, - "logps/rejected": -194.1329803466797, - "loss": 0.5988, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.9790525436401367, - "rewards/margins": 0.4097796380519867, - "rewards/rejected": -1.3888323307037354, + "epoch": 0.4703652653342522, + "grad_norm": 3.505093812942505, + "learning_rate": 9.453700758198396e-08, + "logits/chosen": -2.944913387298584, + "logits/rejected": -2.925502300262451, + "logps/chosen": -69.31742858886719, + "logps/rejected": -69.01850128173828, + "loss": 0.6794, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10750722885131836, + "rewards/margins": 0.030425354838371277, + "rewards/rejected": -0.13793259859085083, "step": 2730 }, { - "epoch": 0.47, - "grad_norm": 15.780929737120895, - "learning_rate": 4.72342331917276e-07, - "logits/chosen": -1.7603209018707275, - "logits/rejected": -1.730158805847168, - "logps/chosen": -138.56451416015625, - "logps/rejected": -176.93080139160156, - "loss": 0.5707, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.8573589324951172, - "rewards/margins": 0.41820549964904785, - "rewards/rejected": -1.275564432144165, + "epoch": 0.4720882150241213, + "grad_norm": 3.1059038639068604, + "learning_rate": 9.446846638345521e-08, + "logits/chosen": -2.9309005737304688, + "logits/rejected": -2.90991473197937, + "logps/chosen": -64.31190490722656, + "logps/rejected": -63.59071731567383, + "loss": 0.6811, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.1145264282822609, + "rewards/margins": 0.027553927153348923, + "rewards/rejected": -0.14208035171031952, "step": 2740 }, { - "epoch": 0.47, - "grad_norm": 20.00693735071868, - "learning_rate": 4.7199761526580484e-07, - "logits/chosen": -1.6731714010238647, - "logits/rejected": -1.6483711004257202, - "logps/chosen": -145.03150939941406, - "logps/rejected": -201.181640625, - "loss": 0.5471, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.9579359292984009, - "rewards/margins": 0.5196166038513184, - "rewards/rejected": -1.4775525331497192, + "epoch": 0.4738111647139904, + "grad_norm": 3.1940810680389404, + "learning_rate": 9.439952305316097e-08, + "logits/chosen": -2.8625340461730957, + "logits/rejected": -2.8576200008392334, + "logps/chosen": -61.2120361328125, + "logps/rejected": -68.39119720458984, + "loss": 0.6797, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.11948289722204208, + "rewards/margins": 0.030159134417772293, + "rewards/rejected": -0.14964203536510468, "step": 2750 }, { - "epoch": 0.48, - "grad_norm": 17.11025850674512, - "learning_rate": 4.7165089107280536e-07, - "logits/chosen": -1.6770479679107666, - "logits/rejected": -1.643370270729065, - "logps/chosen": -151.8813018798828, - "logps/rejected": -205.90744018554688, - "loss": 0.5535, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.9763303995132446, - "rewards/margins": 0.5348490476608276, - "rewards/rejected": -1.5111793279647827, + "epoch": 0.4755341144038594, + "grad_norm": 3.294135332107544, + "learning_rate": 9.433017821456108e-08, + "logits/chosen": -2.9133965969085693, + "logits/rejected": -2.8962509632110596, + "logps/chosen": -65.8562240600586, + "logps/rejected": -70.20357513427734, + "loss": 0.6757, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.11581240594387054, + "rewards/margins": 0.0381658673286438, + "rewards/rejected": -0.15397830307483673, "step": 2760 }, { - "epoch": 0.48, - "grad_norm": 19.624859017470257, - "learning_rate": 4.7130216247373123e-07, - "logits/chosen": -1.7125215530395508, - "logits/rejected": -1.6670726537704468, - "logps/chosen": -179.84864807128906, - "logps/rejected": -230.32839965820312, - "loss": 0.5671, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2465837001800537, - "rewards/margins": 0.5265911817550659, - "rewards/rejected": -1.7731748819351196, + "epoch": 0.4772570640937285, + "grad_norm": 3.4492290019989014, + "learning_rate": 9.426043249474624e-08, + "logits/chosen": -3.0412955284118652, + "logits/rejected": -3.011286973953247, + "logps/chosen": -66.13185119628906, + "logps/rejected": -67.03375244140625, + "loss": 0.6794, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.10917095839977264, + "rewards/margins": 0.030780524015426636, + "rewards/rejected": -0.13995149731636047, "step": 2770 }, { - "epoch": 0.48, - "grad_norm": 18.39672002807499, - "learning_rate": 4.7095143262216203e-07, - "logits/chosen": -1.5359172821044922, - "logits/rejected": -1.4871833324432373, - "logps/chosen": -188.5769805908203, - "logps/rejected": -238.5513153076172, - "loss": 0.5609, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3573224544525146, - "rewards/margins": 0.5221356749534607, - "rewards/rejected": -1.8794580698013306, + "epoch": 0.4789800137835975, + "grad_norm": 3.045886754989624, + "learning_rate": 9.41902865244324e-08, + "logits/chosen": -2.8621273040771484, + "logits/rejected": -2.8325142860412598, + "logps/chosen": -64.79789733886719, + "logps/rejected": -65.37285614013672, + "loss": 0.6807, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.11969073861837387, + "rewards/margins": 0.027886349707841873, + "rewards/rejected": -0.14757707715034485, "step": 2780 }, { - "epoch": 0.48, - "grad_norm": 25.940828538741872, - "learning_rate": 4.705987046897748e-07, - "logits/chosen": -1.6233654022216797, - "logits/rejected": -1.58616042137146, - "logps/chosen": -184.6060028076172, - "logps/rejected": -230.30361938476562, - "loss": 0.578, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3283016681671143, - "rewards/margins": 0.4737378656864166, - "rewards/rejected": -1.8020395040512085, + "epoch": 0.4807029634734666, + "grad_norm": 2.9073054790496826, + "learning_rate": 9.411974093795497e-08, + "logits/chosen": -2.977447748184204, + "logits/rejected": -2.9443159103393555, + "logps/chosen": -63.461631774902344, + "logps/rejected": -65.23011779785156, + "loss": 0.6774, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.11701373755931854, + "rewards/margins": 0.034158896654844284, + "rewards/rejected": -0.15117263793945312, "step": 2790 }, { - "epoch": 0.48, - "grad_norm": 17.60526676910467, - "learning_rate": 4.7024398186631533e-07, - "logits/chosen": -1.6539256572723389, - "logits/rejected": -1.6196858882904053, - "logps/chosen": -191.15408325195312, - "logps/rejected": -228.4556121826172, - "loss": 0.6328, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3112226724624634, - "rewards/margins": 0.4050907492637634, - "rewards/rejected": -1.7163136005401611, + "epoch": 0.4824259131633356, + "grad_norm": 3.480588912963867, + "learning_rate": 9.404879637326307e-08, + "logits/chosen": -2.9597458839416504, + "logits/rejected": -2.934946060180664, + "logps/chosen": -71.83543395996094, + "logps/rejected": -72.26683807373047, + "loss": 0.6766, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.11799754202365875, + "rewards/margins": 0.03629498928785324, + "rewards/rejected": -0.15429255366325378, "step": 2800 }, { - "epoch": 0.48, - "eval_logits/chosen": -1.8423043489456177, - "eval_logits/rejected": -1.8252124786376953, - "eval_logps/chosen": -142.36798095703125, - "eval_logps/rejected": -167.40048217773438, - "eval_loss": 0.6315863728523254, - "eval_rewards/accuracies": 0.6419609785079956, - "eval_rewards/chosen": -0.8366413712501526, - "eval_rewards/margins": 0.20578964054584503, - "eval_rewards/rejected": -1.042431116104126, - "eval_runtime": 356.6408, - "eval_samples_per_second": 12.068, - "eval_steps_per_second": 1.509, + "epoch": 0.4824259131633356, + "eval_logits/chosen": -3.036466121673584, + "eval_logits/rejected": -3.0307648181915283, + "eval_logps/chosen": -65.97711181640625, + "eval_logps/rejected": -72.36688995361328, + "eval_loss": 0.6845990419387817, + "eval_rewards/accuracies": 0.5966542959213257, + "eval_rewards/chosen": -0.07265209406614304, + "eval_rewards/margins": 0.019215548411011696, + "eval_rewards/rejected": -0.09186764806509018, + "eval_runtime": 360.3298, + "eval_samples_per_second": 11.945, + "eval_steps_per_second": 1.493, "step": 2800 }, { - "epoch": 0.48, - "grad_norm": 17.01308659310898, - "learning_rate": 4.6988726735956953e-07, - "logits/chosen": -1.6734424829483032, - "logits/rejected": -1.633302092552185, - "logps/chosen": -146.92413330078125, - "logps/rejected": -195.38314819335938, - "loss": 0.5504, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.9160858392715454, - "rewards/margins": 0.48056167364120483, - "rewards/rejected": -1.3966474533081055, + "epoch": 0.4841488628532047, + "grad_norm": 3.248321056365967, + "learning_rate": 9.397745347191391e-08, + "logits/chosen": -2.8508946895599365, + "logits/rejected": -2.826793909072876, + "logps/chosen": -66.67686462402344, + "logps/rejected": -71.41950988769531, + "loss": 0.6734, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.11341060698032379, + "rewards/margins": 0.04314876347780228, + "rewards/rejected": -0.15655937790870667, "step": 2810 }, { - "epoch": 0.49, - "grad_norm": 15.867862572496643, - "learning_rate": 4.69528564395334e-07, - "logits/chosen": -1.8120372295379639, - "logits/rejected": -1.792083740234375, - "logps/chosen": -149.9076385498047, - "logps/rejected": -176.41195678710938, - "loss": 0.6392, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.9560129046440125, - "rewards/margins": 0.27826496958732605, - "rewards/rejected": -1.2342779636383057, + "epoch": 0.48587181254307377, + "grad_norm": 3.388293504714966, + "learning_rate": 9.39057128790668e-08, + "logits/chosen": -2.998136520385742, + "logits/rejected": -2.9863076210021973, + "logps/chosen": -65.70792388916016, + "logps/rejected": -67.45301818847656, + "loss": 0.6795, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.11380715668201447, + "rewards/margins": 0.03052491322159767, + "rewards/rejected": -0.14433208107948303, "step": 2820 }, { - "epoch": 0.49, - "grad_norm": 13.828136544925801, - "learning_rate": 4.691678762173874e-07, - "logits/chosen": -1.6588958501815796, - "logits/rejected": -1.6307001113891602, - "logps/chosen": -137.5013427734375, - "logps/rejected": -186.1872100830078, - "loss": 0.5436, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.8202708959579468, - "rewards/margins": 0.49207648634910583, - "rewards/rejected": -1.3123472929000854, + "epoch": 0.4875947622329428, + "grad_norm": 3.0719292163848877, + "learning_rate": 9.383357524347748e-08, + "logits/chosen": -2.8031680583953857, + "logits/rejected": -2.788007974624634, + "logps/chosen": -66.85478210449219, + "logps/rejected": -70.41021728515625, + "loss": 0.6748, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.11354263871908188, + "rewards/margins": 0.040914587676525116, + "rewards/rejected": -0.154457226395607, "step": 2830 }, { - "epoch": 0.49, - "grad_norm": 12.998742695216233, - "learning_rate": 4.6880520608746065e-07, - "logits/chosen": -1.766371726989746, - "logits/rejected": -1.7423028945922852, - "logps/chosen": -146.57913208007812, - "logps/rejected": -185.55625915527344, - "loss": 0.5919, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.9395810961723328, - "rewards/margins": 0.4023555815219879, - "rewards/rejected": -1.341936707496643, + "epoch": 0.48931771192281187, + "grad_norm": 3.0653076171875, + "learning_rate": 9.376104121749213e-08, + "logits/chosen": -2.9727180004119873, + "logits/rejected": -2.961493968963623, + "logps/chosen": -64.31333923339844, + "logps/rejected": -66.9847412109375, + "loss": 0.6751, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1167706847190857, + "rewards/margins": 0.03927644342184067, + "rewards/rejected": -0.15604713559150696, "step": 2840 }, { - "epoch": 0.49, - "grad_norm": 16.674011214819668, - "learning_rate": 4.684405572852077e-07, - "logits/chosen": -1.6769888401031494, - "logits/rejected": -1.6428205966949463, - "logps/chosen": -159.1728515625, - "logps/rejected": -215.3779754638672, - "loss": 0.5433, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.0694429874420166, - "rewards/margins": 0.5360538363456726, - "rewards/rejected": -1.6054970026016235, + "epoch": 0.4910406616126809, + "grad_norm": 3.3679287433624268, + "learning_rate": 9.368811145704154e-08, + "logits/chosen": -2.9329311847686768, + "logits/rejected": -2.9137070178985596, + "logps/chosen": -64.20672607421875, + "logps/rejected": -71.63908386230469, + "loss": 0.6709, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11960289627313614, + "rewards/margins": 0.04830095171928406, + "rewards/rejected": -0.1679038554430008, "step": 2850 }, { - "epoch": 0.49, - "grad_norm": 16.33436683362672, - "learning_rate": 4.680739331081757e-07, - "logits/chosen": -1.662724494934082, - "logits/rejected": -1.6217238903045654, - "logps/chosen": -157.78176879882812, - "logps/rejected": -207.1402130126953, - "loss": 0.5372, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.0284638404846191, - "rewards/margins": 0.5234465003013611, - "rewards/rejected": -1.5519102811813354, + "epoch": 0.49276361130255, + "grad_norm": 2.8686556816101074, + "learning_rate": 9.361478662163515e-08, + "logits/chosen": -2.9130587577819824, + "logits/rejected": -2.8891494274139404, + "logps/chosen": -66.36784362792969, + "logps/rejected": -67.8857421875, + "loss": 0.6724, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11403318494558334, + "rewards/margins": 0.04504844546318054, + "rewards/rejected": -0.1590816229581833, "step": 2860 }, { - "epoch": 0.49, - "grad_norm": 20.916547924866293, - "learning_rate": 4.677053368717754e-07, - "logits/chosen": -1.682941198348999, - "logits/rejected": -1.6458778381347656, - "logps/chosen": -167.31607055664062, - "logps/rejected": -218.1786346435547, - "loss": 0.5674, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1120381355285645, - "rewards/margins": 0.539524257183075, - "rewards/rejected": -1.6515624523162842, + "epoch": 0.494486560992419, + "grad_norm": 3.327705144882202, + "learning_rate": 9.354106737435507e-08, + "logits/chosen": -2.953275203704834, + "logits/rejected": -2.9302539825439453, + "logps/chosen": -67.18695831298828, + "logps/rejected": -67.57176208496094, + "loss": 0.6779, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.11071185022592545, + "rewards/margins": 0.03463593125343323, + "rewards/rejected": -0.14534778892993927, "step": 2870 }, { - "epoch": 0.5, - "grad_norm": 18.373461916648946, - "learning_rate": 4.6733477190925073e-07, - "logits/chosen": -1.7388041019439697, - "logits/rejected": -1.6905145645141602, - "logps/chosen": -175.05714416503906, - "logps/rejected": -221.7596435546875, - "loss": 0.5676, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1971012353897095, - "rewards/margins": 0.5112650394439697, - "rewards/rejected": -1.7083663940429688, + "epoch": 0.4962095106822881, + "grad_norm": 3.8654720783233643, + "learning_rate": 9.346695438185015e-08, + "logits/chosen": -3.0281896591186523, + "logits/rejected": -3.002101421356201, + "logps/chosen": -67.46855926513672, + "logps/rejected": -66.77601623535156, + "loss": 0.6764, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.12120544910430908, + "rewards/margins": 0.03729164972901344, + "rewards/rejected": -0.15849709510803223, "step": 2880 }, { - "epoch": 0.5, - "grad_norm": 22.994357222490983, - "learning_rate": 4.6696224157164943e-07, - "logits/chosen": -1.7159115076065063, - "logits/rejected": -1.690751075744629, - "logps/chosen": -173.77462768554688, - "logps/rejected": -222.635986328125, - "loss": 0.5746, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2083570957183838, - "rewards/margins": 0.49303531646728516, - "rewards/rejected": -1.701392412185669, + "epoch": 0.49793246037215716, + "grad_norm": 2.923661470413208, + "learning_rate": 9.339244831432988e-08, + "logits/chosen": -3.037105083465576, + "logits/rejected": -3.0316238403320312, + "logps/chosen": -64.63816833496094, + "logps/rejected": -68.85662078857422, + "loss": 0.6715, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11680474132299423, + "rewards/margins": 0.04656790941953659, + "rewards/rejected": -0.16337266564369202, "step": 2890 }, { - "epoch": 0.5, - "grad_norm": 23.78726232737787, - "learning_rate": 4.6658774922779187e-07, - "logits/chosen": -1.6340763568878174, - "logits/rejected": -1.6105928421020508, - "logps/chosen": -169.45535278320312, - "logps/rejected": -216.67333984375, - "loss": 0.5746, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.16982901096344, - "rewards/margins": 0.48190441727638245, - "rewards/rejected": -1.6517333984375, + "epoch": 0.4996554100620262, + "grad_norm": 3.261712074279785, + "learning_rate": 9.331754984555838e-08, + "logits/chosen": -2.9544241428375244, + "logits/rejected": -2.944662570953369, + "logps/chosen": -63.31207275390625, + "logps/rejected": -65.96617889404297, + "loss": 0.6769, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.10824018716812134, + "rewards/margins": 0.036075785756111145, + "rewards/rejected": -0.1443159580230713, "step": 2900 }, { - "epoch": 0.5, - "eval_logits/chosen": -1.7276008129119873, - "eval_logits/rejected": -1.707594871520996, - "eval_logps/chosen": -164.17117309570312, - "eval_logps/rejected": -193.61111450195312, - "eval_loss": 0.6267496943473816, - "eval_rewards/accuracies": 0.6442843675613403, - "eval_rewards/chosen": -1.054673433303833, - "eval_rewards/margins": 0.24986399710178375, - "eval_rewards/rejected": -1.304537296295166, - "eval_runtime": 356.9094, - "eval_samples_per_second": 12.059, - "eval_steps_per_second": 1.507, + "epoch": 0.4996554100620262, + "eval_logits/chosen": -3.028932571411133, + "eval_logits/rejected": -3.0232160091400146, + "eval_logps/chosen": -66.26190948486328, + "eval_logps/rejected": -72.7197265625, + "eval_loss": 0.6843085289001465, + "eval_rewards/accuracies": 0.6003717184066772, + "eval_rewards/chosen": -0.07550010085105896, + "eval_rewards/margins": 0.0198960117995739, + "eval_rewards/rejected": -0.09539611637592316, + "eval_runtime": 361.4703, + "eval_samples_per_second": 11.907, + "eval_steps_per_second": 1.488, "step": 2900 }, { - "epoch": 0.5, - "grad_norm": 22.570926073633988, - "learning_rate": 4.662112982642412e-07, - "logits/chosen": -1.6592012643814087, - "logits/rejected": -1.6184587478637695, - "logps/chosen": -178.86044311523438, - "logps/rejected": -250.51681518554688, - "loss": 0.5009, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2277532815933228, - "rewards/margins": 0.7185968160629272, - "rewards/rejected": -1.94635009765625, + "epoch": 0.5013783597518953, + "grad_norm": 3.804288148880005, + "learning_rate": 9.324225965284823e-08, + "logits/chosen": -2.994413137435913, + "logits/rejected": -2.982327938079834, + "logps/chosen": -67.48818969726562, + "logps/rejected": -72.01011657714844, + "loss": 0.6711, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.11381006240844727, + "rewards/margins": 0.0472860261797905, + "rewards/rejected": -0.16109611093997955, "step": 2910 }, { - "epoch": 0.5, - "grad_norm": 25.29506569233524, - "learning_rate": 4.6583289208527244e-07, - "logits/chosen": -1.5599522590637207, - "logits/rejected": -1.5313317775726318, - "logps/chosen": -197.64329528808594, - "logps/rejected": -261.9999084472656, - "loss": 0.5768, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.4407055377960205, - "rewards/margins": 0.6243780851364136, - "rewards/rejected": -2.0650835037231445, + "epoch": 0.5031013094417643, + "grad_norm": 3.164123773574829, + "learning_rate": 9.316657841705449e-08, + "logits/chosen": -2.9393081665039062, + "logits/rejected": -2.935638904571533, + "logps/chosen": -65.94876098632812, + "logps/rejected": -71.37811279296875, + "loss": 0.6775, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.1237863302230835, + "rewards/margins": 0.034934334456920624, + "rewards/rejected": -0.15872065722942352, "step": 2920 }, { - "epoch": 0.5, - "grad_norm": 17.04379311188254, - "learning_rate": 4.654525341128418e-07, - "logits/chosen": -1.5148179531097412, - "logits/rejected": -1.468490481376648, - "logps/chosen": -188.95974731445312, - "logps/rejected": -253.90164184570312, - "loss": 0.5079, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.355076789855957, - "rewards/margins": 0.6675424575805664, - "rewards/rejected": -2.0226194858551025, + "epoch": 0.5048242591316333, + "grad_norm": 3.0420029163360596, + "learning_rate": 9.309050682256836e-08, + "logits/chosen": -2.9117321968078613, + "logits/rejected": -2.884338140487671, + "logps/chosen": -65.09063720703125, + "logps/rejected": -68.64939880371094, + "loss": 0.6684, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.11639396101236343, + "rewards/margins": 0.05342023819684982, + "rewards/rejected": -0.16981419920921326, "step": 2930 }, { - "epoch": 0.51, - "grad_norm": 14.086402307191449, - "learning_rate": 4.650702277865558e-07, - "logits/chosen": -1.5800873041152954, - "logits/rejected": -1.5371129512786865, - "logps/chosen": -182.9086456298828, - "logps/rejected": -230.81295776367188, - "loss": 0.5955, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3146532773971558, - "rewards/margins": 0.48588043451309204, - "rewards/rejected": -1.8005338907241821, + "epoch": 0.5065472088215024, + "grad_norm": 3.0785973072052, + "learning_rate": 9.301404555731116e-08, + "logits/chosen": -2.9413981437683105, + "logits/rejected": -2.9049649238586426, + "logps/chosen": -63.005767822265625, + "logps/rejected": -66.04293823242188, + "loss": 0.6764, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1154542937874794, + "rewards/margins": 0.03712349012494087, + "rewards/rejected": -0.15257780253887177, "step": 2940 }, { - "epoch": 0.51, - "grad_norm": 21.69271778749331, - "learning_rate": 4.6468597656363994e-07, - "logits/chosen": -1.6005538702011108, - "logits/rejected": -1.566699743270874, - "logps/chosen": -182.33128356933594, - "logps/rejected": -239.34439086914062, - "loss": 0.5601, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2800660133361816, - "rewards/margins": 0.56809401512146, - "rewards/rejected": -1.8481600284576416, + "epoch": 0.5082701585113715, + "grad_norm": 2.9522688388824463, + "learning_rate": 9.293719531272799e-08, + "logits/chosen": -2.9734482765197754, + "logits/rejected": -2.9552502632141113, + "logps/chosen": -65.55525207519531, + "logps/rejected": -70.6024398803711, + "loss": 0.6712, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.11234132945537567, + "rewards/margins": 0.048349231481552124, + "rewards/rejected": -0.1606905460357666, "step": 2950 }, { - "epoch": 0.51, - "grad_norm": 19.85110567457654, - "learning_rate": 4.6429978391890756e-07, - "logits/chosen": -1.5460537672042847, - "logits/rejected": -1.498718500137329, - "logps/chosen": -180.05801391601562, - "logps/rejected": -235.87344360351562, - "loss": 0.5485, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2613499164581299, - "rewards/margins": 0.5788403749465942, - "rewards/rejected": -1.8401902914047241, + "epoch": 0.5099931082012406, + "grad_norm": 3.1853420734405518, + "learning_rate": 9.285995678378151e-08, + "logits/chosen": -2.9326086044311523, + "logits/rejected": -2.905928134918213, + "logps/chosen": -65.61602783203125, + "logps/rejected": -68.58934020996094, + "loss": 0.6703, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.11668062210083008, + "rewards/margins": 0.05025554820895195, + "rewards/rejected": -0.16693618893623352, "step": 2960 }, { - "epoch": 0.51, - "grad_norm": 23.495999485202848, - "learning_rate": 4.639116533447286e-07, - "logits/chosen": -1.4766029119491577, - "logits/rejected": -1.4375579357147217, - "logps/chosen": -196.3876190185547, - "logps/rejected": -245.5087432861328, - "loss": 0.582, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.4116392135620117, - "rewards/margins": 0.519368588924408, - "rewards/rejected": -1.931007742881775, + "epoch": 0.5117160578911096, + "grad_norm": 3.8613901138305664, + "learning_rate": 9.278233066894572e-08, + "logits/chosen": -2.8656792640686035, + "logits/rejected": -2.838275909423828, + "logps/chosen": -67.8607406616211, + "logps/rejected": -68.91703796386719, + "loss": 0.6755, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1262323409318924, + "rewards/margins": 0.038799136877059937, + "rewards/rejected": -0.16503146290779114, "step": 2970 }, { - "epoch": 0.51, - "grad_norm": 22.165196277613042, - "learning_rate": 4.635215883509976e-07, - "logits/chosen": -1.5197012424468994, - "logits/rejected": -1.47576105594635, - "logps/chosen": -182.31118774414062, - "logps/rejected": -240.2295379638672, - "loss": 0.5411, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2836403846740723, - "rewards/margins": 0.5962404012680054, - "rewards/rejected": -1.8798809051513672, + "epoch": 0.5134390075809786, + "grad_norm": 3.3221471309661865, + "learning_rate": 9.270431767019951e-08, + "logits/chosen": -2.8628203868865967, + "logits/rejected": -2.8370893001556396, + "logps/chosen": -64.91873931884766, + "logps/rejected": -68.04789733886719, + "loss": 0.6711, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.10979530960321426, + "rewards/margins": 0.04787523299455643, + "rewards/rejected": -0.1576705276966095, "step": 2980 }, { - "epoch": 0.52, - "grad_norm": 18.23553989348222, - "learning_rate": 4.6312959246510234e-07, - "logits/chosen": -1.6128461360931396, - "logits/rejected": -1.5682920217514038, - "logps/chosen": -174.17776489257812, - "logps/rejected": -225.26742553710938, - "loss": 0.5519, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.1757910251617432, - "rewards/margins": 0.5307731032371521, - "rewards/rejected": -1.70656418800354, + "epoch": 0.5151619572708477, + "grad_norm": 3.131523370742798, + "learning_rate": 9.262591849302047e-08, + "logits/chosen": -2.9078562259674072, + "logits/rejected": -2.8803505897521973, + "logps/chosen": -69.05020141601562, + "logps/rejected": -71.10308837890625, + "loss": 0.6748, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.12440363317728043, + "rewards/margins": 0.04048031568527222, + "rewards/rejected": -0.16488394141197205, "step": 2990 }, { - "epoch": 0.52, - "grad_norm": 13.778675539788516, - "learning_rate": 4.627356692318919e-07, - "logits/chosen": -1.6555538177490234, - "logits/rejected": -1.6289546489715576, - "logps/chosen": -151.74960327148438, - "logps/rejected": -207.7194061279297, - "loss": 0.5452, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.9974144697189331, - "rewards/margins": 0.5410576462745667, - "rewards/rejected": -1.5384724140167236, + "epoch": 0.5168849069607168, + "grad_norm": 2.872863292694092, + "learning_rate": 9.254713384637838e-08, + "logits/chosen": -2.918975830078125, + "logits/rejected": -2.913259744644165, + "logps/chosen": -64.02835845947266, + "logps/rejected": -69.40301513671875, + "loss": 0.6781, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.12016010284423828, + "rewards/margins": 0.03492623567581177, + "rewards/rejected": -0.15508633852005005, "step": 3000 }, { - "epoch": 0.52, - "eval_logits/chosen": -1.7558156251907349, - "eval_logits/rejected": -1.7363479137420654, - "eval_logps/chosen": -150.7609405517578, - "eval_logps/rejected": -178.5692138671875, - "eval_loss": 0.6288471221923828, - "eval_rewards/accuracies": 0.6463754773139954, - "eval_rewards/chosen": -0.9205708503723145, - "eval_rewards/margins": 0.23354758322238922, - "eval_rewards/rejected": -1.1541184186935425, - "eval_runtime": 356.7483, - "eval_samples_per_second": 12.065, - "eval_steps_per_second": 1.508, + "epoch": 0.5168849069607168, + "eval_logits/chosen": -3.020082950592041, + "eval_logits/rejected": -3.014353036880493, + "eval_logps/chosen": -66.83290100097656, + "eval_logps/rejected": -73.39954376220703, + "eval_loss": 0.6838503479957581, + "eval_rewards/accuracies": 0.6026951670646667, + "eval_rewards/chosen": -0.08121006190776825, + "eval_rewards/margins": 0.020984075963497162, + "eval_rewards/rejected": -0.10219414532184601, + "eval_runtime": 360.8285, + "eval_samples_per_second": 11.928, + "eval_steps_per_second": 1.491, "step": 3000 }, { - "epoch": 0.52, - "grad_norm": 16.90078566654223, - "learning_rate": 4.623398222136443e-07, - "logits/chosen": -1.6691395044326782, - "logits/rejected": -1.6284288167953491, - "logps/chosen": -161.9709930419922, - "logps/rejected": -210.35195922851562, - "loss": 0.5663, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.0677728652954102, - "rewards/margins": 0.5049013495445251, - "rewards/rejected": -1.57267427444458, + "epoch": 0.5186078566505858, + "grad_norm": 3.5088436603546143, + "learning_rate": 9.246796444272887e-08, + "logits/chosen": -2.9433512687683105, + "logits/rejected": -2.920409679412842, + "logps/chosen": -68.26364135742188, + "logps/rejected": -68.73377227783203, + "loss": 0.6819, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.13075979053974152, + "rewards/margins": 0.025632452219724655, + "rewards/rejected": -0.15639221668243408, "step": 3010 }, { - "epoch": 0.52, - "grad_norm": 21.480798300954746, - "learning_rate": 4.6194205499003467e-07, - "logits/chosen": -1.7338823080062866, - "logits/rejected": -1.6823310852050781, - "logps/chosen": -166.0416259765625, - "logps/rejected": -225.45706176757812, - "loss": 0.5425, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.1181974411010742, - "rewards/margins": 0.6333866119384766, - "rewards/rejected": -1.7515838146209717, + "epoch": 0.5203308063404548, + "grad_norm": 3.4233145713806152, + "learning_rate": 9.238841099800693e-08, + "logits/chosen": -3.025982141494751, + "logits/rejected": -2.9952666759490967, + "logps/chosen": -68.6304931640625, + "logps/rejected": -68.3849868774414, + "loss": 0.6776, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.14391957223415375, + "rewards/margins": 0.036774180829524994, + "rewards/rejected": -0.18069376051425934, "step": 3020 }, { - "epoch": 0.52, - "grad_norm": 17.722022537225527, - "learning_rate": 4.615423711581027e-07, - "logits/chosen": -1.6567986011505127, - "logits/rejected": -1.6222938299179077, - "logps/chosen": -170.3144989013672, - "logps/rejected": -208.64501953125, - "loss": 0.6058, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1464793682098389, - "rewards/margins": 0.3912624716758728, - "rewards/rejected": -1.5377418994903564, + "epoch": 0.5220537560303239, + "grad_norm": 3.3919730186462402, + "learning_rate": 9.230847423162053e-08, + "logits/chosen": -2.9319372177124023, + "logits/rejected": -2.910442352294922, + "logps/chosen": -69.41728973388672, + "logps/rejected": -71.12834167480469, + "loss": 0.6823, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1374237835407257, + "rewards/margins": 0.024965396150946617, + "rewards/rejected": -0.16238918900489807, "step": 3030 }, { - "epoch": 0.52, - "grad_norm": 15.214602160313902, - "learning_rate": 4.6114077433221994e-07, - "logits/chosen": -1.7444251775741577, - "logits/rejected": -1.715855598449707, - "logps/chosen": -158.28477478027344, - "logps/rejected": -213.8212432861328, - "loss": 0.5529, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.0712274312973022, - "rewards/margins": 0.5239061713218689, - "rewards/rejected": -1.5951335430145264, + "epoch": 0.523776705720193, + "grad_norm": 3.2684762477874756, + "learning_rate": 9.222815486644399e-08, + "logits/chosen": -3.0326926708221436, + "logits/rejected": -3.025906801223755, + "logps/chosen": -65.07283020019531, + "logps/rejected": -71.50475311279297, + "loss": 0.6783, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.13885042071342468, + "rewards/margins": 0.0330406129360199, + "rewards/rejected": -0.17189103364944458, "step": 3040 }, { - "epoch": 0.53, - "grad_norm": 22.969445403757415, - "learning_rate": 4.6073726814405746e-07, - "logits/chosen": -1.6354888677597046, - "logits/rejected": -1.6061290502548218, - "logps/chosen": -153.43170166015625, - "logps/rejected": -197.95614624023438, - "loss": 0.5946, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.9988822937011719, - "rewards/margins": 0.4272375702857971, - "rewards/rejected": -1.4261198043823242, + "epoch": 0.525499655410062, + "grad_norm": 2.9352407455444336, + "learning_rate": 9.214745362881149e-08, + "logits/chosen": -2.8935718536376953, + "logits/rejected": -2.8804633617401123, + "logps/chosen": -66.72821044921875, + "logps/rejected": -71.2332534790039, + "loss": 0.6813, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.13170121610164642, + "rewards/margins": 0.02705610729753971, + "rewards/rejected": -0.1587573140859604, "step": 3050 }, { - "epoch": 0.53, - "grad_norm": 28.86788571866789, - "learning_rate": 4.6033185624255276e-07, - "logits/chosen": -1.6350476741790771, - "logits/rejected": -1.6002562046051025, - "logps/chosen": -153.01419067382812, - "logps/rejected": -200.2187042236328, - "loss": 0.5541, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.9734545946121216, - "rewards/margins": 0.48509782552719116, - "rewards/rejected": -1.458552360534668, + "epoch": 0.5272226050999311, + "grad_norm": 3.934908628463745, + "learning_rate": 9.206637124851055e-08, + "logits/chosen": -2.92893648147583, + "logits/rejected": -2.9105372428894043, + "logps/chosen": -68.1148910522461, + "logps/rejected": -70.24058532714844, + "loss": 0.678, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.1243911162018776, + "rewards/margins": 0.03433989733457565, + "rewards/rejected": -0.15873101353645325, "step": 3060 }, { - "epoch": 0.53, - "grad_norm": 21.065042836310123, - "learning_rate": 4.5992454229387693e-07, - "logits/chosen": -1.5526440143585205, - "logits/rejected": -1.5073213577270508, - "logps/chosen": -173.8745574951172, - "logps/rejected": -220.6032257080078, - "loss": 0.5811, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1593778133392334, - "rewards/margins": 0.5076309442520142, - "rewards/rejected": -1.6670089960098267, + "epoch": 0.5289455547898001, + "grad_norm": 3.168501615524292, + "learning_rate": 9.19849084587754e-08, + "logits/chosen": -2.889446973800659, + "logits/rejected": -2.863997220993042, + "logps/chosen": -71.57978820800781, + "logps/rejected": -71.05496978759766, + "loss": 0.6775, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.1363719403743744, + "rewards/margins": 0.03479612618684769, + "rewards/rejected": -0.17116807401180267, "step": 3070 }, { - "epoch": 0.53, - "grad_norm": 19.310546719568105, - "learning_rate": 4.5951532998140136e-07, - "logits/chosen": -1.4362452030181885, - "logits/rejected": -1.399596929550171, - "logps/chosen": -182.4234161376953, - "logps/rejected": -246.11245727539062, - "loss": 0.5594, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3157217502593994, - "rewards/margins": 0.5977397561073303, - "rewards/rejected": -1.913461685180664, + "epoch": 0.5306685044796692, + "grad_norm": 3.003641128540039, + "learning_rate": 9.190306599628027e-08, + "logits/chosen": -2.8035268783569336, + "logits/rejected": -2.786806583404541, + "logps/chosen": -65.42008972167969, + "logps/rejected": -72.64752960205078, + "loss": 0.6797, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.14561638236045837, + "rewards/margins": 0.03284695744514465, + "rewards/rejected": -0.17846335470676422, "step": 3080 }, { - "epoch": 0.53, - "grad_norm": 20.275727141362044, - "learning_rate": 4.591042230056644e-07, - "logits/chosen": -1.5431610345840454, - "logits/rejected": -1.5028966665267944, - "logps/chosen": -162.43585205078125, - "logps/rejected": -222.4371337890625, - "loss": 0.5203, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.0885474681854248, - "rewards/margins": 0.5989712476730347, - "rewards/rejected": -1.6875184774398804, + "epoch": 0.5323914541695383, + "grad_norm": 3.1618595123291016, + "learning_rate": 9.182084460113288e-08, + "logits/chosen": -2.887605667114258, + "logits/rejected": -2.868880033493042, + "logps/chosen": -66.78776550292969, + "logps/rejected": -71.1986312866211, + "loss": 0.6735, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.13175687193870544, + "rewards/margins": 0.043186891824007034, + "rewards/rejected": -0.17494376003742218, "step": 3090 }, { - "epoch": 0.53, - "grad_norm": 22.85769406936058, - "learning_rate": 4.586912250843383e-07, - "logits/chosen": -1.49831223487854, - "logits/rejected": -1.4446604251861572, - "logps/chosen": -172.95877075195312, - "logps/rejected": -234.62548828125, - "loss": 0.5525, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.1904902458190918, - "rewards/margins": 0.6400495767593384, - "rewards/rejected": -1.8305397033691406, + "epoch": 0.5341144038594073, + "grad_norm": 4.041504383087158, + "learning_rate": 9.173824501686767e-08, + "logits/chosen": -2.9190025329589844, + "logits/rejected": -2.889906167984009, + "logps/chosen": -66.44412231445312, + "logps/rejected": -69.22412109375, + "loss": 0.67, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1252371370792389, + "rewards/margins": 0.05129619315266609, + "rewards/rejected": -0.17653335630893707, "step": 3100 }, { - "epoch": 0.53, - "eval_logits/chosen": -1.6323180198669434, - "eval_logits/rejected": -1.6100775003433228, - "eval_logps/chosen": -161.87399291992188, - "eval_logps/rejected": -193.96153259277344, - "eval_loss": 0.623075008392334, - "eval_rewards/accuracies": 0.6563661694526672, - "eval_rewards/chosen": -1.0317014455795288, - "eval_rewards/margins": 0.2763398587703705, - "eval_rewards/rejected": -1.3080412149429321, - "eval_runtime": 356.7275, - "eval_samples_per_second": 12.065, - "eval_steps_per_second": 1.508, + "epoch": 0.5341144038594073, + "eval_logits/chosen": -3.0152857303619385, + "eval_logits/rejected": -3.0095441341400146, + "eval_logps/chosen": -66.92872619628906, + "eval_logps/rejected": -73.57532501220703, + "eval_loss": 0.6834792494773865, + "eval_rewards/accuracies": 0.6003717184066772, + "eval_rewards/chosen": -0.0821683257818222, + "eval_rewards/margins": 0.021783730015158653, + "eval_rewards/rejected": -0.10395205020904541, + "eval_runtime": 359.9412, + "eval_samples_per_second": 11.958, + "eval_steps_per_second": 1.495, "step": 3100 }, { - "epoch": 0.54, - "grad_norm": 25.719416790446477, - "learning_rate": 4.5827633995219485e-07, - "logits/chosen": -1.4610720872879028, - "logits/rejected": -1.4539538621902466, - "logps/chosen": -183.6023406982422, - "logps/rejected": -237.76895141601562, - "loss": 0.5884, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3302621841430664, - "rewards/margins": 0.498201847076416, - "rewards/rejected": -1.8284639120101929, + "epoch": 0.5358373535492763, + "grad_norm": 3.552891254425049, + "learning_rate": 9.165526799043897e-08, + "logits/chosen": -2.8621737957000732, + "logits/rejected": -2.8714873790740967, + "logps/chosen": -64.84125518798828, + "logps/rejected": -72.49705505371094, + "loss": 0.6787, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.14228765666484833, + "rewards/margins": 0.03320407122373581, + "rewards/rejected": -0.17549173533916473, "step": 3110 }, { - "epoch": 0.54, - "grad_norm": 14.309987948640194, - "learning_rate": 4.5785957136107234e-07, - "logits/chosen": -1.544480323791504, - "logits/rejected": -1.5000821352005005, - "logps/chosen": -165.1011505126953, - "logps/rejected": -232.72195434570312, - "loss": 0.5266, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.0910911560058594, - "rewards/margins": 0.678336501121521, - "rewards/rejected": -1.7694276571273804, + "epoch": 0.5375603032391454, + "grad_norm": 3.3261067867279053, + "learning_rate": 9.157191427221447e-08, + "logits/chosen": -2.908033609390259, + "logits/rejected": -2.8847293853759766, + "logps/chosen": -68.35820007324219, + "logps/rejected": -73.15019226074219, + "loss": 0.6705, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.12347419559955597, + "rewards/margins": 0.04997570067644119, + "rewards/rejected": -0.17344990372657776, "step": 3120 }, { - "epoch": 0.54, - "grad_norm": 18.893819003746106, - "learning_rate": 4.574409230798413e-07, - "logits/chosen": -1.4636805057525635, - "logits/rejected": -1.4383834600448608, - "logps/chosen": -155.36705017089844, - "logps/rejected": -204.67672729492188, - "loss": 0.5856, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.032982587814331, - "rewards/margins": 0.48075515031814575, - "rewards/rejected": -1.513737678527832, + "epoch": 0.5392832529290145, + "grad_norm": 3.127723217010498, + "learning_rate": 9.148818461596826e-08, + "logits/chosen": -2.8254592418670654, + "logits/rejected": -2.8169493675231934, + "logps/chosen": -64.7706527709961, + "logps/rejected": -69.00775146484375, + "loss": 0.6801, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1268969476222992, + "rewards/margins": 0.029782230034470558, + "rewards/rejected": -0.1566791832447052, "step": 3130 }, { - "epoch": 0.54, - "grad_norm": 19.750368207437877, - "learning_rate": 4.5702039889437014e-07, - "logits/chosen": -1.5176935195922852, - "logits/rejected": -1.4778989553451538, - "logps/chosen": -171.33694458007812, - "logps/rejected": -242.3556365966797, - "loss": 0.5396, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.1966036558151245, - "rewards/margins": 0.689578115940094, - "rewards/rejected": -1.8861818313598633, + "epoch": 0.5410062026188835, + "grad_norm": 3.4870545864105225, + "learning_rate": 9.140407977887403e-08, + "logits/chosen": -2.938673496246338, + "logits/rejected": -2.9236607551574707, + "logps/chosen": -65.6788101196289, + "logps/rejected": -72.48387145996094, + "loss": 0.6715, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13981683552265167, + "rewards/margins": 0.047606609761714935, + "rewards/rejected": -0.187423437833786, "step": 3140 }, { - "epoch": 0.54, - "grad_norm": 15.672784519353224, - "learning_rate": 4.565980026074917e-07, - "logits/chosen": -1.4829928874969482, - "logits/rejected": -1.4322001934051514, - "logps/chosen": -172.95135498046875, - "logps/rejected": -237.19912719726562, - "loss": 0.5323, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.1858676671981812, - "rewards/margins": 0.6504721641540527, - "rewards/rejected": -1.8363399505615234, + "epoch": 0.5427291523087526, + "grad_norm": 3.362210273742676, + "learning_rate": 9.131960052149834e-08, + "logits/chosen": -2.9392380714416504, + "logits/rejected": -2.910787582397461, + "logps/chosen": -66.6327133178711, + "logps/rejected": -71.1677474975586, + "loss": 0.6691, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.12263756990432739, + "rewards/margins": 0.053017932921648026, + "rewards/rejected": -0.17565548419952393, "step": 3150 }, { - "epoch": 0.54, - "grad_norm": 18.037452777993565, - "learning_rate": 4.5617373803896796e-07, - "logits/chosen": -1.3555725812911987, - "logits/rejected": -1.3147733211517334, - "logps/chosen": -188.40936279296875, - "logps/rejected": -247.400634765625, - "loss": 0.559, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3425967693328857, - "rewards/margins": 0.605254054069519, - "rewards/rejected": -1.9478508234024048, + "epoch": 0.5444521019986216, + "grad_norm": 3.542495012283325, + "learning_rate": 9.123474760779359e-08, + "logits/chosen": -2.8761775493621826, + "logits/rejected": -2.8476626873016357, + "logps/chosen": -66.16803741455078, + "logps/rejected": -70.11380767822266, + "loss": 0.6685, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.12021736055612564, + "rewards/margins": 0.05458689481019974, + "rewards/rejected": -0.17480425536632538, "step": 3160 }, { - "epoch": 0.55, - "grad_norm": 15.619953452388868, - "learning_rate": 4.5574760902545625e-07, - "logits/chosen": -1.4381481409072876, - "logits/rejected": -1.391213059425354, - "logps/chosen": -183.0797882080078, - "logps/rejected": -240.10757446289062, - "loss": 0.5234, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.272312045097351, - "rewards/margins": 0.6338873505592346, - "rewards/rejected": -1.9061992168426514, + "epoch": 0.5461750516884907, + "grad_norm": 3.6144564151763916, + "learning_rate": 9.114952180509124e-08, + "logits/chosen": -2.9056293964385986, + "logits/rejected": -2.873357057571411, + "logps/chosen": -69.16236877441406, + "logps/rejected": -69.51313781738281, + "loss": 0.6625, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1327294409275055, + "rewards/margins": 0.06754171848297119, + "rewards/rejected": -0.20027117431163788, "step": 3170 }, { - "epoch": 0.55, - "grad_norm": 24.556199495358904, - "learning_rate": 4.5531961942047385e-07, - "logits/chosen": -1.521206021308899, - "logits/rejected": -1.4590338468551636, - "logps/chosen": -183.3689727783203, - "logps/rejected": -249.7210693359375, - "loss": 0.5345, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3013378381729126, - "rewards/margins": 0.681814968585968, - "rewards/rejected": -1.9831526279449463, + "epoch": 0.5478980013783598, + "grad_norm": 3.13999605178833, + "learning_rate": 9.106392388409477e-08, + "logits/chosen": -2.957420587539673, + "logits/rejected": -2.920548677444458, + "logps/chosen": -66.63191223144531, + "logps/rejected": -69.84432983398438, + "loss": 0.6698, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.1337551772594452, + "rewards/margins": 0.0504077672958374, + "rewards/rejected": -0.1841629445552826, "step": 3180 }, { - "epoch": 0.55, - "grad_norm": 18.24766746095848, - "learning_rate": 4.548897730943638e-07, - "logits/chosen": -1.5017660856246948, - "logits/rejected": -1.4614744186401367, - "logps/chosen": -174.06478881835938, - "logps/rejected": -262.4342041015625, - "loss": 0.481, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2178510427474976, - "rewards/margins": 0.839970588684082, - "rewards/rejected": -2.057821750640869, + "epoch": 0.5496209510682288, + "grad_norm": 3.5873255729675293, + "learning_rate": 9.097795461887277e-08, + "logits/chosen": -2.9530131816864014, + "logits/rejected": -2.9471516609191895, + "logps/chosen": -66.12956237792969, + "logps/rejected": -77.18992614746094, + "loss": 0.6636, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.13831885159015656, + "rewards/margins": 0.0669272169470787, + "rewards/rejected": -0.20524606108665466, "step": 3190 }, { - "epoch": 0.55, - "grad_norm": 19.455426316676963, - "learning_rate": 4.544580739342596e-07, - "logits/chosen": -1.406374216079712, - "logits/rejected": -1.3839681148529053, - "logps/chosen": -190.57859802246094, - "logps/rejected": -231.16159057617188, - "loss": 0.6097, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.3527233600616455, - "rewards/margins": 0.4378505349159241, - "rewards/rejected": -1.7905738353729248, + "epoch": 0.5513439007580979, + "grad_norm": 3.6692728996276855, + "learning_rate": 9.089161478685192e-08, + "logits/chosen": -2.8761074542999268, + "logits/rejected": -2.861693859100342, + "logps/chosen": -67.71090698242188, + "logps/rejected": -69.20539855957031, + "loss": 0.6718, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.12382230907678604, + "rewards/margins": 0.04690946266055107, + "rewards/rejected": -0.17073175311088562, "step": 3200 }, { - "epoch": 0.55, - "eval_logits/chosen": -1.6120717525482178, - "eval_logits/rejected": -1.5902533531188965, - "eval_logps/chosen": -167.82127380371094, - "eval_logps/rejected": -200.23843383789062, - "eval_loss": 0.6200531721115112, - "eval_rewards/accuracies": 0.6554368138313293, - "eval_rewards/chosen": -1.0911740064620972, - "eval_rewards/margins": 0.2796363830566406, - "eval_rewards/rejected": -1.3708105087280273, - "eval_runtime": 356.6653, - "eval_samples_per_second": 12.067, - "eval_steps_per_second": 1.508, + "epoch": 0.5513439007580979, + "eval_logits/chosen": -3.003952741622925, + "eval_logits/rejected": -2.998213052749634, + "eval_logps/chosen": -68.10050201416016, + "eval_logps/rejected": -74.91475677490234, + "eval_loss": 0.6827961802482605, + "eval_rewards/accuracies": 0.6015334725379944, + "eval_rewards/chosen": -0.09388609230518341, + "eval_rewards/margins": 0.023460378870368004, + "eval_rewards/rejected": -0.11734647303819656, + "eval_runtime": 360.6231, + "eval_samples_per_second": 11.935, + "eval_steps_per_second": 1.492, "step": 3200 }, { - "epoch": 0.55, - "grad_norm": 19.796269236135114, - "learning_rate": 4.5402452584404995e-07, - "logits/chosen": -1.411024808883667, - "logits/rejected": -1.3637266159057617, - "logps/chosen": -168.30789184570312, - "logps/rejected": -231.79910278320312, - "loss": 0.5188, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.1436474323272705, - "rewards/margins": 0.6496217250823975, - "rewards/rejected": -1.793269157409668, + "epoch": 0.5530668504479669, + "grad_norm": 3.8423609733581543, + "learning_rate": 9.080490516880998e-08, + "logits/chosen": -2.79463791847229, + "logits/rejected": -2.7705302238464355, + "logps/chosen": -67.96089172363281, + "logps/rejected": -71.14826202392578, + "loss": 0.6721, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.14009490609169006, + "rewards/margins": 0.04641476646065712, + "rewards/rejected": -0.18650969862937927, "step": 3210 }, { - "epoch": 0.55, - "grad_norm": 15.230980934048251, - "learning_rate": 4.535891327443435e-07, - "logits/chosen": -1.4088395833969116, - "logits/rejected": -1.3787903785705566, - "logps/chosen": -172.39651489257812, - "logps/rejected": -239.85952758789062, - "loss": 0.5494, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2206478118896484, - "rewards/margins": 0.6492413282394409, - "rewards/rejected": -1.8698889017105103, + "epoch": 0.554789800137836, + "grad_norm": 3.280461311340332, + "learning_rate": 9.07178265488687e-08, + "logits/chosen": -2.828249931335449, + "logits/rejected": -2.823260545730591, + "logps/chosen": -65.09806823730469, + "logps/rejected": -71.36117553710938, + "loss": 0.6766, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1475423276424408, + "rewards/margins": 0.03724092245101929, + "rewards/rejected": -0.18478327989578247, "step": 3220 }, { - "epoch": 0.56, - "grad_norm": 30.32211749720397, - "learning_rate": 4.5315189857243377e-07, - "logits/chosen": -1.4493725299835205, - "logits/rejected": -1.413207769393921, - "logps/chosen": -175.58511352539062, - "logps/rejected": -230.5104522705078, - "loss": 0.5586, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.214536428451538, - "rewards/margins": 0.5267941355705261, - "rewards/rejected": -1.7413305044174194, + "epoch": 0.556512749827705, + "grad_norm": 3.3534491062164307, + "learning_rate": 9.063037971448675e-08, + "logits/chosen": -2.882533550262451, + "logits/rejected": -2.8667454719543457, + "logps/chosen": -68.42729949951172, + "logps/rejected": -73.65990447998047, + "loss": 0.6802, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.14269927144050598, + "rewards/margins": 0.030112752690911293, + "rewards/rejected": -0.17281204462051392, "step": 3230 }, { - "epoch": 0.56, - "grad_norm": 16.643885483015545, - "learning_rate": 4.527128272822629e-07, - "logits/chosen": -1.621273398399353, - "logits/rejected": -1.58579421043396, - "logps/chosen": -170.71180725097656, - "logps/rejected": -214.2522735595703, - "loss": 0.6022, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1446621417999268, - "rewards/margins": 0.47269731760025024, - "rewards/rejected": -1.6173597574234009, + "epoch": 0.5582356995175741, + "grad_norm": 3.23046612739563, + "learning_rate": 9.054256545645258e-08, + "logits/chosen": -3.001438617706299, + "logits/rejected": -2.9791247844696045, + "logps/chosen": -72.44905090332031, + "logps/rejected": -72.3000717163086, + "loss": 0.6777, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.16212376952171326, + "rewards/margins": 0.035440459847450256, + "rewards/rejected": -0.19756419956684113, "step": 3240 }, { - "epoch": 0.56, - "grad_norm": 21.109399435647408, - "learning_rate": 4.522719228443864e-07, - "logits/chosen": -1.5881023406982422, - "logits/rejected": -1.5520793199539185, - "logps/chosen": -142.5547332763672, - "logps/rejected": -191.86538696289062, - "loss": 0.5607, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.8997921943664551, - "rewards/margins": 0.4858148992061615, - "rewards/rejected": -1.3856070041656494, + "epoch": 0.5599586492074431, + "grad_norm": 3.267273426055908, + "learning_rate": 9.045438456887727e-08, + "logits/chosen": -2.8932247161865234, + "logits/rejected": -2.8726253509521484, + "logps/chosen": -66.225830078125, + "logps/rejected": -72.84576416015625, + "loss": 0.666, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.13653263449668884, + "rewards/margins": 0.05883949249982834, + "rewards/rejected": -0.19537213444709778, "step": 3250 }, { - "epoch": 0.56, - "grad_norm": 22.057929518873816, - "learning_rate": 4.5182918924593703e-07, - "logits/chosen": -1.607410192489624, - "logits/rejected": -1.5681886672973633, - "logps/chosen": -147.30438232421875, - "logps/rejected": -203.17489624023438, - "loss": 0.5476, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.9579440355300903, - "rewards/margins": 0.5465749502182007, - "rewards/rejected": -1.504518747329712, + "epoch": 0.5616815988973122, + "grad_norm": 3.519906759262085, + "learning_rate": 9.036583784918741e-08, + "logits/chosen": -2.9068236351013184, + "logits/rejected": -2.8895974159240723, + "logps/chosen": -66.02603149414062, + "logps/rejected": -72.74566650390625, + "loss": 0.6693, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.14490602910518646, + "rewards/margins": 0.05516090244054794, + "rewards/rejected": -0.2000669240951538, "step": 3260 }, { - "epoch": 0.56, - "grad_norm": 18.19721714439992, - "learning_rate": 4.5138463049058885e-07, - "logits/chosen": -1.6494948863983154, - "logits/rejected": -1.625372290611267, - "logps/chosen": -166.00369262695312, - "logps/rejected": -213.57943725585938, - "loss": 0.5827, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1090967655181885, - "rewards/margins": 0.4481208920478821, - "rewards/rejected": -1.5572177171707153, + "epoch": 0.5634045485871813, + "grad_norm": 3.5675628185272217, + "learning_rate": 9.027692609811777e-08, + "logits/chosen": -2.9633305072784424, + "logits/rejected": -2.9593842029571533, + "logps/chosen": -70.6051025390625, + "logps/rejected": -78.98965454101562, + "loss": 0.6675, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.15475505590438843, + "rewards/margins": 0.05626174807548523, + "rewards/rejected": -0.21101677417755127, "step": 3270 }, { - "epoch": 0.57, - "grad_norm": 20.95007962777713, - "learning_rate": 4.50938250598521e-07, - "logits/chosen": -1.6200672388076782, - "logits/rejected": -1.5890979766845703, - "logps/chosen": -150.30422973632812, - "logps/rejected": -204.21400451660156, - "loss": 0.5402, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.0092462301254272, - "rewards/margins": 0.509067714214325, - "rewards/rejected": -1.5183137655258179, + "epoch": 0.5651274982770503, + "grad_norm": 3.5583724975585938, + "learning_rate": 9.018765011970419e-08, + "logits/chosen": -2.9657270908355713, + "logits/rejected": -2.9546895027160645, + "logps/chosen": -64.42697143554688, + "logps/rejected": -71.25099182128906, + "loss": 0.6763, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1500626802444458, + "rewards/margins": 0.038469213992357254, + "rewards/rejected": -0.18853187561035156, "step": 3280 }, { - "epoch": 0.57, - "grad_norm": 18.41298820642052, - "learning_rate": 4.5049005360638103e-07, - "logits/chosen": -1.5941425561904907, - "logits/rejected": -1.538638710975647, - "logps/chosen": -169.52369689941406, - "logps/rejected": -231.6940460205078, - "loss": 0.5708, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1636857986450195, - "rewards/margins": 0.6202095150947571, - "rewards/rejected": -1.7838952541351318, + "epoch": 0.5668504479669194, + "grad_norm": 3.300567388534546, + "learning_rate": 9.00980107212762e-08, + "logits/chosen": -3.010849952697754, + "logits/rejected": -2.980490207672119, + "logps/chosen": -67.35631561279297, + "logps/rejected": -73.60289001464844, + "loss": 0.6652, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.141900897026062, + "rewards/margins": 0.060899633914232254, + "rewards/rejected": -0.20280051231384277, "step": 3290 }, { - "epoch": 0.57, - "grad_norm": 22.05304883452244, - "learning_rate": 4.5004004356724893e-07, - "logits/chosen": -1.455288290977478, - "logits/rejected": -1.414819598197937, - "logps/chosen": -181.70596313476562, - "logps/rejected": -236.2588653564453, - "loss": 0.5807, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2678734064102173, - "rewards/margins": 0.5589209794998169, - "rewards/rejected": -1.8267943859100342, + "epoch": 0.5685733976567884, + "grad_norm": 3.9140853881835938, + "learning_rate": 9.000800871344979e-08, + "logits/chosen": -2.8878273963928223, + "logits/rejected": -2.8666417598724365, + "logps/chosen": -70.4729995727539, + "logps/rejected": -73.83319854736328, + "loss": 0.6724, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.15556514263153076, + "rewards/margins": 0.04699654132127762, + "rewards/rejected": -0.20256169140338898, "step": 3300 }, { - "epoch": 0.57, - "eval_logits/chosen": -1.5502673387527466, - "eval_logits/rejected": -1.5291829109191895, - "eval_logps/chosen": -168.8760528564453, - "eval_logps/rejected": -199.7250213623047, - "eval_loss": 0.6238878965377808, - "eval_rewards/accuracies": 0.6505576372146606, - "eval_rewards/chosen": -1.1017221212387085, - "eval_rewards/margins": 0.26395440101623535, - "eval_rewards/rejected": -1.3656764030456543, - "eval_runtime": 357.2068, - "eval_samples_per_second": 12.049, - "eval_steps_per_second": 1.506, + "epoch": 0.5685733976567884, + "eval_logits/chosen": -2.990844488143921, + "eval_logits/rejected": -2.985057830810547, + "eval_logps/chosen": -68.70267486572266, + "eval_logps/rejected": -75.66940307617188, + "eval_loss": 0.6821591854095459, + "eval_rewards/accuracies": 0.6050186157226562, + "eval_rewards/chosen": -0.0999077558517456, + "eval_rewards/margins": 0.0249850582331419, + "eval_rewards/rejected": -0.12489282339811325, + "eval_runtime": 360.6919, + "eval_samples_per_second": 11.933, + "eval_steps_per_second": 1.492, "step": 3300 }, { - "epoch": 0.57, - "grad_norm": 18.786920967242654, - "learning_rate": 4.4958822455060017e-07, - "logits/chosen": -1.3820545673370361, - "logits/rejected": -1.3281322717666626, - "logps/chosen": -169.11795043945312, - "logps/rejected": -231.81167602539062, - "loss": 0.5407, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.1603150367736816, - "rewards/margins": 0.6508747339248657, - "rewards/rejected": -1.8111896514892578, + "epoch": 0.5702963473466575, + "grad_norm": 3.486947536468506, + "learning_rate": 8.991764491012004e-08, + "logits/chosen": -2.8264219760894775, + "logits/rejected": -2.7986056804656982, + "logps/chosen": -67.07157135009766, + "logps/rejected": -70.15736389160156, + "loss": 0.6684, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.13972768187522888, + "rewards/margins": 0.0548524372279644, + "rewards/rejected": -0.1945801079273224, "step": 3310 }, { - "epoch": 0.57, - "grad_norm": 21.45405748820459, - "learning_rate": 4.4913460064226894e-07, - "logits/chosen": -1.44109308719635, - "logits/rejected": -1.3888362646102905, - "logps/chosen": -179.4849395751953, - "logps/rejected": -232.6226043701172, - "loss": 0.562, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.257912278175354, - "rewards/margins": 0.5788989067077637, - "rewards/rejected": -1.8368113040924072, + "epoch": 0.5720192970365265, + "grad_norm": 4.112346649169922, + "learning_rate": 8.982692012845379e-08, + "logits/chosen": -2.8834726810455322, + "logits/rejected": -2.8484654426574707, + "logps/chosen": -69.16401672363281, + "logps/rejected": -70.00126647949219, + "loss": 0.6676, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.15456373989582062, + "rewards/margins": 0.055997978895902634, + "rewards/rejected": -0.21056172251701355, "step": 3320 }, { - "epoch": 0.57, - "grad_norm": 16.508118858701444, - "learning_rate": 4.486791759444111e-07, - "logits/chosen": -1.5882141590118408, - "logits/rejected": -1.5403480529785156, - "logps/chosen": -164.25289916992188, - "logps/rejected": -231.80007934570312, - "loss": 0.5175, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.1004276275634766, - "rewards/margins": 0.680879533290863, - "rewards/rejected": -1.7813072204589844, + "epoch": 0.5737422467263956, + "grad_norm": 3.7715468406677246, + "learning_rate": 8.973583518888222e-08, + "logits/chosen": -2.983250141143799, + "logits/rejected": -2.952721118927002, + "logps/chosen": -67.77142333984375, + "logps/rejected": -73.4288101196289, + "loss": 0.6646, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.13558022677898407, + "rewards/margins": 0.0619480200111866, + "rewards/rejected": -0.19752824306488037, "step": 3330 }, { - "epoch": 0.58, - "grad_norm": 33.55292099789505, - "learning_rate": 4.4822195457546716e-07, - "logits/chosen": -1.5143282413482666, - "logits/rejected": -1.4674466848373413, - "logps/chosen": -189.68515014648438, - "logps/rejected": -263.664794921875, - "loss": 0.5363, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3378245830535889, - "rewards/margins": 0.75914067029953, - "rewards/rejected": -2.0969653129577637, + "epoch": 0.5754651964162646, + "grad_norm": 3.686140537261963, + "learning_rate": 8.964439091509344e-08, + "logits/chosen": -2.921363115310669, + "logits/rejected": -2.9037246704101562, + "logps/chosen": -71.49298095703125, + "logps/rejected": -75.01543426513672, + "loss": 0.6687, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.15579842031002045, + "rewards/margins": 0.05456198379397392, + "rewards/rejected": -0.21036040782928467, "step": 3340 }, { - "epoch": 0.58, - "grad_norm": 16.44122753789477, - "learning_rate": 4.477629406701254e-07, - "logits/chosen": -1.427293300628662, - "logits/rejected": -1.3901170492172241, - "logps/chosen": -177.4530792236328, - "logps/rejected": -247.17105102539062, - "loss": 0.53, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.216886281967163, - "rewards/margins": 0.6818975806236267, - "rewards/rejected": -1.8987839221954346, + "epoch": 0.5771881461061337, + "grad_norm": 3.439488410949707, + "learning_rate": 8.955258813402509e-08, + "logits/chosen": -2.8145248889923096, + "logits/rejected": -2.8000881671905518, + "logps/chosen": -71.99406433105469, + "logps/rejected": -76.73139953613281, + "loss": 0.6793, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.16215400397777557, + "rewards/margins": 0.03191717341542244, + "rewards/rejected": -0.1940712034702301, "step": 3350 }, { - "epoch": 0.58, - "grad_norm": 14.401579786445982, - "learning_rate": 4.473021383792838e-07, - "logits/chosen": -1.5537811517715454, - "logits/rejected": -1.5063152313232422, - "logps/chosen": -168.55690002441406, - "logps/rejected": -219.3855743408203, - "loss": 0.5712, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1374642848968506, - "rewards/margins": 0.5342391133308411, - "rewards/rejected": -1.6717033386230469, + "epoch": 0.5789110957960028, + "grad_norm": 3.584379196166992, + "learning_rate": 8.946042767585676e-08, + "logits/chosen": -2.9319262504577637, + "logits/rejected": -2.89823842048645, + "logps/chosen": -70.10853576660156, + "logps/rejected": -72.36190795898438, + "loss": 0.6714, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.15293297171592712, + "rewards/margins": 0.048371512442827225, + "rewards/rejected": -0.20130451023578644, "step": 3360 }, { - "epoch": 0.58, - "grad_norm": 19.904447545777934, - "learning_rate": 4.4683955187001285e-07, - "logits/chosen": -1.5263116359710693, - "logits/rejected": -1.4976154565811157, - "logps/chosen": -163.7618865966797, - "logps/rejected": -228.2670440673828, - "loss": 0.5506, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.1279897689819336, - "rewards/margins": 0.6148195862770081, - "rewards/rejected": -1.7428092956542969, + "epoch": 0.5806340454858718, + "grad_norm": 3.598647117614746, + "learning_rate": 8.936791037400258e-08, + "logits/chosen": -2.870156764984131, + "logits/rejected": -2.860452175140381, + "logps/chosen": -66.26335144042969, + "logps/rejected": -74.37271881103516, + "loss": 0.6707, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.15308070182800293, + "rewards/margins": 0.05050992965698242, + "rewards/rejected": -0.20359063148498535, "step": 3370 }, { - "epoch": 0.58, - "grad_norm": 23.89725054685839, - "learning_rate": 4.463751853255182e-07, - "logits/chosen": -1.6531779766082764, - "logits/rejected": -1.6117451190948486, - "logps/chosen": -161.826171875, - "logps/rejected": -209.54940795898438, - "loss": 0.5573, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.0634934902191162, - "rewards/margins": 0.5306968688964844, - "rewards/rejected": -1.5941904783248901, + "epoch": 0.5823569951757409, + "grad_norm": 3.889470100402832, + "learning_rate": 8.927503706510364e-08, + "logits/chosen": -2.962991237640381, + "logits/rejected": -2.932013988494873, + "logps/chosen": -69.73117065429688, + "logps/rejected": -70.13800048828125, + "loss": 0.6674, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.14270296692848206, + "rewards/margins": 0.05723966285586357, + "rewards/rejected": -0.19994261860847473, "step": 3380 }, { - "epoch": 0.58, - "grad_norm": 16.812554502274565, - "learning_rate": 4.45909042945102e-07, - "logits/chosen": -1.5942082405090332, - "logits/rejected": -1.553095817565918, - "logps/chosen": -158.5015869140625, - "logps/rejected": -206.106201171875, - "loss": 0.5908, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.0478193759918213, - "rewards/margins": 0.4990636706352234, - "rewards/rejected": -1.5468828678131104, + "epoch": 0.5840799448656099, + "grad_norm": 3.23209547996521, + "learning_rate": 8.91818085890204e-08, + "logits/chosen": -2.8833847045898438, + "logits/rejected": -2.8576457500457764, + "logps/chosen": -69.89290618896484, + "logps/rejected": -71.88001251220703, + "loss": 0.6744, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1616523414850235, + "rewards/margins": 0.0428120493888855, + "rewards/rejected": -0.204464390873909, "step": 3390 }, { - "epoch": 0.59, - "grad_norm": 16.25733459636065, - "learning_rate": 4.454411289441259e-07, - "logits/chosen": -1.650813102722168, - "logits/rejected": -1.589519739151001, - "logps/chosen": -148.92491149902344, - "logps/rejected": -204.60989379882812, - "loss": 0.536, + "epoch": 0.585802894555479, + "grad_norm": 3.751899480819702, + "learning_rate": 8.908822578882518e-08, + "logits/chosen": -2.931617021560669, + "logits/rejected": -2.8930277824401855, + "logps/chosen": -67.65100860595703, + "logps/rejected": -72.86280822753906, + "loss": 0.6625, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.9608653783798218, - "rewards/margins": 0.5713566541671753, - "rewards/rejected": -1.532222032546997, + "rewards/chosen": -0.1479429006576538, + "rewards/margins": 0.06683917343616486, + "rewards/rejected": -0.21478208899497986, "step": 3400 }, { - "epoch": 0.59, - "eval_logits/chosen": -1.723968505859375, - "eval_logits/rejected": -1.705629825592041, - "eval_logps/chosen": -141.45721435546875, - "eval_logps/rejected": -167.2509307861328, - "eval_loss": 0.63118976354599, - "eval_rewards/accuracies": 0.6466078162193298, - "eval_rewards/chosen": -0.8275338411331177, - "eval_rewards/margins": 0.21340180933475494, - "eval_rewards/rejected": -1.0409355163574219, - "eval_runtime": 357.3396, - "eval_samples_per_second": 12.045, - "eval_steps_per_second": 1.506, + "epoch": 0.585802894555479, + "eval_logits/chosen": -2.981968879699707, + "eval_logits/rejected": -2.9761545658111572, + "eval_logps/chosen": -68.8060302734375, + "eval_logps/rejected": -75.84404754638672, + "eval_loss": 0.6818387508392334, + "eval_rewards/accuracies": 0.6089683771133423, + "eval_rewards/chosen": -0.10094138979911804, + "eval_rewards/margins": 0.025697872042655945, + "eval_rewards/rejected": -0.12663927674293518, + "eval_runtime": 359.5659, + "eval_samples_per_second": 11.97, + "eval_steps_per_second": 1.496, "step": 3400 }, { - "epoch": 0.59, - "grad_norm": 17.44200872783744, - "learning_rate": 4.4497144755397215e-07, - "logits/chosen": -1.5299510955810547, - "logits/rejected": -1.4821765422821045, - "logps/chosen": -140.80323791503906, - "logps/rejected": -188.71884155273438, - "loss": 0.5417, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9123737215995789, - "rewards/margins": 0.5039972066879272, - "rewards/rejected": -1.4163707494735718, + "epoch": 0.587525844245348, + "grad_norm": 3.432067632675171, + "learning_rate": 8.899428951079443e-08, + "logits/chosen": -2.8233437538146973, + "logits/rejected": -2.7948660850524902, + "logps/chosen": -64.45096588134766, + "logps/rejected": -67.34065246582031, + "loss": 0.6687, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.14877715706825256, + "rewards/margins": 0.05394697189331055, + "rewards/rejected": -0.2027241289615631, "step": 3410 }, { - "epoch": 0.59, - "grad_norm": 20.94711202688562, - "learning_rate": 4.4450000302200576e-07, - "logits/chosen": -1.5101244449615479, - "logits/rejected": -1.4615298509597778, - "logps/chosen": -156.17779541015625, - "logps/rejected": -221.5508270263672, - "loss": 0.52, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.0341514348983765, - "rewards/margins": 0.6485717296600342, - "rewards/rejected": -1.6827232837677002, + "epoch": 0.5892487939352171, + "grad_norm": 4.382596015930176, + "learning_rate": 8.890000060440115e-08, + "logits/chosen": -2.845134735107422, + "logits/rejected": -2.821000576019287, + "logps/chosen": -66.77651977539062, + "logps/rejected": -72.5694808959961, + "loss": 0.6694, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.14009198546409607, + "rewards/margins": 0.05261436849832535, + "rewards/rejected": -0.19270634651184082, "step": 3420 }, { - "epoch": 0.59, - "grad_norm": 17.697196074003802, - "learning_rate": 4.440267996115359e-07, - "logits/chosen": -1.5161569118499756, - "logits/rejected": -1.4690425395965576, - "logps/chosen": -188.15164184570312, - "logps/rejected": -251.8218231201172, - "loss": 0.5671, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.3682286739349365, - "rewards/margins": 0.6038091778755188, - "rewards/rejected": -1.9720379114151, + "epoch": 0.5909717436250862, + "grad_norm": 3.418834924697876, + "learning_rate": 8.880535992230718e-08, + "logits/chosen": -2.9278876781463623, + "logits/rejected": -2.9076414108276367, + "logps/chosen": -66.03239440917969, + "logps/rejected": -73.9791030883789, + "loss": 0.6721, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.14691507816314697, + "rewards/margins": 0.046559590846300125, + "rewards/rejected": -0.1934746652841568, "step": 3430 }, { - "epoch": 0.59, - "grad_norm": 23.878652372078566, - "learning_rate": 4.435518416017774e-07, - "logits/chosen": -1.4505062103271484, - "logits/rejected": -1.4057317972183228, - "logps/chosen": -191.19534301757812, - "logps/rejected": -253.8204345703125, - "loss": 0.5548, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.386473298072815, - "rewards/margins": 0.6239285469055176, - "rewards/rejected": -2.010401964187622, + "epoch": 0.5926946933149552, + "grad_norm": 3.650676727294922, + "learning_rate": 8.871036832035547e-08, + "logits/chosen": -2.8495564460754395, + "logits/rejected": -2.8339741230010986, + "logps/chosen": -69.0568618774414, + "logps/rejected": -73.52063751220703, + "loss": 0.6743, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.16483992338180542, + "rewards/margins": 0.04235614091157913, + "rewards/rejected": -0.20719607174396515, "step": 3440 }, { - "epoch": 0.59, - "grad_norm": 24.14251784705596, - "learning_rate": 4.430751332878122e-07, - "logits/chosen": -1.6515562534332275, - "logits/rejected": -1.5952726602554321, - "logps/chosen": -197.3381805419922, - "logps/rejected": -256.8216857910156, - "loss": 0.5492, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3796002864837646, - "rewards/margins": 0.6393512487411499, - "rewards/rejected": -2.018951654434204, + "epoch": 0.5944176430048242, + "grad_norm": 3.656323194503784, + "learning_rate": 8.861502665756244e-08, + "logits/chosen": -3.0892930030822754, + "logits/rejected": -3.049489974975586, + "logps/chosen": -74.71489715576172, + "logps/rejected": -75.1200180053711, + "loss": 0.6713, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.15335428714752197, + "rewards/margins": 0.048375897109508514, + "rewards/rejected": -0.20173020660877228, "step": 3450 }, { - "epoch": 0.6, - "grad_norm": 24.13758333534331, - "learning_rate": 4.425966789805503e-07, - "logits/chosen": -1.499289631843567, - "logits/rejected": -1.4667627811431885, - "logps/chosen": -164.79124450683594, - "logps/rejected": -216.9873504638672, - "loss": 0.5621, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.1262753009796143, - "rewards/margins": 0.5078203678131104, - "rewards/rejected": -1.6340957880020142, + "epoch": 0.5961405926946933, + "grad_norm": 3.811377763748169, + "learning_rate": 8.851933579611007e-08, + "logits/chosen": -2.8736202716827393, + "logits/rejected": -2.861013889312744, + "logps/chosen": -65.99501037597656, + "logps/rejected": -71.20674133300781, + "loss": 0.6767, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13815616071224213, + "rewards/margins": 0.03790999576449394, + "rewards/rejected": -0.17606614530086517, "step": 3460 }, { - "epoch": 0.6, - "grad_norm": 18.681968022257387, - "learning_rate": 4.4211648300669076e-07, - "logits/chosen": -1.597586989402771, - "logits/rejected": -1.5641849040985107, - "logps/chosen": -169.10386657714844, - "logps/rejected": -226.5546112060547, - "loss": 0.547, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.133301854133606, - "rewards/margins": 0.5889450311660767, - "rewards/rejected": -1.7222468852996826, + "epoch": 0.5978635423845624, + "grad_norm": 3.395078182220459, + "learning_rate": 8.842329660133815e-08, + "logits/chosen": -2.9512839317321777, + "logits/rejected": -2.9359641075134277, + "logps/chosen": -70.45713806152344, + "logps/rejected": -74.82063293457031, + "loss": 0.6686, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.14646930992603302, + "rewards/margins": 0.05837889388203621, + "rewards/rejected": -0.20484820008277893, "step": 3470 }, { - "epoch": 0.6, - "grad_norm": 29.803622122615664, - "learning_rate": 4.4163454970868277e-07, - "logits/chosen": -1.5102007389068604, - "logits/rejected": -1.4543273448944092, - "logps/chosen": -181.89114379882812, - "logps/rejected": -245.5972900390625, - "loss": 0.5329, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2680021524429321, - "rewards/margins": 0.6691805124282837, - "rewards/rejected": -1.9371826648712158, + "epoch": 0.5995864920744314, + "grad_norm": 3.6889381408691406, + "learning_rate": 8.832690994173655e-08, + "logits/chosen": -2.8805530071258545, + "logits/rejected": -2.8497469425201416, + "logps/chosen": -69.79116821289062, + "logps/rejected": -73.75949096679688, + "loss": 0.663, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1467658281326294, + "rewards/margins": 0.0719979852437973, + "rewards/rejected": -0.2187638282775879, "step": 3480 }, { - "epoch": 0.6, - "grad_norm": 28.83453327688963, - "learning_rate": 4.411508834446863e-07, - "logits/chosen": -1.5323913097381592, - "logits/rejected": -1.4874933958053589, - "logps/chosen": -182.50387573242188, - "logps/rejected": -241.5607452392578, - "loss": 0.5529, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.282502293586731, - "rewards/margins": 0.587981104850769, - "rewards/rejected": -1.8704833984375, + "epoch": 0.6013094417643005, + "grad_norm": 4.063599109649658, + "learning_rate": 8.823017668893726e-08, + "logits/chosen": -2.88657283782959, + "logits/rejected": -2.858245372772217, + "logps/chosen": -69.35597229003906, + "logps/rejected": -75.87593078613281, + "loss": 0.6645, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.150865837931633, + "rewards/margins": 0.06266475468873978, + "rewards/rejected": -0.21353061497211456, "step": 3490 }, { - "epoch": 0.6, - "grad_norm": 16.032941250777014, - "learning_rate": 4.406654885885326e-07, - "logits/chosen": -1.4855334758758545, - "logits/rejected": -1.4571826457977295, - "logps/chosen": -178.00135803222656, - "logps/rejected": -237.3003387451172, - "loss": 0.5392, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.259861707687378, - "rewards/margins": 0.5666104555130005, - "rewards/rejected": -1.8264720439910889, + "epoch": 0.6030323914541695, + "grad_norm": 3.57769513130188, + "learning_rate": 8.813309771770652e-08, + "logits/chosen": -2.834378480911255, + "logits/rejected": -2.826758861541748, + "logps/chosen": -68.08917999267578, + "logps/rejected": -74.96446228027344, + "loss": 0.6742, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.1608649641275406, + "rewards/margins": 0.04216255620121956, + "rewards/rejected": -0.20302753150463104, "step": 3500 }, { - "epoch": 0.6, - "eval_logits/chosen": -1.6595714092254639, - "eval_logits/rejected": -1.6385571956634521, - "eval_logps/chosen": -161.32484436035156, - "eval_logps/rejected": -191.19439697265625, - "eval_loss": 0.6286602020263672, - "eval_rewards/accuracies": 0.6466078162193298, - "eval_rewards/chosen": -1.0262099504470825, - "eval_rewards/margins": 0.2541602849960327, - "eval_rewards/rejected": -1.2803701162338257, - "eval_runtime": 357.0552, - "eval_samples_per_second": 12.054, - "eval_steps_per_second": 1.507, + "epoch": 0.6030323914541695, + "eval_logits/chosen": -2.9745237827301025, + "eval_logits/rejected": -2.9687376022338867, + "eval_logps/chosen": -69.42023468017578, + "eval_logps/rejected": -76.5616683959961, + "eval_loss": 0.6814299821853638, + "eval_rewards/accuracies": 0.6082713603973389, + "eval_rewards/chosen": -0.10708339512348175, + "eval_rewards/margins": 0.026732003316283226, + "eval_rewards/rejected": -0.13381539285182953, + "eval_runtime": 360.2439, + "eval_samples_per_second": 11.947, + "eval_steps_per_second": 1.493, "step": 3500 }, { - "epoch": 0.6, - "grad_norm": 16.07998691134206, - "learning_rate": 4.4017836952968467e-07, - "logits/chosen": -1.4526565074920654, - "logits/rejected": -1.4062235355377197, - "logps/chosen": -173.96885681152344, - "logps/rejected": -226.445068359375, - "loss": 0.5686, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2041919231414795, - "rewards/margins": 0.5490579009056091, - "rewards/rejected": -1.7532498836517334, + "epoch": 0.6047553411440386, + "grad_norm": 3.326735019683838, + "learning_rate": 8.803567390593694e-08, + "logits/chosen": -2.775590181350708, + "logits/rejected": -2.751251697540283, + "logps/chosen": -70.67406463623047, + "logps/rejected": -72.92584991455078, + "loss": 0.6722, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.17106808722019196, + "rewards/margins": 0.04674633592367172, + "rewards/rejected": -0.21781444549560547, "step": 3510 }, { - "epoch": 0.61, - "grad_norm": 19.834365533950795, - "learning_rate": 4.396895306731977e-07, - "logits/chosen": -1.5146148204803467, - "logits/rejected": -1.4718494415283203, - "logps/chosen": -160.6667938232422, - "logps/rejected": -208.8634490966797, - "loss": 0.5754, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.053438425064087, - "rewards/margins": 0.5146963000297546, - "rewards/rejected": -1.5681347846984863, + "epoch": 0.6064782908339077, + "grad_norm": 3.649550437927246, + "learning_rate": 8.793790613463953e-08, + "logits/chosen": -2.8101563453674316, + "logits/rejected": -2.7903876304626465, + "logps/chosen": -70.46080017089844, + "logps/rejected": -72.71427154541016, + "loss": 0.6681, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.15126517415046692, + "rewards/margins": 0.05522242188453674, + "rewards/rejected": -0.20648758113384247, "step": 3520 }, { - "epoch": 0.61, - "grad_norm": 23.90518563871216, - "learning_rate": 4.391989764396792e-07, - "logits/chosen": -1.6393533945083618, - "logits/rejected": -1.577980637550354, - "logps/chosen": -166.45315551757812, - "logps/rejected": -219.417724609375, - "loss": 0.5593, + "epoch": 0.6082012405237767, + "grad_norm": 3.7469167709350586, + "learning_rate": 8.783979528793584e-08, + "logits/chosen": -2.970606803894043, + "logits/rejected": -2.928586006164551, + "logps/chosen": -70.70912170410156, + "logps/rejected": -72.60647583007812, + "loss": 0.6616, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1050881147384644, - "rewards/margins": 0.5800349116325378, - "rewards/rejected": -1.6851232051849365, + "rewards/chosen": -0.14751648902893066, + "rewards/margins": 0.0694083645939827, + "rewards/rejected": -0.21692487597465515, "step": 3530 }, { - "epoch": 0.61, - "grad_norm": 20.26503213849771, - "learning_rate": 4.387067112652487e-07, - "logits/chosen": -1.5266510248184204, - "logits/rejected": -1.4859250783920288, - "logps/chosen": -157.66932678222656, - "logps/rejected": -214.6486053466797, - "loss": 0.5651, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.0376538038253784, - "rewards/margins": 0.5733412504196167, - "rewards/rejected": -1.6109952926635742, + "epoch": 0.6099241902136457, + "grad_norm": 3.8952109813690186, + "learning_rate": 8.774134225304974e-08, + "logits/chosen": -2.797332286834717, + "logits/rejected": -2.7737374305725098, + "logps/chosen": -70.36532592773438, + "logps/rejected": -74.63444519042969, + "loss": 0.6728, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1645258665084839, + "rewards/margins": 0.04614800959825516, + "rewards/rejected": -0.21067388355731964, "step": 3540 }, { - "epoch": 0.61, - "grad_norm": 18.788930669523673, - "learning_rate": 4.382127396014982e-07, - "logits/chosen": -1.6274988651275635, - "logits/rejected": -1.6048628091812134, - "logps/chosen": -166.0504608154297, - "logps/rejected": -205.7928466796875, - "loss": 0.609, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1151145696640015, - "rewards/margins": 0.41729575395584106, - "rewards/rejected": -1.5324103832244873, + "epoch": 0.6116471399035148, + "grad_norm": 3.9337563514709473, + "learning_rate": 8.764254792029964e-08, + "logits/chosen": -2.8850772380828857, + "logits/rejected": -2.8755321502685547, + "logps/chosen": -72.20113372802734, + "logps/rejected": -73.35503387451172, + "loss": 0.6799, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.17637988924980164, + "rewards/margins": 0.03165814280509949, + "rewards/rejected": -0.20803804695606232, "step": 3550 }, { - "epoch": 0.61, - "grad_norm": 18.6411865873627, - "learning_rate": 4.377170659154514e-07, - "logits/chosen": -1.5456907749176025, - "logits/rejected": -1.506981611251831, - "logps/chosen": -159.85513305664062, - "logps/rejected": -213.0148468017578, - "loss": 0.5651, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.069687843322754, - "rewards/margins": 0.535394549369812, - "rewards/rejected": -1.6050825119018555, + "epoch": 0.6133700895933839, + "grad_norm": 3.5271224975585938, + "learning_rate": 8.754341318309028e-08, + "logits/chosen": -2.816992998123169, + "logits/rejected": -2.7901835441589355, + "logps/chosen": -70.08598327636719, + "logps/rejected": -74.62560272216797, + "loss": 0.6711, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.17186033725738525, + "rewards/margins": 0.04891379550099373, + "rewards/rejected": -0.22077412903308868, "step": 3560 }, { - "epoch": 0.62, - "grad_norm": 26.755951307532122, - "learning_rate": 4.372196946895238e-07, - "logits/chosen": -1.6680046319961548, - "logits/rejected": -1.6189712285995483, - "logps/chosen": -177.2747344970703, - "logps/rejected": -216.9444580078125, - "loss": 0.609, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2041199207305908, - "rewards/margins": 0.43309324979782104, - "rewards/rejected": -1.637213110923767, + "epoch": 0.6150930392832529, + "grad_norm": 3.995802164077759, + "learning_rate": 8.744393893790476e-08, + "logits/chosen": -2.9562668800354004, + "logits/rejected": -2.9225308895111084, + "logps/chosen": -75.40304565429688, + "logps/rejected": -74.3688735961914, + "loss": 0.6856, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18530330061912537, + "rewards/margins": 0.026163259521126747, + "rewards/rejected": -0.21146658062934875, "step": 3570 }, { - "epoch": 0.62, - "grad_norm": 16.327880126029793, - "learning_rate": 4.367206304214815e-07, - "logits/chosen": -1.6215425729751587, - "logits/rejected": -1.5847231149673462, - "logps/chosen": -168.4742431640625, - "logps/rejected": -224.7816162109375, - "loss": 0.5332, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.1029232740402222, - "rewards/margins": 0.570185661315918, - "rewards/rejected": -1.6731090545654297, + "epoch": 0.616815988973122, + "grad_norm": 3.804481267929077, + "learning_rate": 8.73441260842963e-08, + "logits/chosen": -2.8924849033355713, + "logits/rejected": -2.870704174041748, + "logps/chosen": -74.10020446777344, + "logps/rejected": -79.51061248779297, + "loss": 0.6656, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.15905411541461945, + "rewards/margins": 0.061337023973464966, + "rewards/rejected": -0.22039110958576202, "step": 3580 }, { - "epoch": 0.62, - "grad_norm": 17.82230607590664, - "learning_rate": 4.3621987762440114e-07, - "logits/chosen": -1.582554578781128, - "logits/rejected": -1.5418357849121094, - "logps/chosen": -181.93031311035156, - "logps/rejected": -246.87838745117188, - "loss": 0.5296, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.27500319480896, - "rewards/margins": 0.654120922088623, - "rewards/rejected": -1.929124116897583, + "epoch": 0.618538938662991, + "grad_norm": 3.658123016357422, + "learning_rate": 8.724397552488023e-08, + "logits/chosen": -2.880936861038208, + "logits/rejected": -2.865353584289551, + "logps/chosen": -72.49365234375, + "logps/rejected": -77.39817810058594, + "loss": 0.6698, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.18068650364875793, + "rewards/margins": 0.05343535542488098, + "rewards/rejected": -0.23412184417247772, "step": 3590 }, { - "epoch": 0.62, - "grad_norm": 27.151158453154128, - "learning_rate": 4.357174408266289e-07, - "logits/chosen": -1.5266609191894531, - "logits/rejected": -1.4800150394439697, - "logps/chosen": -184.74020385742188, - "logps/rejected": -237.02490234375, - "loss": 0.5689, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3012017011642456, - "rewards/margins": 0.5533289909362793, - "rewards/rejected": -1.854530692100525, + "epoch": 0.6202618883528601, + "grad_norm": 4.0332818031311035, + "learning_rate": 8.714348816532577e-08, + "logits/chosen": -2.861480236053467, + "logits/rejected": -2.8369815349578857, + "logps/chosen": -71.11116027832031, + "logps/rejected": -72.80549621582031, + "loss": 0.6722, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.16464100778102875, + "rewards/margins": 0.04780580848455429, + "rewards/rejected": -0.21244683861732483, "step": 3600 }, { - "epoch": 0.62, - "eval_logits/chosen": -1.6493839025497437, - "eval_logits/rejected": -1.6286251544952393, - "eval_logps/chosen": -170.8087158203125, - "eval_logps/rejected": -201.0063018798828, - "eval_loss": 0.627535879611969, - "eval_rewards/accuracies": 0.6486988663673401, - "eval_rewards/chosen": -1.1210483312606812, - "eval_rewards/margins": 0.2574405074119568, - "eval_rewards/rejected": -1.3784890174865723, - "eval_runtime": 356.9383, - "eval_samples_per_second": 12.058, - "eval_steps_per_second": 1.507, + "epoch": 0.6202618883528601, + "eval_logits/chosen": -2.9655280113220215, + "eval_logits/rejected": -2.9596989154815674, + "eval_logps/chosen": -69.97339630126953, + "eval_logps/rejected": -77.21550750732422, + "eval_loss": 0.6810084581375122, + "eval_rewards/accuracies": 0.609897792339325, + "eval_rewards/chosen": -0.11261503398418427, + "eval_rewards/margins": 0.027738766744732857, + "eval_rewards/rejected": -0.14035379886627197, + "eval_runtime": 360.9369, + "eval_samples_per_second": 11.925, + "eval_steps_per_second": 1.491, "step": 3600 }, { - "epoch": 0.62, - "grad_norm": 16.53409464026598, - "learning_rate": 4.3521332457173933e-07, - "logits/chosen": -1.4792962074279785, - "logits/rejected": -1.4342132806777954, - "logps/chosen": -188.6980438232422, - "logps/rejected": -251.24951171875, - "loss": 0.5333, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3721535205841064, - "rewards/margins": 0.6460615396499634, - "rewards/rejected": -2.018214702606201, + "epoch": 0.6219848380427292, + "grad_norm": 3.6863250732421875, + "learning_rate": 8.704266491434787e-08, + "logits/chosen": -2.827634572982788, + "logits/rejected": -2.8120434284210205, + "logps/chosen": -70.19799041748047, + "logps/rejected": -72.06962585449219, + "loss": 0.6765, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.18704785406589508, + "rewards/margins": 0.039228882640600204, + "rewards/rejected": -0.2262767106294632, "step": 3610 }, { - "epoch": 0.62, - "grad_norm": 24.356843194385412, - "learning_rate": 4.347075334184946e-07, - "logits/chosen": -1.389676809310913, - "logits/rejected": -1.3410922288894653, - "logps/chosen": -182.9585418701172, - "logps/rejected": -253.10958862304688, - "loss": 0.5018, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.2746689319610596, - "rewards/margins": 0.7207802534103394, - "rewards/rejected": -1.9954490661621094, + "epoch": 0.6237077877325982, + "grad_norm": 3.8912832736968994, + "learning_rate": 8.694150668369892e-08, + "logits/chosen": -2.786785125732422, + "logits/rejected": -2.762024402618408, + "logps/chosen": -70.12651062011719, + "logps/rejected": -75.05070495605469, + "loss": 0.6616, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.14618650078773499, + "rewards/margins": 0.06834978610277176, + "rewards/rejected": -0.21453626453876495, "step": 3620 }, { - "epoch": 0.63, - "grad_norm": 28.04057531766567, - "learning_rate": 4.34200071940803e-07, - "logits/chosen": -1.4672437906265259, - "logits/rejected": -1.4280986785888672, - "logps/chosen": -215.80764770507812, - "logps/rejected": -300.14385986328125, - "loss": 0.5108, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.6468420028686523, - "rewards/margins": 0.7980831861495972, - "rewards/rejected": -2.44492506980896, + "epoch": 0.6254307374224672, + "grad_norm": 3.563636064529419, + "learning_rate": 8.68400143881606e-08, + "logits/chosen": -2.940396785736084, + "logits/rejected": -2.9337635040283203, + "logps/chosen": -67.58637237548828, + "logps/rejected": -79.30843353271484, + "loss": 0.6603, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.16445663571357727, + "rewards/margins": 0.0717763751745224, + "rewards/rejected": -0.23623299598693848, "step": 3630 }, { - "epoch": 0.63, - "grad_norm": 31.68658499963128, - "learning_rate": 4.3369094472767785e-07, - "logits/chosen": -1.3977959156036377, - "logits/rejected": -1.3606897592544556, - "logps/chosen": -217.1880645751953, - "logps/rejected": -290.8738708496094, - "loss": 0.5379, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.6216189861297607, - "rewards/margins": 0.7258588671684265, - "rewards/rejected": -2.347477674484253, + "epoch": 0.6271536871123363, + "grad_norm": 4.187912940979004, + "learning_rate": 8.673818894553557e-08, + "logits/chosen": -2.8915393352508545, + "logits/rejected": -2.8772926330566406, + "logps/chosen": -70.48597717285156, + "logps/rejected": -77.50422668457031, + "loss": 0.6668, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1544286608695984, + "rewards/margins": 0.058818262070417404, + "rewards/rejected": -0.2132469117641449, "step": 3640 }, { - "epoch": 0.63, - "grad_norm": 22.54081035634695, - "learning_rate": 4.331801563831956e-07, - "logits/chosen": -1.3711670637130737, - "logits/rejected": -1.350187063217163, - "logps/chosen": -206.61978149414062, - "logps/rejected": -269.8750915527344, - "loss": 0.544, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.555546522140503, - "rewards/margins": 0.5982221961021423, - "rewards/rejected": -2.15376877784729, + "epoch": 0.6288766368022054, + "grad_norm": 3.832031488418579, + "learning_rate": 8.663603127663912e-08, + "logits/chosen": -2.842319965362549, + "logits/rejected": -2.846832513809204, + "logps/chosen": -66.86103820800781, + "logps/rejected": -75.29658508300781, + "loss": 0.6709, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.15779516100883484, + "rewards/margins": 0.049841057509183884, + "rewards/rejected": -0.20763623714447021, "step": 3650 }, { - "epoch": 0.63, - "grad_norm": 22.175111801909775, - "learning_rate": 4.326677115264547e-07, - "logits/chosen": -1.3863328695297241, - "logits/rejected": -1.3272384405136108, - "logps/chosen": -210.5821075439453, - "logps/rejected": -292.0908203125, - "loss": 0.5043, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.5674175024032593, - "rewards/margins": 0.8180697560310364, - "rewards/rejected": -2.3854870796203613, + "epoch": 0.6305995864920745, + "grad_norm": 3.538102149963379, + "learning_rate": 8.653354230529094e-08, + "logits/chosen": -2.894085645675659, + "logits/rejected": -2.864441394805908, + "logps/chosen": -70.91181945800781, + "logps/rejected": -77.59912109375, + "loss": 0.6612, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.17053073644638062, + "rewards/margins": 0.07006730139255524, + "rewards/rejected": -0.24059805274009705, "step": 3660 }, { - "epoch": 0.63, - "grad_norm": 18.961212344873417, - "learning_rate": 4.321536147915334e-07, - "logits/chosen": -1.3621985912322998, - "logits/rejected": -1.3097569942474365, - "logps/chosen": -205.28738403320312, - "logps/rejected": -273.30584716796875, - "loss": 0.5708, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.5246150493621826, - "rewards/margins": 0.6830393671989441, - "rewards/rejected": -2.2076547145843506, + "epoch": 0.6323225361819435, + "grad_norm": 3.7949655055999756, + "learning_rate": 8.643072295830669e-08, + "logits/chosen": -2.8500585556030273, + "logits/rejected": -2.8231747150421143, + "logps/chosen": -70.67984771728516, + "logps/rejected": -75.11799621582031, + "loss": 0.6726, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.17828655242919922, + "rewards/margins": 0.04741708189249039, + "rewards/rejected": -0.225703626871109, "step": 3670 }, { - "epoch": 0.63, - "grad_norm": 17.75818169123661, - "learning_rate": 4.316378708274481e-07, - "logits/chosen": -1.4744240045547485, - "logits/rejected": -1.422086477279663, - "logps/chosen": -186.6185302734375, - "logps/rejected": -245.1935577392578, - "loss": 0.5536, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3194071054458618, - "rewards/margins": 0.6318386197090149, - "rewards/rejected": -1.951245665550232, + "epoch": 0.6340454858718125, + "grad_norm": 3.9998621940612793, + "learning_rate": 8.632757416548961e-08, + "logits/chosen": -2.8689355850219727, + "logits/rejected": -2.830148696899414, + "logps/chosen": -72.94480895996094, + "logps/rejected": -73.00611877441406, + "loss": 0.6736, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18268983066082, + "rewards/margins": 0.04636243358254433, + "rewards/rejected": -0.22905227541923523, "step": 3680 }, { - "epoch": 0.64, - "grad_norm": 25.3589027579314, - "learning_rate": 4.31120484298111e-07, - "logits/chosen": -1.4429172277450562, - "logits/rejected": -1.4147446155548096, - "logps/chosen": -174.2320098876953, - "logps/rejected": -255.7716064453125, - "loss": 0.5238, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2215301990509033, - "rewards/margins": 0.7453585863113403, - "rewards/rejected": -1.9668890237808228, + "epoch": 0.6357684355616816, + "grad_norm": 4.77199649810791, + "learning_rate": 8.62240968596222e-08, + "logits/chosen": -2.806609630584717, + "logits/rejected": -2.814436912536621, + "logps/chosen": -69.28324890136719, + "logps/rejected": -82.42266845703125, + "loss": 0.6658, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.17189538478851318, + "rewards/margins": 0.061388492584228516, + "rewards/rejected": -0.2332838773727417, "step": 3690 }, { - "epoch": 0.64, - "grad_norm": 17.358115744138235, - "learning_rate": 4.306014598822886e-07, - "logits/chosen": -1.4474033117294312, - "logits/rejected": -1.394345998764038, - "logps/chosen": -179.29293823242188, - "logps/rejected": -256.70098876953125, - "loss": 0.517, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2170681953430176, - "rewards/margins": 0.7563605904579163, - "rewards/rejected": -1.9734289646148682, + "epoch": 0.6374913852515507, + "grad_norm": 3.955845355987549, + "learning_rate": 8.612029197645772e-08, + "logits/chosen": -2.8485870361328125, + "logits/rejected": -2.822967052459717, + "logps/chosen": -74.45802307128906, + "logps/rejected": -82.77164459228516, + "loss": 0.664, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.16860488057136536, + "rewards/margins": 0.06524287164211273, + "rewards/rejected": -0.23384776711463928, "step": 3700 }, { - "epoch": 0.64, - "eval_logits/chosen": -1.523759365081787, - "eval_logits/rejected": -1.4999202489852905, - "eval_logps/chosen": -181.3194580078125, - "eval_logps/rejected": -215.5612030029297, - "eval_loss": 0.6243796944618225, - "eval_rewards/accuracies": 0.6565985083580017, - "eval_rewards/chosen": -1.2261559963226318, - "eval_rewards/margins": 0.2978822588920593, - "eval_rewards/rejected": -1.5240384340286255, - "eval_runtime": 357.0346, - "eval_samples_per_second": 12.055, - "eval_steps_per_second": 1.507, + "epoch": 0.6374913852515507, + "eval_logits/chosen": -2.954319953918457, + "eval_logits/rejected": -2.9484646320343018, + "eval_logps/chosen": -70.8017807006836, + "eval_logps/rejected": -78.20402526855469, + "eval_loss": 0.6803364157676697, + "eval_rewards/accuracies": 0.6089683771133423, + "eval_rewards/chosen": -0.12089894711971283, + "eval_rewards/margins": 0.029340064153075218, + "eval_rewards/rejected": -0.1502390056848526, + "eval_runtime": 359.9254, + "eval_samples_per_second": 11.958, + "eval_steps_per_second": 1.495, "step": 3700 }, { - "epoch": 0.64, - "grad_norm": 21.218376082289595, - "learning_rate": 4.3008080227355844e-07, - "logits/chosen": -1.4100111722946167, - "logits/rejected": -1.3635252714157104, - "logps/chosen": -195.98471069335938, - "logps/rejected": -256.88739013671875, - "loss": 0.5513, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.4212100505828857, - "rewards/margins": 0.6375278234481812, - "rewards/rejected": -2.0587379932403564, + "epoch": 0.6392143349414197, + "grad_norm": 4.2370991706848145, + "learning_rate": 8.601616045471168e-08, + "logits/chosen": -2.8805429935455322, + "logits/rejected": -2.849026918411255, + "logps/chosen": -71.5788345336914, + "logps/rejected": -73.38832092285156, + "loss": 0.6726, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.1771015226840973, + "rewards/margins": 0.04644893854856491, + "rewards/rejected": -0.2235504388809204, "step": 3710 }, { - "epoch": 0.64, - "grad_norm": 21.355833105859, - "learning_rate": 4.295585161802674e-07, - "logits/chosen": -1.4289751052856445, - "logits/rejected": -1.3828635215759277, - "logps/chosen": -182.1897735595703, - "logps/rejected": -262.51226806640625, - "loss": 0.4968, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.303205132484436, - "rewards/margins": 0.7883247137069702, - "rewards/rejected": -2.0915298461914062, + "epoch": 0.6409372846312887, + "grad_norm": 3.7011959552764893, + "learning_rate": 8.591170323605347e-08, + "logits/chosen": -2.893097400665283, + "logits/rejected": -2.875398635864258, + "logps/chosen": -68.0528564453125, + "logps/rejected": -76.62176513671875, + "loss": 0.661, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.16156095266342163, + "rewards/margins": 0.07080355286598206, + "rewards/rejected": -0.2323645055294037, "step": 3720 }, { - "epoch": 0.64, - "grad_norm": 22.252174921593365, - "learning_rate": 4.2903460632548893e-07, - "logits/chosen": -1.3439371585845947, - "logits/rejected": -1.2857837677001953, - "logps/chosen": -212.6474151611328, - "logps/rejected": -296.428466796875, - "loss": 0.4997, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.581465721130371, - "rewards/margins": 0.8757398724555969, - "rewards/rejected": -2.4572055339813232, + "epoch": 0.6426602343211578, + "grad_norm": 4.4856791496276855, + "learning_rate": 8.580692126509778e-08, + "logits/chosen": -2.853728771209717, + "logits/rejected": -2.830976963043213, + "logps/chosen": -71.31932830810547, + "logps/rejected": -74.6063461303711, + "loss": 0.6615, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.16801698505878448, + "rewards/margins": 0.07099028676748276, + "rewards/rejected": -0.23900727927684784, "step": 3730 }, { - "epoch": 0.64, - "grad_norm": 22.471686436011503, - "learning_rate": 4.285090774469802e-07, - "logits/chosen": -1.3240846395492554, - "logits/rejected": -1.2739444971084595, - "logps/chosen": -212.66140747070312, - "logps/rejected": -290.9830017089844, - "loss": 0.5364, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.592313528060913, - "rewards/margins": 0.7535529136657715, - "rewards/rejected": -2.3458666801452637, + "epoch": 0.6443831840110269, + "grad_norm": 4.300098896026611, + "learning_rate": 8.570181548939604e-08, + "logits/chosen": -2.85255765914917, + "logits/rejected": -2.8277950286865234, + "logps/chosen": -70.86285400390625, + "logps/rejected": -79.48960876464844, + "loss": 0.6683, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17427803575992584, + "rewards/margins": 0.05652335286140442, + "rewards/rejected": -0.23080138862133026, "step": 3740 }, { - "epoch": 0.65, - "grad_norm": 17.899544534984106, - "learning_rate": 4.2798193429713913e-07, - "logits/chosen": -1.440411925315857, - "logits/rejected": -1.3944687843322754, - "logps/chosen": -198.22142028808594, - "logps/rejected": -263.0658874511719, - "loss": 0.5618, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.44929838180542, - "rewards/margins": 0.650551974773407, - "rewards/rejected": -2.0998501777648926, + "epoch": 0.646106133700896, + "grad_norm": 4.036457538604736, + "learning_rate": 8.559638685942782e-08, + "logits/chosen": -2.9201862812042236, + "logits/rejected": -2.892306089401245, + "logps/chosen": -69.86690521240234, + "logps/rejected": -76.1387710571289, + "loss": 0.6639, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.16548243165016174, + "rewards/margins": 0.06511317193508148, + "rewards/rejected": -0.23059561848640442, "step": 3750 }, { - "epoch": 0.65, - "grad_norm": 25.746429517526668, - "learning_rate": 4.27453181642962e-07, - "logits/chosen": -1.4367876052856445, - "logits/rejected": -1.4018447399139404, - "logps/chosen": -195.23745727539062, - "logps/rejected": -260.03192138671875, - "loss": 0.5471, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3940891027450562, - "rewards/margins": 0.6460259556770325, - "rewards/rejected": -2.0401148796081543, + "epoch": 0.647829083390765, + "grad_norm": 4.326759338378906, + "learning_rate": 8.54906363285924e-08, + "logits/chosen": -2.8713104724884033, + "logits/rejected": -2.8680050373077393, + "logps/chosen": -74.909423828125, + "logps/rejected": -80.24607849121094, + "loss": 0.671, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1903577744960785, + "rewards/margins": 0.05150656774640083, + "rewards/rejected": -0.24186435341835022, "step": 3760 }, { - "epoch": 0.65, - "grad_norm": 22.141823531733728, - "learning_rate": 4.2692282426599967e-07, - "logits/chosen": -1.4208014011383057, - "logits/rejected": -1.3772103786468506, - "logps/chosen": -181.8473663330078, - "logps/rejected": -244.9152374267578, - "loss": 0.5249, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2831165790557861, - "rewards/margins": 0.6210489273071289, - "rewards/rejected": -1.904165506362915, + "epoch": 0.649552033080634, + "grad_norm": 4.169382095336914, + "learning_rate": 8.538456485319994e-08, + "logits/chosen": -2.843334197998047, + "logits/rejected": -2.8231163024902344, + "logps/chosen": -71.5437240600586, + "logps/rejected": -76.70804595947266, + "loss": 0.6752, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.18002033233642578, + "rewards/margins": 0.04156037047505379, + "rewards/rejected": -0.22158071398735046, "step": 3770 }, { - "epoch": 0.65, - "grad_norm": 25.749235391096974, - "learning_rate": 4.2639086696231483e-07, - "logits/chosen": -1.3430489301681519, - "logits/rejected": -1.2899630069732666, - "logps/chosen": -210.1292266845703, - "logps/rejected": -266.73431396484375, - "loss": 0.552, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.5607130527496338, - "rewards/margins": 0.6126449704170227, - "rewards/rejected": -2.1733579635620117, + "epoch": 0.6512749827705031, + "grad_norm": 3.6488869190216064, + "learning_rate": 8.527817339246297e-08, + "logits/chosen": -2.8893494606018066, + "logits/rejected": -2.8518636226654053, + "logps/chosen": -72.23634338378906, + "logps/rejected": -73.34587097167969, + "loss": 0.6675, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.18157723546028137, + "rewards/margins": 0.05787067487835884, + "rewards/rejected": -0.23944790661334991, "step": 3780 }, { - "epoch": 0.65, - "grad_norm": 17.91537947782448, - "learning_rate": 4.2585731454243834e-07, - "logits/chosen": -1.347544550895691, - "logits/rejected": -1.2992546558380127, - "logps/chosen": -203.87083435058594, - "logps/rejected": -270.9369812011719, - "loss": 0.5513, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.498838186264038, - "rewards/margins": 0.7007363438606262, - "rewards/rejected": -2.1995744705200195, + "epoch": 0.6529979324603722, + "grad_norm": 3.9068140983581543, + "learning_rate": 8.517146290848767e-08, + "logits/chosen": -2.842933177947998, + "logits/rejected": -2.814704656600952, + "logps/chosen": -71.16899108886719, + "logps/rejected": -74.94173431396484, + "loss": 0.6627, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17180873453617096, + "rewards/margins": 0.0677022859454155, + "rewards/rejected": -0.23951101303100586, "step": 3790 }, { - "epoch": 0.65, - "grad_norm": 20.642915135629426, - "learning_rate": 4.2532217183132566e-07, - "logits/chosen": -1.4202806949615479, - "logits/rejected": -1.3704365491867065, - "logps/chosen": -190.59255981445312, - "logps/rejected": -250.588134765625, - "loss": 0.5368, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3290106058120728, - "rewards/margins": 0.6480494737625122, - "rewards/rejected": -1.977060317993164, + "epoch": 0.6547208821502413, + "grad_norm": 5.100893974304199, + "learning_rate": 8.506443436626513e-08, + "logits/chosen": -2.8600306510925293, + "logits/rejected": -2.828758716583252, + "logps/chosen": -75.09590148925781, + "logps/rejected": -76.66107177734375, + "loss": 0.6644, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1740262657403946, + "rewards/margins": 0.06390801072120667, + "rewards/rejected": -0.23793426156044006, "step": 3800 }, { - "epoch": 0.65, - "eval_logits/chosen": -1.523742437362671, - "eval_logits/rejected": -1.5010066032409668, - "eval_logps/chosen": -182.3809356689453, - "eval_logps/rejected": -216.2484893798828, - "eval_loss": 0.6206509470939636, - "eval_rewards/accuracies": 0.6579925417900085, - "eval_rewards/chosen": -1.2367708683013916, - "eval_rewards/margins": 0.29413995146751404, - "eval_rewards/rejected": -1.530910849571228, - "eval_runtime": 357.0405, - "eval_samples_per_second": 12.055, - "eval_steps_per_second": 1.507, + "epoch": 0.6547208821502413, + "eval_logits/chosen": -2.944425344467163, + "eval_logits/rejected": -2.9385876655578613, + "eval_logps/chosen": -71.985107421875, + "eval_logps/rejected": -79.5918197631836, + "eval_loss": 0.6795005202293396, + "eval_rewards/accuracies": 0.6110594868659973, + "eval_rewards/chosen": -0.132732093334198, + "eval_rewards/margins": 0.031384874135255814, + "eval_rewards/rejected": -0.16411694884300232, + "eval_runtime": 360.5198, + "eval_samples_per_second": 11.938, + "eval_steps_per_second": 1.492, "step": 3800 }, { - "epoch": 0.66, - "grad_norm": 31.94932671255532, - "learning_rate": 4.2478544366831373e-07, - "logits/chosen": -1.4317169189453125, - "logits/rejected": -1.3770487308502197, - "logps/chosen": -202.6186981201172, - "logps/rejected": -254.42404174804688, - "loss": 0.5594, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.4515069723129272, - "rewards/margins": 0.57206791639328, - "rewards/rejected": -2.0235750675201416, + "epoch": 0.6564438318401102, + "grad_norm": 4.450614929199219, + "learning_rate": 8.495708873366273e-08, + "logits/chosen": -2.882823944091797, + "logits/rejected": -2.8404746055603027, + "logps/chosen": -76.32991027832031, + "logps/rejected": -77.35725402832031, + "loss": 0.6645, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.18861815333366394, + "rewards/margins": 0.0643153265118599, + "rewards/rejected": -0.25293344259262085, "step": 3810 }, { - "epoch": 0.66, - "grad_norm": 26.840179888734244, - "learning_rate": 4.242471349070765e-07, - "logits/chosen": -1.430687665939331, - "logits/rejected": -1.3825973272323608, - "logps/chosen": -182.3372039794922, - "logps/rejected": -254.831787109375, - "loss": 0.5042, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2825406789779663, - "rewards/margins": 0.7350937128067017, - "rewards/rejected": -2.017634630203247, + "epoch": 0.6581667815299793, + "grad_norm": 4.633328914642334, + "learning_rate": 8.48494269814153e-08, + "logits/chosen": -2.851984739303589, + "logits/rejected": -2.835164785385132, + "logps/chosen": -72.11973571777344, + "logps/rejected": -76.26155090332031, + "loss": 0.6702, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.18021515011787415, + "rewards/margins": 0.051523733884096146, + "rewards/rejected": -0.23173888027668, "step": 3820 }, { - "epoch": 0.66, - "grad_norm": 20.713279754546374, - "learning_rate": 4.2370725041558163e-07, - "logits/chosen": -1.4622533321380615, - "logits/rejected": -1.3965160846710205, - "logps/chosen": -194.24440002441406, - "logps/rejected": -252.4665985107422, - "loss": 0.5156, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3557844161987305, - "rewards/margins": 0.6555383205413818, - "rewards/rejected": -2.0113227367401123, + "epoch": 0.6598897312198484, + "grad_norm": 4.264226913452148, + "learning_rate": 8.474145008311633e-08, + "logits/chosen": -2.895481586456299, + "logits/rejected": -2.8515303134918213, + "logps/chosen": -76.1996078491211, + "logps/rejected": -75.78540802001953, + "loss": 0.6619, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17522504925727844, + "rewards/margins": 0.06902584433555603, + "rewards/rejected": -0.24425086379051208, "step": 3830 }, { - "epoch": 0.66, - "grad_norm": 22.738897007983, - "learning_rate": 4.2316579507604613e-07, - "logits/chosen": -1.3598577976226807, - "logits/rejected": -1.3157683610916138, - "logps/chosen": -200.95361328125, - "logps/rejected": -288.90191650390625, - "loss": 0.5188, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.468830943107605, - "rewards/margins": 0.8298671841621399, - "rewards/rejected": -2.2986984252929688, + "epoch": 0.6616126809097175, + "grad_norm": 4.619011878967285, + "learning_rate": 8.463315901520923e-08, + "logits/chosen": -2.823070526123047, + "logits/rejected": -2.8097739219665527, + "logps/chosen": -70.88066101074219, + "logps/rejected": -83.83921813964844, + "loss": 0.6574, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.16794264316558838, + "rewards/margins": 0.08013279736042023, + "rewards/rejected": -0.2480754405260086, "step": 3840 }, { - "epoch": 0.66, - "grad_norm": 26.405390100521963, - "learning_rate": 4.2262277378489224e-07, - "logits/chosen": -1.427339792251587, - "logits/rejected": -1.385075330734253, - "logps/chosen": -227.91049194335938, - "logps/rejected": -302.1413879394531, - "loss": 0.5196, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.706134557723999, - "rewards/margins": 0.7777893543243408, - "rewards/rejected": -2.483924150466919, + "epoch": 0.6633356305995864, + "grad_norm": 4.062577247619629, + "learning_rate": 8.452455475697845e-08, + "logits/chosen": -2.9455015659332275, + "logits/rejected": -2.930697441101074, + "logps/chosen": -75.87129211425781, + "logps/rejected": -77.36614227294922, + "loss": 0.6708, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.1857018619775772, + "rewards/margins": 0.05030056834220886, + "rewards/rejected": -0.23600240051746368, "step": 3850 }, { - "epoch": 0.67, - "grad_norm": 28.541254516094085, - "learning_rate": 4.2207819145270346e-07, - "logits/chosen": -1.4458119869232178, - "logits/rejected": -1.3982911109924316, - "logps/chosen": -232.8706512451172, - "logps/rejected": -297.7009582519531, - "loss": 0.568, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.7871061563491821, - "rewards/margins": 0.6608562469482422, - "rewards/rejected": -2.4479622840881348, + "epoch": 0.6650585802894555, + "grad_norm": 4.330991268157959, + "learning_rate": 8.44156382905407e-08, + "logits/chosen": -2.9505677223205566, + "logits/rejected": -2.925079107284546, + "logps/chosen": -75.33770751953125, + "logps/rejected": -77.89942932128906, + "loss": 0.6776, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.21157272160053253, + "rewards/margins": 0.03799385949969292, + "rewards/rejected": -0.24956655502319336, "step": 3860 }, { - "epoch": 0.67, - "grad_norm": 20.798655132332478, - "learning_rate": 4.2153205300417966e-07, - "logits/chosen": -1.4056997299194336, - "logits/rejected": -1.3534657955169678, - "logps/chosen": -214.6895751953125, - "logps/rejected": -290.5361633300781, - "loss": 0.5187, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.5584853887557983, - "rewards/margins": 0.7904712557792664, - "rewards/rejected": -2.34895658493042, + "epoch": 0.6667815299793246, + "grad_norm": 4.8397111892700195, + "learning_rate": 8.430641060083593e-08, + "logits/chosen": -2.8415279388427734, + "logits/rejected": -2.8184056282043457, + "logps/chosen": -77.4899673461914, + "logps/rejected": -80.97842407226562, + "loss": 0.6633, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1863713264465332, + "rewards/margins": 0.06677299737930298, + "rewards/rejected": -0.2531442940235138, "step": 3870 }, { - "epoch": 0.67, - "grad_norm": 25.018107512387246, - "learning_rate": 4.209843633780929e-07, - "logits/chosen": -1.5281155109405518, - "logits/rejected": -1.5098029375076294, - "logps/chosen": -187.2782440185547, - "logps/rejected": -250.95950317382812, - "loss": 0.5438, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3641104698181152, - "rewards/margins": 0.6108844876289368, - "rewards/rejected": -1.9749950170516968, + "epoch": 0.6685044796691937, + "grad_norm": 4.591561317443848, + "learning_rate": 8.419687267561858e-08, + "logits/chosen": -2.875171184539795, + "logits/rejected": -2.879774332046509, + "logps/chosen": -72.00981140136719, + "logps/rejected": -78.77186584472656, + "loss": 0.6753, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.2113080769777298, + "rewards/margins": 0.041494350880384445, + "rewards/rejected": -0.25280243158340454, "step": 3880 }, { - "epoch": 0.67, - "grad_norm": 17.201393329622153, - "learning_rate": 4.204351275272426e-07, - "logits/chosen": -1.5760449171066284, - "logits/rejected": -1.5332744121551514, - "logps/chosen": -177.4027862548828, - "logps/rejected": -236.5082244873047, - "loss": 0.5651, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.2320191860198975, - "rewards/margins": 0.6117894053459167, - "rewards/rejected": -1.8438085317611694, + "epoch": 0.6702274293590628, + "grad_norm": 4.114559173583984, + "learning_rate": 8.408702550544853e-08, + "logits/chosen": -2.9188356399536133, + "logits/rejected": -2.892014503479004, + "logps/chosen": -73.74089050292969, + "logps/rejected": -78.37785339355469, + "loss": 0.6636, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.19554482400417328, + "rewards/margins": 0.06676265597343445, + "rewards/rejected": -0.26230746507644653, "step": 3890 }, { - "epoch": 0.67, - "grad_norm": 18.018343000765952, - "learning_rate": 4.1988435041841096e-07, - "logits/chosen": -1.5944218635559082, - "logits/rejected": -1.5262387990951538, - "logps/chosen": -170.49252319335938, - "logps/rejected": -219.80899047851562, - "loss": 0.5382, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.1367781162261963, - "rewards/margins": 0.5586223006248474, - "rewards/rejected": -1.6954004764556885, + "epoch": 0.6719503790489317, + "grad_norm": 4.3273820877075195, + "learning_rate": 8.39768700836822e-08, + "logits/chosen": -2.896733283996582, + "logits/rejected": -2.842710018157959, + "logps/chosen": -76.47528839111328, + "logps/rejected": -75.9754409790039, + "loss": 0.6664, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.19646716117858887, + "rewards/margins": 0.060664378106594086, + "rewards/rejected": -0.25713151693344116, "step": 3900 }, { - "epoch": 0.67, - "eval_logits/chosen": -1.6579508781433105, - "eval_logits/rejected": -1.6361998319625854, - "eval_logps/chosen": -160.2046661376953, - "eval_logps/rejected": -190.85934448242188, - "eval_loss": 0.6221497654914856, - "eval_rewards/accuracies": 0.6596189737319946, - "eval_rewards/chosen": -1.0150080919265747, - "eval_rewards/margins": 0.26201140880584717, - "eval_rewards/rejected": -1.2770196199417114, - "eval_runtime": 357.0359, - "eval_samples_per_second": 12.055, - "eval_steps_per_second": 1.507, + "epoch": 0.6719503790489317, + "eval_logits/chosen": -2.9358367919921875, + "eval_logits/rejected": -2.9299936294555664, + "eval_logps/chosen": -73.20443725585938, + "eval_logps/rejected": -81.02217102050781, + "eval_loss": 0.6786462664604187, + "eval_rewards/accuracies": 0.6080390214920044, + "eval_rewards/chosen": -0.14492543041706085, + "eval_rewards/margins": 0.03349505737423897, + "eval_rewards/rejected": -0.17842045426368713, + "eval_runtime": 359.9545, + "eval_samples_per_second": 11.957, + "eval_steps_per_second": 1.495, "step": 3900 }, { - "epoch": 0.67, - "grad_norm": 15.180474012277164, - "learning_rate": 4.1933203703231766e-07, - "logits/chosen": -1.584212303161621, - "logits/rejected": -1.5458735227584839, - "logps/chosen": -177.2735595703125, - "logps/rejected": -240.51168823242188, - "loss": 0.5167, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2165385484695435, - "rewards/margins": 0.6424941420555115, - "rewards/rejected": -1.8590329885482788, + "epoch": 0.6736733287388008, + "grad_norm": 3.9224681854248047, + "learning_rate": 8.386640740646353e-08, + "logits/chosen": -2.869636297225952, + "logits/rejected": -2.854032516479492, + "logps/chosen": -76.14567565917969, + "logps/rejected": -82.44049835205078, + "loss": 0.6604, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.20504307746887207, + "rewards/margins": 0.07330290973186493, + "rewards/rejected": -0.2783460021018982, "step": 3910 }, { - "epoch": 0.68, - "grad_norm": 20.98038793058905, - "learning_rate": 4.1877819236357524e-07, - "logits/chosen": -1.5897353887557983, - "logits/rejected": -1.5237689018249512, - "logps/chosen": -172.70350646972656, - "logps/rejected": -239.7980499267578, - "loss": 0.4887, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.1406900882720947, - "rewards/margins": 0.7530602216720581, - "rewards/rejected": -1.8937504291534424, + "epoch": 0.6753962784286699, + "grad_norm": 4.396719932556152, + "learning_rate": 8.375563847271506e-08, + "logits/chosen": -2.9339499473571777, + "logits/rejected": -2.8893306255340576, + "logps/chosen": -77.58342742919922, + "logps/rejected": -77.1610336303711, + "loss": 0.6589, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.189140185713768, + "rewards/margins": 0.07799036800861359, + "rewards/rejected": -0.2671305537223816, "step": 3920 }, { - "epoch": 0.68, - "grad_norm": 29.051405565574303, - "learning_rate": 4.182228214206437e-07, - "logits/chosen": -1.5160815715789795, - "logits/rejected": -1.48716139793396, - "logps/chosen": -189.08810424804688, - "logps/rejected": -257.77593994140625, - "loss": 0.5336, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3786773681640625, - "rewards/margins": 0.6711302399635315, - "rewards/rejected": -2.0498077869415283, + "epoch": 0.677119228118539, + "grad_norm": 4.47017765045166, + "learning_rate": 8.364456428412874e-08, + "logits/chosen": -2.937316417694092, + "logits/rejected": -2.9325146675109863, + "logps/chosen": -71.93880462646484, + "logps/rejected": -79.04782104492188, + "loss": 0.6691, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20680996775627136, + "rewards/margins": 0.055847086012363434, + "rewards/rejected": -0.2626570761203766, "step": 3930 }, { - "epoch": 0.68, - "grad_norm": 32.250601718030964, - "learning_rate": 4.1766592922578527e-07, - "logits/chosen": -1.3783949613571167, - "logits/rejected": -1.3409314155578613, - "logps/chosen": -183.18594360351562, - "logps/rejected": -250.7453155517578, - "loss": 0.5505, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.297050952911377, - "rewards/margins": 0.6737505793571472, - "rewards/rejected": -1.9708013534545898, + "epoch": 0.6788421778084079, + "grad_norm": 4.382420539855957, + "learning_rate": 8.353318584515705e-08, + "logits/chosen": -2.7662198543548584, + "logits/rejected": -2.7476911544799805, + "logps/chosen": -72.57732391357422, + "logps/rejected": -78.52955627441406, + "loss": 0.6678, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.19077327847480774, + "rewards/margins": 0.05754157155752182, + "rewards/rejected": -0.24831485748291016, "step": 3940 }, { - "epoch": 0.68, - "grad_norm": 19.935483029280153, - "learning_rate": 4.1710752081501877e-07, - "logits/chosen": -1.3798249959945679, - "logits/rejected": -1.311702847480774, - "logps/chosen": -178.89566040039062, - "logps/rejected": -250.24734497070312, - "loss": 0.4886, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.2341785430908203, - "rewards/margins": 0.7588900327682495, - "rewards/rejected": -1.9930686950683594, + "epoch": 0.680565127498277, + "grad_norm": 4.424745082855225, + "learning_rate": 8.342150416300375e-08, + "logits/chosen": -2.7651352882385254, + "logits/rejected": -2.7198243141174316, + "logps/chosen": -75.12512969970703, + "logps/rejected": -77.98560333251953, + "loss": 0.6601, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19651785492897034, + "rewards/margins": 0.07387096434831619, + "rewards/rejected": -0.27038878202438354, "step": 3950 }, { - "epoch": 0.68, - "grad_norm": 33.05051638682265, - "learning_rate": 4.1654760123807464e-07, - "logits/chosen": -1.4223079681396484, - "logits/rejected": -1.3881456851959229, - "logps/chosen": -198.07431030273438, - "logps/rejected": -285.7303771972656, - "loss": 0.4943, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.4536970853805542, - "rewards/margins": 0.8098018765449524, - "rewards/rejected": -2.2634987831115723, + "epoch": 0.6822880771881461, + "grad_norm": 4.411109924316406, + "learning_rate": 8.330952024761493e-08, + "logits/chosen": -2.8689446449279785, + "logits/rejected": -2.869525194168091, + "logps/chosen": -72.38044738769531, + "logps/rejected": -87.84962463378906, + "loss": 0.656, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.19677282869815826, + "rewards/margins": 0.0877741351723671, + "rewards/rejected": -0.28454697132110596, "step": 3960 }, { - "epoch": 0.68, - "grad_norm": 27.584592344930318, - "learning_rate": 4.159861755583487e-07, - "logits/chosen": -1.3134465217590332, - "logits/rejected": -1.2678642272949219, - "logps/chosen": -234.09814453125, - "logps/rejected": -308.8127746582031, - "loss": 0.549, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.800374984741211, - "rewards/margins": 0.7578364610671997, - "rewards/rejected": -2.558211326599121, + "epoch": 0.6840110268780152, + "grad_norm": 4.500481605529785, + "learning_rate": 8.319723511166973e-08, + "logits/chosen": -2.8778367042541504, + "logits/rejected": -2.8542728424072266, + "logps/chosen": -74.01570129394531, + "logps/rejected": -80.1430435180664, + "loss": 0.6614, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19942805171012878, + "rewards/margins": 0.07196511328220367, + "rewards/rejected": -0.2713931202888489, "step": 3970 }, { - "epoch": 0.69, - "grad_norm": 29.469171946700417, - "learning_rate": 4.154232488528566e-07, - "logits/chosen": -1.1992053985595703, - "logits/rejected": -1.1396461725234985, - "logps/chosen": -219.92236328125, - "logps/rejected": -315.6834411621094, - "loss": 0.4724, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.6659343242645264, - "rewards/margins": 0.9590142965316772, - "rewards/rejected": -2.624948501586914, + "epoch": 0.6857339765678843, + "grad_norm": 4.250563621520996, + "learning_rate": 8.308464977057131e-08, + "logits/chosen": -2.7574124336242676, + "logits/rejected": -2.7379393577575684, + "logps/chosen": -74.24089813232422, + "logps/rejected": -81.76484680175781, + "loss": 0.659, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.20901378989219666, + "rewards/margins": 0.07665853202342987, + "rewards/rejected": -0.2856723368167877, "step": 3980 }, { - "epoch": 0.69, - "grad_norm": 17.97839097611244, - "learning_rate": 4.148588262121877e-07, - "logits/chosen": -1.3333715200424194, - "logits/rejected": -1.299889326095581, - "logps/chosen": -211.5640106201172, - "logps/rejected": -273.6368408203125, - "loss": 0.5775, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.5835427045822144, - "rewards/margins": 0.6002888083457947, - "rewards/rejected": -2.183831214904785, + "epoch": 0.6874569262577532, + "grad_norm": 4.266327381134033, + "learning_rate": 8.297176524243754e-08, + "logits/chosen": -2.862718105316162, + "logits/rejected": -2.8492565155029297, + "logps/chosen": -74.35490417480469, + "logps/rejected": -82.00604248046875, + "loss": 0.6684, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21138229966163635, + "rewards/margins": 0.05601108819246292, + "rewards/rejected": -0.26739341020584106, "step": 3990 }, { - "epoch": 0.69, - "grad_norm": 29.741620640645017, - "learning_rate": 4.1429291274045965e-07, - "logits/chosen": -1.5011112689971924, - "logits/rejected": -1.4447122812271118, - "logps/chosen": -197.10702514648438, - "logps/rejected": -261.1085205078125, - "loss": 0.5399, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.399017572402954, - "rewards/margins": 0.6844178438186646, - "rewards/rejected": -2.083435535430908, + "epoch": 0.6891798759476223, + "grad_norm": 4.685428619384766, + "learning_rate": 8.285858254809193e-08, + "logits/chosen": -2.930467367172241, + "logits/rejected": -2.8961825370788574, + "logps/chosen": -78.38148498535156, + "logps/rejected": -80.21168518066406, + "loss": 0.6653, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.21177248656749725, + "rewards/margins": 0.06271915137767792, + "rewards/rejected": -0.27449163794517517, "step": 4000 }, { - "epoch": 0.69, - "eval_logits/chosen": -1.5105490684509277, - "eval_logits/rejected": -1.486973524093628, - "eval_logps/chosen": -175.73806762695312, - "eval_logps/rejected": -209.60133361816406, - "eval_loss": 0.6212473511695862, - "eval_rewards/accuracies": 0.6598513126373291, - "eval_rewards/chosen": -1.170341968536377, - "eval_rewards/margins": 0.29409757256507874, - "eval_rewards/rejected": -1.4644395112991333, - "eval_runtime": 356.9871, - "eval_samples_per_second": 12.056, - "eval_steps_per_second": 1.507, + "epoch": 0.6891798759476223, + "eval_logits/chosen": -2.923618793487549, + "eval_logits/rejected": -2.9177629947662354, + "eval_logps/chosen": -74.3039779663086, + "eval_logps/rejected": -82.26918029785156, + "eval_loss": 0.6780802607536316, + "eval_rewards/accuracies": 0.6057156324386597, + "eval_rewards/chosen": -0.15592080354690552, + "eval_rewards/margins": 0.034969717264175415, + "eval_rewards/rejected": -0.19089052081108093, + "eval_runtime": 360.2025, + "eval_samples_per_second": 11.949, + "eval_steps_per_second": 1.494, "step": 4000 }, { - "epoch": 0.69, - "grad_norm": 21.179313854062823, - "learning_rate": 4.137255135552714e-07, - "logits/chosen": -1.3642061948776245, - "logits/rejected": -1.3185532093048096, - "logps/chosen": -176.0836944580078, - "logps/rejected": -256.7080078125, - "loss": 0.5005, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2807254791259766, - "rewards/margins": 0.7561134099960327, - "rewards/rejected": -2.036839008331299, + "epoch": 0.6909028256374914, + "grad_norm": 4.136673927307129, + "learning_rate": 8.274510271105428e-08, + "logits/chosen": -2.8100674152374268, + "logits/rejected": -2.8002233505249023, + "logps/chosen": -69.55921936035156, + "logps/rejected": -82.14431762695312, + "loss": 0.6583, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.21539802849292755, + "rewards/margins": 0.07587047666311264, + "rewards/rejected": -0.291268527507782, "step": 4010 }, { - "epoch": 0.69, - "grad_norm": 28.693872888159124, - "learning_rate": 4.131566337876575e-07, - "logits/chosen": -1.3393471240997314, - "logits/rejected": -1.3077666759490967, - "logps/chosen": -198.69711303710938, - "logps/rejected": -269.0760192871094, - "loss": 0.5463, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.4653034210205078, - "rewards/margins": 0.694530189037323, - "rewards/rejected": -2.1598334312438965, + "epoch": 0.6926257753273605, + "grad_norm": 4.985778331756592, + "learning_rate": 8.26313267575315e-08, + "logits/chosen": -2.8186466693878174, + "logits/rejected": -2.8054795265197754, + "logps/chosen": -73.369873046875, + "logps/rejected": -80.44734954833984, + "loss": 0.6661, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2118435651063919, + "rewards/margins": 0.061372630298137665, + "rewards/rejected": -0.273216187953949, "step": 4020 }, { - "epoch": 0.69, - "grad_norm": 20.870224461785025, - "learning_rate": 4.125862785820416e-07, - "logits/chosen": -1.3702881336212158, - "logits/rejected": -1.319515585899353, - "logps/chosen": -193.2422637939453, - "logps/rejected": -268.23193359375, - "loss": 0.5069, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3722060918807983, - "rewards/margins": 0.752329409122467, - "rewards/rejected": -2.12453556060791, + "epoch": 0.6943487250172296, + "grad_norm": 4.250854969024658, + "learning_rate": 8.251725571640831e-08, + "logits/chosen": -2.8195624351501465, + "logits/rejected": -2.7978997230529785, + "logps/chosen": -75.46442413330078, + "logps/rejected": -83.16146087646484, + "loss": 0.6568, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1941043883562088, + "rewards/margins": 0.07958976924419403, + "rewards/rejected": -0.27369412779808044, "step": 4030 }, { - "epoch": 0.7, - "grad_norm": 24.444442071597063, - "learning_rate": 4.1201445309618954e-07, - "logits/chosen": -1.4431445598602295, - "logits/rejected": -1.3992760181427002, - "logps/chosen": -196.61453247070312, - "logps/rejected": -275.2479553222656, - "loss": 0.5035, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.4041612148284912, - "rewards/margins": 0.8080703020095825, - "rewards/rejected": -2.212231397628784, + "epoch": 0.6960716747070985, + "grad_norm": 4.681697368621826, + "learning_rate": 8.240289061923791e-08, + "logits/chosen": -2.904080867767334, + "logits/rejected": -2.8924145698547363, + "logps/chosen": -77.6021728515625, + "logps/rejected": -81.96739196777344, + "loss": 0.6646, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.21409961581230164, + "rewards/margins": 0.06508499383926392, + "rewards/rejected": -0.27918460965156555, "step": 4040 }, { - "epoch": 0.7, - "grad_norm": 20.703451812620813, - "learning_rate": 4.114411625011634e-07, - "logits/chosen": -1.3789803981781006, - "logits/rejected": -1.3350975513458252, - "logps/chosen": -179.94940185546875, - "logps/rejected": -255.5587615966797, - "loss": 0.525, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2938952445983887, - "rewards/margins": 0.7335586547851562, - "rewards/rejected": -2.027453899383545, + "epoch": 0.6977946243969676, + "grad_norm": 4.321097373962402, + "learning_rate": 8.228823250023268e-08, + "logits/chosen": -2.812398672103882, + "logits/rejected": -2.7948527336120605, + "logps/chosen": -70.7357406616211, + "logps/rejected": -81.06127166748047, + "loss": 0.6577, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.20170316100120544, + "rewards/margins": 0.08067599684000015, + "rewards/rejected": -0.2823791801929474, "step": 4050 }, { - "epoch": 0.7, - "grad_norm": 18.961262754259714, - "learning_rate": 4.1086641198127404e-07, - "logits/chosen": -1.392407774925232, - "logits/rejected": -1.3395566940307617, - "logps/chosen": -198.05589294433594, - "logps/rejected": -261.8578796386719, - "loss": 0.5609, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.417249321937561, - "rewards/margins": 0.6800889372825623, - "rewards/rejected": -2.0973381996154785, + "epoch": 0.6995175740868367, + "grad_norm": 4.3770551681518555, + "learning_rate": 8.21732823962548e-08, + "logits/chosen": -2.8143866062164307, + "logits/rejected": -2.782468557357788, + "logps/chosen": -79.16787719726562, + "logps/rejected": -81.5352783203125, + "loss": 0.6644, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.22808189690113068, + "rewards/margins": 0.06586854159832001, + "rewards/rejected": -0.2939504384994507, "step": 4060 }, { - "epoch": 0.7, - "grad_norm": 18.757428674130104, - "learning_rate": 4.102902067340348e-07, - "logits/chosen": -1.3935401439666748, - "logits/rejected": -1.3446584939956665, - "logps/chosen": -188.22537231445312, - "logps/rejected": -258.7428283691406, - "loss": 0.5224, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3439836502075195, - "rewards/margins": 0.7147015333175659, - "rewards/rejected": -2.058685302734375, + "epoch": 0.7012405237767058, + "grad_norm": 4.432036876678467, + "learning_rate": 8.205804134680696e-08, + "logits/chosen": -2.816871166229248, + "logits/rejected": -2.7946834564208984, + "logps/chosen": -76.47846984863281, + "logps/rejected": -82.78707122802734, + "loss": 0.6607, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22646565735340118, + "rewards/margins": 0.07262485474348068, + "rewards/rejected": -0.2990904748439789, "step": 4070 }, { - "epoch": 0.7, - "grad_norm": 19.87740623644791, - "learning_rate": 4.0971255197011395e-07, - "logits/chosen": -1.3319361209869385, - "logits/rejected": -1.294301986694336, - "logps/chosen": -182.7200469970703, - "logps/rejected": -261.05523681640625, - "loss": 0.5126, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3081274032592773, - "rewards/margins": 0.7623058557510376, - "rewards/rejected": -2.0704331398010254, + "epoch": 0.7029634734665747, + "grad_norm": 4.366625785827637, + "learning_rate": 8.194251039402279e-08, + "logits/chosen": -2.782827377319336, + "logits/rejected": -2.775156259536743, + "logps/chosen": -73.0945053100586, + "logps/rejected": -82.41346740722656, + "loss": 0.6611, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.21194684505462646, + "rewards/margins": 0.0719664990901947, + "rewards/rejected": -0.28391334414482117, "step": 4080 }, { - "epoch": 0.7, - "grad_norm": 22.371735739500583, - "learning_rate": 4.091334529132881e-07, - "logits/chosen": -1.4664791822433472, - "logits/rejected": -1.404679536819458, - "logps/chosen": -177.88174438476562, - "logps/rejected": -248.9783935546875, - "loss": 0.5139, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2242028713226318, - "rewards/margins": 0.7204464673995972, - "rewards/rejected": -1.944649338722229, + "epoch": 0.7046864231564438, + "grad_norm": 4.243492603302002, + "learning_rate": 8.182669058265762e-08, + "logits/chosen": -2.8675835132598877, + "logits/rejected": -2.837397813796997, + "logps/chosen": -75.27025604248047, + "logps/rejected": -82.02632904052734, + "loss": 0.6589, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1979300081729889, + "rewards/margins": 0.07707500457763672, + "rewards/rejected": -0.2750049829483032, "step": 4090 }, { - "epoch": 0.71, - "grad_norm": 24.372455930646307, - "learning_rate": 4.0855291480039454e-07, - "logits/chosen": -1.3770744800567627, - "logits/rejected": -1.329611897468567, - "logps/chosen": -182.24974060058594, - "logps/rejected": -250.8807830810547, - "loss": 0.5175, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2940999269485474, - "rewards/margins": 0.722284197807312, - "rewards/rejected": -2.0163843631744385, + "epoch": 0.7064093728463129, + "grad_norm": 4.171043872833252, + "learning_rate": 8.17105829600789e-08, + "logits/chosen": -2.8199868202209473, + "logits/rejected": -2.791888952255249, + "logps/chosen": -71.82962799072266, + "logps/rejected": -77.13992309570312, + "loss": 0.6532, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.18986184895038605, + "rewards/margins": 0.08909063041210175, + "rewards/rejected": -0.2789524495601654, "step": 4100 }, { - "epoch": 0.71, - "eval_logits/chosen": -1.472186803817749, - "eval_logits/rejected": -1.4476103782653809, - "eval_logps/chosen": -186.34982299804688, - "eval_logps/rejected": -222.20494079589844, - "eval_loss": 0.6203304529190063, - "eval_rewards/accuracies": 0.6554368138313293, - "eval_rewards/chosen": -1.276459813117981, - "eval_rewards/margins": 0.31401583552360535, - "eval_rewards/rejected": -1.5904756784439087, - "eval_runtime": 356.6055, - "eval_samples_per_second": 12.069, - "eval_steps_per_second": 1.509, + "epoch": 0.7064093728463129, + "eval_logits/chosen": -2.906397819519043, + "eval_logits/rejected": -2.900517702102661, + "eval_logps/chosen": -74.83631134033203, + "eval_logps/rejected": -82.92964172363281, + "eval_loss": 0.677558422088623, + "eval_rewards/accuracies": 0.6124535202980042, + "eval_rewards/chosen": -0.16124409437179565, + "eval_rewards/margins": 0.03625102713704109, + "eval_rewards/rejected": -0.19749511778354645, + "eval_runtime": 359.7757, + "eval_samples_per_second": 11.963, + "eval_steps_per_second": 1.495, "step": 4100 }, { - "epoch": 0.71, - "grad_norm": 17.294444254537726, - "learning_rate": 4.079709428812842e-07, - "logits/chosen": -1.3422235250473022, - "logits/rejected": -1.3077037334442139, - "logps/chosen": -201.4602813720703, - "logps/rejected": -255.19070434570312, - "loss": 0.5744, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.4541057348251343, - "rewards/margins": 0.5830581188201904, - "rewards/rejected": -2.0371639728546143, + "epoch": 0.708132322536182, + "grad_norm": 4.226618766784668, + "learning_rate": 8.159418857625685e-08, + "logits/chosen": -2.787879705429077, + "logits/rejected": -2.776662826538086, + "logps/chosen": -79.5539779663086, + "logps/rejected": -79.31879425048828, + "loss": 0.6755, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.23491552472114563, + "rewards/margins": 0.04324030503630638, + "rewards/rejected": -0.2781558632850647, "step": 4110 }, { - "epoch": 0.71, - "grad_norm": 19.72533891211532, - "learning_rate": 4.073875424187739e-07, - "logits/chosen": -1.3486844301223755, - "logits/rejected": -1.3319542407989502, - "logps/chosen": -187.834228515625, - "logps/rejected": -242.30770874023438, - "loss": 0.583, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3786789178848267, - "rewards/margins": 0.49663081765174866, - "rewards/rejected": -1.875309944152832, + "epoch": 0.709855272226051, + "grad_norm": 4.805150032043457, + "learning_rate": 8.147750848375478e-08, + "logits/chosen": -2.7447409629821777, + "logits/rejected": -2.7471377849578857, + "logps/chosen": -73.45333862304688, + "logps/rejected": -84.03813171386719, + "loss": 0.6688, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.23488454520702362, + "rewards/margins": 0.05762225389480591, + "rewards/rejected": -0.29250678420066833, "step": 4120 }, { - "epoch": 0.71, - "grad_norm": 17.2804912644492, - "learning_rate": 4.0680271868859906e-07, - "logits/chosen": -1.4753568172454834, - "logits/rejected": -1.4285588264465332, - "logps/chosen": -163.55975341796875, - "logps/rejected": -233.02902221679688, - "loss": 0.4851, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.0960824489593506, - "rewards/margins": 0.7092905640602112, - "rewards/rejected": -1.805373191833496, + "epoch": 0.71157822191592, + "grad_norm": 4.33480978012085, + "learning_rate": 8.13605437377198e-08, + "logits/chosen": -2.8447792530059814, + "logits/rejected": -2.822270393371582, + "logps/chosen": -73.93051147460938, + "logps/rejected": -81.64012145996094, + "loss": 0.6513, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.19962568581104279, + "rewards/margins": 0.09144510328769684, + "rewards/rejected": -0.2910707890987396, "step": 4130 }, { - "epoch": 0.71, - "grad_norm": 21.81704208943039, - "learning_rate": 4.0621647697936556e-07, - "logits/chosen": -1.4139468669891357, - "logits/rejected": -1.3735511302947998, - "logps/chosen": -193.77230834960938, - "logps/rejected": -239.8427734375, - "loss": 0.5885, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3823630809783936, - "rewards/margins": 0.5091473460197449, - "rewards/rejected": -1.8915106058120728, + "epoch": 0.7133011716057891, + "grad_norm": 4.122574806213379, + "learning_rate": 8.124329539587311e-08, + "logits/chosen": -2.8442020416259766, + "logits/rejected": -2.8172695636749268, + "logps/chosen": -78.91192626953125, + "logps/rejected": -77.90412902832031, + "loss": 0.678, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.23386287689208984, + "rewards/margins": 0.03814762085676193, + "rewards/rejected": -0.2720105051994324, "step": 4140 }, { - "epoch": 0.72, - "grad_norm": 18.457503619554558, - "learning_rate": 4.0562882259250233e-07, - "logits/chosen": -1.4741637706756592, - "logits/rejected": -1.4252352714538574, - "logps/chosen": -182.50404357910156, - "logps/rejected": -247.427734375, - "loss": 0.5226, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.2463964223861694, - "rewards/margins": 0.709705114364624, - "rewards/rejected": -1.956101417541504, + "epoch": 0.7150241212956582, + "grad_norm": 4.575338840484619, + "learning_rate": 8.112576451850046e-08, + "logits/chosen": -2.865945816040039, + "logits/rejected": -2.84014630317688, + "logps/chosen": -79.77760314941406, + "logps/rejected": -81.44580841064453, + "loss": 0.6583, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21935932338237762, + "rewards/margins": 0.07686015963554382, + "rewards/rejected": -0.29621952772140503, "step": 4150 }, { - "epoch": 0.72, - "grad_norm": 19.03179504667782, - "learning_rate": 4.0503976084221323e-07, - "logits/chosen": -1.3726146221160889, - "logits/rejected": -1.3159904479980469, - "logps/chosen": -179.83924865722656, - "logps/rejected": -258.4535217285156, - "loss": 0.4878, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.2819244861602783, - "rewards/margins": 0.8116732835769653, - "rewards/rejected": -2.093597888946533, + "epoch": 0.7167470709855273, + "grad_norm": 4.371513366699219, + "learning_rate": 8.100795216844264e-08, + "logits/chosen": -2.760072708129883, + "logits/rejected": -2.7301230430603027, + "logps/chosen": -74.31935119628906, + "logps/rejected": -79.0527114868164, + "loss": 0.6607, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.2264113873243332, + "rewards/margins": 0.07299733906984329, + "rewards/rejected": -0.2994087338447571, "step": 4160 }, { - "epoch": 0.72, - "grad_norm": 23.70588198146383, - "learning_rate": 4.044492970554292e-07, - "logits/chosen": -1.374589443206787, - "logits/rejected": -1.3363924026489258, - "logps/chosen": -193.84341430664062, - "logps/rejected": -267.19268798828125, - "loss": 0.5472, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.4245638847351074, - "rewards/margins": 0.711793065071106, - "rewards/rejected": -2.136356830596924, + "epoch": 0.7184700206753962, + "grad_norm": 4.953804016113281, + "learning_rate": 8.088985941108584e-08, + "logits/chosen": -2.812103509902954, + "logits/rejected": -2.804077625274658, + "logps/chosen": -73.244140625, + "logps/rejected": -82.40103912353516, + "loss": 0.6621, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.21848125755786896, + "rewards/margins": 0.06967557966709137, + "rewards/rejected": -0.28815680742263794, "step": 4170 }, { - "epoch": 0.72, - "grad_norm": 20.355963174810125, - "learning_rate": 4.038574365717594e-07, - "logits/chosen": -1.3285168409347534, - "logits/rejected": -1.2805362939834595, - "logps/chosen": -200.12326049804688, - "logps/rejected": -274.0704650878906, - "loss": 0.5344, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.447908878326416, - "rewards/margins": 0.7298901677131653, - "rewards/rejected": -2.1777987480163574, + "epoch": 0.7201929703652653, + "grad_norm": 4.888758182525635, + "learning_rate": 8.077148731435188e-08, + "logits/chosen": -2.8084912300109863, + "logits/rejected": -2.7935614585876465, + "logps/chosen": -76.97647094726562, + "logps/rejected": -85.00114440917969, + "loss": 0.6621, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2163088619709015, + "rewards/margins": 0.0708131492137909, + "rewards/rejected": -0.2871219515800476, "step": 4180 }, { - "epoch": 0.72, - "grad_norm": 30.67163449647573, - "learning_rate": 4.0326418474344416e-07, - "logits/chosen": -1.3149698972702026, - "logits/rejected": -1.2749181985855103, - "logps/chosen": -206.71963500976562, - "logps/rejected": -285.58819580078125, - "loss": 0.5258, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.5566787719726562, - "rewards/margins": 0.7740581631660461, - "rewards/rejected": -2.3307368755340576, + "epoch": 0.7219159200551344, + "grad_norm": 5.011810302734375, + "learning_rate": 8.065283694868883e-08, + "logits/chosen": -2.7904458045959473, + "logits/rejected": -2.7769782543182373, + "logps/chosen": -74.02743530273438, + "logps/rejected": -83.00337219238281, + "loss": 0.6595, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.22959303855895996, + "rewards/margins": 0.07523669302463531, + "rewards/rejected": -0.3048296868801117, "step": 4190 }, { - "epoch": 0.72, - "grad_norm": 23.189476713803757, - "learning_rate": 4.0266954693530515e-07, - "logits/chosen": -1.3780596256256104, - "logits/rejected": -1.3424698114395142, - "logps/chosen": -209.67904663085938, - "logps/rejected": -262.9185791015625, - "loss": 0.5803, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.5398088693618774, - "rewards/margins": 0.5638743042945862, - "rewards/rejected": -2.1036829948425293, + "epoch": 0.7236388697450035, + "grad_norm": 4.535158157348633, + "learning_rate": 8.053390938706102e-08, + "logits/chosen": -2.8413708209991455, + "logits/rejected": -2.8256707191467285, + "logps/chosen": -77.91133117675781, + "logps/rejected": -79.3990478515625, + "loss": 0.6733, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.22204849123954773, + "rewards/margins": 0.046053197234869, + "rewards/rejected": -0.26810169219970703, "step": 4200 }, { - "epoch": 0.72, - "eval_logits/chosen": -1.4580851793289185, - "eval_logits/rejected": -1.4321988821029663, - "eval_logps/chosen": -193.99774169921875, - "eval_logps/rejected": -231.7759552001953, - "eval_loss": 0.6207540035247803, - "eval_rewards/accuracies": 0.6624070405960083, - "eval_rewards/chosen": -1.3529391288757324, - "eval_rewards/margins": 0.33324676752090454, - "eval_rewards/rejected": -1.6861858367919922, - "eval_runtime": 356.9885, - "eval_samples_per_second": 12.056, - "eval_steps_per_second": 1.507, + "epoch": 0.7236388697450035, + "eval_logits/chosen": -2.8949339389801025, + "eval_logits/rejected": -2.889023542404175, + "eval_logps/chosen": -75.9118881225586, + "eval_logps/rejected": -84.16388702392578, + "eval_loss": 0.6769329309463501, + "eval_rewards/accuracies": 0.6087360382080078, + "eval_rewards/chosen": -0.17199990153312683, + "eval_rewards/margins": 0.03783779591321945, + "eval_rewards/rejected": -0.20983768999576569, + "eval_runtime": 359.9545, + "eval_samples_per_second": 11.957, + "eval_steps_per_second": 1.495, "step": 4200 }, { - "epoch": 0.73, - "grad_norm": 32.38460221933683, - "learning_rate": 4.020735285246979e-07, - "logits/chosen": -1.3975965976715088, - "logits/rejected": -1.355668306350708, - "logps/chosen": -212.5668182373047, - "logps/rejected": -264.63543701171875, - "loss": 0.6133, + "epoch": 0.7253618194348725, + "grad_norm": 4.928792953491211, + "learning_rate": 8.041470570493958e-08, + "logits/chosen": -2.823927640914917, + "logits/rejected": -2.802546977996826, + "logps/chosen": -79.63623046875, + "logps/rejected": -82.85804748535156, + "loss": 0.6692, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.5729354619979858, - "rewards/margins": 0.5456880331039429, - "rewards/rejected": -2.1186232566833496, + "rewards/chosen": -0.24339476227760315, + "rewards/margins": 0.0574357807636261, + "rewards/rejected": -0.30083051323890686, "step": 4210 }, { - "epoch": 0.73, - "grad_norm": 16.885675503765714, - "learning_rate": 4.014761349014629e-07, - "logits/chosen": -1.3606762886047363, - "logits/rejected": -1.3178844451904297, - "logps/chosen": -178.82691955566406, - "logps/rejected": -241.67929077148438, - "loss": 0.5612, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2662688493728638, - "rewards/margins": 0.6238974332809448, - "rewards/rejected": -1.8901660442352295, + "epoch": 0.7270847691247415, + "grad_norm": 4.286323547363281, + "learning_rate": 8.029522698029257e-08, + "logits/chosen": -2.681990623474121, + "logits/rejected": -2.663794994354248, + "logps/chosen": -74.3034439086914, + "logps/rejected": -81.97090148925781, + "loss": 0.6616, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.22084763646125793, + "rewards/margins": 0.07181050628423691, + "rewards/rejected": -0.29265812039375305, "step": 4220 }, { - "epoch": 0.73, - "grad_norm": 29.792336168407434, - "learning_rate": 4.0087737146787656e-07, - "logits/chosen": -1.587550401687622, - "logits/rejected": -1.5437839031219482, - "logps/chosen": -163.56658935546875, - "logps/rejected": -229.27224731445312, - "loss": 0.5343, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.082904577255249, - "rewards/margins": 0.6688292622566223, - "rewards/rejected": -1.7517340183258057, + "epoch": 0.7288077188146106, + "grad_norm": 4.317086219787598, + "learning_rate": 8.017547429357531e-08, + "logits/chosen": -2.8607168197631836, + "logits/rejected": -2.8414225578308105, + "logps/chosen": -78.4534683227539, + "logps/rejected": -85.2705078125, + "loss": 0.658, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.23147408664226532, + "rewards/margins": 0.08023453503847122, + "rewards/rejected": -0.31170862913131714, "step": 4230 }, { - "epoch": 0.73, - "grad_norm": 20.765107873436467, - "learning_rate": 4.002772436386027e-07, - "logits/chosen": -1.5118169784545898, - "logits/rejected": -1.4638663530349731, - "logps/chosen": -155.6704559326172, - "logps/rejected": -229.45620727539062, - "loss": 0.518, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.009352207183838, - "rewards/margins": 0.7259235382080078, - "rewards/rejected": -1.7352758646011353, + "epoch": 0.7305306685044797, + "grad_norm": 4.675597190856934, + "learning_rate": 8.005544872772054e-08, + "logits/chosen": -2.7655177116394043, + "logits/rejected": -2.7412803173065186, + "logps/chosen": -77.64450073242188, + "logps/rejected": -87.1747055053711, + "loss": 0.6567, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2290038764476776, + "rewards/margins": 0.08320339769124985, + "rewards/rejected": -0.31220728158950806, "step": 4240 }, { - "epoch": 0.73, - "grad_norm": 23.96242082772077, - "learning_rate": 3.9967575684064367e-07, - "logits/chosen": -1.4785500764846802, - "logits/rejected": -1.4373469352722168, - "logps/chosen": -159.1673126220703, - "logps/rejected": -217.58468627929688, - "loss": 0.5303, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.0413461923599243, - "rewards/margins": 0.6081751585006714, - "rewards/rejected": -1.6495214700698853, + "epoch": 0.7322536181943488, + "grad_norm": 4.733819484710693, + "learning_rate": 7.993515136812874e-08, + "logits/chosen": -2.779085636138916, + "logits/rejected": -2.7630178928375244, + "logps/chosen": -78.02629089355469, + "logps/rejected": -81.59638977050781, + "loss": 0.668, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.2297748625278473, + "rewards/margins": 0.059504471719264984, + "rewards/rejected": -0.28927934169769287, "step": 4250 }, { - "epoch": 0.73, - "grad_norm": 24.144969976617194, - "learning_rate": 3.990729165132907e-07, - "logits/chosen": -1.4406192302703857, - "logits/rejected": -1.4052913188934326, - "logps/chosen": -160.1890106201172, - "logps/rejected": -228.3748779296875, - "loss": 0.544, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.0565812587738037, - "rewards/margins": 0.679029643535614, - "rewards/rejected": -1.7356109619140625, + "epoch": 0.7339765678842178, + "grad_norm": 4.640880107879639, + "learning_rate": 7.981458330265815e-08, + "logits/chosen": -2.7458930015563965, + "logits/rejected": -2.731210231781006, + "logps/chosen": -76.95973205566406, + "logps/rejected": -84.32952880859375, + "loss": 0.6629, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22431564331054688, + "rewards/margins": 0.07044248282909393, + "rewards/rejected": -0.294758141040802, "step": 4260 }, { - "epoch": 0.74, - "grad_norm": 24.358604007282633, - "learning_rate": 3.984687281080754e-07, - "logits/chosen": -1.3951603174209595, - "logits/rejected": -1.3441218137741089, - "logps/chosen": -164.6576690673828, - "logps/rejected": -230.2593536376953, - "loss": 0.5264, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.1306527853012085, - "rewards/margins": 0.6664345860481262, - "rewards/rejected": -1.79708731174469, + "epoch": 0.7356995175740868, + "grad_norm": 4.803411960601807, + "learning_rate": 7.969374562161509e-08, + "logits/chosen": -2.7621819972991943, + "logits/rejected": -2.737330675125122, + "logps/chosen": -71.40404510498047, + "logps/rejected": -78.76124572753906, + "loss": 0.6554, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.19795607030391693, + "rewards/margins": 0.08418522775173187, + "rewards/rejected": -0.2821413278579712, "step": 4270 }, { - "epoch": 0.74, - "grad_norm": 24.378376134460726, - "learning_rate": 3.978631970887201e-07, - "logits/chosen": -1.4013197422027588, - "logits/rejected": -1.3541449308395386, - "logps/chosen": -162.7399139404297, - "logps/rejected": -237.9474639892578, - "loss": 0.5066, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.1103148460388184, - "rewards/margins": 0.7485235333442688, - "rewards/rejected": -1.858838677406311, + "epoch": 0.7374224672639559, + "grad_norm": 4.555239200592041, + "learning_rate": 7.957263941774402e-08, + "logits/chosen": -2.7401576042175293, + "logits/rejected": -2.714449882507324, + "logps/chosen": -74.34765625, + "logps/rejected": -82.82798767089844, + "loss": 0.6584, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.22630128264427185, + "rewards/margins": 0.08142432570457458, + "rewards/rejected": -0.30772560834884644, "step": 4280 }, { - "epoch": 0.74, - "grad_norm": 26.74832111588032, - "learning_rate": 3.972563289310882e-07, - "logits/chosen": -1.3995485305786133, - "logits/rejected": -1.348487138748169, - "logps/chosen": -180.16856384277344, - "logps/rejected": -255.4312286376953, - "loss": 0.5467, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2493109703063965, - "rewards/margins": 0.7890298962593079, - "rewards/rejected": -2.0383410453796387, + "epoch": 0.739145416953825, + "grad_norm": 4.382991313934326, + "learning_rate": 7.945126578621763e-08, + "logits/chosen": -2.778887987136841, + "logits/rejected": -2.755769729614258, + "logps/chosen": -79.17945861816406, + "logps/rejected": -82.90766906738281, + "loss": 0.6621, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2393958568572998, + "rewards/margins": 0.07353595644235611, + "rewards/rejected": -0.3129318356513977, "step": 4290 }, { - "epoch": 0.74, - "grad_norm": 15.729787495443901, - "learning_rate": 3.9664812912313533e-07, - "logits/chosen": -1.4865190982818604, - "logits/rejected": -1.4452247619628906, - "logps/chosen": -155.62908935546875, - "logps/rejected": -227.11245727539062, - "loss": 0.507, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.0309059619903564, - "rewards/margins": 0.7097223997116089, - "rewards/rejected": -1.7406282424926758, + "epoch": 0.740868366643694, + "grad_norm": 4.42786979675293, + "learning_rate": 7.932962582462707e-08, + "logits/chosen": -2.796304225921631, + "logits/rejected": -2.7852556705474854, + "logps/chosen": -76.05867767333984, + "logps/rejected": -83.70006561279297, + "loss": 0.6618, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.2350235879421234, + "rewards/margins": 0.07132681459188461, + "rewards/rejected": -0.3063504099845886, "step": 4300 }, { - "epoch": 0.74, - "eval_logits/chosen": -1.5954647064208984, - "eval_logits/rejected": -1.5738048553466797, - "eval_logps/chosen": -152.3179931640625, - "eval_logps/rejected": -181.7826385498047, - "eval_loss": 0.6264519095420837, - "eval_rewards/accuracies": 0.6624070405960083, - "eval_rewards/chosen": -0.9361413717269897, - "eval_rewards/margins": 0.25011131167411804, - "eval_rewards/rejected": -1.1862527132034302, - "eval_runtime": 356.8364, - "eval_samples_per_second": 12.062, - "eval_steps_per_second": 1.508, + "epoch": 0.740868366643694, + "eval_logits/chosen": -2.885286569595337, + "eval_logits/rejected": -2.8793962001800537, + "eval_logps/chosen": -76.69395446777344, + "eval_logps/rejected": -85.07231140136719, + "eval_loss": 0.6764479875564575, + "eval_rewards/accuracies": 0.6057156324386597, + "eval_rewards/chosen": -0.17982058227062225, + "eval_rewards/margins": 0.039101339876651764, + "eval_rewards/rejected": -0.21892189979553223, + "eval_runtime": 359.6614, + "eval_samples_per_second": 11.967, + "eval_steps_per_second": 1.496, "step": 4300 }, { - "epoch": 0.74, - "grad_norm": 13.768891932897395, - "learning_rate": 3.9603860316485925e-07, - "logits/chosen": -1.418806791305542, - "logits/rejected": -1.3766006231307983, - "logps/chosen": -162.57728576660156, - "logps/rejected": -218.0735321044922, - "loss": 0.5448, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.0851390361785889, - "rewards/margins": 0.5797218084335327, - "rewards/rejected": -1.664860725402832, + "epoch": 0.742591316333563, + "grad_norm": 4.963508605957031, + "learning_rate": 7.920772063297185e-08, + "logits/chosen": -2.7422854900360107, + "logits/rejected": -2.7238478660583496, + "logps/chosen": -77.0846176147461, + "logps/rejected": -82.02983856201172, + "loss": 0.6602, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2303268015384674, + "rewards/margins": 0.07409859448671341, + "rewards/rejected": -0.30442532896995544, "step": 4310 }, { - "epoch": 0.74, - "grad_norm": 14.715075236823548, - "learning_rate": 3.9542775656825e-07, - "logits/chosen": -1.4987797737121582, - "logits/rejected": -1.4415086507797241, - "logps/chosen": -172.37828063964844, - "logps/rejected": -245.30239868164062, - "loss": 0.4709, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.1376218795776367, - "rewards/margins": 0.7687402963638306, - "rewards/rejected": -1.9063619375228882, + "epoch": 0.7443142660234321, + "grad_norm": 4.418318271636963, + "learning_rate": 7.908555131365e-08, + "logits/chosen": -2.8225841522216797, + "logits/rejected": -2.7945821285247803, + "logps/chosen": -82.75288391113281, + "logps/rejected": -89.62847137451172, + "loss": 0.6444, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.24110881984233856, + "rewards/margins": 0.10840406268835068, + "rewards/rejected": -0.34951287508010864, "step": 4320 }, { - "epoch": 0.75, - "grad_norm": 28.706374504499472, - "learning_rate": 3.948155948572405e-07, - "logits/chosen": -1.3579802513122559, - "logits/rejected": -1.3000389337539673, - "logps/chosen": -183.6754150390625, - "logps/rejected": -248.2251739501953, - "loss": 0.5221, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2819044589996338, - "rewards/margins": 0.702656626701355, - "rewards/rejected": -1.9845609664916992, + "epoch": 0.7460372157133012, + "grad_norm": 4.678439140319824, + "learning_rate": 7.896311897144809e-08, + "logits/chosen": -2.729358434677124, + "logits/rejected": -2.6988437175750732, + "logps/chosen": -81.18726348876953, + "logps/rejected": -80.73506164550781, + "loss": 0.6713, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.25696468353271484, + "rewards/margins": 0.05245956778526306, + "rewards/rejected": -0.3094242513179779, "step": 4330 }, { - "epoch": 0.75, - "grad_norm": 21.368421423428487, - "learning_rate": 3.9420212356765606e-07, - "logits/chosen": -1.3122832775115967, - "logits/rejected": -1.2653281688690186, - "logps/chosen": -180.68899536132812, - "logps/rejected": -259.26025390625, - "loss": 0.5476, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3169108629226685, - "rewards/margins": 0.7710382342338562, - "rewards/rejected": -2.08794903755188, + "epoch": 0.7477601654031703, + "grad_norm": 4.287242412567139, + "learning_rate": 7.884042471353122e-08, + "logits/chosen": -2.7333014011383057, + "logits/rejected": -2.715153694152832, + "logps/chosen": -73.27593994140625, + "logps/rejected": -83.72303771972656, + "loss": 0.6543, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.24279603362083435, + "rewards/margins": 0.08945179730653763, + "rewards/rejected": -0.3322478234767914, "step": 4340 }, { - "epoch": 0.75, - "grad_norm": 21.538559584335715, - "learning_rate": 3.93587348247165e-07, - "logits/chosen": -1.3758046627044678, - "logits/rejected": -1.337914228439331, - "logps/chosen": -167.84469604492188, - "logps/rejected": -237.70242309570312, - "loss": 0.5158, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.1684411764144897, - "rewards/margins": 0.6767427921295166, - "rewards/rejected": -1.8451837301254272, + "epoch": 0.7494831150930393, + "grad_norm": 5.045931816101074, + "learning_rate": 7.8717469649433e-08, + "logits/chosen": -2.7423102855682373, + "logits/rejected": -2.727919578552246, + "logps/chosen": -73.34207916259766, + "logps/rejected": -83.40766906738281, + "loss": 0.6583, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22332239151000977, + "rewards/margins": 0.07879458367824554, + "rewards/rejected": -0.3021170198917389, "step": 4350 }, { - "epoch": 0.75, - "grad_norm": 16.11283805818009, - "learning_rate": 3.929712744552278e-07, - "logits/chosen": -1.412389874458313, - "logits/rejected": -1.356400728225708, - "logps/chosen": -176.62753295898438, - "logps/rejected": -245.2499542236328, - "loss": 0.5299, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2295823097229004, - "rewards/margins": 0.7066112160682678, - "rewards/rejected": -1.9361934661865234, + "epoch": 0.7512060647829083, + "grad_norm": 4.683145999908447, + "learning_rate": 7.859425489104556e-08, + "logits/chosen": -2.785099506378174, + "logits/rejected": -2.7638652324676514, + "logps/chosen": -80.40673065185547, + "logps/rejected": -84.97071838378906, + "loss": 0.6658, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.267350971698761, + "rewards/margins": 0.06585561484098434, + "rewards/rejected": -0.3332065939903259, "step": 4360 }, { - "epoch": 0.75, - "grad_norm": 30.246128714363632, - "learning_rate": 3.923539077630471e-07, - "logits/chosen": -1.3993642330169678, - "logits/rejected": -1.3563659191131592, - "logps/chosen": -184.28158569335938, - "logps/rejected": -247.9432373046875, - "loss": 0.5544, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.293311357498169, - "rewards/margins": 0.6382697820663452, - "rewards/rejected": -1.9315814971923828, + "epoch": 0.7529290144727774, + "grad_norm": 4.6525797843933105, + "learning_rate": 7.847078155260942e-08, + "logits/chosen": -2.8002583980560303, + "logits/rejected": -2.7860169410705566, + "logps/chosen": -79.53623962402344, + "logps/rejected": -86.96631622314453, + "loss": 0.6603, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.24577634036540985, + "rewards/margins": 0.07589911669492722, + "rewards/rejected": -0.32167547941207886, "step": 4370 }, { - "epoch": 0.75, - "grad_norm": 25.81059610250568, - "learning_rate": 3.917352537535176e-07, - "logits/chosen": -1.4071307182312012, - "logits/rejected": -1.356684923171997, - "logps/chosen": -182.4822998046875, - "logps/rejected": -258.318359375, - "loss": 0.5202, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.281593918800354, - "rewards/margins": 0.7798604369163513, - "rewards/rejected": -2.0614542961120605, + "epoch": 0.7546519641626465, + "grad_norm": 4.781350135803223, + "learning_rate": 7.834705075070352e-08, + "logits/chosen": -2.8228507041931152, + "logits/rejected": -2.8000426292419434, + "logps/chosen": -79.38736724853516, + "logps/rejected": -82.51011657714844, + "loss": 0.6719, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2508380115032196, + "rewards/margins": 0.0524689257144928, + "rewards/rejected": -0.30330690741539, "step": 4380 }, { - "epoch": 0.76, - "grad_norm": 21.811436240135972, - "learning_rate": 3.91115318021175e-07, - "logits/chosen": -1.336089849472046, - "logits/rejected": -1.291550874710083, - "logps/chosen": -188.70346069335938, - "logps/rejected": -266.1550598144531, - "loss": 0.5067, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3547266721725464, - "rewards/margins": 0.7720333337783813, - "rewards/rejected": -2.126760244369507, + "epoch": 0.7563749138525155, + "grad_norm": 4.321199893951416, + "learning_rate": 7.8223063604235e-08, + "logits/chosen": -2.75453782081604, + "logits/rejected": -2.7411556243896484, + "logps/chosen": -78.19532775878906, + "logps/rejected": -86.12273406982422, + "loss": 0.6592, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.24958685040473938, + "rewards/margins": 0.07666632533073425, + "rewards/rejected": -0.32625311613082886, "step": 4390 }, { - "epoch": 0.76, - "grad_norm": 25.759431483166495, - "learning_rate": 3.9049410617214607e-07, - "logits/chosen": -1.3443093299865723, - "logits/rejected": -1.2999963760375977, - "logps/chosen": -194.36892700195312, - "logps/rejected": -274.6505432128906, - "loss": 0.5273, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3747164011001587, - "rewards/margins": 0.8176537752151489, - "rewards/rejected": -2.1923701763153076, + "epoch": 0.7580978635423845, + "grad_norm": 4.8006591796875, + "learning_rate": 7.809882123442921e-08, + "logits/chosen": -2.7704663276672363, + "logits/rejected": -2.7536025047302246, + "logps/chosen": -81.3585433959961, + "logps/rejected": -86.84334564208984, + "loss": 0.6625, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.2445327341556549, + "rewards/margins": 0.06969234347343445, + "rewards/rejected": -0.3142250180244446, "step": 4400 }, { - "epoch": 0.76, - "eval_logits/chosen": -1.4307539463043213, - "eval_logits/rejected": -1.404834270477295, - "eval_logps/chosen": -185.88987731933594, - "eval_logps/rejected": -224.0266876220703, - "eval_loss": 0.6210964918136597, - "eval_rewards/accuracies": 0.6686803102493286, - "eval_rewards/chosen": -1.2718603610992432, - "eval_rewards/margins": 0.3368328809738159, - "eval_rewards/rejected": -1.608693242073059, - "eval_runtime": 356.7436, - "eval_samples_per_second": 12.065, - "eval_steps_per_second": 1.508, + "epoch": 0.7580978635423845, + "eval_logits/chosen": -2.8745179176330566, + "eval_logits/rejected": -2.8686115741729736, + "eval_logps/chosen": -78.0712890625, + "eval_logps/rejected": -86.64641571044922, + "eval_loss": 0.6756924986839294, + "eval_rewards/accuracies": 0.6052509546279907, + "eval_rewards/chosen": -0.19359391927719116, + "eval_rewards/margins": 0.04106910154223442, + "eval_rewards/rejected": -0.2346630096435547, + "eval_runtime": 359.8765, + "eval_samples_per_second": 11.96, + "eval_steps_per_second": 1.495, "step": 4400 }, { - "epoch": 0.76, - "grad_norm": 19.840813891153072, - "learning_rate": 3.898716238240971e-07, - "logits/chosen": -1.3299553394317627, - "logits/rejected": -1.289876103401184, - "logps/chosen": -192.49099731445312, - "logps/rejected": -250.6361541748047, - "loss": 0.5987, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3760064840316772, - "rewards/margins": 0.5802738666534424, - "rewards/rejected": -1.9562803506851196, + "epoch": 0.7598208132322536, + "grad_norm": 4.7382097244262695, + "learning_rate": 7.797432476481942e-08, + "logits/chosen": -2.7340545654296875, + "logits/rejected": -2.7181458473205566, + "logps/chosen": -81.9361572265625, + "logps/rejected": -89.33692932128906, + "loss": 0.6622, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.2704450488090515, + "rewards/margins": 0.07261786609888077, + "rewards/rejected": -0.3430629074573517, "step": 4410 }, { - "epoch": 0.76, - "grad_norm": 26.035856678795522, - "learning_rate": 3.892478766061841e-07, - "logits/chosen": -1.489180564880371, - "logits/rejected": -1.4286963939666748, - "logps/chosen": -172.96762084960938, - "logps/rejected": -225.233154296875, - "loss": 0.5652, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1818801164627075, - "rewards/margins": 0.5658958554267883, - "rewards/rejected": -1.7477757930755615, + "epoch": 0.7615437629221227, + "grad_norm": 5.843469142913818, + "learning_rate": 7.784957532123681e-08, + "logits/chosen": -2.8479888439178467, + "logits/rejected": -2.805508852005005, + "logps/chosen": -80.79518127441406, + "logps/rejected": -83.56358337402344, + "loss": 0.6634, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2597918212413788, + "rewards/margins": 0.07114500552415848, + "rewards/rejected": -0.33093681931495667, "step": 4420 }, { - "epoch": 0.76, - "grad_norm": 25.167741330404056, - "learning_rate": 3.886228701590011e-07, - "logits/chosen": -1.4246338605880737, - "logits/rejected": -1.3719749450683594, - "logps/chosen": -154.06051635742188, - "logps/rejected": -209.6873321533203, - "loss": 0.5631, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.0029816627502441, - "rewards/margins": 0.5808738470077515, - "rewards/rejected": -1.583855390548706, + "epoch": 0.7632667126119917, + "grad_norm": 3.8952701091766357, + "learning_rate": 7.772457403180022e-08, + "logits/chosen": -2.740231513977051, + "logits/rejected": -2.7093002796173096, + "logps/chosen": -79.71388244628906, + "logps/rejected": -83.66574096679688, + "loss": 0.6669, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.25931215286254883, + "rewards/margins": 0.06413620710372925, + "rewards/rejected": -0.3234483599662781, "step": 4430 }, { - "epoch": 0.76, - "grad_norm": 17.10714671684354, - "learning_rate": 3.8799661013452955e-07, - "logits/chosen": -1.485050916671753, - "logits/rejected": -1.4327274560928345, - "logps/chosen": -173.7892608642578, - "logps/rejected": -240.88101196289062, - "loss": 0.5115, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.1438651084899902, - "rewards/margins": 0.7148648500442505, - "rewards/rejected": -1.8587299585342407, + "epoch": 0.7649896623018608, + "grad_norm": 5.432117938995361, + "learning_rate": 7.759932202690592e-08, + "logits/chosen": -2.8032429218292236, + "logits/rejected": -2.773001194000244, + "logps/chosen": -86.66961669921875, + "logps/rejected": -91.77088928222656, + "loss": 0.6515, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2726757824420929, + "rewards/margins": 0.09480321407318115, + "rewards/rejected": -0.36747899651527405, "step": 4440 }, { - "epoch": 0.77, - "grad_norm": 19.220029089043173, - "learning_rate": 3.8736910219608705e-07, - "logits/chosen": -1.3361194133758545, - "logits/rejected": -1.2997193336486816, - "logps/chosen": -164.47987365722656, - "logps/rejected": -231.0243682861328, - "loss": 0.5257, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.0879555940628052, - "rewards/margins": 0.6804584264755249, - "rewards/rejected": -1.7684139013290405, + "epoch": 0.7667126119917298, + "grad_norm": 4.670837879180908, + "learning_rate": 7.747382043921741e-08, + "logits/chosen": -2.676553964614868, + "logits/rejected": -2.6650469303131104, + "logps/chosen": -81.74214172363281, + "logps/rejected": -87.35073852539062, + "loss": 0.663, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.2602686882019043, + "rewards/margins": 0.0713680237531662, + "rewards/rejected": -0.3316367268562317, "step": 4450 }, { - "epoch": 0.77, - "grad_norm": 21.93245110911694, - "learning_rate": 3.8674035201827626e-07, - "logits/chosen": -1.4222412109375, - "logits/rejected": -1.387459635734558, - "logps/chosen": -174.14901733398438, - "logps/rejected": -240.2296600341797, - "loss": 0.5475, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2142760753631592, - "rewards/margins": 0.6618901491165161, - "rewards/rejected": -1.8761663436889648, + "epoch": 0.7684355616815989, + "grad_norm": 5.040444850921631, + "learning_rate": 7.734807040365525e-08, + "logits/chosen": -2.7904114723205566, + "logits/rejected": -2.783573627471924, + "logps/chosen": -78.90962219238281, + "logps/rejected": -85.83683776855469, + "loss": 0.6638, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2617763876914978, + "rewards/margins": 0.06999503076076508, + "rewards/rejected": -0.3317714333534241, "step": 4460 }, { - "epoch": 0.77, - "grad_norm": 26.334535864969112, - "learning_rate": 3.861103652869334e-07, - "logits/chosen": -1.4492603540420532, - "logits/rejected": -1.3905606269836426, - "logps/chosen": -184.28909301757812, - "logps/rejected": -254.66085815429688, - "loss": 0.5201, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.2838647365570068, - "rewards/margins": 0.7604522109031677, - "rewards/rejected": -2.0443172454833984, + "epoch": 0.770158511371468, + "grad_norm": 5.055511951446533, + "learning_rate": 7.722207305738668e-08, + "logits/chosen": -2.8303465843200684, + "logits/rejected": -2.8013291358947754, + "logps/chosen": -82.27478790283203, + "logps/rejected": -85.03372192382812, + "loss": 0.656, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.26352792978286743, + "rewards/margins": 0.08442261070013046, + "rewards/rejected": -0.3479505479335785, "step": 4470 }, { - "epoch": 0.77, - "grad_norm": 41.57134948033417, - "learning_rate": 3.8547914769907705e-07, - "logits/chosen": -1.4375700950622559, - "logits/rejected": -1.3990795612335205, - "logps/chosen": -193.02252197265625, - "logps/rejected": -266.1149597167969, - "loss": 0.5628, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.4204905033111572, - "rewards/margins": 0.7187623977661133, - "rewards/rejected": -2.1392529010772705, + "epoch": 0.771881461061337, + "grad_norm": 5.498683929443359, + "learning_rate": 7.709582953981541e-08, + "logits/chosen": -2.853652238845825, + "logits/rejected": -2.8468830585479736, + "logps/chosen": -78.6220703125, + "logps/rejected": -86.2085952758789, + "loss": 0.6668, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.276112824678421, + "rewards/margins": 0.06350714713335037, + "rewards/rejected": -0.3396199345588684, "step": 4480 }, { - "epoch": 0.77, - "grad_norm": 28.64256495751411, - "learning_rate": 3.848467049628564e-07, - "logits/chosen": -1.317628264427185, - "logits/rejected": -1.2681446075439453, - "logps/chosen": -187.17874145507812, - "logps/rejected": -251.03970336914062, - "loss": 0.531, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3606078624725342, - "rewards/margins": 0.6513209939002991, - "rewards/rejected": -2.0119290351867676, + "epoch": 0.7736044107512061, + "grad_norm": 5.022222518920898, + "learning_rate": 7.696934099257128e-08, + "logits/chosen": -2.744422197341919, + "logits/rejected": -2.718071937561035, + "logps/chosen": -79.3916015625, + "logps/rejected": -85.3674087524414, + "loss": 0.6663, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.2825077176094055, + "rewards/margins": 0.07279295474290848, + "rewards/rejected": -0.3553006649017334, "step": 4490 }, { - "epoch": 0.78, - "grad_norm": 12.948645388897265, - "learning_rate": 3.8421304279749983e-07, - "logits/chosen": -1.3421502113342285, - "logits/rejected": -1.2936899662017822, - "logps/chosen": -180.89065551757812, - "logps/rejected": -251.9108428955078, - "loss": 0.5574, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2455495595932007, - "rewards/margins": 0.7440658211708069, - "rewards/rejected": -1.9896152019500732, + "epoch": 0.7753273604410751, + "grad_norm": 5.13820743560791, + "learning_rate": 7.684260855949997e-08, + "logits/chosen": -2.7579360008239746, + "logits/rejected": -2.7426867485046387, + "logps/chosen": -83.91657257080078, + "logps/rejected": -88.30168151855469, + "loss": 0.6605, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.275812566280365, + "rewards/margins": 0.07757066935300827, + "rewards/rejected": -0.35338321328163147, "step": 4500 }, { - "epoch": 0.78, - "eval_logits/chosen": -1.4964402914047241, - "eval_logits/rejected": -1.472887396812439, - "eval_logps/chosen": -169.3536376953125, - "eval_logps/rejected": -203.17874145507812, - "eval_loss": 0.6233484745025635, - "eval_rewards/accuracies": 0.6670538783073425, - "eval_rewards/chosen": -1.1064980030059814, - "eval_rewards/margins": 0.29371556639671326, - "eval_rewards/rejected": -1.400213599205017, - "eval_runtime": 356.7428, - "eval_samples_per_second": 12.065, - "eval_steps_per_second": 1.508, + "epoch": 0.7753273604410751, + "eval_logits/chosen": -2.864949941635132, + "eval_logits/rejected": -2.8590142726898193, + "eval_logps/chosen": -79.67756652832031, + "eval_logps/rejected": -88.53424072265625, + "eval_loss": 0.6745909452438354, + "eval_rewards/accuracies": 0.6066449880599976, + "eval_rewards/chosen": -0.20965667068958282, + "eval_rewards/margins": 0.043884556740522385, + "eval_rewards/rejected": -0.2535412013530731, + "eval_runtime": 359.4763, + "eval_samples_per_second": 11.973, + "eval_steps_per_second": 1.497, "step": 4500 }, { - "epoch": 0.78, - "grad_norm": 17.509248366329857, - "learning_rate": 3.8357816693326314e-07, - "logits/chosen": -1.487713098526001, - "logits/rejected": -1.4337613582611084, - "logps/chosen": -171.7799835205078, - "logps/rejected": -247.6623992919922, - "loss": 0.5203, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.1836849451065063, - "rewards/margins": 0.7367376089096069, - "rewards/rejected": -1.9204223155975342, + "epoch": 0.7770503101309442, + "grad_norm": 5.928679466247559, + "learning_rate": 7.671563338665262e-08, + "logits/chosen": -2.8647067546844482, + "logits/rejected": -2.8383216857910156, + "logps/chosen": -80.19285583496094, + "logps/rejected": -91.64847564697266, + "loss": 0.6525, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.26760441064834595, + "rewards/margins": 0.09270481765270233, + "rewards/rejected": -0.3603092133998871, "step": 4510 }, { - "epoch": 0.78, - "grad_norm": 21.475335389561995, - "learning_rate": 3.829420831113775e-07, - "logits/chosen": -1.4706519842147827, - "logits/rejected": -1.422728180885315, - "logps/chosen": -176.04486083984375, - "logps/rejected": -242.8605499267578, - "loss": 0.5317, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.234581708908081, - "rewards/margins": 0.6933325529098511, - "rewards/rejected": -1.9279142618179321, + "epoch": 0.7787732598208132, + "grad_norm": 5.4021406173706055, + "learning_rate": 7.65884166222755e-08, + "logits/chosen": -2.8022427558898926, + "logits/rejected": -2.781902313232422, + "logps/chosen": -81.70062255859375, + "logps/rejected": -84.92115783691406, + "loss": 0.6701, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.29102572798728943, + "rewards/margins": 0.05737787485122681, + "rewards/rejected": -0.34840360283851624, "step": 4520 }, { - "epoch": 0.78, - "grad_norm": 30.044462081498253, - "learning_rate": 3.823047970839981e-07, - "logits/chosen": -1.4337480068206787, - "logits/rejected": -1.4001357555389404, - "logps/chosen": -168.0045623779297, - "logps/rejected": -224.1116943359375, - "loss": 0.5591, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.1422946453094482, - "rewards/margins": 0.564059317111969, - "rewards/rejected": -1.7063539028167725, + "epoch": 0.7804962095106823, + "grad_norm": 5.309082508087158, + "learning_rate": 7.646095941679962e-08, + "logits/chosen": -2.7916605472564697, + "logits/rejected": -2.782623767852783, + "logps/chosen": -81.60151672363281, + "logps/rejected": -86.02545928955078, + "loss": 0.6752, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2782711982727051, + "rewards/margins": 0.0471639521420002, + "rewards/rejected": -0.32543516159057617, "step": 4530 }, { - "epoch": 0.78, - "grad_norm": 32.08416286753465, - "learning_rate": 3.816663146141514e-07, - "logits/chosen": -1.321825623512268, - "logits/rejected": -1.2757227420806885, - "logps/chosen": -175.95849609375, - "logps/rejected": -248.03097534179688, - "loss": 0.5095, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.232386827468872, - "rewards/margins": 0.7383901476860046, - "rewards/rejected": -1.970776915550232, + "epoch": 0.7822191592005513, + "grad_norm": 6.1212687492370605, + "learning_rate": 7.633326292283028e-08, + "logits/chosen": -2.704936981201172, + "logits/rejected": -2.6931707859039307, + "logps/chosen": -79.34037780761719, + "logps/rejected": -85.31973266601562, + "loss": 0.6595, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.2660858631134033, + "rewards/margins": 0.0777621865272522, + "rewards/rejected": -0.3438480496406555, "step": 4540 }, { - "epoch": 0.78, - "grad_norm": 21.22323203264765, - "learning_rate": 3.810266414756836e-07, - "logits/chosen": -1.3958414793014526, - "logits/rejected": -1.344481110572815, - "logps/chosen": -177.6934051513672, - "logps/rejected": -246.7960205078125, - "loss": 0.5158, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2130842208862305, - "rewards/margins": 0.7209616899490356, - "rewards/rejected": -1.9340457916259766, + "epoch": 0.7839421088904204, + "grad_norm": 5.877651691436768, + "learning_rate": 7.620532829513672e-08, + "logits/chosen": -2.802441120147705, + "logits/rejected": -2.7796969413757324, + "logps/chosen": -82.77192687988281, + "logps/rejected": -89.22770690917969, + "loss": 0.6521, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.26413458585739136, + "rewards/margins": 0.09422699362039566, + "rewards/rejected": -0.35836154222488403, "step": 4550 }, { - "epoch": 0.79, - "grad_norm": 21.245086858763795, - "learning_rate": 3.803857834532081e-07, - "logits/chosen": -1.2998394966125488, - "logits/rejected": -1.2284823656082153, - "logps/chosen": -185.28271484375, - "logps/rejected": -263.3417663574219, - "loss": 0.5029, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2907075881958008, - "rewards/margins": 0.8120514154434204, - "rewards/rejected": -2.1027588844299316, + "epoch": 0.7856650585802895, + "grad_norm": 4.796073913574219, + "learning_rate": 7.607715669064162e-08, + "logits/chosen": -2.711344003677368, + "logits/rejected": -2.674924612045288, + "logps/chosen": -81.03084564208984, + "logps/rejected": -88.92893981933594, + "loss": 0.6442, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2481379508972168, + "rewards/margins": 0.11026115715503693, + "rewards/rejected": -0.35839909315109253, "step": 4560 }, { - "epoch": 0.79, - "grad_norm": 30.48524469504423, - "learning_rate": 3.797437463420534e-07, - "logits/chosen": -1.3093476295471191, - "logits/rejected": -1.259817361831665, - "logps/chosen": -194.9139862060547, - "logps/rejected": -268.6426696777344, - "loss": 0.5509, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.4381399154663086, - "rewards/margins": 0.7308866381645203, - "rewards/rejected": -2.1690266132354736, + "epoch": 0.7873880082701585, + "grad_norm": 5.433099746704102, + "learning_rate": 7.594874926841069e-08, + "logits/chosen": -2.7318739891052246, + "logits/rejected": -2.707764148712158, + "logps/chosen": -79.96495056152344, + "logps/rejected": -89.24745178222656, + "loss": 0.6575, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2887458801269531, + "rewards/margins": 0.08619584143161774, + "rewards/rejected": -0.37494176626205444, "step": 4570 }, { - "epoch": 0.79, - "grad_norm": 20.59369730212467, - "learning_rate": 3.791005359482106e-07, - "logits/chosen": -1.3296152353286743, - "logits/rejected": -1.28184175491333, - "logps/chosen": -157.950439453125, - "logps/rejected": -213.4839630126953, - "loss": 0.5544, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.037023901939392, - "rewards/margins": 0.5802245736122131, - "rewards/rejected": -1.617248296737671, + "epoch": 0.7891109579600276, + "grad_norm": 5.255424976348877, + "learning_rate": 7.582010718964212e-08, + "logits/chosen": -2.701725721359253, + "logits/rejected": -2.675565242767334, + "logps/chosen": -78.10517883300781, + "logps/rejected": -84.2778549194336, + "loss": 0.655, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2384556531906128, + "rewards/margins": 0.08670667558908463, + "rewards/rejected": -0.3251623511314392, "step": 4580 }, { - "epoch": 0.79, - "grad_norm": 21.05628740715068, - "learning_rate": 3.784561580882806e-07, - "logits/chosen": -1.4657633304595947, - "logits/rejected": -1.4152452945709229, - "logps/chosen": -169.1389617919922, - "logps/rejected": -219.05465698242188, - "loss": 0.6103, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1427037715911865, - "rewards/margins": 0.5260920524597168, - "rewards/rejected": -1.6687958240509033, + "epoch": 0.7908339076498966, + "grad_norm": 5.248478412628174, + "learning_rate": 7.569123161765611e-08, + "logits/chosen": -2.7991232872009277, + "logits/rejected": -2.7724077701568604, + "logps/chosen": -83.50713348388672, + "logps/rejected": -86.54850006103516, + "loss": 0.6708, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2862285375595093, + "rewards/margins": 0.05719224363565445, + "rewards/rejected": -0.3434208035469055, "step": 4590 }, { - "epoch": 0.79, - "grad_norm": 22.60134182003666, - "learning_rate": 3.778106185894221e-07, - "logits/chosen": -1.3957931995391846, - "logits/rejected": -1.3415305614471436, - "logps/chosen": -159.36795043945312, - "logps/rejected": -236.89773559570312, - "loss": 0.4819, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.0599912405014038, - "rewards/margins": 0.7656328082084656, - "rewards/rejected": -1.8256241083145142, + "epoch": 0.7925568573397657, + "grad_norm": 5.3829522132873535, + "learning_rate": 7.556212371788441e-08, + "logits/chosen": -2.682486057281494, + "logits/rejected": -2.6631035804748535, + "logps/chosen": -82.19569396972656, + "logps/rejected": -94.45573425292969, + "loss": 0.6437, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2883058190345764, + "rewards/margins": 0.11274129152297974, + "rewards/rejected": -0.40104714035987854, "step": 4600 }, { - "epoch": 0.79, - "eval_logits/chosen": -1.525081992149353, - "eval_logits/rejected": -1.501688003540039, - "eval_logps/chosen": -169.0588836669922, - "eval_logps/rejected": -203.32528686523438, - "eval_loss": 0.6219184994697571, - "eval_rewards/accuracies": 0.6642658114433289, - "eval_rewards/chosen": -1.1035504341125488, - "eval_rewards/margins": 0.298128604888916, - "eval_rewards/rejected": -1.4016790390014648, - "eval_runtime": 356.8595, - "eval_samples_per_second": 12.061, - "eval_steps_per_second": 1.508, + "epoch": 0.7925568573397657, + "eval_logits/chosen": -2.8572607040405273, + "eval_logits/rejected": -2.851304531097412, + "eval_logps/chosen": -81.13443756103516, + "eval_logps/rejected": -90.21497344970703, + "eval_loss": 0.6737370491027832, + "eval_rewards/accuracies": 0.6071096658706665, + "eval_rewards/chosen": -0.22422541677951813, + "eval_rewards/margins": 0.04612297564744949, + "eval_rewards/rejected": -0.2703484296798706, + "eval_runtime": 360.2206, + "eval_samples_per_second": 11.948, + "eval_steps_per_second": 1.494, "step": 4600 }, { - "epoch": 0.79, - "grad_norm": 25.140522968467, - "learning_rate": 3.771639232892986e-07, - "logits/chosen": -1.3437252044677734, - "logits/rejected": -1.3191344738006592, - "logps/chosen": -189.2852325439453, - "logps/rejected": -238.8382110595703, - "loss": 0.6115, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3813140392303467, - "rewards/margins": 0.4978283941745758, - "rewards/rejected": -1.8791425228118896, + "epoch": 0.7942798070296347, + "grad_norm": 4.989021301269531, + "learning_rate": 7.543278465785973e-08, + "logits/chosen": -2.7107458114624023, + "logits/rejected": -2.700726270675659, + "logps/chosen": -82.72367858886719, + "logps/rejected": -87.36424255371094, + "loss": 0.6745, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.31585732102394104, + "rewards/margins": 0.04831356555223465, + "rewards/rejected": -0.3641708493232727, "step": 4610 }, { - "epoch": 0.8, - "grad_norm": 21.229639253980007, - "learning_rate": 3.765160780360254e-07, - "logits/chosen": -1.3881046772003174, - "logits/rejected": -1.3339178562164307, - "logps/chosen": -180.4025421142578, - "logps/rejected": -259.890380859375, - "loss": 0.5085, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.2468607425689697, - "rewards/margins": 0.7971670627593994, - "rewards/rejected": -2.0440280437469482, + "epoch": 0.7960027567195038, + "grad_norm": 6.328110694885254, + "learning_rate": 7.530321560720508e-08, + "logits/chosen": -2.74727201461792, + "logits/rejected": -2.7238094806671143, + "logps/chosen": -83.11624908447266, + "logps/rejected": -92.23828887939453, + "loss": 0.6525, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.273975133895874, + "rewards/margins": 0.09328394383192062, + "rewards/rejected": -0.36725908517837524, "step": 4620 }, { - "epoch": 0.8, - "grad_norm": 30.53629505024118, - "learning_rate": 3.75867088688117e-07, - "logits/chosen": -1.3791451454162598, - "logits/rejected": -1.3137580156326294, - "logps/chosen": -209.33139038085938, - "logps/rejected": -286.5825500488281, - "loss": 0.5159, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.5205762386322021, - "rewards/margins": 0.7794795036315918, - "rewards/rejected": -2.300055742263794, + "epoch": 0.7977257064093728, + "grad_norm": 5.385335922241211, + "learning_rate": 7.517341773762341e-08, + "logits/chosen": -2.780961513519287, + "logits/rejected": -2.747697114944458, + "logps/chosen": -86.0129165649414, + "logps/rejected": -93.71475219726562, + "loss": 0.6565, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2872922420501709, + "rewards/margins": 0.08390410244464874, + "rewards/rejected": -0.37119635939598083, "step": 4630 }, { - "epoch": 0.8, - "grad_norm": 26.67530435659893, - "learning_rate": 3.7521696111443413e-07, - "logits/chosen": -1.3778386116027832, - "logits/rejected": -1.341675043106079, - "logps/chosen": -217.1299285888672, - "logps/rejected": -284.71978759765625, - "loss": 0.5664, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.6237319707870483, - "rewards/margins": 0.6682808995246887, - "rewards/rejected": -2.2920126914978027, + "epoch": 0.7994486560992419, + "grad_norm": 5.627150535583496, + "learning_rate": 7.504339222288683e-08, + "logits/chosen": -2.7807700634002686, + "logits/rejected": -2.7660746574401855, + "logps/chosen": -87.76912689208984, + "logps/rejected": -95.35968017578125, + "loss": 0.6655, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3301452398300171, + "rewards/margins": 0.06794091314077377, + "rewards/rejected": -0.39808616042137146, "step": 4640 }, { - "epoch": 0.8, - "grad_norm": 36.07331886816096, - "learning_rate": 3.7456570119413034e-07, - "logits/chosen": -1.413480520248413, - "logits/rejected": -1.3600969314575195, - "logps/chosen": -189.5355987548828, - "logps/rejected": -251.88204956054688, - "loss": 0.5552, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3538799285888672, - "rewards/margins": 0.6571952700614929, - "rewards/rejected": -2.011075496673584, + "epoch": 0.801171605789111, + "grad_norm": 5.463457107543945, + "learning_rate": 7.491314023882607e-08, + "logits/chosen": -2.781244993209839, + "logits/rejected": -2.7495713233947754, + "logps/chosen": -84.5881576538086, + "logps/rejected": -89.88493347167969, + "loss": 0.6561, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.304373562335968, + "rewards/margins": 0.08668698370456696, + "rewards/rejected": -0.39106056094169617, "step": 4650 }, { - "epoch": 0.8, - "grad_norm": 19.411023629464214, - "learning_rate": 3.739133148165994e-07, - "logits/chosen": -1.4477910995483398, - "logits/rejected": -1.4069766998291016, - "logps/chosen": -176.53195190429688, - "logps/rejected": -239.30844116210938, - "loss": 0.5337, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2224671840667725, - "rewards/margins": 0.62762051820755, - "rewards/rejected": -1.8500875234603882, + "epoch": 0.80289455547898, + "grad_norm": 6.089754104614258, + "learning_rate": 7.478266296331988e-08, + "logits/chosen": -2.7720773220062256, + "logits/rejected": -2.754800319671631, + "logps/chosen": -82.82508087158203, + "logps/rejected": -91.43245697021484, + "loss": 0.6551, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.28517812490463257, + "rewards/margins": 0.0863594189286232, + "rewards/rejected": -0.37153756618499756, "step": 4660 }, { - "epoch": 0.8, - "grad_norm": 22.362534471168633, - "learning_rate": 3.7325980788142146e-07, - "logits/chosen": -1.4072405099868774, - "logits/rejected": -1.357230305671692, - "logps/chosen": -165.07376098632812, - "logps/rejected": -237.02816772460938, - "loss": 0.4906, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.1267932653427124, - "rewards/margins": 0.7528173327445984, - "rewards/rejected": -1.8796107769012451, + "epoch": 0.8046175051688491, + "grad_norm": 4.796602725982666, + "learning_rate": 7.46519615762843e-08, + "logits/chosen": -2.763443946838379, + "logits/rejected": -2.7395286560058594, + "logps/chosen": -80.69648742675781, + "logps/rejected": -85.73301696777344, + "loss": 0.6568, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2826913595199585, + "rewards/margins": 0.08366372436285019, + "rewards/rejected": -0.3663550913333893, "step": 4670 }, { - "epoch": 0.81, - "grad_norm": 25.270149260749562, - "learning_rate": 3.726051862983101e-07, - "logits/chosen": -1.3556668758392334, - "logits/rejected": -1.3043700456619263, - "logps/chosen": -190.7840576171875, - "logps/rejected": -255.9459686279297, - "loss": 0.5537, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3830840587615967, - "rewards/margins": 0.6596297025680542, - "rewards/rejected": -2.0427136421203613, + "epoch": 0.8063404548587181, + "grad_norm": 5.143124580383301, + "learning_rate": 7.452103725966201e-08, + "logits/chosen": -2.750540256500244, + "logits/rejected": -2.7209665775299072, + "logps/chosen": -82.34010314941406, + "logps/rejected": -89.6563949584961, + "loss": 0.6594, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.2984951436519623, + "rewards/margins": 0.08116915076971054, + "rewards/rejected": -0.3796643614768982, "step": 4680 }, { - "epoch": 0.81, - "grad_norm": 24.777840014526955, - "learning_rate": 3.7194945598705864e-07, - "logits/chosen": -1.3643500804901123, - "logits/rejected": -1.3103562593460083, - "logps/chosen": -213.9306182861328, - "logps/rejected": -301.82757568359375, - "loss": 0.5075, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.5816562175750732, - "rewards/margins": 0.8936996459960938, - "rewards/rejected": -2.475355625152588, + "epoch": 0.8080634045485872, + "grad_norm": 5.349360942840576, + "learning_rate": 7.438989119741173e-08, + "logits/chosen": -2.7934322357177734, + "logits/rejected": -2.7713704109191895, + "logps/chosen": -86.65959930419922, + "logps/rejected": -95.86109924316406, + "loss": 0.6485, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3089648187160492, + "rewards/margins": 0.10664431750774384, + "rewards/rejected": -0.41560912132263184, "step": 4690 }, { - "epoch": 0.81, - "grad_norm": 35.9367234479366, - "learning_rate": 3.712926228774868e-07, - "logits/chosen": -1.298680067062378, - "logits/rejected": -1.252151608467102, - "logps/chosen": -218.3412322998047, - "logps/rejected": -308.64752197265625, - "loss": 0.5187, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.6659730672836304, - "rewards/margins": 0.8636065721511841, - "rewards/rejected": -2.5295798778533936, + "epoch": 0.8097863542384562, + "grad_norm": 5.843964576721191, + "learning_rate": 7.425852457549736e-08, + "logits/chosen": -2.763597249984741, + "logits/rejected": -2.755293130874634, + "logps/chosen": -83.36461639404297, + "logps/rejected": -97.21564483642578, + "loss": 0.6526, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.31630265712738037, + "rewards/margins": 0.09895575791597366, + "rewards/rejected": -0.4152584671974182, "step": 4700 }, { - "epoch": 0.81, - "eval_logits/chosen": -1.3934706449508667, - "eval_logits/rejected": -1.3670498132705688, - "eval_logps/chosen": -205.2917938232422, - "eval_logps/rejected": -246.5410614013672, - "eval_loss": 0.6171659231185913, - "eval_rewards/accuracies": 0.6654275059700012, - "eval_rewards/chosen": -1.4658793210983276, - "eval_rewards/margins": 0.367957204580307, - "eval_rewards/rejected": -1.833836555480957, - "eval_runtime": 356.7771, - "eval_samples_per_second": 12.064, - "eval_steps_per_second": 1.508, + "epoch": 0.8097863542384562, + "eval_logits/chosen": -2.8489065170288086, + "eval_logits/rejected": -2.8429276943206787, + "eval_logps/chosen": -82.56464385986328, + "eval_logps/rejected": -91.90455627441406, + "eval_loss": 0.6727486848831177, + "eval_rewards/accuracies": 0.606877326965332, + "eval_rewards/chosen": -0.23852740228176117, + "eval_rewards/margins": 0.0487169586122036, + "eval_rewards/rejected": -0.2872443199157715, + "eval_runtime": 359.9733, + "eval_samples_per_second": 11.956, + "eval_steps_per_second": 1.495, "step": 4700 }, { - "epoch": 0.81, - "grad_norm": 32.393157890217395, - "learning_rate": 3.7063469290938696e-07, - "logits/chosen": -1.3531776666641235, - "logits/rejected": -1.3033009767532349, - "logps/chosen": -212.64096069335938, - "logps/rejected": -282.6040954589844, - "loss": 0.5413, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.546820044517517, - "rewards/margins": 0.7553335428237915, - "rewards/rejected": -2.3021538257598877, + "epoch": 0.8115093039283253, + "grad_norm": 5.262428283691406, + "learning_rate": 7.41269385818774e-08, + "logits/chosen": -2.810303211212158, + "logits/rejected": -2.7742409706115723, + "logps/chosen": -88.20140075683594, + "logps/rejected": -92.91087341308594, + "loss": 0.653, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.3025413155555725, + "rewards/margins": 0.1027384027838707, + "rewards/rejected": -0.4052796959877014, "step": 4710 }, { - "epoch": 0.81, - "grad_norm": 21.02352487673719, - "learning_rate": 3.699756720324706e-07, - "logits/chosen": -1.2925106287002563, - "logits/rejected": -1.2351093292236328, - "logps/chosen": -197.33441162109375, - "logps/rejected": -281.5450439453125, - "loss": 0.4884, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.405700922012329, - "rewards/margins": 0.8855878710746765, - "rewards/rejected": -2.2912888526916504, + "epoch": 0.8132322536181944, + "grad_norm": 5.286714553833008, + "learning_rate": 7.399513440649412e-08, + "logits/chosen": -2.6862411499023438, + "logits/rejected": -2.657238483428955, + "logps/chosen": -86.468017578125, + "logps/rejected": -93.57646179199219, + "loss": 0.6442, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.29665708541870117, + "rewards/margins": 0.11476937681436539, + "rewards/rejected": -0.41142645478248596, "step": 4720 }, { - "epoch": 0.81, - "grad_norm": 26.71400422066647, - "learning_rate": 3.693155662063141e-07, - "logits/chosen": -1.2681843042373657, - "logits/rejected": -1.219074010848999, - "logps/chosen": -199.86378479003906, - "logps/rejected": -269.1332092285156, - "loss": 0.5673, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.4612171649932861, - "rewards/margins": 0.7179456353187561, - "rewards/rejected": -2.1791629791259766, + "epoch": 0.8149552033080634, + "grad_norm": 5.766880989074707, + "learning_rate": 7.386311324126282e-08, + "logits/chosen": -2.680243968963623, + "logits/rejected": -2.656158924102783, + "logps/chosen": -86.38379669189453, + "logps/rejected": -90.43648529052734, + "loss": 0.667, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3263756334781647, + "rewards/margins": 0.06575731933116913, + "rewards/rejected": -0.3921329379081726, "step": 4730 }, { - "epoch": 0.82, - "grad_norm": 16.242282256503056, - "learning_rate": 3.686543814003053e-07, - "logits/chosen": -1.3467975854873657, - "logits/rejected": -1.3040544986724854, - "logps/chosen": -180.89808654785156, - "logps/rejected": -271.9189453125, - "loss": 0.4748, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2646945714950562, - "rewards/margins": 0.9159406423568726, - "rewards/rejected": -2.1806349754333496, + "epoch": 0.8166781529979324, + "grad_norm": 5.2600297927856445, + "learning_rate": 7.373087628006106e-08, + "logits/chosen": -2.7628931999206543, + "logits/rejected": -2.751981258392334, + "logps/chosen": -83.48346710205078, + "logps/rejected": -92.15267181396484, + "loss": 0.6531, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2904190421104431, + "rewards/margins": 0.09240089356899261, + "rewards/rejected": -0.38281992077827454, "step": 4740 }, { - "epoch": 0.82, - "grad_norm": 22.35210107859672, - "learning_rate": 3.6799212359358933e-07, - "logits/chosen": -1.2919436693191528, - "logits/rejected": -1.2553608417510986, - "logps/chosen": -206.98501586914062, - "logps/rejected": -273.5370788574219, - "loss": 0.54, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.5159308910369873, - "rewards/margins": 0.673417866230011, - "rewards/rejected": -2.1893489360809326, + "epoch": 0.8184011026878015, + "grad_norm": 5.543763637542725, + "learning_rate": 7.359842471871787e-08, + "logits/chosen": -2.7339413166046143, + "logits/rejected": -2.7162299156188965, + "logps/chosen": -83.31120300292969, + "logps/rejected": -92.21416473388672, + "loss": 0.6511, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.27919134497642517, + "rewards/margins": 0.09701523184776306, + "rewards/rejected": -0.37620657682418823, "step": 4750 }, { - "epoch": 0.82, - "grad_norm": 29.45848931001129, - "learning_rate": 3.6732879877501453e-07, - "logits/chosen": -1.2655035257339478, - "logits/rejected": -1.2089664936065674, - "logps/chosen": -215.909912109375, - "logps/rejected": -307.30560302734375, - "loss": 0.4823, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.6345545053482056, - "rewards/margins": 0.9228925704956055, - "rewards/rejected": -2.5574469566345215, + "epoch": 0.8201240523776706, + "grad_norm": 6.084500789642334, + "learning_rate": 7.346575975500291e-08, + "logits/chosen": -2.73199725151062, + "logits/rejected": -2.7106871604919434, + "logps/chosen": -84.28748321533203, + "logps/rejected": -91.7631607055664, + "loss": 0.657, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.3181541860103607, + "rewards/margins": 0.08389735966920853, + "rewards/rejected": -0.40205153822898865, "step": 4760 }, { - "epoch": 0.82, - "grad_norm": 19.064566618433858, - "learning_rate": 3.666644129430784e-07, - "logits/chosen": -1.3485455513000488, - "logits/rejected": -1.3007423877716064, - "logps/chosen": -227.8409423828125, - "logps/rejected": -299.4037170410156, - "loss": 0.5629, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.6823813915252686, - "rewards/margins": 0.7599068284034729, - "rewards/rejected": -2.4422881603240967, + "epoch": 0.8218470020675396, + "grad_norm": 5.495306491851807, + "learning_rate": 7.333288258861567e-08, + "logits/chosen": -2.7941107749938965, + "logits/rejected": -2.773500919342041, + "logps/chosen": -92.41073608398438, + "logps/rejected": -96.80774688720703, + "loss": 0.6574, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3282061517238617, + "rewards/margins": 0.08787950128316879, + "rewards/rejected": -0.4160856604576111, "step": 4770 }, { - "epoch": 0.82, - "grad_norm": 22.624642669624826, - "learning_rate": 3.65998972105873e-07, - "logits/chosen": -1.280133605003357, - "logits/rejected": -1.228562593460083, - "logps/chosen": -196.01754760742188, - "logps/rejected": -289.71990966796875, - "loss": 0.461, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.4455887079238892, - "rewards/margins": 0.912562370300293, - "rewards/rejected": -2.3581509590148926, + "epoch": 0.8235699517574087, + "grad_norm": 6.325778007507324, + "learning_rate": 7.31997944211746e-08, + "logits/chosen": -2.6705031394958496, + "logits/rejected": -2.651705741882324, + "logps/chosen": -81.9596939086914, + "logps/rejected": -96.26366424560547, + "loss": 0.6408, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.30489587783813477, + "rewards/margins": 0.11830363422632217, + "rewards/rejected": -0.42319950461387634, "step": 4780 }, { - "epoch": 0.83, - "grad_norm": 17.630300335409032, - "learning_rate": 3.6533248228103114e-07, - "logits/chosen": -1.3750900030136108, - "logits/rejected": -1.324573278427124, - "logps/chosen": -203.55032348632812, - "logps/rejected": -268.24688720703125, - "loss": 0.528, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.4577271938323975, - "rewards/margins": 0.6970139741897583, - "rewards/rejected": -2.154741048812866, + "epoch": 0.8252929014472777, + "grad_norm": 4.711288928985596, + "learning_rate": 7.306649645620623e-08, + "logits/chosen": -2.7664904594421387, + "logits/rejected": -2.7429041862487793, + "logps/chosen": -91.65912628173828, + "logps/rejected": -91.39378356933594, + "loss": 0.6751, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.3386782109737396, + "rewards/margins": 0.0473986491560936, + "rewards/rejected": -0.3860768675804138, "step": 4790 }, { - "epoch": 0.83, - "grad_norm": 22.323338607470507, - "learning_rate": 3.646649494956717e-07, - "logits/chosen": -1.3112070560455322, - "logits/rejected": -1.2691413164138794, - "logps/chosen": -205.2784881591797, - "logps/rejected": -268.1100769042969, - "loss": 0.5805, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.5329288244247437, - "rewards/margins": 0.6451320648193359, - "rewards/rejected": -2.178061008453369, + "epoch": 0.8270158511371468, + "grad_norm": 5.897047996520996, + "learning_rate": 7.293298989913435e-08, + "logits/chosen": -2.6948139667510986, + "logits/rejected": -2.6794960498809814, + "logps/chosen": -82.49443054199219, + "logps/rejected": -88.50199127197266, + "loss": 0.6604, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3049885630607605, + "rewards/margins": 0.07673478126525879, + "rewards/rejected": -0.3817233443260193, "step": 4800 }, { - "epoch": 0.83, - "eval_logits/chosen": -1.445318341255188, - "eval_logits/rejected": -1.4196213483810425, - "eval_logps/chosen": -201.05026245117188, - "eval_logps/rejected": -241.25576782226562, - "eval_loss": 0.6145854592323303, - "eval_rewards/accuracies": 0.6619423627853394, - "eval_rewards/chosen": -1.423464059829712, - "eval_rewards/margins": 0.35751983523368835, - "eval_rewards/rejected": -1.780983805656433, - "eval_runtime": 356.6955, - "eval_samples_per_second": 12.066, - "eval_steps_per_second": 1.508, + "epoch": 0.8270158511371468, + "eval_logits/chosen": -2.840977191925049, + "eval_logits/rejected": -2.8350658416748047, + "eval_logps/chosen": -83.65935516357422, + "eval_logps/rejected": -93.16956329345703, + "eval_loss": 0.6720952987670898, + "eval_rewards/accuracies": 0.6089683771133423, + "eval_rewards/chosen": -0.2494746446609497, + "eval_rewards/margins": 0.05041969567537308, + "eval_rewards/rejected": -0.2998943626880646, + "eval_runtime": 360.2892, + "eval_samples_per_second": 11.946, + "eval_steps_per_second": 1.493, "step": 4800 }, { - "epoch": 0.83, - "grad_norm": 23.1783795567982, - "learning_rate": 3.6399637978634497e-07, - "logits/chosen": -1.2973178625106812, - "logits/rejected": -1.234431505203247, - "logps/chosen": -206.18838500976562, - "logps/rejected": -280.7757263183594, - "loss": 0.5075, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.4898689985275269, - "rewards/margins": 0.7940649390220642, - "rewards/rejected": -2.2839341163635254, + "epoch": 0.8287388008270159, + "grad_norm": 5.640036106109619, + "learning_rate": 7.279927595726899e-08, + "logits/chosen": -2.7054238319396973, + "logits/rejected": -2.6721575260162354, + "logps/chosen": -88.3663558959961, + "logps/rejected": -93.75032043457031, + "loss": 0.6497, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.3115006685256958, + "rewards/margins": 0.1022612601518631, + "rewards/rejected": -0.4137619137763977, "step": 4810 }, { - "epoch": 0.83, - "grad_norm": 20.941205872084087, - "learning_rate": 3.6332677919897823e-07, - "logits/chosen": -1.330582857131958, - "logits/rejected": -1.2920982837677002, - "logps/chosen": -202.64508056640625, - "logps/rejected": -277.89532470703125, - "loss": 0.5228, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.514899492263794, - "rewards/margins": 0.7458511590957642, - "rewards/rejected": -2.2607505321502686, + "epoch": 0.8304617505168849, + "grad_norm": 5.218100547790527, + "learning_rate": 7.266535583979565e-08, + "logits/chosen": -2.7404797077178955, + "logits/rejected": -2.7312655448913574, + "logps/chosen": -82.75204467773438, + "logps/rejected": -92.75565338134766, + "loss": 0.6541, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3159467577934265, + "rewards/margins": 0.09343425929546356, + "rewards/rejected": -0.40938109159469604, "step": 4820 }, { - "epoch": 0.83, - "grad_norm": 20.30033885452288, - "learning_rate": 3.626561537888214e-07, - "logits/chosen": -1.3852955102920532, - "logits/rejected": -1.3380589485168457, - "logps/chosen": -196.2224884033203, - "logps/rejected": -265.86505126953125, - "loss": 0.5619, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.426672101020813, - "rewards/margins": 0.7057501673698425, - "rewards/rejected": -2.1324222087860107, + "epoch": 0.832184700206754, + "grad_norm": 4.796080589294434, + "learning_rate": 7.253123075776428e-08, + "logits/chosen": -2.7532737255096436, + "logits/rejected": -2.731318950653076, + "logps/chosen": -86.41776275634766, + "logps/rejected": -91.73139953613281, + "loss": 0.6688, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.32840853929519653, + "rewards/margins": 0.06262843310832977, + "rewards/rejected": -0.3910369277000427, "step": 4830 }, { - "epoch": 0.83, - "grad_norm": 28.339668690193722, - "learning_rate": 3.6198450962039146e-07, - "logits/chosen": -1.3548475503921509, - "logits/rejected": -1.2958180904388428, - "logps/chosen": -199.49864196777344, - "logps/rejected": -276.6757507324219, - "loss": 0.5025, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.4332863092422485, - "rewards/margins": 0.809424102306366, - "rewards/rejected": -2.2427103519439697, + "epoch": 0.833907649896623, + "grad_norm": 5.210707664489746, + "learning_rate": 7.239690192407829e-08, + "logits/chosen": -2.7101588249206543, + "logits/rejected": -2.680236339569092, + "logps/chosen": -88.89765930175781, + "logps/rejected": -96.16397094726562, + "loss": 0.6465, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.32727310061454773, + "rewards/margins": 0.110235795378685, + "rewards/rejected": -0.43750882148742676, "step": 4840 }, { - "epoch": 0.84, - "grad_norm": 24.489009584175815, - "learning_rate": 3.6131185276741846e-07, - "logits/chosen": -1.4219049215316772, - "logits/rejected": -1.377803087234497, - "logps/chosen": -193.35806274414062, - "logps/rejected": -261.9024353027344, - "loss": 0.5377, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.392596960067749, - "rewards/margins": 0.6902645826339722, - "rewards/rejected": -2.0828614234924316, + "epoch": 0.8356305995864921, + "grad_norm": 5.94487190246582, + "learning_rate": 7.226237055348368e-08, + "logits/chosen": -2.7761588096618652, + "logits/rejected": -2.7590034008026123, + "logps/chosen": -85.37464141845703, + "logps/rejected": -95.1301498413086, + "loss": 0.6502, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3124055862426758, + "rewards/margins": 0.10277421772480011, + "rewards/rejected": -0.4151798188686371, "step": 4850 }, { - "epoch": 0.84, - "grad_norm": 23.049416952706352, - "learning_rate": 3.6063818931278997e-07, - "logits/chosen": -1.438050627708435, - "logits/rejected": -1.3838953971862793, - "logps/chosen": -196.03994750976562, - "logps/rejected": -255.8987579345703, - "loss": 0.5318, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3760112524032593, - "rewards/margins": 0.6772761344909668, - "rewards/rejected": -2.0532872676849365, + "epoch": 0.8373535492763611, + "grad_norm": 6.092998504638672, + "learning_rate": 7.2127637862558e-08, + "logits/chosen": -2.783973217010498, + "logits/rejected": -2.7518773078918457, + "logps/chosen": -91.34198760986328, + "logps/rejected": -90.67448425292969, + "loss": 0.6636, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3288485109806061, + "rewards/margins": 0.07213427126407623, + "rewards/rejected": -0.4009827673435211, "step": 4860 }, { - "epoch": 0.84, - "grad_norm": 27.95119723700728, - "learning_rate": 3.599635253484967e-07, - "logits/chosen": -1.458106279373169, - "logits/rejected": -1.4050066471099854, - "logps/chosen": -196.61280822753906, - "logps/rejected": -271.3414611816406, - "loss": 0.5218, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3915178775787354, - "rewards/margins": 0.8070972561836243, - "rewards/rejected": -2.1986148357391357, + "epoch": 0.8390764989662302, + "grad_norm": 6.207915782928467, + "learning_rate": 7.199270506969934e-08, + "logits/chosen": -2.7969024181365967, + "logits/rejected": -2.771622657775879, + "logps/chosen": -91.00338745117188, + "logps/rejected": -94.34010314941406, + "loss": 0.6543, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.33533793687820435, + "rewards/margins": 0.09314209222793579, + "rewards/rejected": -0.4284800589084625, "step": 4870 }, { - "epoch": 0.84, - "grad_norm": 23.334821262369495, - "learning_rate": 3.592878669755767e-07, - "logits/chosen": -1.3905763626098633, - "logits/rejected": -1.33687424659729, - "logps/chosen": -179.8762664794922, - "logps/rejected": -240.846923828125, - "loss": 0.5348, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.26702082157135, - "rewards/margins": 0.6355078220367432, - "rewards/rejected": -1.9025285243988037, + "epoch": 0.8407994486560992, + "grad_norm": 5.539631366729736, + "learning_rate": 7.185757339511533e-08, + "logits/chosen": -2.6904730796813965, + "logits/rejected": -2.6575820446014404, + "logps/chosen": -84.82585906982422, + "logps/rejected": -90.90750885009766, + "loss": 0.6566, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3164503872394562, + "rewards/margins": 0.08641557395458221, + "rewards/rejected": -0.4028659760951996, "step": 4880 }, { - "epoch": 0.84, - "grad_norm": 20.651381347651178, - "learning_rate": 3.586112203040607e-07, - "logits/chosen": -1.4436790943145752, - "logits/rejected": -1.3931138515472412, - "logps/chosen": -185.76181030273438, - "logps/rejected": -262.454345703125, - "loss": 0.5051, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3033283948898315, - "rewards/margins": 0.785824179649353, - "rewards/rejected": -2.0891528129577637, + "epoch": 0.8425223983459683, + "grad_norm": 5.719715595245361, + "learning_rate": 7.172224406081215e-08, + "logits/chosen": -2.7437620162963867, + "logits/rejected": -2.7225348949432373, + "logps/chosen": -88.09601593017578, + "logps/rejected": -96.74874114990234, + "loss": 0.6477, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.32675427198410034, + "rewards/margins": 0.1053960919380188, + "rewards/rejected": -0.4321504235267639, "step": 4890 }, { - "epoch": 0.84, - "grad_norm": 14.620800977419114, - "learning_rate": 3.5793359145291665e-07, - "logits/chosen": -1.4301611185073853, - "logits/rejected": -1.3711490631103516, - "logps/chosen": -181.84730529785156, - "logps/rejected": -252.3798065185547, - "loss": 0.537, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2864367961883545, - "rewards/margins": 0.7350460886955261, - "rewards/rejected": -2.0214829444885254, + "epoch": 0.8442453480358374, + "grad_norm": 5.439476490020752, + "learning_rate": 7.158671829058332e-08, + "logits/chosen": -2.719465732574463, + "logits/rejected": -2.6880860328674316, + "logps/chosen": -87.46734619140625, + "logps/rejected": -92.49640655517578, + "loss": 0.6664, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.34264951944351196, + "rewards/margins": 0.07987688481807709, + "rewards/rejected": -0.42252641916275024, "step": 4900 }, { - "epoch": 0.84, - "eval_logits/chosen": -1.5460282564163208, - "eval_logits/rejected": -1.5222222805023193, - "eval_logps/chosen": -179.59288024902344, - "eval_logps/rejected": -214.940185546875, - "eval_loss": 0.6194379925727844, - "eval_rewards/accuracies": 0.6556691527366638, - "eval_rewards/chosen": -1.2088903188705444, - "eval_rewards/margins": 0.30893754959106445, - "eval_rewards/rejected": -1.517828106880188, - "eval_runtime": 356.6033, - "eval_samples_per_second": 12.069, - "eval_steps_per_second": 1.509, + "epoch": 0.8442453480358374, + "eval_logits/chosen": -2.8323721885681152, + "eval_logits/rejected": -2.826402425765991, + "eval_logps/chosen": -84.92655181884766, + "eval_logps/rejected": -94.65947723388672, + "eval_loss": 0.6712405681610107, + "eval_rewards/accuracies": 0.604786217212677, + "eval_rewards/chosen": -0.2621465027332306, + "eval_rewards/margins": 0.052647095173597336, + "eval_rewards/rejected": -0.3147936165332794, + "eval_runtime": 360.6945, + "eval_samples_per_second": 11.933, + "eval_steps_per_second": 1.492, "step": 4900 }, { - "epoch": 0.85, - "grad_norm": 18.773623561195404, - "learning_rate": 3.5725498654999436e-07, - "logits/chosen": -1.572040319442749, - "logits/rejected": -1.511036992073059, - "logps/chosen": -181.8535919189453, - "logps/rejected": -260.3033142089844, - "loss": 0.5114, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2552320957183838, - "rewards/margins": 0.8234399557113647, - "rewards/rejected": -2.078671932220459, + "epoch": 0.8459682977257064, + "grad_norm": 6.13719367980957, + "learning_rate": 7.145099730999888e-08, + "logits/chosen": -2.8455302715301514, + "logits/rejected": -2.81510591506958, + "logps/chosen": -89.68118286132812, + "logps/rejected": -94.53526306152344, + "loss": 0.6571, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3334195017814636, + "rewards/margins": 0.08758939802646637, + "rewards/rejected": -0.4210088849067688, "step": 4910 }, { - "epoch": 0.85, - "grad_norm": 25.741760898047566, - "learning_rate": 3.5657541173197025e-07, - "logits/chosen": -1.3761519193649292, - "logits/rejected": -1.3280622959136963, - "logps/chosen": -193.3008575439453, - "logps/rejected": -278.3187561035156, - "loss": 0.494, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.394696593284607, - "rewards/margins": 0.8541049957275391, - "rewards/rejected": -2.2488017082214355, + "epoch": 0.8476912474155754, + "grad_norm": 5.903871536254883, + "learning_rate": 7.131508234639405e-08, + "logits/chosen": -2.683255195617676, + "logits/rejected": -2.6657886505126953, + "logps/chosen": -86.96060180664062, + "logps/rejected": -98.50045013427734, + "loss": 0.643, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3312074542045593, + "rewards/margins": 0.11914181709289551, + "rewards/rejected": -0.45034927129745483, "step": 4920 }, { - "epoch": 0.85, - "grad_norm": 27.217333252692796, - "learning_rate": 3.558948731442918e-07, - "logits/chosen": -1.5090538263320923, - "logits/rejected": -1.461111307144165, - "logps/chosen": -210.60617065429688, - "logps/rejected": -291.8194885253906, - "loss": 0.5621, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.5748202800750732, - "rewards/margins": 0.7634907960891724, - "rewards/rejected": -2.338311195373535, + "epoch": 0.8494141971054445, + "grad_norm": 5.683137893676758, + "learning_rate": 7.117897462885836e-08, + "logits/chosen": -2.853965997695923, + "logits/rejected": -2.8416576385498047, + "logps/chosen": -88.86614990234375, + "logps/rejected": -102.5183334350586, + "loss": 0.6575, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.35723018646240234, + "rewards/margins": 0.08792047947645187, + "rewards/rejected": -0.4451506733894348, "step": 4930 }, { - "epoch": 0.85, - "grad_norm": 22.65567566910284, - "learning_rate": 3.5521337694112177e-07, - "logits/chosen": -1.4714148044586182, - "logits/rejected": -1.4120241403579712, - "logps/chosen": -215.1482391357422, - "logps/rejected": -304.5157775878906, - "loss": 0.4672, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.592576265335083, - "rewards/margins": 0.9204828143119812, - "rewards/rejected": -2.513059139251709, + "epoch": 0.8511371467953136, + "grad_norm": 5.63087272644043, + "learning_rate": 7.104267538822435e-08, + "logits/chosen": -2.7871501445770264, + "logits/rejected": -2.758396625518799, + "logps/chosen": -90.38795471191406, + "logps/rejected": -97.97694396972656, + "loss": 0.6483, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.3446611762046814, + "rewards/margins": 0.10284022986888885, + "rewards/rejected": -0.44750142097473145, "step": 4940 }, { - "epoch": 0.85, - "grad_norm": 18.92994086706776, - "learning_rate": 3.5453092928528283e-07, - "logits/chosen": -1.2949804067611694, - "logits/rejected": -1.252745270729065, - "logps/chosen": -194.73434448242188, - "logps/rejected": -266.51446533203125, - "loss": 0.5488, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3993715047836304, - "rewards/margins": 0.7247661352157593, - "rewards/rejected": -2.1241374015808105, + "epoch": 0.8528600964851827, + "grad_norm": 6.1002044677734375, + "learning_rate": 7.090618585705657e-08, + "logits/chosen": -2.5923728942871094, + "logits/rejected": -2.578038215637207, + "logps/chosen": -85.99813079833984, + "logps/rejected": -94.92589569091797, + "loss": 0.6521, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3117091953754425, + "rewards/margins": 0.09636653959751129, + "rewards/rejected": -0.4080757200717926, "step": 4950 }, { - "epoch": 0.85, - "grad_norm": 29.05218033955657, - "learning_rate": 3.538475363482017e-07, - "logits/chosen": -1.4200494289398193, - "logits/rejected": -1.379931926727295, - "logps/chosen": -191.14431762695312, - "logps/rejected": -275.96734619140625, - "loss": 0.4846, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.3891611099243164, - "rewards/margins": 0.8065220713615417, - "rewards/rejected": -2.195683240890503, + "epoch": 0.8545830461750517, + "grad_norm": 5.581972122192383, + "learning_rate": 7.076950726964034e-08, + "logits/chosen": -2.7105612754821777, + "logits/rejected": -2.7103323936462402, + "logps/chosen": -84.18190002441406, + "logps/rejected": -98.76054382324219, + "loss": 0.6481, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3192813992500305, + "rewards/margins": 0.10413311421871185, + "rewards/rejected": -0.42341452836990356, "step": 4960 }, { - "epoch": 0.86, - "grad_norm": 25.53487607610273, - "learning_rate": 3.531632043098533e-07, - "logits/chosen": -1.3623136281967163, - "logits/rejected": -1.316384196281433, - "logps/chosen": -199.2927703857422, - "logps/rejected": -290.8288269042969, - "loss": 0.4895, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.4792590141296387, - "rewards/margins": 0.9106336832046509, - "rewards/rejected": -2.389892578125, + "epoch": 0.8563059958649207, + "grad_norm": 6.258480072021484, + "learning_rate": 7.063264086197066e-08, + "logits/chosen": -2.6861400604248047, + "logits/rejected": -2.6755423545837402, + "logps/chosen": -84.96774291992188, + "logps/rejected": -96.87010192871094, + "loss": 0.6457, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.33594292402267456, + "rewards/margins": 0.1142006665468216, + "rewards/rejected": -0.4501435160636902, "step": 4970 }, { - "epoch": 0.86, - "grad_norm": 18.067653883232882, - "learning_rate": 3.5247793935870493e-07, - "logits/chosen": -1.3661185503005981, - "logits/rejected": -1.318273663520813, - "logps/chosen": -203.88742065429688, - "logps/rejected": -304.10009765625, - "loss": 0.4466, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.510157823562622, - "rewards/margins": 0.9821771383285522, - "rewards/rejected": -2.492335081100464, + "epoch": 0.8580289455547898, + "grad_norm": 6.162121295928955, + "learning_rate": 7.049558787174099e-08, + "logits/chosen": -2.7526276111602783, + "logits/rejected": -2.7450337409973145, + "logps/chosen": -85.32099914550781, + "logps/rejected": -96.36564636230469, + "loss": 0.6561, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3246348798274994, + "rewards/margins": 0.09016862511634827, + "rewards/rejected": -0.41480350494384766, "step": 4980 }, { - "epoch": 0.86, - "grad_norm": 25.76809031221678, - "learning_rate": 3.5179174769166036e-07, - "logits/chosen": -1.2775933742523193, - "logits/rejected": -1.2376461029052734, - "logps/chosen": -229.0320587158203, - "logps/rejected": -312.44122314453125, - "loss": 0.5803, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.7728198766708374, - "rewards/margins": 0.8028677701950073, - "rewards/rejected": -2.575687885284424, + "epoch": 0.8597518952446589, + "grad_norm": 5.951625347137451, + "learning_rate": 7.035834953833208e-08, + "logits/chosen": -2.6703968048095703, + "logits/rejected": -2.6620566844940186, + "logps/chosen": -85.4616928100586, + "logps/rejected": -97.10552978515625, + "loss": 0.6581, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3367730677127838, + "rewards/margins": 0.08541145920753479, + "rewards/rejected": -0.42218446731567383, "step": 4990 }, { - "epoch": 0.86, - "grad_norm": 24.778358441979794, - "learning_rate": 3.511046355140036e-07, - "logits/chosen": -1.2975661754608154, - "logits/rejected": -1.2417397499084473, - "logps/chosen": -212.7232666015625, - "logps/rejected": -299.6105041503906, - "loss": 0.5112, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.5695029497146606, - "rewards/margins": 0.8998052477836609, - "rewards/rejected": -2.4693078994750977, + "epoch": 0.8614748449345279, + "grad_norm": 6.248295783996582, + "learning_rate": 7.022092710280073e-08, + "logits/chosen": -2.6949782371520996, + "logits/rejected": -2.6747851371765137, + "logps/chosen": -88.69740295410156, + "logps/rejected": -96.07904052734375, + "loss": 0.6499, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3289342522621155, + "rewards/margins": 0.10480217635631561, + "rewards/rejected": -0.4337364137172699, "step": 5000 }, { - "epoch": 0.86, - "eval_logits/chosen": -1.427557110786438, - "eval_logits/rejected": -1.4012691974639893, - "eval_logps/chosen": -209.61801147460938, - "eval_logps/rejected": -250.4540252685547, - "eval_loss": 0.6177005171775818, - "eval_rewards/accuracies": 0.6579925417900085, - "eval_rewards/chosen": -1.5091416835784912, - "eval_rewards/margins": 0.3638246953487396, - "eval_rewards/rejected": -1.8729661703109741, - "eval_runtime": 357.0332, - "eval_samples_per_second": 12.055, - "eval_steps_per_second": 1.507, + "epoch": 0.8614748449345279, + "eval_logits/chosen": -2.817162275314331, + "eval_logits/rejected": -2.8111133575439453, + "eval_logps/chosen": -85.77025604248047, + "eval_logps/rejected": -95.64825439453125, + "eval_loss": 0.6707120537757874, + "eval_rewards/accuracies": 0.5954925417900085, + "eval_rewards/chosen": -0.2705835700035095, + "eval_rewards/margins": 0.054097648710012436, + "eval_rewards/rejected": -0.32468119263648987, + "eval_runtime": 360.2067, + "eval_samples_per_second": 11.949, + "eval_steps_per_second": 1.494, "step": 5000 }, { - "epoch": 0.86, - "grad_norm": 23.044456102420437, - "learning_rate": 3.5041660903934306e-07, - "logits/chosen": -1.334160566329956, - "logits/rejected": -1.2778995037078857, - "logps/chosen": -218.86984252929688, - "logps/rejected": -302.5597229003906, - "loss": 0.4935, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.6585071086883545, - "rewards/margins": 0.8564162254333496, - "rewards/rejected": -2.514923572540283, + "epoch": 0.8631977946243969, + "grad_norm": 7.2535319328308105, + "learning_rate": 7.008332180786861e-08, + "logits/chosen": -2.6919608116149902, + "logits/rejected": -2.6655592918395996, + "logps/chosen": -89.97347259521484, + "logps/rejected": -97.08216094970703, + "loss": 0.6554, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.36922687292099, + "rewards/margins": 0.09045498073101044, + "rewards/rejected": -0.45968183875083923, "step": 5010 }, { - "epoch": 0.86, - "grad_norm": 31.632139485329414, - "learning_rate": 3.4972767448955516e-07, - "logits/chosen": -1.3136205673217773, - "logits/rejected": -1.2596207857131958, - "logps/chosen": -208.8480682373047, - "logps/rejected": -286.65570068359375, - "loss": 0.5453, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.5537054538726807, - "rewards/margins": 0.7929602265357971, - "rewards/rejected": -2.346665859222412, + "epoch": 0.864920744314266, + "grad_norm": 6.6150126457214355, + "learning_rate": 6.994553489791103e-08, + "logits/chosen": -2.6872353553771973, + "logits/rejected": -2.662476062774658, + "logps/chosen": -87.15821075439453, + "logps/rejected": -95.46469116210938, + "loss": 0.6541, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.3365879952907562, + "rewards/margins": 0.09809447824954987, + "rewards/rejected": -0.4346825182437897, "step": 5020 }, { - "epoch": 0.87, - "grad_norm": 28.18486571020162, - "learning_rate": 3.4903783809472793e-07, - "logits/chosen": -1.2829835414886475, - "logits/rejected": -1.240122675895691, - "logps/chosen": -198.78387451171875, - "logps/rejected": -277.5005187988281, - "loss": 0.5452, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.4618786573410034, - "rewards/margins": 0.7737663984298706, - "rewards/rejected": -2.235645055770874, + "epoch": 0.8666436940041351, + "grad_norm": 7.26581335067749, + "learning_rate": 6.980756761894559e-08, + "logits/chosen": -2.6435627937316895, + "logits/rejected": -2.6336257457733154, + "logps/chosen": -86.87774658203125, + "logps/rejected": -96.81118774414062, + "loss": 0.6585, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.34260568022727966, + "rewards/margins": 0.08591968566179276, + "rewards/rejected": -0.4285253882408142, "step": 5030 }, { - "epoch": 0.87, - "grad_norm": 24.06279497070965, - "learning_rate": 3.483471060931051e-07, - "logits/chosen": -1.50538170337677, - "logits/rejected": -1.4423930644989014, - "logps/chosen": -200.87606811523438, - "logps/rejected": -264.9050598144531, - "loss": 0.5298, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.4371707439422607, - "rewards/margins": 0.7078632116317749, - "rewards/rejected": -2.145033836364746, + "epoch": 0.8683666436940042, + "grad_norm": 6.305263519287109, + "learning_rate": 6.966942121862102e-08, + "logits/chosen": -2.8530077934265137, + "logits/rejected": -2.8140368461608887, + "logps/chosen": -92.75401306152344, + "logps/rejected": -93.6357421875, + "loss": 0.663, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3559655547142029, + "rewards/margins": 0.07592367380857468, + "rewards/rejected": -0.43188923597335815, "step": 5040 }, { - "epoch": 0.87, - "grad_norm": 22.362866544811133, - "learning_rate": 3.4765548473102936e-07, - "logits/chosen": -1.3779505491256714, - "logits/rejected": -1.329679250717163, - "logps/chosen": -202.9115753173828, - "logps/rejected": -277.89459228515625, - "loss": 0.534, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.4728914499282837, - "rewards/margins": 0.7591592073440552, - "rewards/rejected": -2.232050895690918, + "epoch": 0.8700895933838731, + "grad_norm": 6.878271102905273, + "learning_rate": 6.953109694620587e-08, + "logits/chosen": -2.733891725540161, + "logits/rejected": -2.7108194828033447, + "logps/chosen": -90.18342590332031, + "logps/rejected": -98.98524475097656, + "loss": 0.6534, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.34579095244407654, + "rewards/margins": 0.09699489176273346, + "rewards/rejected": -0.4427858293056488, "step": 5050 }, { - "epoch": 0.87, - "grad_norm": 23.051911448956087, - "learning_rate": 3.469629802628858e-07, - "logits/chosen": -1.3045955896377563, - "logits/rejected": -1.2757608890533447, - "logps/chosen": -192.4661865234375, - "logps/rejected": -255.61367797851562, - "loss": 0.5956, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.4202029705047607, - "rewards/margins": 0.6157991290092468, - "rewards/rejected": -2.0360023975372314, + "epoch": 0.8718125430737422, + "grad_norm": 5.571224212646484, + "learning_rate": 6.939259605257717e-08, + "logits/chosen": -2.597719430923462, + "logits/rejected": -2.5918784141540527, + "logps/chosen": -85.84352111816406, + "logps/rejected": -94.81729125976562, + "loss": 0.6632, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3537163734436035, + "rewards/margins": 0.07410316169261932, + "rewards/rejected": -0.42781955003738403, "step": 5060 }, { - "epoch": 0.87, - "grad_norm": 24.88380616274711, - "learning_rate": 3.4626959895104585e-07, - "logits/chosen": -1.445326566696167, - "logits/rejected": -1.3971054553985596, - "logps/chosen": -176.08921813964844, - "logps/rejected": -235.4571990966797, - "loss": 0.5525, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2336560487747192, - "rewards/margins": 0.6401635408401489, - "rewards/rejected": -1.8738195896148682, + "epoch": 0.8735354927636113, + "grad_norm": 6.979639053344727, + "learning_rate": 6.925391979020918e-08, + "logits/chosen": -2.6971375942230225, + "logits/rejected": -2.67332124710083, + "logps/chosen": -88.00436401367188, + "logps/rejected": -93.44691467285156, + "loss": 0.6519, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.35256052017211914, + "rewards/margins": 0.10063686221837997, + "rewards/rejected": -0.4531974196434021, "step": 5070 }, { - "epoch": 0.88, - "grad_norm": 16.10357940315526, - "learning_rate": 3.4557534706580997e-07, - "logits/chosen": -1.690610647201538, - "logits/rejected": -1.6249040365219116, - "logps/chosen": -159.53741455078125, - "logps/rejected": -228.35623168945312, - "loss": 0.5054, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.020334005355835, - "rewards/margins": 0.7543589472770691, - "rewards/rejected": -1.7746931314468384, + "epoch": 0.8752584424534804, + "grad_norm": 5.353731155395508, + "learning_rate": 6.911506941316199e-08, + "logits/chosen": -2.883151054382324, + "logits/rejected": -2.8466620445251465, + "logps/chosen": -90.44625091552734, + "logps/rejected": -94.98185729980469, + "loss": 0.646, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3294186592102051, + "rewards/margins": 0.11147884279489517, + "rewards/rejected": -0.44089746475219727, "step": 5080 }, { - "epoch": 0.88, - "grad_norm": 22.84540156375547, - "learning_rate": 3.4488023088535144e-07, - "logits/chosen": -1.5469788312911987, - "logits/rejected": -1.4869216680526733, - "logps/chosen": -163.7722625732422, - "logps/rejected": -236.5236053466797, - "loss": 0.4949, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.09481680393219, - "rewards/margins": 0.7558841109275818, - "rewards/rejected": -1.8507009744644165, + "epoch": 0.8769813921433495, + "grad_norm": 6.316625595092773, + "learning_rate": 6.89760461770703e-08, + "logits/chosen": -2.7551684379577637, + "logits/rejected": -2.724618673324585, + "logps/chosen": -88.63999938964844, + "logps/rejected": -98.74547576904297, + "loss": 0.6384, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3434928357601166, + "rewards/margins": 0.12895774841308594, + "rewards/rejected": -0.4724505543708801, "step": 5090 }, { - "epoch": 0.88, - "grad_norm": 24.117584748609755, - "learning_rate": 3.4418425669565946e-07, - "logits/chosen": -1.3648254871368408, - "logits/rejected": -1.3106800317764282, - "logps/chosen": -192.01718139648438, - "logps/rejected": -246.06515502929688, - "loss": 0.5746, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3694689273834229, - "rewards/margins": 0.5682665109634399, - "rewards/rejected": -1.9377353191375732, + "epoch": 0.8787043418332184, + "grad_norm": 8.141860008239746, + "learning_rate": 6.88368513391319e-08, + "logits/chosen": -2.5979244709014893, + "logits/rejected": -2.5645573139190674, + "logps/chosen": -91.8878402709961, + "logps/rejected": -96.7591323852539, + "loss": 0.6628, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.3682095408439636, + "rewards/margins": 0.07615621387958527, + "rewards/rejected": -0.44436579942703247, "step": 5100 }, { - "epoch": 0.88, - "eval_logits/chosen": -1.5572093725204468, - "eval_logits/rejected": -1.532787561416626, - "eval_logps/chosen": -180.94764709472656, - "eval_logps/rejected": -217.08363342285156, - "eval_loss": 0.6200332641601562, - "eval_rewards/accuracies": 0.6654275059700012, - "eval_rewards/chosen": -1.2224379777908325, - "eval_rewards/margins": 0.3168245851993561, - "eval_rewards/rejected": -1.5392626523971558, - "eval_runtime": 356.479, - "eval_samples_per_second": 12.074, - "eval_steps_per_second": 1.509, + "epoch": 0.8787043418332184, + "eval_logits/chosen": -2.8094429969787598, + "eval_logits/rejected": -2.8035061359405518, + "eval_logps/chosen": -87.14309692382812, + "eval_logps/rejected": -97.29234313964844, + "eval_loss": 0.6696966886520386, + "eval_rewards/accuracies": 0.5968866348266602, + "eval_rewards/chosen": -0.28431203961372375, + "eval_rewards/margins": 0.05681019648909569, + "eval_rewards/rejected": -0.34112221002578735, + "eval_runtime": 360.5514, + "eval_samples_per_second": 11.937, + "eval_steps_per_second": 1.492, "step": 5100 }, { - "epoch": 0.88, - "grad_norm": 28.379638712637114, - "learning_rate": 3.434874307904822e-07, - "logits/chosen": -1.4629642963409424, - "logits/rejected": -1.4055429697036743, - "logps/chosen": -198.49514770507812, - "logps/rejected": -260.3693542480469, - "loss": 0.5568, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.4061576128005981, - "rewards/margins": 0.6592377424240112, - "rewards/rejected": -2.0653955936431885, + "epoch": 0.8804272915230875, + "grad_norm": 7.131643772125244, + "learning_rate": 6.869748615809644e-08, + "logits/chosen": -2.7321696281433105, + "logits/rejected": -2.6981236934661865, + "logps/chosen": -95.87647247314453, + "logps/rejected": -99.60868835449219, + "loss": 0.6653, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3800361156463623, + "rewards/margins": 0.07765528559684753, + "rewards/rejected": -0.45769137144088745, "step": 5110 }, { - "epoch": 0.88, - "grad_norm": 33.84680091265571, - "learning_rate": 3.427897594712699e-07, - "logits/chosen": -1.5411012172698975, - "logits/rejected": -1.4923017024993896, - "logps/chosen": -190.82667541503906, - "logps/rejected": -243.14395141601562, - "loss": 0.5811, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.369838833808899, - "rewards/margins": 0.558884859085083, - "rewards/rejected": -1.928723931312561, + "epoch": 0.8821502412129566, + "grad_norm": 6.144846439361572, + "learning_rate": 6.855795189425398e-08, + "logits/chosen": -2.7883951663970947, + "logits/rejected": -2.7596137523651123, + "logps/chosen": -91.65258026123047, + "logps/rejected": -94.02896881103516, + "loss": 0.6695, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.37779197096824646, + "rewards/margins": 0.05963408201932907, + "rewards/rejected": -0.4374260902404785, "step": 5120 }, { - "epoch": 0.88, - "grad_norm": 21.2178772791051, - "learning_rate": 3.4209124904711805e-07, - "logits/chosen": -1.5400969982147217, - "logits/rejected": -1.481069803237915, - "logps/chosen": -191.25750732421875, - "logps/rejected": -274.0238037109375, - "loss": 0.4782, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3769677877426147, - "rewards/margins": 0.8635452389717102, - "rewards/rejected": -2.2405130863189697, + "epoch": 0.8838731909028257, + "grad_norm": 6.057769298553467, + "learning_rate": 6.841824980942361e-08, + "logits/chosen": -2.804337739944458, + "logits/rejected": -2.7736172676086426, + "logps/chosen": -90.45610046386719, + "logps/rejected": -95.85780334472656, + "loss": 0.6565, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.368899941444397, + "rewards/margins": 0.0899074450135231, + "rewards/rejected": -0.4588073790073395, "step": 5130 }, { - "epoch": 0.89, - "grad_norm": 27.959322283199082, - "learning_rate": 3.4139190583471025e-07, - "logits/chosen": -1.50569748878479, - "logits/rejected": -1.4533193111419678, - "logps/chosen": -185.509521484375, - "logps/rejected": -234.0798797607422, - "loss": 0.5815, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2769523859024048, - "rewards/margins": 0.5343953967094421, - "rewards/rejected": -1.8113473653793335, + "epoch": 0.8855961405926946, + "grad_norm": 5.70902156829834, + "learning_rate": 6.827838116694204e-08, + "logits/chosen": -2.7242846488952637, + "logits/rejected": -2.6901285648345947, + "logps/chosen": -92.74956512451172, + "logps/rejected": -96.52131652832031, + "loss": 0.6574, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3493127226829529, + "rewards/margins": 0.08618960529565811, + "rewards/rejected": -0.435502290725708, "step": 5140 }, { - "epoch": 0.89, - "grad_norm": 20.960498750902655, - "learning_rate": 3.4069173615826097e-07, - "logits/chosen": -1.5694390535354614, - "logits/rejected": -1.5354506969451904, - "logps/chosen": -168.58926391601562, - "logps/rejected": -225.9666748046875, - "loss": 0.5677, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.154682993888855, - "rewards/margins": 0.5564135313034058, - "rewards/rejected": -1.7110967636108398, + "epoch": 0.8873190902825637, + "grad_norm": 6.267994403839111, + "learning_rate": 6.81383472316522e-08, + "logits/chosen": -2.7582485675811768, + "logits/rejected": -2.749330997467041, + "logps/chosen": -86.73070526123047, + "logps/rejected": -98.75907897949219, + "loss": 0.6494, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.33600908517837524, + "rewards/margins": 0.10284741222858429, + "rewards/rejected": -0.4388565123081207, "step": 5150 }, { - "epoch": 0.89, - "grad_norm": 20.888551855019205, - "learning_rate": 3.399907463494585e-07, - "logits/chosen": -1.553257703781128, - "logits/rejected": -1.5001866817474365, - "logps/chosen": -163.6346435546875, - "logps/rejected": -215.9080810546875, - "loss": 0.5523, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.1223015785217285, - "rewards/margins": 0.5567636489868164, - "rewards/rejected": -1.6790653467178345, + "epoch": 0.8890420399724328, + "grad_norm": 6.955835819244385, + "learning_rate": 6.79981492698917e-08, + "logits/chosen": -2.7331669330596924, + "logits/rejected": -2.698927640914917, + "logps/chosen": -86.40743255615234, + "logps/rejected": -93.25131225585938, + "loss": 0.6508, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.34990599751472473, + "rewards/margins": 0.10248558223247528, + "rewards/rejected": -0.4523916244506836, "step": 5160 }, { - "epoch": 0.89, - "grad_norm": 22.44450995498008, - "learning_rate": 3.3928894274740773e-07, - "logits/chosen": -1.5365890264511108, - "logits/rejected": -1.4780817031860352, - "logps/chosen": -159.61709594726562, - "logps/rejected": -242.06234741210938, - "loss": 0.4865, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.0430219173431396, - "rewards/margins": 0.8591874837875366, - "rewards/rejected": -1.9022095203399658, + "epoch": 0.8907649896623019, + "grad_norm": 6.757615089416504, + "learning_rate": 6.785778854948155e-08, + "logits/chosen": -2.7200756072998047, + "logits/rejected": -2.6970667839050293, + "logps/chosen": -87.436767578125, + "logps/rejected": -98.19063568115234, + "loss": 0.6336, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3214018940925598, + "rewards/margins": 0.14203134179115295, + "rewards/rejected": -0.46343326568603516, "step": 5170 }, { - "epoch": 0.89, - "grad_norm": 32.65259207620487, - "learning_rate": 3.385863316985726e-07, - "logits/chosen": -1.5846903324127197, - "logits/rejected": -1.5513648986816406, - "logps/chosen": -196.44918823242188, - "logps/rejected": -248.993408203125, - "loss": 0.5864, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.4105911254882812, - "rewards/margins": 0.5344855785369873, - "rewards/rejected": -1.9450767040252686, + "epoch": 0.892487939352171, + "grad_norm": 7.490974426269531, + "learning_rate": 6.771726633971452e-08, + "logits/chosen": -2.8090243339538574, + "logits/rejected": -2.7990384101867676, + "logps/chosen": -94.57676696777344, + "logps/rejected": -101.28718566894531, + "loss": 0.664, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3917374312877655, + "rewards/margins": 0.07604822516441345, + "rewards/rejected": -0.46778565645217896, "step": 5180 }, { - "epoch": 0.89, - "grad_norm": 20.160509769449938, - "learning_rate": 3.3788291955671887e-07, - "logits/chosen": -1.4820839166641235, - "logits/rejected": -1.4463526010513306, - "logps/chosen": -182.4502716064453, - "logps/rejected": -242.9739990234375, - "loss": 0.5733, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2962095737457275, - "rewards/margins": 0.5948411226272583, - "rewards/rejected": -1.8910505771636963, + "epoch": 0.8942108890420399, + "grad_norm": 7.311649799346924, + "learning_rate": 6.757658391134377e-08, + "logits/chosen": -2.691488027572632, + "logits/rejected": -2.6800453662872314, + "logps/chosen": -90.04177856445312, + "logps/rejected": -96.65260314941406, + "loss": 0.6734, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3720157742500305, + "rewards/margins": 0.05560746788978577, + "rewards/rejected": -0.4276232123374939, "step": 5190 }, { - "epoch": 0.9, - "grad_norm": 18.895506656949056, - "learning_rate": 3.371787126828568e-07, - "logits/chosen": -1.5754809379577637, - "logits/rejected": -1.5293941497802734, - "logps/chosen": -168.30630493164062, - "logps/rejected": -236.78439331054688, - "loss": 0.5138, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.1625381708145142, - "rewards/margins": 0.6649240255355835, - "rewards/rejected": -1.8274621963500977, + "epoch": 0.895933838731909, + "grad_norm": 5.600946426391602, + "learning_rate": 6.743574253657136e-08, + "logits/chosen": -2.757327079772949, + "logits/rejected": -2.7360892295837402, + "logps/chosen": -87.47918701171875, + "logps/rejected": -99.45861053466797, + "loss": 0.6513, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.35400596261024475, + "rewards/margins": 0.09994912892580032, + "rewards/rejected": -0.4539550244808197, "step": 5200 }, { - "epoch": 0.9, - "eval_logits/chosen": -1.6232234239578247, - "eval_logits/rejected": -1.6006020307540894, - "eval_logps/chosen": -162.8901824951172, - "eval_logps/rejected": -195.02578735351562, - "eval_loss": 0.6237266063690186, - "eval_rewards/accuracies": 0.6605483293533325, - "eval_rewards/chosen": -1.0418633222579956, - "eval_rewards/margins": 0.2768208086490631, - "eval_rewards/rejected": -1.3186841011047363, - "eval_runtime": 357.1428, - "eval_samples_per_second": 12.051, - "eval_steps_per_second": 1.506, + "epoch": 0.895933838731909, + "eval_logits/chosen": -2.8031022548675537, + "eval_logits/rejected": -2.7971508502960205, + "eval_logps/chosen": -87.38237762451172, + "eval_logps/rejected": -97.62223815917969, + "eval_loss": 0.6693427562713623, + "eval_rewards/accuracies": 0.5952602028846741, + "eval_rewards/chosen": -0.28670480847358704, + "eval_rewards/margins": 0.0577162504196167, + "eval_rewards/rejected": -0.3444210886955261, + "eval_runtime": 360.2553, + "eval_samples_per_second": 11.947, + "eval_steps_per_second": 1.493, "step": 5200 }, { - "epoch": 0.9, - "grad_norm": 20.636102998390292, - "learning_rate": 3.364737174451834e-07, - "logits/chosen": -1.5026520490646362, - "logits/rejected": -1.4632505178451538, - "logps/chosen": -176.74197387695312, - "logps/rejected": -229.8460693359375, - "loss": 0.5734, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.25114107131958, - "rewards/margins": 0.5474826097488403, - "rewards/rejected": -1.7986234426498413, + "epoch": 0.8976567884217781, + "grad_norm": 6.237802982330322, + "learning_rate": 6.729474348903667e-08, + "logits/chosen": -2.713639497756958, + "logits/rejected": -2.698694944381714, + "logps/chosen": -87.16832733154297, + "logps/rejected": -93.84297180175781, + "loss": 0.6605, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.35529080033302307, + "rewards/margins": 0.083133764564991, + "rewards/rejected": -0.43842458724975586, "step": 5210 }, { - "epoch": 0.9, - "grad_norm": 18.20246477650609, - "learning_rate": 3.3576794021902476e-07, - "logits/chosen": -1.5258533954620361, - "logits/rejected": -1.4866435527801514, - "logps/chosen": -168.38894653320312, - "logps/rejected": -228.6901092529297, - "loss": 0.5603, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1579110622406006, - "rewards/margins": 0.5693751573562622, - "rewards/rejected": -1.7272861003875732, + "epoch": 0.8993797381116472, + "grad_norm": 5.577020168304443, + "learning_rate": 6.715358804380495e-08, + "logits/chosen": -2.7461514472961426, + "logits/rejected": -2.733440637588501, + "logps/chosen": -86.21382904052734, + "logps/rejected": -96.14476776123047, + "loss": 0.6684, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3360275626182556, + "rewards/margins": 0.06560958921909332, + "rewards/rejected": -0.40163716673851013, "step": 5220 }, { - "epoch": 0.9, - "grad_norm": 25.25518148404087, - "learning_rate": 3.350613873867788e-07, - "logits/chosen": -1.4658780097961426, - "logits/rejected": -1.4225355386734009, - "logps/chosen": -174.45970153808594, - "logps/rejected": -261.13702392578125, - "loss": 0.4966, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.1921513080596924, - "rewards/margins": 0.858329176902771, - "rewards/rejected": -2.050480365753174, + "epoch": 0.9011026878015161, + "grad_norm": 6.821839809417725, + "learning_rate": 6.701227747735576e-08, + "logits/chosen": -2.6805355548858643, + "logits/rejected": -2.669785976409912, + "logps/chosen": -90.50724029541016, + "logps/rejected": -104.52873229980469, + "loss": 0.6384, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3524440824985504, + "rewards/margins": 0.13172096014022827, + "rewards/rejected": -0.4841650128364563, "step": 5230 }, { - "epoch": 0.9, - "grad_norm": 22.578680932349204, - "learning_rate": 3.343540653378571e-07, - "logits/chosen": -1.4708452224731445, - "logits/rejected": -1.409401535987854, - "logps/chosen": -177.78860473632812, - "logps/rejected": -269.25457763671875, - "loss": 0.4701, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.2503529787063599, - "rewards/margins": 0.9091068506240845, - "rewards/rejected": -2.1594595909118652, + "epoch": 0.9028256374913852, + "grad_norm": 6.107116222381592, + "learning_rate": 6.687081306757142e-08, + "logits/chosen": -2.676337718963623, + "logits/rejected": -2.654982328414917, + "logps/chosen": -88.69650268554688, + "logps/rejected": -105.2641830444336, + "loss": 0.6248, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.3592351973056793, + "rewards/margins": 0.15993066132068634, + "rewards/rejected": -0.5191658735275269, "step": 5240 }, { - "epoch": 0.9, - "grad_norm": 21.94896818155034, - "learning_rate": 3.3364598046862754e-07, - "logits/chosen": -1.3917882442474365, - "logits/rejected": -1.3478825092315674, - "logps/chosen": -180.23175048828125, - "logps/rejected": -260.63482666015625, - "loss": 0.4965, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2926623821258545, - "rewards/margins": 0.8049648404121399, - "rewards/rejected": -2.0976271629333496, + "epoch": 0.9045485871812543, + "grad_norm": 5.749131679534912, + "learning_rate": 6.67291960937255e-08, + "logits/chosen": -2.6306498050689697, + "logits/rejected": -2.6170966625213623, + "logps/chosen": -87.65843200683594, + "logps/rejected": -96.63198852539062, + "loss": 0.6574, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36680710315704346, + "rewards/margins": 0.09071727842092514, + "rewards/rejected": -0.45752429962158203, "step": 5250 }, { - "epoch": 0.91, - "grad_norm": 42.50229156823177, - "learning_rate": 3.3293713918235594e-07, - "logits/chosen": -1.4157629013061523, - "logits/rejected": -1.3547483682632446, - "logps/chosen": -195.6414337158203, - "logps/rejected": -264.51416015625, - "loss": 0.5465, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3889554738998413, - "rewards/margins": 0.741489827632904, - "rewards/rejected": -2.1304454803466797, + "epoch": 0.9062715368711234, + "grad_norm": 8.217474937438965, + "learning_rate": 6.658742783647119e-08, + "logits/chosen": -2.7182517051696777, + "logits/rejected": -2.6786859035491943, + "logps/chosen": -91.67625427246094, + "logps/rejected": -96.38301086425781, + "loss": 0.6523, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.34890884160995483, + "rewards/margins": 0.09981432557106018, + "rewards/rejected": -0.4487232267856598, "step": 5260 }, { - "epoch": 0.91, - "grad_norm": 20.377525883555307, - "learning_rate": 3.3222754788914875e-07, - "logits/chosen": -1.5662615299224854, - "logits/rejected": -1.526829719543457, - "logps/chosen": -177.85626220703125, - "logps/rejected": -250.97933959960938, - "loss": 0.5173, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.263085961341858, - "rewards/margins": 0.735329270362854, - "rewards/rejected": -1.998415231704712, + "epoch": 0.9079944865609925, + "grad_norm": 6.370172023773193, + "learning_rate": 6.644550957782975e-08, + "logits/chosen": -2.811823606491089, + "logits/rejected": -2.7978882789611816, + "logps/chosen": -88.22168731689453, + "logps/rejected": -98.79576110839844, + "loss": 0.6468, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.3666745722293854, + "rewards/margins": 0.10982397943735123, + "rewards/rejected": -0.4764985144138336, "step": 5270 }, { - "epoch": 0.91, - "grad_norm": 24.732098744846958, - "learning_rate": 3.315172130058946e-07, - "logits/chosen": -1.4817497730255127, - "logits/rejected": -1.4205673933029175, - "logps/chosen": -187.76739501953125, - "logps/rejected": -252.81613159179688, - "loss": 0.5296, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3341633081436157, - "rewards/margins": 0.6874901056289673, - "rewards/rejected": -2.021653413772583, + "epoch": 0.9097174362508614, + "grad_norm": 6.619154930114746, + "learning_rate": 6.630344260117892e-08, + "logits/chosen": -2.715485095977783, + "logits/rejected": -2.6864123344421387, + "logps/chosen": -93.1703872680664, + "logps/rejected": -98.26622009277344, + "loss": 0.6578, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3880094289779663, + "rewards/margins": 0.0879502147436142, + "rewards/rejected": -0.4759596884250641, "step": 5280 }, { - "epoch": 0.91, - "grad_norm": 26.624712530739988, - "learning_rate": 3.308061409562065e-07, - "logits/chosen": -1.4430485963821411, - "logits/rejected": -1.378722906112671, - "logps/chosen": -176.76901245117188, - "logps/rejected": -246.05923461914062, - "loss": 0.5194, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.209058165550232, - "rewards/margins": 0.7315285801887512, - "rewards/rejected": -1.940587043762207, + "epoch": 0.9114403859407305, + "grad_norm": 6.778704643249512, + "learning_rate": 6.61612281912413e-08, + "logits/chosen": -2.6618595123291016, + "logits/rejected": -2.6204473972320557, + "logps/chosen": -91.83499145507812, + "logps/rejected": -97.12114715576172, + "loss": 0.6565, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.35979264974594116, + "rewards/margins": 0.09102566540241241, + "rewards/rejected": -0.4508183002471924, "step": 5290 }, { - "epoch": 0.91, - "grad_norm": 16.5970454890233, - "learning_rate": 3.300943381703639e-07, - "logits/chosen": -1.4298001527786255, - "logits/rejected": -1.3858981132507324, - "logps/chosen": -189.7223358154297, - "logps/rejected": -266.050537109375, - "loss": 0.5094, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3463001251220703, - "rewards/margins": 0.7629293203353882, - "rewards/rejected": -2.109229564666748, + "epoch": 0.9131633356305996, + "grad_norm": 6.4996337890625, + "learning_rate": 6.601886763407278e-08, + "logits/chosen": -2.6700820922851562, + "logits/rejected": -2.6571602821350098, + "logps/chosen": -90.00111389160156, + "logps/rejected": -100.92208099365234, + "loss": 0.6475, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.34902939200401306, + "rewards/margins": 0.10871565341949463, + "rewards/rejected": -0.4577450752258301, "step": 5300 }, { - "epoch": 0.91, - "eval_logits/chosen": -1.5428049564361572, - "eval_logits/rejected": -1.5180394649505615, - "eval_logps/chosen": -187.3815460205078, - "eval_logps/rejected": -224.76116943359375, - "eval_loss": 0.6180873513221741, - "eval_rewards/accuracies": 0.6598513126373291, - "eval_rewards/chosen": -1.2867772579193115, - "eval_rewards/margins": 0.32926076650619507, - "eval_rewards/rejected": -1.6160376071929932, - "eval_runtime": 357.239, - "eval_samples_per_second": 12.048, - "eval_steps_per_second": 1.506, + "epoch": 0.9131633356305996, + "eval_logits/chosen": -2.7942566871643066, + "eval_logits/rejected": -2.7882344722747803, + "eval_logps/chosen": -87.72479248046875, + "eval_logps/rejected": -98.02127075195312, + "eval_loss": 0.6691641211509705, + "eval_rewards/accuracies": 0.5987453460693359, + "eval_rewards/chosen": -0.2901288866996765, + "eval_rewards/margins": 0.05828263610601425, + "eval_rewards/rejected": -0.34841153025627136, + "eval_runtime": 360.1225, + "eval_samples_per_second": 11.951, + "eval_steps_per_second": 1.494, "step": 5300 }, { - "epoch": 0.91, - "grad_norm": 24.46549230872405, - "learning_rate": 3.293818110852541e-07, - "logits/chosen": -1.5138168334960938, - "logits/rejected": -1.472394347190857, - "logps/chosen": -205.1785430908203, - "logps/rejected": -269.4486389160156, - "loss": 0.5451, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.4704174995422363, - "rewards/margins": 0.7003182172775269, - "rewards/rejected": -2.1707355976104736, + "epoch": 0.9148862853204687, + "grad_norm": 5.797268390655518, + "learning_rate": 6.587636221705082e-08, + "logits/chosen": -2.7919394969940186, + "logits/rejected": -2.7814671993255615, + "logps/chosen": -96.14761352539062, + "logps/rejected": -98.30670166015625, + "loss": 0.6632, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3801385760307312, + "rewards/margins": 0.07896140217781067, + "rewards/rejected": -0.45910006761550903, "step": 5310 }, { - "epoch": 0.92, - "grad_norm": 28.551168644730765, - "learning_rate": 3.286685661443144e-07, - "logits/chosen": -1.4450080394744873, - "logits/rejected": -1.3684725761413574, - "logps/chosen": -212.12887573242188, - "logps/rejected": -286.6734924316406, - "loss": 0.5018, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.5459760427474976, - "rewards/margins": 0.8185034990310669, - "rewards/rejected": -2.3644795417785645, + "epoch": 0.9166092350103378, + "grad_norm": 6.824923038482666, + "learning_rate": 6.573371322886288e-08, + "logits/chosen": -2.7079017162323, + "logits/rejected": -2.6571567058563232, + "logps/chosen": -94.98011779785156, + "logps/rejected": -98.27288055419922, + "loss": 0.648, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.37426260113716125, + "rewards/margins": 0.10587947070598602, + "rewards/rejected": -0.48014211654663086, "step": 5320 }, { - "epoch": 0.92, - "grad_norm": 21.620111584157225, - "learning_rate": 3.2795460979747375e-07, - "logits/chosen": -1.3988605737686157, - "logits/rejected": -1.3602981567382812, - "logps/chosen": -198.0777130126953, - "logps/rejected": -299.89276123046875, - "loss": 0.4887, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.4722723960876465, - "rewards/margins": 0.9927380681037903, - "rewards/rejected": -2.465010643005371, + "epoch": 0.9183321847002067, + "grad_norm": 5.241591453552246, + "learning_rate": 6.559092195949476e-08, + "logits/chosen": -2.6528475284576416, + "logits/rejected": -2.654944896697998, + "logps/chosen": -88.13108825683594, + "logps/rejected": -101.67539978027344, + "loss": 0.6494, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3727738559246063, + "rewards/margins": 0.10972573608160019, + "rewards/rejected": -0.4824995994567871, "step": 5330 }, { - "epoch": 0.92, - "grad_norm": 25.423142113773764, - "learning_rate": 3.272399485010943e-07, - "logits/chosen": -1.431849479675293, - "logits/rejected": -1.3641244173049927, - "logps/chosen": -201.8954620361328, - "logps/rejected": -272.422607421875, - "loss": 0.5224, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.4554879665374756, - "rewards/margins": 0.7551096081733704, - "rewards/rejected": -2.210597515106201, + "epoch": 0.9200551343900758, + "grad_norm": 6.645832061767578, + "learning_rate": 6.544798970021886e-08, + "logits/chosen": -2.6900384426116943, + "logits/rejected": -2.6418492794036865, + "logps/chosen": -93.05720520019531, + "logps/rejected": -97.80923461914062, + "loss": 0.6524, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3669526278972626, + "rewards/margins": 0.09738953411579132, + "rewards/rejected": -0.4643422067165375, "step": 5340 }, { - "epoch": 0.92, - "grad_norm": 21.415159819448192, - "learning_rate": 3.2652458871791326e-07, - "logits/chosen": -1.4087716341018677, - "logits/rejected": -1.361433982849121, - "logps/chosen": -191.4141082763672, - "logps/rejected": -257.26812744140625, - "loss": 0.5519, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3878710269927979, - "rewards/margins": 0.6652384996414185, - "rewards/rejected": -2.053109645843506, + "epoch": 0.9217780840799449, + "grad_norm": 6.164369583129883, + "learning_rate": 6.530491774358266e-08, + "logits/chosen": -2.642723560333252, + "logits/rejected": -2.6087820529937744, + "logps/chosen": -90.41754913330078, + "logps/rejected": -98.78402709960938, + "loss": 0.6562, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.377750962972641, + "rewards/margins": 0.09037534892559052, + "rewards/rejected": -0.4681262969970703, "step": 5350 }, { - "epoch": 0.92, - "grad_norm": 23.352557781202382, - "learning_rate": 3.2580853691698417e-07, - "logits/chosen": -1.5152844190597534, - "logits/rejected": -1.4675962924957275, - "logps/chosen": -186.24412536621094, - "logps/rejected": -270.291015625, - "loss": 0.5293, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3303719758987427, - "rewards/margins": 0.8078699111938477, - "rewards/rejected": -2.138241767883301, + "epoch": 0.923501033769814, + "grad_norm": 7.229325294494629, + "learning_rate": 6.516170738339683e-08, + "logits/chosen": -2.7518601417541504, + "logits/rejected": -2.7438347339630127, + "logps/chosen": -88.50611877441406, + "logps/rejected": -104.66204833984375, + "loss": 0.6399, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.35292014479637146, + "rewards/margins": 0.1288849264383316, + "rewards/rejected": -0.4818050265312195, "step": 5360 }, { - "epoch": 0.93, - "grad_norm": 27.08219440604054, - "learning_rate": 3.250917995736187e-07, - "logits/chosen": -1.4008272886276245, - "logits/rejected": -1.349577784538269, - "logps/chosen": -190.25677490234375, - "logps/rejected": -279.8938293457031, - "loss": 0.4707, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.383306622505188, - "rewards/margins": 0.8817272186279297, - "rewards/rejected": -2.2650339603424072, + "epoch": 0.9252239834596829, + "grad_norm": 7.459567546844482, + "learning_rate": 6.501835991472373e-08, + "logits/chosen": -2.6610798835754395, + "logits/rejected": -2.6450963020324707, + "logps/chosen": -87.69316101074219, + "logps/rejected": -96.68251037597656, + "loss": 0.6652, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.35768431425094604, + "rewards/margins": 0.07509956508874893, + "rewards/rejected": -0.43278390169143677, "step": 5370 }, { - "epoch": 0.93, - "grad_norm": 21.444506787758893, - "learning_rate": 3.2437438316932766e-07, - "logits/chosen": -1.4608103036880493, - "logits/rejected": -1.4044318199157715, - "logps/chosen": -212.57666015625, - "logps/rejected": -279.16912841796875, - "loss": 0.535, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.5518763065338135, - "rewards/margins": 0.7239239811897278, - "rewards/rejected": -2.2758002281188965, + "epoch": 0.926946933149552, + "grad_norm": 6.591744899749756, + "learning_rate": 6.487487663386553e-08, + "logits/chosen": -2.76351261138916, + "logits/rejected": -2.726323366165161, + "logps/chosen": -93.70985412597656, + "logps/rejected": -99.14210510253906, + "loss": 0.6465, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3631172776222229, + "rewards/margins": 0.11200074106454849, + "rewards/rejected": -0.47511807084083557, "step": 5380 }, { - "epoch": 0.93, - "grad_norm": 22.582981599970143, - "learning_rate": 3.2365629419176294e-07, - "logits/chosen": -1.422620415687561, - "logits/rejected": -1.3589953184127808, - "logps/chosen": -210.6852569580078, - "logps/rejected": -277.79864501953125, - "loss": 0.5383, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.5212723016738892, - "rewards/margins": 0.7407953143119812, - "rewards/rejected": -2.2620673179626465, + "epoch": 0.9286698828394211, + "grad_norm": 6.302175521850586, + "learning_rate": 6.473125883835259e-08, + "logits/chosen": -2.6813721656799316, + "logits/rejected": -2.6422839164733887, + "logps/chosen": -98.21910858154297, + "logps/rejected": -98.19676208496094, + "loss": 0.6668, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.39651817083358765, + "rewards/margins": 0.06931675970554352, + "rewards/rejected": -0.46583491563796997, "step": 5390 }, { - "epoch": 0.93, - "grad_norm": 24.413482194523574, - "learning_rate": 3.229375391346585e-07, - "logits/chosen": -1.4233802556991577, - "logits/rejected": -1.3726685047149658, - "logps/chosen": -182.7910919189453, - "logps/rejected": -270.41156005859375, - "loss": 0.4865, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3170902729034424, - "rewards/margins": 0.8481825590133667, - "rewards/rejected": -2.1652729511260986, + "epoch": 0.9303928325292902, + "grad_norm": 7.194708824157715, + "learning_rate": 6.458750782693171e-08, + "logits/chosen": -2.6976680755615234, + "logits/rejected": -2.6807875633239746, + "logps/chosen": -88.2069091796875, + "logps/rejected": -101.64945983886719, + "loss": 0.6494, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3710055947303772, + "rewards/margins": 0.10657094419002533, + "rewards/rejected": -0.4775765538215637, "step": 5400 }, { - "epoch": 0.93, - "eval_logits/chosen": -1.5443414449691772, - "eval_logits/rejected": -1.5196946859359741, - "eval_logps/chosen": -181.3465576171875, - "eval_logps/rejected": -217.53018188476562, - "eval_loss": 0.6221857070922852, - "eval_rewards/accuracies": 0.669842004776001, - "eval_rewards/chosen": -1.2264270782470703, - "eval_rewards/margins": 0.31730079650878906, - "eval_rewards/rejected": -1.543727993965149, - "eval_runtime": 357.1726, - "eval_samples_per_second": 12.05, - "eval_steps_per_second": 1.506, + "epoch": 0.9303928325292902, + "eval_logits/chosen": -2.788670778274536, + "eval_logits/rejected": -2.782686471939087, + "eval_logps/chosen": -88.10902404785156, + "eval_logps/rejected": -98.53675079345703, + "eval_loss": 0.6686705350875854, + "eval_rewards/accuracies": 0.6015334725379944, + "eval_rewards/chosen": -0.29397135972976685, + "eval_rewards/margins": 0.059594981372356415, + "eval_rewards/rejected": -0.35356634855270386, + "eval_runtime": 360.3657, + "eval_samples_per_second": 11.943, + "eval_steps_per_second": 1.493, "step": 5400 }, { - "epoch": 0.93, - "grad_norm": 33.90762588243634, - "learning_rate": 3.222181244977716e-07, - "logits/chosen": -1.4560267925262451, - "logits/rejected": -1.4195563793182373, - "logps/chosen": -192.35723876953125, - "logps/rejected": -252.6515350341797, - "loss": 0.5492, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3573839664459229, - "rewards/margins": 0.6285881996154785, - "rewards/rejected": -1.9859724044799805, + "epoch": 0.9321157822191593, + "grad_norm": 7.650205612182617, + "learning_rate": 6.444362489955433e-08, + "logits/chosen": -2.709641933441162, + "logits/rejected": -2.6979405879974365, + "logps/chosen": -95.5630874633789, + "logps/rejected": -100.59437561035156, + "loss": 0.6632, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.38923701643943787, + "rewards/margins": 0.07583045214414597, + "rewards/rejected": -0.46506747603416443, "step": 5410 }, { - "epoch": 0.93, - "grad_norm": 25.43643547302594, - "learning_rate": 3.2149805678682415e-07, - "logits/chosen": -1.5208218097686768, - "logits/rejected": -1.4746044874191284, - "logps/chosen": -182.27589416503906, - "logps/rejected": -257.0049743652344, - "loss": 0.5247, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2857364416122437, - "rewards/margins": 0.7472572326660156, - "rewards/rejected": -2.032993793487549, + "epoch": 0.9338387319090282, + "grad_norm": 7.145179748535156, + "learning_rate": 6.429961135736483e-08, + "logits/chosen": -2.7654125690460205, + "logits/rejected": -2.7453110218048096, + "logps/chosen": -91.00215911865234, + "logps/rejected": -99.94932556152344, + "loss": 0.6576, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.37283921241760254, + "rewards/margins": 0.08913250267505646, + "rewards/rejected": -0.4619717001914978, "step": 5420 }, { - "epoch": 0.94, - "grad_norm": 28.226137721023306, - "learning_rate": 3.207773425134441e-07, - "logits/chosen": -1.4794824123382568, - "logits/rejected": -1.4373642206192017, - "logps/chosen": -185.75901794433594, - "logps/rejected": -257.13983154296875, - "loss": 0.532, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3217830657958984, - "rewards/margins": 0.7321761846542358, - "rewards/rejected": -2.0539591312408447, + "epoch": 0.9355616815988973, + "grad_norm": 8.034782409667969, + "learning_rate": 6.415546850268881e-08, + "logits/chosen": -2.699660539627075, + "logits/rejected": -2.688311815261841, + "logps/chosen": -93.40458679199219, + "logps/rejected": -101.03926849365234, + "loss": 0.6553, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.39806491136550903, + "rewards/margins": 0.0948207750916481, + "rewards/rejected": -0.49288567900657654, "step": 5430 }, { - "epoch": 0.94, - "grad_norm": 21.150277719094746, - "learning_rate": 3.2005598819510586e-07, - "logits/chosen": -1.4646375179290771, - "logits/rejected": -1.4225250482559204, - "logps/chosen": -189.71932983398438, - "logps/rejected": -262.24505615234375, - "loss": 0.5265, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3352781534194946, - "rewards/margins": 0.7349643111228943, - "rewards/rejected": -2.070242404937744, + "epoch": 0.9372846312887664, + "grad_norm": 6.361593723297119, + "learning_rate": 6.401119763902118e-08, + "logits/chosen": -2.700340747833252, + "logits/rejected": -2.6840767860412598, + "logps/chosen": -93.75382232666016, + "logps/rejected": -105.26869201660156, + "loss": 0.6406, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.3755238950252533, + "rewards/margins": 0.12471933662891388, + "rewards/rejected": -0.5002433061599731, "step": 5440 }, { - "epoch": 0.94, - "grad_norm": 40.77880858071023, - "learning_rate": 3.193340003550722e-07, - "logits/chosen": -1.3812825679779053, - "logits/rejected": -1.338728427886963, - "logps/chosen": -192.5988006591797, - "logps/rejected": -267.5910949707031, - "loss": 0.525, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3704335689544678, - "rewards/margins": 0.7710081338882446, - "rewards/rejected": -2.141441822052002, + "epoch": 0.9390075809786355, + "grad_norm": 8.197189331054688, + "learning_rate": 6.386680007101444e-08, + "logits/chosen": -2.6286511421203613, + "logits/rejected": -2.618351936340332, + "logps/chosen": -95.17610168457031, + "logps/rejected": -103.09773254394531, + "loss": 0.6528, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3959697186946869, + "rewards/margins": 0.1002323180437088, + "rewards/rejected": -0.4962020516395569, "step": 5450 }, { - "epoch": 0.94, - "grad_norm": 30.470434371984815, - "learning_rate": 3.186113855223348e-07, - "logits/chosen": -1.4694117307662964, - "logits/rejected": -1.4298455715179443, - "logps/chosen": -192.1426239013672, - "logps/rejected": -244.8093719482422, - "loss": 0.6017, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.3830455541610718, - "rewards/margins": 0.544926106929779, - "rewards/rejected": -1.9279718399047852, + "epoch": 0.9407305306685044, + "grad_norm": 7.160200595855713, + "learning_rate": 6.372227710446696e-08, + "logits/chosen": -2.70961594581604, + "logits/rejected": -2.6946840286254883, + "logps/chosen": -91.92683410644531, + "logps/rejected": -97.2591552734375, + "loss": 0.6656, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.38083431124687195, + "rewards/margins": 0.07164783775806427, + "rewards/rejected": -0.4524821639060974, "step": 5460 }, { - "epoch": 0.94, - "grad_norm": 19.744210255270794, - "learning_rate": 3.178881502315552e-07, - "logits/chosen": -1.469347357749939, - "logits/rejected": -1.4357693195343018, - "logps/chosen": -171.6509246826172, - "logps/rejected": -222.62744140625, - "loss": 0.5921, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2030577659606934, - "rewards/margins": 0.5184643268585205, - "rewards/rejected": -1.7215220928192139, + "epoch": 0.9424534803583735, + "grad_norm": 7.070869445800781, + "learning_rate": 6.357763004631103e-08, + "logits/chosen": -2.66728138923645, + "logits/rejected": -2.6487419605255127, + "logps/chosen": -90.99283599853516, + "logps/rejected": -96.91069030761719, + "loss": 0.6711, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.3963419795036316, + "rewards/margins": 0.0679207444190979, + "rewards/rejected": -0.46426278352737427, "step": 5470 }, { - "epoch": 0.94, - "grad_norm": 21.30606288485756, - "learning_rate": 3.1716430102300573e-07, - "logits/chosen": -1.5191564559936523, - "logits/rejected": -1.4697355031967163, - "logps/chosen": -164.0375518798828, - "logps/rejected": -234.484619140625, - "loss": 0.5209, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1111133098602295, - "rewards/margins": 0.7175213694572449, - "rewards/rejected": -1.8286346197128296, + "epoch": 0.9441764300482426, + "grad_norm": 5.866538047790527, + "learning_rate": 6.343286020460114e-08, + "logits/chosen": -2.695925235748291, + "logits/rejected": -2.6727564334869385, + "logps/chosen": -91.25939178466797, + "logps/rejected": -102.32554626464844, + "loss": 0.6423, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3831619322299957, + "rewards/margins": 0.1236589327454567, + "rewards/rejected": -0.506820797920227, "step": 5480 }, { - "epoch": 0.95, - "grad_norm": 19.015649280887946, - "learning_rate": 3.164398444425106e-07, - "logits/chosen": -1.4912570714950562, - "logits/rejected": -1.4432531595230103, - "logps/chosen": -171.32119750976562, - "logps/rejected": -226.35067749023438, - "loss": 0.5431, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.1578484773635864, - "rewards/margins": 0.5836814641952515, - "rewards/rejected": -1.7415298223495483, + "epoch": 0.9458993797381117, + "grad_norm": 7.357505798339844, + "learning_rate": 6.328796888850211e-08, + "logits/chosen": -2.6660616397857666, + "logits/rejected": -2.639159679412842, + "logps/chosen": -93.99574279785156, + "logps/rejected": -99.41559600830078, + "loss": 0.6594, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.38430607318878174, + "rewards/margins": 0.08772842586040497, + "rewards/rejected": -0.4720345139503479, "step": 5490 }, { - "epoch": 0.95, - "grad_norm": 25.985898470229376, - "learning_rate": 3.157147870413864e-07, - "logits/chosen": -1.5515010356903076, - "logits/rejected": -1.512731909751892, - "logps/chosen": -169.95750427246094, - "logps/rejected": -240.7258758544922, - "loss": 0.513, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1616063117980957, - "rewards/margins": 0.6991836428642273, - "rewards/rejected": -1.8607898950576782, + "epoch": 0.9476223294279807, + "grad_norm": 7.893945217132568, + "learning_rate": 6.314295740827728e-08, + "logits/chosen": -2.7434821128845215, + "logits/rejected": -2.727250099182129, + "logps/chosen": -90.81407928466797, + "logps/rejected": -104.48640441894531, + "loss": 0.6412, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.37003234028816223, + "rewards/margins": 0.12817493081092834, + "rewards/rejected": -0.4982072412967682, "step": 5500 }, { - "epoch": 0.95, - "eval_logits/chosen": -1.5876429080963135, - "eval_logits/rejected": -1.5650734901428223, - "eval_logps/chosen": -172.41822814941406, - "eval_logps/rejected": -205.8068389892578, - "eval_loss": 0.6214230060577393, - "eval_rewards/accuracies": 0.6721654534339905, - "eval_rewards/chosen": -1.137143850326538, - "eval_rewards/margins": 0.289350688457489, - "eval_rewards/rejected": -1.4264944791793823, - "eval_runtime": 357.0862, - "eval_samples_per_second": 12.053, - "eval_steps_per_second": 1.507, + "epoch": 0.9476223294279807, + "eval_logits/chosen": -2.779411792755127, + "eval_logits/rejected": -2.773380756378174, + "eval_logps/chosen": -88.95333862304688, + "eval_logps/rejected": -99.52513122558594, + "eval_loss": 0.6681700944900513, + "eval_rewards/accuracies": 0.5996747016906738, + "eval_rewards/chosen": -0.3024143576622009, + "eval_rewards/margins": 0.061035752296447754, + "eval_rewards/rejected": -0.36345013976097107, + "eval_runtime": 359.9936, + "eval_samples_per_second": 11.956, + "eval_steps_per_second": 1.494, "step": 5500 }, { - "epoch": 0.95, - "grad_norm": 23.17137980170567, - "learning_rate": 3.1498913537638314e-07, - "logits/chosen": -1.471665620803833, - "logits/rejected": -1.4335734844207764, - "logps/chosen": -186.6552734375, - "logps/rejected": -242.85983276367188, - "loss": 0.5717, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3344801664352417, - "rewards/margins": 0.566752552986145, - "rewards/rejected": -1.9012327194213867, + "epoch": 0.9493452791178497, + "grad_norm": 6.8784027099609375, + "learning_rate": 6.299782707527664e-08, + "logits/chosen": -2.657125949859619, + "logits/rejected": -2.6420211791992188, + "logps/chosen": -93.97222900390625, + "logps/rejected": -100.09397888183594, + "loss": 0.6679, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4074481427669525, + "rewards/margins": 0.0661727637052536, + "rewards/rejected": -0.47362083196640015, "step": 5510 }, { - "epoch": 0.95, - "grad_norm": 20.934842998393453, - "learning_rate": 3.142628960096246e-07, - "logits/chosen": -1.4280526638031006, - "logits/rejected": -1.3803586959838867, - "logps/chosen": -180.4522247314453, - "logps/rejected": -248.35739135742188, - "loss": 0.5239, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.246476411819458, - "rewards/margins": 0.7218848466873169, - "rewards/rejected": -1.9683609008789062, + "epoch": 0.9510682288077188, + "grad_norm": 7.177425384521484, + "learning_rate": 6.285257920192492e-08, + "logits/chosen": -2.6515066623687744, + "logits/rejected": -2.627837657928467, + "logps/chosen": -92.08159637451172, + "logps/rejected": -99.34222412109375, + "loss": 0.6451, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.36257150769233704, + "rewards/margins": 0.11540888249874115, + "rewards/rejected": -0.477980375289917, "step": 5520 }, { - "epoch": 0.95, - "grad_norm": 20.694293406108667, - "learning_rate": 3.135360755085493e-07, - "logits/chosen": -1.4679347276687622, - "logits/rejected": -1.4120423793792725, - "logps/chosen": -185.0923309326172, - "logps/rejected": -251.6171875, - "loss": 0.5145, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2520725727081299, - "rewards/margins": 0.7337436079978943, - "rewards/rejected": -1.9858160018920898, + "epoch": 0.9527911784975879, + "grad_norm": 7.7968549728393555, + "learning_rate": 6.270721510170987e-08, + "logits/chosen": -2.692481517791748, + "logits/rejected": -2.6599838733673096, + "logps/chosen": -98.7787857055664, + "logps/rejected": -100.84993743896484, + "loss": 0.6579, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.38872066140174866, + "rewards/margins": 0.08904504776000977, + "rewards/rejected": -0.4777657091617584, "step": 5530 }, { - "epoch": 0.95, - "grad_norm": 22.80776688821052, - "learning_rate": 3.12808680445851e-07, - "logits/chosen": -1.4971723556518555, - "logits/rejected": -1.477506399154663, - "logps/chosen": -170.65281677246094, - "logps/rejected": -243.3797149658203, - "loss": 0.5055, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.1831166744232178, - "rewards/margins": 0.683652400970459, - "rewards/rejected": -1.8667690753936768, + "epoch": 0.954514128187457, + "grad_norm": 7.713639736175537, + "learning_rate": 6.25617360891702e-08, + "logits/chosen": -2.675360679626465, + "logits/rejected": -2.6800215244293213, + "logps/chosen": -90.1320571899414, + "logps/rejected": -106.236083984375, + "loss": 0.6447, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.37784865498542786, + "rewards/margins": 0.1174989715218544, + "rewards/rejected": -0.4953475892543793, "step": 5540 }, { - "epoch": 0.96, - "grad_norm": 21.44049349168216, - "learning_rate": 3.1208071739941937e-07, - "logits/chosen": -1.3374189138412476, - "logits/rejected": -1.3035809993743896, - "logps/chosen": -188.96694946289062, - "logps/rejected": -238.453125, - "loss": 0.6158, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3638770580291748, - "rewards/margins": 0.4801939129829407, - "rewards/rejected": -1.8440710306167603, + "epoch": 0.956237077877326, + "grad_norm": 6.391312122344971, + "learning_rate": 6.241614347988388e-08, + "logits/chosen": -2.556227922439575, + "logits/rejected": -2.541020154953003, + "logps/chosen": -92.57128143310547, + "logps/rejected": -100.6026840209961, + "loss": 0.6694, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.3995343744754791, + "rewards/margins": 0.0657266154885292, + "rewards/rejected": -0.46526098251342773, "step": 5550 }, { - "epoch": 0.96, - "grad_norm": 20.558896998020238, - "learning_rate": 3.113521929522802e-07, - "logits/chosen": -1.4649537801742554, - "logits/rejected": -1.4129371643066406, - "logps/chosen": -168.60670471191406, - "logps/rejected": -243.099365234375, - "loss": 0.5099, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.1673256158828735, - "rewards/margins": 0.7692097425460815, - "rewards/rejected": -1.9365352392196655, + "epoch": 0.957960027567195, + "grad_norm": 7.574127197265625, + "learning_rate": 6.227043859045603e-08, + "logits/chosen": -2.679574489593506, + "logits/rejected": -2.6547327041625977, + "logps/chosen": -89.28507995605469, + "logps/rejected": -96.2481918334961, + "loss": 0.6562, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3739834725856781, + "rewards/margins": 0.09359666705131531, + "rewards/rejected": -0.4675801396369934, "step": 5560 }, { - "epoch": 0.96, - "grad_norm": 23.194806036236645, - "learning_rate": 3.10623113692536e-07, - "logits/chosen": -1.5026556253433228, - "logits/rejected": -1.4706186056137085, - "logps/chosen": -169.34744262695312, - "logps/rejected": -237.5296630859375, - "loss": 0.5547, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.1870052814483643, - "rewards/margins": 0.6190119385719299, - "rewards/rejected": -1.8060171604156494, + "epoch": 0.9596829772570641, + "grad_norm": 7.61766242980957, + "learning_rate": 6.212462273850721e-08, + "logits/chosen": -2.7123208045959473, + "logits/rejected": -2.7033305168151855, + "logps/chosen": -87.42466735839844, + "logps/rejected": -104.3492202758789, + "loss": 0.6509, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.36758172512054443, + "rewards/margins": 0.10654322057962418, + "rewards/rejected": -0.4741249680519104, "step": 5570 }, { - "epoch": 0.96, - "grad_norm": 17.077550864060385, - "learning_rate": 3.0989348621330695e-07, - "logits/chosen": -1.4042866230010986, - "logits/rejected": -1.3619954586029053, - "logps/chosen": -170.88133239746094, - "logps/rejected": -240.86929321289062, - "loss": 0.5208, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.1771609783172607, - "rewards/margins": 0.6939356923103333, - "rewards/rejected": -1.8710966110229492, + "epoch": 0.9614059269469332, + "grad_norm": 5.746629238128662, + "learning_rate": 6.197869724266139e-08, + "logits/chosen": -2.5936295986175537, + "logits/rejected": -2.5804390907287598, + "logps/chosen": -92.78849029541016, + "logps/rejected": -103.72259521484375, + "loss": 0.6507, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3961091935634613, + "rewards/margins": 0.10325653851032257, + "rewards/rejected": -0.49936574697494507, "step": 5580 }, { - "epoch": 0.96, - "grad_norm": 41.03513118380479, - "learning_rate": 3.091633171126704e-07, - "logits/chosen": -1.5033903121948242, - "logits/rejected": -1.4395246505737305, - "logps/chosen": -188.55575561523438, - "logps/rejected": -257.30426025390625, - "loss": 0.5208, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3172476291656494, - "rewards/margins": 0.7042714953422546, - "rewards/rejected": -2.021519184112549, + "epoch": 0.9631288766368022, + "grad_norm": 9.365887641906738, + "learning_rate": 6.183266342253406e-08, + "logits/chosen": -2.758596420288086, + "logits/rejected": -2.7179176807403564, + "logps/chosen": -95.80296325683594, + "logps/rejected": -103.4892578125, + "loss": 0.6583, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.389718234539032, + "rewards/margins": 0.09365256130695343, + "rewards/rejected": -0.483370840549469, "step": 5590 }, { - "epoch": 0.96, - "grad_norm": 33.09744249323354, - "learning_rate": 3.0843261299360164e-07, - "logits/chosen": -1.4485256671905518, - "logits/rejected": -1.4189153909683228, - "logps/chosen": -189.03610229492188, - "logps/rejected": -262.5235595703125, - "loss": 0.5474, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3653209209442139, - "rewards/margins": 0.6869848966598511, - "rewards/rejected": -2.0523059368133545, + "epoch": 0.9648518263266712, + "grad_norm": 7.137005805969238, + "learning_rate": 6.168652259872033e-08, + "logits/chosen": -2.7192625999450684, + "logits/rejected": -2.7171120643615723, + "logps/chosen": -91.826171875, + "logps/rejected": -106.61534118652344, + "loss": 0.6531, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3932226002216339, + "rewards/margins": 0.09984948486089706, + "rewards/rejected": -0.4930720925331116, "step": 5600 }, { - "epoch": 0.96, - "eval_logits/chosen": -1.5346873998641968, - "eval_logits/rejected": -1.5108610391616821, - "eval_logps/chosen": -177.24856567382812, - "eval_logps/rejected": -212.6680450439453, - "eval_loss": 0.6200674772262573, - "eval_rewards/accuracies": 0.6689126491546631, - "eval_rewards/chosen": -1.185447096824646, - "eval_rewards/margins": 0.30965960025787354, - "eval_rewards/rejected": -1.49510657787323, - "eval_runtime": 357.0504, - "eval_samples_per_second": 12.054, - "eval_steps_per_second": 1.507, + "epoch": 0.9648518263266712, + "eval_logits/chosen": -2.774271011352539, + "eval_logits/rejected": -2.7682688236236572, + "eval_logps/chosen": -88.65845489501953, + "eval_logps/rejected": -99.27580261230469, + "eval_loss": 0.6679658889770508, + "eval_rewards/accuracies": 0.6045538783073425, + "eval_rewards/chosen": -0.2994656264781952, + "eval_rewards/margins": 0.06149120256304741, + "eval_rewards/rejected": -0.3609568178653717, + "eval_runtime": 360.2429, + "eval_samples_per_second": 11.947, + "eval_steps_per_second": 1.493, "step": 5600 }, { - "epoch": 0.97, - "grad_norm": 28.45755729486924, - "learning_rate": 3.077013804639144e-07, - "logits/chosen": -1.4699697494506836, - "logits/rejected": -1.4311306476593018, - "logps/chosen": -185.23690795898438, - "logps/rejected": -261.2013244628906, - "loss": 0.5124, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.328597068786621, - "rewards/margins": 0.7179481983184814, - "rewards/rejected": -2.0465452671051025, + "epoch": 0.9665747760165403, + "grad_norm": 6.414742469787598, + "learning_rate": 6.154027609278288e-08, + "logits/chosen": -2.728668451309204, + "logits/rejected": -2.712214231491089, + "logps/chosen": -90.20558166503906, + "logps/rejected": -106.26041412353516, + "loss": 0.6449, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.37805065512657166, + "rewards/margins": 0.11896820366382599, + "rewards/rejected": -0.4970189034938812, "step": 5610 }, { - "epoch": 0.97, - "grad_norm": 28.676584357098996, - "learning_rate": 3.069696261362008e-07, - "logits/chosen": -1.3878097534179688, - "logits/rejected": -1.3469122648239136, - "logps/chosen": -203.050537109375, - "logps/rejected": -258.06475830078125, - "loss": 0.5663, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.4804455041885376, - "rewards/margins": 0.5948622822761536, - "rewards/rejected": -2.075307607650757, + "epoch": 0.9682977257064094, + "grad_norm": 8.864455223083496, + "learning_rate": 6.139392522724017e-08, + "logits/chosen": -2.6541590690612793, + "logits/rejected": -2.633671998977661, + "logps/chosen": -96.14595031738281, + "logps/rejected": -100.43621063232422, + "loss": 0.6595, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.411262571811676, + "rewards/margins": 0.08785083144903183, + "rewards/rejected": -0.49911341071128845, "step": 5620 }, { - "epoch": 0.97, - "grad_norm": 31.893058353519617, - "learning_rate": 3.062373566277715e-07, - "logits/chosen": -1.441892385482788, - "logits/rejected": -1.3970489501953125, - "logps/chosen": -198.53103637695312, - "logps/rejected": -243.0982208251953, - "loss": 0.5993, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.4262107610702515, - "rewards/margins": 0.4929355978965759, - "rewards/rejected": -1.9191462993621826, + "epoch": 0.9700206753962785, + "grad_norm": 6.868548393249512, + "learning_rate": 6.12474713255543e-08, + "logits/chosen": -2.7163074016571045, + "logits/rejected": -2.688546657562256, + "logps/chosen": -95.85604858398438, + "logps/rejected": -97.7147216796875, + "loss": 0.6711, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3994835913181305, + "rewards/margins": 0.06611791998147964, + "rewards/rejected": -0.46560150384902954, "step": 5630 }, { - "epoch": 0.97, - "grad_norm": 32.04723618634895, - "learning_rate": 3.0550457856059596e-07, - "logits/chosen": -1.449190616607666, - "logits/rejected": -1.4080677032470703, - "logps/chosen": -168.21890258789062, - "logps/rejected": -238.8227081298828, - "loss": 0.5373, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.1427868604660034, - "rewards/margins": 0.6845923662185669, - "rewards/rejected": -1.8273794651031494, + "epoch": 0.9717436250861475, + "grad_norm": 7.378805160522461, + "learning_rate": 6.110091571211919e-08, + "logits/chosen": -2.6807336807250977, + "logits/rejected": -2.6671555042266846, + "logps/chosen": -90.3287124633789, + "logps/rejected": -102.43013763427734, + "loss": 0.6536, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3638276755809784, + "rewards/margins": 0.09953577816486359, + "rewards/rejected": -0.4633634686470032, "step": 5640 }, { - "epoch": 0.97, - "grad_norm": 18.712266282878826, - "learning_rate": 3.047712985612428e-07, - "logits/chosen": -1.3978092670440674, - "logits/rejected": -1.3562982082366943, - "logps/chosen": -176.45343017578125, - "logps/rejected": -245.86154174804688, - "loss": 0.5429, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2446025609970093, - "rewards/margins": 0.6920903921127319, - "rewards/rejected": -1.9366929531097412, + "epoch": 0.9734665747760165, + "grad_norm": 6.890263557434082, + "learning_rate": 6.095425971224856e-08, + "logits/chosen": -2.6029608249664307, + "logits/rejected": -2.595309019088745, + "logps/chosen": -89.53279876708984, + "logps/rejected": -102.14915466308594, + "loss": 0.6472, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3752228021621704, + "rewards/margins": 0.12423346191644669, + "rewards/rejected": -0.4994562566280365, "step": 5650 }, { - "epoch": 0.98, - "grad_norm": 20.19600716165965, - "learning_rate": 3.040375232608194e-07, - "logits/chosen": -1.3913816213607788, - "logits/rejected": -1.3549675941467285, - "logps/chosen": -181.96463012695312, - "logps/rejected": -260.1955261230469, - "loss": 0.5016, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3012346029281616, - "rewards/margins": 0.7908525466918945, - "rewards/rejected": -2.0920872688293457, + "epoch": 0.9751895244658856, + "grad_norm": 7.2957963943481445, + "learning_rate": 6.080750465216388e-08, + "logits/chosen": -2.6116573810577393, + "logits/rejected": -2.6080095767974854, + "logps/chosen": -90.91778564453125, + "logps/rejected": -102.00828552246094, + "loss": 0.6466, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3905393183231354, + "rewards/margins": 0.11953528970479965, + "rewards/rejected": -0.5100746154785156, "step": 5660 }, { - "epoch": 0.98, - "grad_norm": 21.953309399233106, - "learning_rate": 3.0330325929491245e-07, - "logits/chosen": -1.3647847175598145, - "logits/rejected": -1.3191179037094116, - "logps/chosen": -187.47390747070312, - "logps/rejected": -257.4412841796875, - "loss": 0.5044, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3347058296203613, - "rewards/margins": 0.7116448879241943, - "rewards/rejected": -2.0463504791259766, + "epoch": 0.9769124741557547, + "grad_norm": 7.507359981536865, + "learning_rate": 6.06606518589825e-08, + "logits/chosen": -2.648078680038452, + "logits/rejected": -2.6287286281585693, + "logps/chosen": -89.63655090332031, + "logps/rejected": -99.84394836425781, + "loss": 0.6459, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3559412956237793, + "rewards/margins": 0.11415255069732666, + "rewards/rejected": -0.47009381651878357, "step": 5670 }, { - "epoch": 0.98, - "grad_norm": 40.5185950103975, - "learning_rate": 3.0256851330352753e-07, - "logits/chosen": -1.3821312189102173, - "logits/rejected": -1.322158932685852, - "logps/chosen": -210.1196746826172, - "logps/rejected": -284.6582946777344, - "loss": 0.5185, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.5572584867477417, - "rewards/margins": 0.7730095982551575, - "rewards/rejected": -2.330268383026123, + "epoch": 0.9786354238456237, + "grad_norm": 7.136532306671143, + "learning_rate": 6.05137026607055e-08, + "logits/chosen": -2.692141532897949, + "logits/rejected": -2.654029369354248, + "logps/chosen": -94.61227416992188, + "logps/rejected": -103.2687759399414, + "loss": 0.6459, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.40233784914016724, + "rewards/margins": 0.1138642430305481, + "rewards/rejected": -0.5162020921707153, "step": 5680 }, { - "epoch": 0.98, - "grad_norm": 17.952104758420326, - "learning_rate": 3.0183329193102894e-07, - "logits/chosen": -1.4393055438995361, - "logits/rejected": -1.3816049098968506, - "logps/chosen": -200.3620147705078, - "logps/rejected": -273.0220642089844, - "loss": 0.5014, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.423686146736145, - "rewards/margins": 0.7898932695388794, - "rewards/rejected": -2.2135794162750244, + "epoch": 0.9803583735354927, + "grad_norm": 7.768203258514404, + "learning_rate": 6.036665838620579e-08, + "logits/chosen": -2.721024990081787, + "logits/rejected": -2.691474676132202, + "logps/chosen": -96.23184967041016, + "logps/rejected": -101.6405258178711, + "loss": 0.6441, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.38233327865600586, + "rewards/margins": 0.11741151660680771, + "rewards/rejected": -0.49974480271339417, "step": 5690 }, { - "epoch": 0.98, - "grad_norm": 33.97947710759319, - "learning_rate": 3.010976018260805e-07, - "logits/chosen": -1.289398431777954, - "logits/rejected": -1.246914267539978, - "logps/chosen": -189.9264678955078, - "logps/rejected": -259.13726806640625, - "loss": 0.5291, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3215687274932861, - "rewards/margins": 0.7366959452629089, - "rewards/rejected": -2.05826473236084, + "epoch": 0.9820813232253618, + "grad_norm": 8.61497974395752, + "learning_rate": 6.021952036521611e-08, + "logits/chosen": -2.5390217304229736, + "logits/rejected": -2.5234203338623047, + "logps/chosen": -97.38616180419922, + "logps/rejected": -103.48213958740234, + "loss": 0.652, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3960030674934387, + "rewards/margins": 0.10574601590633392, + "rewards/rejected": -0.5017490983009338, "step": 5700 }, { - "epoch": 0.98, - "eval_logits/chosen": -1.5449095964431763, - "eval_logits/rejected": -1.5208981037139893, - "eval_logps/chosen": -175.29298400878906, - "eval_logps/rejected": -211.0419921875, - "eval_loss": 0.6191110610961914, - "eval_rewards/accuracies": 0.6696096658706665, - "eval_rewards/chosen": -1.165891408920288, - "eval_rewards/margins": 0.3129545748233795, - "eval_rewards/rejected": -1.4788459539413452, - "eval_runtime": 356.9489, - "eval_samples_per_second": 12.058, - "eval_steps_per_second": 1.507, + "epoch": 0.9820813232253618, + "eval_logits/chosen": -2.7663583755493164, + "eval_logits/rejected": -2.760427951812744, + "eval_logps/chosen": -89.92341613769531, + "eval_logps/rejected": -100.78007507324219, + "eval_loss": 0.6671208739280701, + "eval_rewards/accuracies": 0.6040892004966736, + "eval_rewards/chosen": -0.3121151626110077, + "eval_rewards/margins": 0.06388425081968307, + "eval_rewards/rejected": -0.37599942088127136, + "eval_runtime": 360.2055, + "eval_samples_per_second": 11.949, + "eval_steps_per_second": 1.494, "step": 5700 }, { - "epoch": 0.98, - "grad_norm": 24.082058815741895, - "learning_rate": 3.003614496415843e-07, - "logits/chosen": -1.501319169998169, - "logits/rejected": -1.4549624919891357, - "logps/chosen": -180.39666748046875, - "logps/rejected": -245.5010223388672, - "loss": 0.5398, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2527682781219482, - "rewards/margins": 0.6765493154525757, - "rewards/rejected": -1.9293174743652344, + "epoch": 0.9838042729152309, + "grad_norm": 7.650971412658691, + "learning_rate": 6.007228992831685e-08, + "logits/chosen": -2.7118613719940186, + "logits/rejected": -2.686074733734131, + "logps/chosen": -95.1128921508789, + "logps/rejected": -105.84073638916016, + "loss": 0.6409, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.399996817111969, + "rewards/margins": 0.1325821727514267, + "rewards/rejected": -0.5325790643692017, "step": 5710 }, { - "epoch": 0.99, - "grad_norm": 18.19243173514077, - "learning_rate": 2.996248420346211e-07, - "logits/chosen": -1.4630482196807861, - "logits/rejected": -1.40970778465271, - "logps/chosen": -167.19842529296875, - "logps/rejected": -250.3085479736328, - "loss": 0.467, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.1519994735717773, - "rewards/margins": 0.8528478741645813, - "rewards/rejected": -2.004847288131714, + "epoch": 0.9855272226051, + "grad_norm": 7.109969615936279, + "learning_rate": 5.992496840692423e-08, + "logits/chosen": -2.682396411895752, + "logits/rejected": -2.6602180004119873, + "logps/chosen": -91.23838806152344, + "logps/rejected": -103.3809585571289, + "loss": 0.6328, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39226388931274414, + "rewards/margins": 0.1431894153356552, + "rewards/rejected": -0.5354533195495605, "step": 5720 }, { - "epoch": 0.99, - "grad_norm": 33.44207769272372, - "learning_rate": 2.988877856663905e-07, - "logits/chosen": -1.5095856189727783, - "logits/rejected": -1.4771087169647217, - "logps/chosen": -190.52589416503906, - "logps/rejected": -250.38720703125, - "loss": 0.582, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3880624771118164, - "rewards/margins": 0.597137987613678, - "rewards/rejected": -1.9852005243301392, + "epoch": 0.987250172294969, + "grad_norm": 6.852600574493408, + "learning_rate": 5.97775571332781e-08, + "logits/chosen": -2.7351083755493164, + "logits/rejected": -2.725668430328369, + "logps/chosen": -90.90850830078125, + "logps/rejected": -101.49790954589844, + "loss": 0.6515, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3918391466140747, + "rewards/margins": 0.10430805385112762, + "rewards/rejected": -0.49614715576171875, "step": 5730 }, { - "epoch": 0.99, - "grad_norm": 23.692568889129813, - "learning_rate": 2.9815028720214985e-07, - "logits/chosen": -1.4424539804458618, - "logits/rejected": -1.3772521018981934, - "logps/chosen": -192.9495849609375, - "logps/rejected": -276.883056640625, - "loss": 0.4764, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3374172449111938, - "rewards/margins": 0.8872700929641724, - "rewards/rejected": -2.224687099456787, + "epoch": 0.988973121984838, + "grad_norm": 6.903168201446533, + "learning_rate": 5.963005744042997e-08, + "logits/chosen": -2.6414573192596436, + "logits/rejected": -2.61142635345459, + "logps/chosen": -102.45942687988281, + "logps/rejected": -108.10794830322266, + "loss": 0.6517, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.432508647441864, + "rewards/margins": 0.10413438081741333, + "rewards/rejected": -0.5366430878639221, "step": 5740 }, { - "epoch": 0.99, - "grad_norm": 17.064609183815648, - "learning_rate": 2.974123533111545e-07, - "logits/chosen": -1.580055594444275, - "logits/rejected": -1.540281057357788, - "logps/chosen": -191.0297393798828, - "logps/rejected": -233.5084991455078, - "loss": 0.5938, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3438113927841187, - "rewards/margins": 0.48881006240844727, - "rewards/rejected": -1.8326218128204346, + "epoch": 0.9906960716747071, + "grad_norm": 7.210351467132568, + "learning_rate": 5.94824706622309e-08, + "logits/chosen": -2.7761495113372803, + "logits/rejected": -2.7487683296203613, + "logps/chosen": -96.80901336669922, + "logps/rejected": -99.75078582763672, + "loss": 0.6561, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.40144777297973633, + "rewards/margins": 0.09334973990917206, + "rewards/rejected": -0.4947974681854248, "step": 5750 }, { - "epoch": 0.99, - "grad_norm": 17.358709255280868, - "learning_rate": 2.9667399066659756e-07, - "logits/chosen": -1.5095783472061157, - "logits/rejected": -1.4547650814056396, - "logps/chosen": -172.166015625, - "logps/rejected": -241.0043182373047, - "loss": 0.511, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.1578861474990845, - "rewards/margins": 0.6968024373054504, - "rewards/rejected": -1.8546886444091797, + "epoch": 0.9924190213645762, + "grad_norm": 7.566880702972412, + "learning_rate": 5.933479813331951e-08, + "logits/chosen": -2.6539306640625, + "logits/rejected": -2.628377914428711, + "logps/chosen": -97.50645446777344, + "logps/rejected": -107.49908447265625, + "loss": 0.6541, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.410966694355011, + "rewards/margins": 0.10833420604467392, + "rewards/rejected": -0.5193008184432983, "step": 5760 }, { - "epoch": 0.99, - "grad_norm": 20.70958250669779, - "learning_rate": 2.959352059455492e-07, - "logits/chosen": -1.4510507583618164, - "logits/rejected": -1.3988720178604126, - "logps/chosen": -165.02476501464844, - "logps/rejected": -238.8711395263672, - "loss": 0.5043, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.1318988800048828, - "rewards/margins": 0.7165284156799316, - "rewards/rejected": -1.848427176475525, + "epoch": 0.9941419710544452, + "grad_norm": 7.751824855804443, + "learning_rate": 5.918704118910984e-08, + "logits/chosen": -2.6133615970611572, + "logits/rejected": -2.594430923461914, + "logps/chosen": -88.68501281738281, + "logps/rejected": -102.99845886230469, + "loss": 0.643, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3683035373687744, + "rewards/margins": 0.12121020257472992, + "rewards/rejected": -0.48951372504234314, "step": 5770 }, { - "epoch": 1.0, - "grad_norm": 26.960927481343827, - "learning_rate": 2.9519600582889655e-07, - "logits/chosen": -1.4297640323638916, - "logits/rejected": -1.378154993057251, - "logps/chosen": -178.6942138671875, - "logps/rejected": -261.24639892578125, - "loss": 0.4882, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2625802755355835, - "rewards/margins": 0.8104287385940552, - "rewards/rejected": -2.0730090141296387, + "epoch": 0.9958649207443143, + "grad_norm": 7.475341320037842, + "learning_rate": 5.903920116577931e-08, + "logits/chosen": -2.6427292823791504, + "logits/rejected": -2.623394727706909, + "logps/chosen": -91.09282684326172, + "logps/rejected": -105.16011047363281, + "loss": 0.6413, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.38629263639450073, + "rewards/margins": 0.1256280243396759, + "rewards/rejected": -0.511920690536499, "step": 5780 }, { - "epoch": 1.0, - "grad_norm": 19.048196919185166, - "learning_rate": 2.944563970012831e-07, - "logits/chosen": -1.2947901487350464, - "logits/rejected": -1.236037254333496, - "logps/chosen": -189.5282440185547, - "logps/rejected": -269.74005126953125, - "loss": 0.4999, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.380592703819275, - "rewards/margins": 0.7776483297348022, - "rewards/rejected": -2.158240795135498, + "epoch": 0.9975878704341833, + "grad_norm": 6.766174793243408, + "learning_rate": 5.889127940025662e-08, + "logits/chosen": -2.583322048187256, + "logits/rejected": -2.55000638961792, + "logps/chosen": -87.63530731201172, + "logps/rejected": -101.72434997558594, + "loss": 0.6448, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3616120517253876, + "rewards/margins": 0.11629859358072281, + "rewards/rejected": -0.4779106080532074, "step": 5790 }, { - "epoch": 1.0, - "grad_norm": 24.450393947587344, - "learning_rate": 2.937163861510486e-07, - "logits/chosen": -1.3695513010025024, - "logits/rejected": -1.3113354444503784, - "logps/chosen": -209.8120880126953, - "logps/rejected": -304.4366149902344, - "loss": 0.496, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.5908751487731934, - "rewards/margins": 0.9317790865898132, - "rewards/rejected": -2.5226542949676514, + "epoch": 0.9993108201240524, + "grad_norm": 7.446423530578613, + "learning_rate": 5.874327723020972e-08, + "logits/chosen": -2.668041467666626, + "logits/rejected": -2.6409144401550293, + "logps/chosen": -89.25969696044922, + "logps/rejected": -104.5272445678711, + "loss": 0.6355, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.3850882649421692, + "rewards/margins": 0.1383116990327835, + "rewards/rejected": -0.5233998894691467, "step": 5800 }, { - "epoch": 1.0, - "eval_logits/chosen": -1.4435439109802246, - "eval_logits/rejected": -1.41628098487854, - "eval_logps/chosen": -210.4264678955078, - "eval_logps/rejected": -253.47520446777344, - "eval_loss": 0.6148089170455933, - "eval_rewards/accuracies": 0.6679832935333252, - "eval_rewards/chosen": -1.5172260999679565, - "eval_rewards/margins": 0.3859521150588989, - "eval_rewards/rejected": -1.903178334236145, - "eval_runtime": 356.8738, - "eval_samples_per_second": 12.06, - "eval_steps_per_second": 1.508, + "epoch": 0.9993108201240524, + "eval_logits/chosen": -2.754894495010376, + "eval_logits/rejected": -2.7488529682159424, + "eval_logps/chosen": -91.43659210205078, + "eval_logps/rejected": -102.54092407226562, + "eval_loss": 0.6662805080413818, + "eval_rewards/accuracies": 0.6057156324386597, + "eval_rewards/chosen": -0.3272469639778137, + "eval_rewards/margins": 0.06636104732751846, + "eval_rewards/rejected": -0.3936080038547516, + "eval_runtime": 359.9403, + "eval_samples_per_second": 11.958, + "eval_steps_per_second": 1.495, "step": 5800 }, { - "epoch": 1.0, - "grad_norm": 18.008006484022793, - "learning_rate": 2.9297597997016797e-07, - "logits/chosen": -1.4246017932891846, - "logits/rejected": -1.3686187267303467, - "logps/chosen": -203.05320739746094, - "logps/rejected": -305.4484558105469, - "loss": 0.4592, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.490220308303833, - "rewards/margins": 1.0287506580352783, - "rewards/rejected": -2.5189712047576904, + "epoch": 1.0010337698139213, + "grad_norm": 7.47341775894165, + "learning_rate": 5.85951959940336e-08, + "logits/chosen": -2.723707675933838, + "logits/rejected": -2.708026170730591, + "logps/chosen": -94.59834289550781, + "logps/rejected": -106.0676040649414, + "loss": 0.644, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4052727222442627, + "rewards/margins": 0.11942293494939804, + "rewards/rejected": -0.524695634841919, "step": 5810 }, { - "epoch": 1.0, - "grad_norm": 24.397505186303498, - "learning_rate": 2.922351851541915e-07, - "logits/chosen": -1.4257746934890747, - "logits/rejected": -1.3525466918945312, - "logps/chosen": -195.74807739257812, - "logps/rejected": -306.99359130859375, - "loss": 0.3977, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.4273375272750854, - "rewards/margins": 1.1495184898376465, - "rewards/rejected": -2.5768561363220215, + "epoch": 1.0027567195037905, + "grad_norm": 6.446918487548828, + "learning_rate": 5.8447037030838295e-08, + "logits/chosen": -2.7201502323150635, + "logits/rejected": -2.6872313022613525, + "logps/chosen": -93.95738220214844, + "logps/rejected": -105.8100357055664, + "loss": 0.6283, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.4093136787414551, + "rewards/margins": 0.15548773109912872, + "rewards/rejected": -0.5648013949394226, "step": 5820 }, { - "epoch": 1.0, - "grad_norm": 34.799268956724895, - "learning_rate": 2.914940084021836e-07, - "logits/chosen": -1.297031283378601, - "logits/rejected": -1.231827974319458, - "logps/chosen": -198.88839721679688, - "logps/rejected": -312.55792236328125, - "loss": 0.442, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.4539258480072021, - "rewards/margins": 1.143164873123169, - "rewards/rejected": -2.59709095954895, + "epoch": 1.0044796691936595, + "grad_norm": 7.902465343475342, + "learning_rate": 5.829880168043672e-08, + "logits/chosen": -2.610410213470459, + "logits/rejected": -2.580620765686035, + "logps/chosen": -93.01567077636719, + "logps/rejected": -104.25047302246094, + "loss": 0.6459, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.39519113302230835, + "rewards/margins": 0.11866496503353119, + "rewards/rejected": -0.513856053352356, "step": 5830 }, { - "epoch": 1.01, - "grad_norm": 16.077132102945612, - "learning_rate": 2.907524564166628e-07, - "logits/chosen": -1.3520994186401367, - "logits/rejected": -1.2968170642852783, - "logps/chosen": -193.8484344482422, - "logps/rejected": -302.53302001953125, - "loss": 0.4416, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.414527177810669, - "rewards/margins": 1.065403699874878, - "rewards/rejected": -2.4799306392669678, + "epoch": 1.0062026188835287, + "grad_norm": 6.444275379180908, + "learning_rate": 5.8150491283332556e-08, + "logits/chosen": -2.639526128768921, + "logits/rejected": -2.6196341514587402, + "logps/chosen": -91.62089538574219, + "logps/rejected": -108.6521987915039, + "loss": 0.6335, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.392328679561615, + "rewards/margins": 0.14849011600017548, + "rewards/rejected": -0.5408187508583069, "step": 5840 }, { - "epoch": 1.01, - "grad_norm": 27.25246319010426, - "learning_rate": 2.9001053590354076e-07, - "logits/chosen": -1.410636067390442, - "logits/rejected": -1.3458845615386963, - "logps/chosen": -188.6303253173828, - "logps/rejected": -307.2320861816406, - "loss": 0.3943, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.3775842189788818, - "rewards/margins": 1.1499364376068115, - "rewards/rejected": -2.5275206565856934, + "epoch": 1.0079255685733977, + "grad_norm": 7.668493747711182, + "learning_rate": 5.800210718070815e-08, + "logits/chosen": -2.698667287826538, + "logits/rejected": -2.678873300552368, + "logps/chosen": -90.6561050415039, + "logps/rejected": -110.56428527832031, + "loss": 0.6251, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.3974132537841797, + "rewards/margins": 0.16324903070926666, + "rewards/rejected": -0.5606622695922852, "step": 5850 }, { - "epoch": 1.01, - "grad_norm": 18.14664982804058, - "learning_rate": 2.8926825357206176e-07, - "logits/chosen": -1.2408941984176636, - "logits/rejected": -1.1875782012939453, - "logps/chosen": -207.2787628173828, - "logps/rejected": -331.25421142578125, - "loss": 0.4206, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.5550321340560913, - "rewards/margins": 1.2251521348953247, - "rewards/rejected": -2.780184268951416, + "epoch": 1.0096485182632666, + "grad_norm": 6.586671352386475, + "learning_rate": 5.785365071441235e-08, + "logits/chosen": -2.565764904022217, + "logits/rejected": -2.555816650390625, + "logps/chosen": -93.14619445800781, + "logps/rejected": -110.57508850097656, + "loss": 0.6331, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.41372841596603394, + "rewards/margins": 0.15966841578483582, + "rewards/rejected": -0.5733968019485474, "step": 5860 }, { - "epoch": 1.01, - "grad_norm": 33.09906725415787, - "learning_rate": 2.885256161347421e-07, - "logits/chosen": -1.236800193786621, - "logits/rejected": -1.174392580986023, - "logps/chosen": -234.08859252929688, - "logps/rejected": -356.24066162109375, - "loss": 0.3899, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7791268825531006, - "rewards/margins": 1.2331987619400024, - "rewards/rejected": -3.0123260021209717, + "epoch": 1.0113714679531358, + "grad_norm": 9.994606018066406, + "learning_rate": 5.7705123226948425e-08, + "logits/chosen": -2.6034083366394043, + "logits/rejected": -2.583834648132324, + "logps/chosen": -99.65670013427734, + "logps/rejected": -112.95194244384766, + "loss": 0.635, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.43475866317749023, + "rewards/margins": 0.14449259638786316, + "rewards/rejected": -0.5792513489723206, "step": 5870 }, { - "epoch": 1.01, - "grad_norm": 29.54869321861051, - "learning_rate": 2.877826303073094e-07, - "logits/chosen": -1.2946747541427612, - "logits/rejected": -1.2476125955581665, - "logps/chosen": -216.5059814453125, - "logps/rejected": -321.3334655761719, - "loss": 0.4526, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.6175800561904907, - "rewards/margins": 1.0643819570541382, - "rewards/rejected": -2.681962251663208, + "epoch": 1.0130944176430048, + "grad_norm": 8.589122772216797, + "learning_rate": 5.7556526061461874e-08, + "logits/chosen": -2.670046329498291, + "logits/rejected": -2.6585898399353027, + "logps/chosen": -94.46307373046875, + "logps/rejected": -103.35002136230469, + "loss": 0.6515, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.39721113443374634, + "rewards/margins": 0.10498642921447754, + "rewards/rejected": -0.5021975636482239, "step": 5880 }, { - "epoch": 1.01, - "grad_norm": 23.87246950772011, - "learning_rate": 2.870393028086416e-07, - "logits/chosen": -1.3654061555862427, - "logits/rejected": -1.3092124462127686, - "logps/chosen": -201.2548828125, - "logps/rejected": -316.09326171875, - "loss": 0.4342, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.461004614830017, - "rewards/margins": 1.118170142173767, - "rewards/rejected": -2.579174518585205, + "epoch": 1.014817367332874, + "grad_norm": 8.38769245147705, + "learning_rate": 5.740786056172833e-08, + "logits/chosen": -2.6955230236053467, + "logits/rejected": -2.681936502456665, + "logps/chosen": -96.41594696044922, + "logps/rejected": -111.12618255615234, + "loss": 0.6465, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.41250571608543396, + "rewards/margins": 0.11666438728570938, + "rewards/rejected": -0.529170036315918, "step": 5890 }, { - "epoch": 1.02, - "grad_norm": 22.072237513064117, - "learning_rate": 2.8629564036070663e-07, - "logits/chosen": -1.2765244245529175, - "logits/rejected": -1.2124745845794678, - "logps/chosen": -197.97320556640625, - "logps/rejected": -324.934326171875, - "loss": 0.3739, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.4230375289916992, - "rewards/margins": 1.2795560359954834, - "rewards/rejected": -2.7025935649871826, + "epoch": 1.016540317022743, + "grad_norm": 7.229883193969727, + "learning_rate": 5.7259128072141324e-08, + "logits/chosen": -2.609078884124756, + "logits/rejected": -2.5904786586761475, + "logps/chosen": -98.62474060058594, + "logps/rejected": -111.3572769165039, + "loss": 0.6362, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.42958712577819824, + "rewards/margins": 0.13729210197925568, + "rewards/rejected": -0.5668792128562927, "step": 5900 }, { - "epoch": 1.02, - "eval_logits/chosen": -1.371607780456543, - "eval_logits/rejected": -1.3429399728775024, - "eval_logps/chosen": -213.2480010986328, - "eval_logps/rejected": -259.2733459472656, - "eval_loss": 0.621561586856842, - "eval_rewards/accuracies": 0.6626393795013428, - "eval_rewards/chosen": -1.5454415082931519, - "eval_rewards/margins": 0.4157179594039917, - "eval_rewards/rejected": -1.961159348487854, - "eval_runtime": 356.8605, - "eval_samples_per_second": 12.061, - "eval_steps_per_second": 1.508, + "epoch": 1.016540317022743, + "eval_logits/chosen": -2.7389726638793945, + "eval_logits/rejected": -2.7328853607177734, + "eval_logps/chosen": -93.74750518798828, + "eval_logps/rejected": -105.1657943725586, + "eval_loss": 0.6653571724891663, + "eval_rewards/accuracies": 0.6043215394020081, + "eval_rewards/chosen": -0.3503560423851013, + "eval_rewards/margins": 0.06950072199106216, + "eval_rewards/rejected": -0.4198567569255829, + "eval_runtime": 360.5595, + "eval_samples_per_second": 11.937, + "eval_steps_per_second": 1.492, "step": 5900 }, { - "epoch": 1.02, - "grad_norm": 27.59715394921913, - "learning_rate": 2.855516496885011e-07, - "logits/chosen": -1.2443146705627441, - "logits/rejected": -1.2075556516647339, - "logps/chosen": -208.70321655273438, - "logps/rejected": -310.6231689453125, - "loss": 0.4883, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5616544485092163, - "rewards/margins": 0.9676151275634766, - "rewards/rejected": -2.5292694568634033, + "epoch": 1.018263266712612, + "grad_norm": 8.26405143737793, + "learning_rate": 5.7110329937700216e-08, + "logits/chosen": -2.6024017333984375, + "logits/rejected": -2.6053740978240967, + "logps/chosen": -94.26959228515625, + "logps/rejected": -109.2395248413086, + "loss": 0.6541, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41757115721702576, + "rewards/margins": 0.09765584766864777, + "rewards/rejected": -0.5152269601821899, "step": 5910 }, { - "epoch": 1.02, - "grad_norm": 24.263714171924033, - "learning_rate": 2.848073375199901e-07, - "logits/chosen": -1.2384252548217773, - "logits/rejected": -1.181979775428772, - "logps/chosen": -219.75973510742188, - "logps/rejected": -325.8696594238281, - "loss": 0.4625, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.6830384731292725, - "rewards/margins": 1.0408580303192139, - "rewards/rejected": -2.7238965034484863, + "epoch": 1.019986216402481, + "grad_norm": 7.7567667961120605, + "learning_rate": 5.696146750399802e-08, + "logits/chosen": -2.6418375968933105, + "logits/rejected": -2.6241607666015625, + "logps/chosen": -94.14632415771484, + "logps/rejected": -106.68177795410156, + "loss": 0.6513, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4266262948513031, + "rewards/margins": 0.10511897504329681, + "rewards/rejected": -0.5317453145980835, "step": 5920 }, { - "epoch": 1.02, - "grad_norm": 20.723910025082144, - "learning_rate": 2.8406271058604574e-07, - "logits/chosen": -1.3165338039398193, - "logits/rejected": -1.2699096202850342, - "logps/chosen": -209.8754425048828, - "logps/rejected": -316.329833984375, - "loss": 0.4768, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.5963435173034668, - "rewards/margins": 1.05552077293396, - "rewards/rejected": -2.6518642902374268, + "epoch": 1.02170916609235, + "grad_norm": 6.675997734069824, + "learning_rate": 5.681254211720915e-08, + "logits/chosen": -2.659510374069214, + "logits/rejected": -2.6527857780456543, + "logps/chosen": -97.44358825683594, + "logps/rejected": -108.7308120727539, + "loss": 0.6548, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.47215375304222107, + "rewards/margins": 0.10348248481750488, + "rewards/rejected": -0.5756362676620483, "step": 5930 }, { - "epoch": 1.02, - "grad_norm": 35.79174059848847, - "learning_rate": 2.833177756203868e-07, - "logits/chosen": -1.3231611251831055, - "logits/rejected": -1.2533804178237915, - "logps/chosen": -185.56277465820312, - "logps/rejected": -296.2825622558594, - "loss": 0.4249, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3131976127624512, - "rewards/margins": 1.1194097995758057, - "rewards/rejected": -2.4326071739196777, + "epoch": 1.0234321157822193, + "grad_norm": 7.2116899490356445, + "learning_rate": 5.6663555124077354e-08, + "logits/chosen": -2.6509711742401123, + "logits/rejected": -2.613877296447754, + "logps/chosen": -95.33317565917969, + "logps/rejected": -108.7905502319336, + "loss": 0.6332, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.41096001863479614, + "rewards/margins": 0.1462816298007965, + "rewards/rejected": -0.5572416186332703, "step": 5940 }, { - "epoch": 1.03, - "grad_norm": 17.76388707818442, - "learning_rate": 2.8257253935951754e-07, - "logits/chosen": -1.2369143962860107, - "logits/rejected": -1.1907278299331665, - "logps/chosen": -180.17445373535156, - "logps/rejected": -300.03424072265625, - "loss": 0.3913, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.2647721767425537, - "rewards/margins": 1.1959367990493774, - "rewards/rejected": -2.4607090950012207, + "epoch": 1.0251550654720882, + "grad_norm": 7.672728061676025, + "learning_rate": 5.651450787190351e-08, + "logits/chosen": -2.529658079147339, + "logits/rejected": -2.5256409645080566, + "logps/chosen": -95.80914306640625, + "logps/rejected": -106.40169525146484, + "loss": 0.6514, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.42103952169418335, + "rewards/margins": 0.10326598584651947, + "rewards/rejected": -0.5243055820465088, "step": 5950 }, { - "epoch": 1.03, - "grad_norm": 16.94373019662342, - "learning_rate": 2.818270085426668e-07, - "logits/chosen": -1.252617597579956, - "logits/rejected": -1.1776127815246582, - "logps/chosen": -212.00613403320312, - "logps/rejected": -300.90032958984375, - "loss": 0.4698, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.5370571613311768, - "rewards/margins": 0.9539899826049805, - "rewards/rejected": -2.4910473823547363, + "epoch": 1.0268780151619572, + "grad_norm": 7.341794967651367, + "learning_rate": 5.6365401708533353e-08, + "logits/chosen": -2.589254379272461, + "logits/rejected": -2.541999340057373, + "logps/chosen": -103.13002014160156, + "logps/rejected": -105.22901916503906, + "loss": 0.6673, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4480670094490051, + "rewards/margins": 0.08621273934841156, + "rewards/rejected": -0.5342798233032227, "step": 5960 }, { - "epoch": 1.03, - "grad_norm": 18.34387313974363, - "learning_rate": 2.8108118991172715e-07, - "logits/chosen": -1.2002298831939697, - "logits/rejected": -1.1465680599212646, - "logps/chosen": -217.5402374267578, - "logps/rejected": -327.88226318359375, - "loss": 0.4467, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.656275987625122, - "rewards/margins": 1.1236770153045654, - "rewards/rejected": -2.7799527645111084, + "epoch": 1.0286009648518264, + "grad_norm": 8.059379577636719, + "learning_rate": 5.6216237982345426e-08, + "logits/chosen": -2.5731124877929688, + "logits/rejected": -2.5565097332000732, + "logps/chosen": -94.64891052246094, + "logps/rejected": -106.91328430175781, + "loss": 0.6368, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.42751556634902954, + "rewards/margins": 0.14247098565101624, + "rewards/rejected": -0.5699866414070129, "step": 5970 }, { - "epoch": 1.03, - "grad_norm": 36.226753471187656, - "learning_rate": 2.8033509021119396e-07, - "logits/chosen": -1.1955822706222534, - "logits/rejected": -1.1503514051437378, - "logps/chosen": -217.57955932617188, - "logps/rejected": -343.6628112792969, - "loss": 0.4415, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.6500743627548218, - "rewards/margins": 1.2213424444198608, - "rewards/rejected": -2.8714168071746826, + "epoch": 1.0303239145416954, + "grad_norm": 7.262861251831055, + "learning_rate": 5.606701804223879e-08, + "logits/chosen": -2.5789666175842285, + "logits/rejected": -2.5790278911590576, + "logps/chosen": -92.66761016845703, + "logps/rejected": -111.3015365600586, + "loss": 0.6325, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.40097588300704956, + "rewards/margins": 0.146892249584198, + "rewards/rejected": -0.5478681325912476, "step": 5980 }, { - "epoch": 1.03, - "grad_norm": 31.27940424068028, - "learning_rate": 2.795887161881043e-07, - "logits/chosen": -1.2698607444763184, - "logits/rejected": -1.196852207183838, - "logps/chosen": -225.42538452148438, - "logps/rejected": -327.5019226074219, - "loss": 0.4639, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7037813663482666, - "rewards/margins": 1.0513932704925537, - "rewards/rejected": -2.7551746368408203, + "epoch": 1.0320468642315643, + "grad_norm": 7.83336877822876, + "learning_rate": 5.5917743237620865e-08, + "logits/chosen": -2.6466517448425293, + "logits/rejected": -2.613440990447998, + "logps/chosen": -97.26774597167969, + "logps/rejected": -106.04005432128906, + "loss": 0.6457, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.42194968461990356, + "rewards/margins": 0.11849778890609741, + "rewards/rejected": -0.540447473526001, "step": 5990 }, { - "epoch": 1.03, - "grad_norm": 25.514076708010567, - "learning_rate": 2.7884207459197585e-07, - "logits/chosen": -1.23202383518219, - "logits/rejected": -1.166017770767212, - "logps/chosen": -224.9745635986328, - "logps/rejected": -359.5087585449219, - "loss": 0.3835, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.7289674282073975, - "rewards/margins": 1.3435934782028198, - "rewards/rejected": -3.0725607872009277, + "epoch": 1.0337698139214335, + "grad_norm": 7.2638068199157715, + "learning_rate": 5.576841491839517e-08, + "logits/chosen": -2.626448154449463, + "logits/rejected": -2.6101455688476562, + "logps/chosen": -96.3993911743164, + "logps/rejected": -106.6590347290039, + "loss": 0.6587, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.44295287132263184, + "rewards/margins": 0.10090150684118271, + "rewards/rejected": -0.5438543558120728, "step": 6000 }, { - "epoch": 1.03, - "eval_logits/chosen": -1.3176610469818115, - "eval_logits/rejected": -1.2868660688400269, - "eval_logps/chosen": -241.43719482421875, - "eval_logps/rejected": -294.40496826171875, - "eval_loss": 0.6213955879211426, - "eval_rewards/accuracies": 0.6670538783073425, - "eval_rewards/chosen": -1.8273334503173828, - "eval_rewards/margins": 0.48514264822006226, - "eval_rewards/rejected": -2.3124759197235107, - "eval_runtime": 356.8191, - "eval_samples_per_second": 12.062, - "eval_steps_per_second": 1.508, + "epoch": 1.0337698139214335, + "eval_logits/chosen": -2.732137680053711, + "eval_logits/rejected": -2.7260348796844482, + "eval_logps/chosen": -93.24308013916016, + "eval_logps/rejected": -104.63257598876953, + "eval_loss": 0.6654485464096069, + "eval_rewards/accuracies": 0.6075743436813354, + "eval_rewards/chosen": -0.3453117907047272, + "eval_rewards/margins": 0.06921263039112091, + "eval_rewards/rejected": -0.4145244359970093, + "eval_runtime": 360.5197, + "eval_samples_per_second": 11.938, + "eval_steps_per_second": 1.492, "step": 6000 }, { - "epoch": 1.04, - "grad_norm": 23.69085635543719, - "learning_rate": 2.780951721747461e-07, - "logits/chosen": -1.243060827255249, - "logits/rejected": -1.194278359413147, - "logps/chosen": -225.7394561767578, - "logps/rejected": -336.02508544921875, - "loss": 0.4742, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.72724187374115, - "rewards/margins": 1.119065284729004, - "rewards/rejected": -2.8463072776794434, + "epoch": 1.0354927636113025, + "grad_norm": 8.994678497314453, + "learning_rate": 5.561903443494922e-08, + "logits/chosen": -2.632159948348999, + "logits/rejected": -2.616946220397949, + "logps/chosen": -97.70941162109375, + "logps/rejected": -104.2746810913086, + "loss": 0.6661, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4467419981956482, + "rewards/margins": 0.08194559812545776, + "rewards/rejected": -0.528687596321106, "step": 6010 }, { - "epoch": 1.04, - "grad_norm": 19.041771832370905, - "learning_rate": 2.7734801569071104e-07, - "logits/chosen": -1.4446563720703125, - "logits/rejected": -1.3703842163085938, - "logps/chosen": -204.45094299316406, - "logps/rejected": -318.0191345214844, - "loss": 0.4287, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.50087308883667, - "rewards/margins": 1.1805269718170166, - "rewards/rejected": -2.6814000606536865, + "epoch": 1.0372157133011717, + "grad_norm": 7.749782562255859, + "learning_rate": 5.546960313814221e-08, + "logits/chosen": -2.781144380569458, + "logits/rejected": -2.7513694763183594, + "logps/chosen": -99.9555435180664, + "logps/rejected": -105.5430908203125, + "loss": 0.6597, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4556824564933777, + "rewards/margins": 0.1007528156042099, + "rewards/rejected": -0.556435227394104, "step": 6020 }, { - "epoch": 1.04, - "grad_norm": 22.204161933617108, - "learning_rate": 2.766006118964644e-07, - "logits/chosen": -1.1446921825408936, - "logits/rejected": -1.0945428609848022, - "logps/chosen": -206.8660125732422, - "logps/rejected": -310.853759765625, - "loss": 0.451, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5705788135528564, - "rewards/margins": 1.0349514484405518, - "rewards/rejected": -2.605530261993408, + "epoch": 1.0389386629910407, + "grad_norm": 6.4451680183410645, + "learning_rate": 5.532012237929288e-08, + "logits/chosen": -2.4611928462982178, + "logits/rejected": -2.44938588142395, + "logps/chosen": -94.34858703613281, + "logps/rejected": -104.7852783203125, + "loss": 0.6572, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4451059401035309, + "rewards/margins": 0.09958353638648987, + "rewards/rejected": -0.5446893572807312, "step": 6030 }, { - "epoch": 1.04, - "grad_norm": 27.633296335888442, - "learning_rate": 2.7585296755083615e-07, - "logits/chosen": -1.3180968761444092, - "logits/rejected": -1.2671663761138916, - "logps/chosen": -198.31124877929688, - "logps/rejected": -304.19488525390625, - "loss": 0.43, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.4460715055465698, - "rewards/margins": 1.0792269706726074, - "rewards/rejected": -2.525298595428467, + "epoch": 1.0406616126809096, + "grad_norm": 7.477050304412842, + "learning_rate": 5.517059351016723e-08, + "logits/chosen": -2.642655849456787, + "logits/rejected": -2.624969244003296, + "logps/chosen": -96.06242370605469, + "logps/rejected": -105.86860656738281, + "loss": 0.6458, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4235418438911438, + "rewards/margins": 0.11838100850582123, + "rewards/rejected": -0.5419228076934814, "step": 6040 }, { - "epoch": 1.04, - "grad_norm": 19.23129899129365, - "learning_rate": 2.751050894148317e-07, - "logits/chosen": -1.235442876815796, - "logits/rejected": -1.174726963043213, - "logps/chosen": -212.7599334716797, - "logps/rejected": -319.74639892578125, - "loss": 0.4224, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.5342434644699097, - "rewards/margins": 1.1264307498931885, - "rewards/rejected": -2.6606743335723877, + "epoch": 1.0423845623707788, + "grad_norm": 8.111739158630371, + "learning_rate": 5.502101788296634e-08, + "logits/chosen": -2.5507383346557617, + "logits/rejected": -2.5269408226013184, + "logps/chosen": -104.08492279052734, + "logps/rejected": -112.29386901855469, + "loss": 0.6354, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.44742804765701294, + "rewards/margins": 0.13851603865623474, + "rewards/rejected": -0.5859440565109253, "step": 6050 }, { - "epoch": 1.04, - "grad_norm": 21.993509562945054, - "learning_rate": 2.743569842515707e-07, - "logits/chosen": -1.2447845935821533, - "logits/rejected": -1.1827826499938965, - "logps/chosen": -215.64529418945312, - "logps/rejected": -319.7334899902344, - "loss": 0.4931, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.6215600967407227, - "rewards/margins": 1.0563386678695679, - "rewards/rejected": -2.67789888381958, + "epoch": 1.0441075120606478, + "grad_norm": 7.987109661102295, + "learning_rate": 5.487139685031413e-08, + "logits/chosen": -2.6121532917022705, + "logits/rejected": -2.590766429901123, + "logps/chosen": -98.3913803100586, + "logps/rejected": -106.94645690917969, + "loss": 0.6555, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.44896596670150757, + "rewards/margins": 0.10085372626781464, + "rewards/rejected": -0.5498196482658386, "step": 6060 }, { - "epoch": 1.05, - "grad_norm": 31.01800262084724, - "learning_rate": 2.7360865882622556e-07, - "logits/chosen": -1.2382781505584717, - "logits/rejected": -1.1739325523376465, - "logps/chosen": -226.4764404296875, - "logps/rejected": -340.14373779296875, - "loss": 0.4489, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7254207134246826, - "rewards/margins": 1.1591459512710571, - "rewards/rejected": -2.8845667839050293, + "epoch": 1.045830461750517, + "grad_norm": 7.485147476196289, + "learning_rate": 5.4721731765245116e-08, + "logits/chosen": -2.6229617595672607, + "logits/rejected": -2.601656675338745, + "logps/chosen": -96.12904357910156, + "logps/rejected": -107.45579528808594, + "loss": 0.6375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4218669831752777, + "rewards/margins": 0.1354655772447586, + "rewards/rejected": -0.5573326349258423, "step": 6070 }, { - "epoch": 1.05, - "grad_norm": 23.645653500298494, - "learning_rate": 2.728601199059609e-07, - "logits/chosen": -1.2225624322891235, - "logits/rejected": -1.1666558980941772, - "logps/chosen": -230.73385620117188, - "logps/rejected": -352.7666320800781, - "loss": 0.4109, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.7167644500732422, - "rewards/margins": 1.275223970413208, - "rewards/rejected": -2.9919886589050293, + "epoch": 1.047553411440386, + "grad_norm": 7.552558422088623, + "learning_rate": 5.4572023981192184e-08, + "logits/chosen": -2.5975961685180664, + "logits/rejected": -2.579049587249756, + "logps/chosen": -102.36415100097656, + "logps/rejected": -110.1705322265625, + "loss": 0.6395, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4328855872154236, + "rewards/margins": 0.13304778933525085, + "rewards/rejected": -0.565933346748352, "step": 6080 }, { - "epoch": 1.05, - "grad_norm": 26.05274957772241, - "learning_rate": 2.7211137425987175e-07, - "logits/chosen": -1.2456872463226318, - "logits/rejected": -1.1820614337921143, - "logps/chosen": -225.69967651367188, - "logps/rejected": -366.41064453125, - "loss": 0.3763, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.7430721521377563, - "rewards/margins": 1.407641887664795, - "rewards/rejected": -3.150714159011841, + "epoch": 1.049276361130255, + "grad_norm": 7.410242080688477, + "learning_rate": 5.442227485197435e-08, + "logits/chosen": -2.657362461090088, + "logits/rejected": -2.6392769813537598, + "logps/chosen": -92.25110626220703, + "logps/rejected": -108.11601257324219, + "loss": 0.626, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4084300100803375, + "rewards/margins": 0.1592157930135727, + "rewards/rejected": -0.5676458477973938, "step": 6090 }, { - "epoch": 1.05, - "grad_norm": 23.260564144653117, - "learning_rate": 2.713624286589227e-07, - "logits/chosen": -1.1914881467819214, - "logits/rejected": -1.1363308429718018, - "logps/chosen": -247.3178253173828, - "logps/rejected": -392.1295471191406, - "loss": 0.3822, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.905500054359436, - "rewards/margins": 1.4772017002105713, - "rewards/rejected": -3.382701873779297, + "epoch": 1.050999310820124, + "grad_norm": 7.989211082458496, + "learning_rate": 5.4272485731784536e-08, + "logits/chosen": -2.6268820762634277, + "logits/rejected": -2.6230292320251465, + "logps/chosen": -101.92762756347656, + "logps/rejected": -113.89131164550781, + "loss": 0.6337, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.45144420862197876, + "rewards/margins": 0.14876195788383484, + "rewards/rejected": -0.6002060770988464, "step": 6100 }, { - "epoch": 1.05, - "eval_logits/chosen": -1.247104287147522, - "eval_logits/rejected": -1.2163046598434448, - "eval_logps/chosen": -258.7976379394531, - "eval_logps/rejected": -313.2447509765625, - "eval_loss": 0.6230133771896362, - "eval_rewards/accuracies": 0.6710036993026733, - "eval_rewards/chosen": -2.0009379386901855, - "eval_rewards/margins": 0.49993589520454407, - "eval_rewards/rejected": -2.5008738040924072, - "eval_runtime": 356.9611, - "eval_samples_per_second": 12.057, - "eval_steps_per_second": 1.507, + "epoch": 1.050999310820124, + "eval_logits/chosen": -2.7237114906311035, + "eval_logits/rejected": -2.717660903930664, + "eval_logps/chosen": -93.63307189941406, + "eval_logps/rejected": -105.14704895019531, + "eval_loss": 0.664944052696228, + "eval_rewards/accuracies": 0.6078066825866699, + "eval_rewards/chosen": -0.34921181201934814, + "eval_rewards/margins": 0.07045748829841614, + "eval_rewards/rejected": -0.4196692109107971, + "eval_runtime": 360.0647, + "eval_samples_per_second": 11.953, + "eval_steps_per_second": 1.494, "step": 6100 }, { - "epoch": 1.05, - "grad_norm": 20.111788335085762, - "learning_rate": 2.7061328987588626e-07, - "logits/chosen": -1.1539726257324219, - "logits/rejected": -1.087749719619751, - "logps/chosen": -263.44146728515625, - "logps/rejected": -398.18914794921875, - "loss": 0.4009, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -2.117790699005127, - "rewards/margins": 1.3292558193206787, - "rewards/rejected": -3.4470467567443848, + "epoch": 1.052722260509993, + "grad_norm": 7.026992321014404, + "learning_rate": 5.4122657975177254e-08, + "logits/chosen": -2.627516508102417, + "logits/rejected": -2.6128859519958496, + "logps/chosen": -96.61692810058594, + "logps/rejected": -112.77938079833984, + "loss": 0.635, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.44917988777160645, + "rewards/margins": 0.14372874796390533, + "rewards/rejected": -0.5929085612297058, "step": 6110 }, { - "epoch": 1.05, - "grad_norm": 39.46202657579613, - "learning_rate": 2.6986396468528236e-07, - "logits/chosen": -1.2154873609542847, - "logits/rejected": -1.1202675104141235, - "logps/chosen": -255.0636444091797, - "logps/rejected": -415.2186584472656, - "loss": 0.3857, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -2.007509469985962, - "rewards/margins": 1.6099491119384766, - "rewards/rejected": -3.6174583435058594, + "epoch": 1.0544452101998623, + "grad_norm": 7.159167766571045, + "learning_rate": 5.397279293705648e-08, + "logits/chosen": -2.726543664932251, + "logits/rejected": -2.680231809616089, + "logps/chosen": -95.74651336669922, + "logps/rejected": -113.47621154785156, + "loss": 0.6163, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.41424790024757385, + "rewards/margins": 0.18573865294456482, + "rewards/rejected": -0.5999865531921387, "step": 6120 }, { - "epoch": 1.06, - "grad_norm": 16.111988424606817, - "learning_rate": 2.6911445986331634e-07, - "logits/chosen": -1.1826080083847046, - "logits/rejected": -1.1169893741607666, - "logps/chosen": -239.3343048095703, - "logps/rejected": -376.82037353515625, - "loss": 0.3942, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.849416971206665, - "rewards/margins": 1.3795721530914307, - "rewards/rejected": -3.2289886474609375, + "epoch": 1.0561681598897312, + "grad_norm": 7.718677997589111, + "learning_rate": 5.3822891972663266e-08, + "logits/chosen": -2.6282782554626465, + "logits/rejected": -2.604907989501953, + "logps/chosen": -98.44243621826172, + "logps/rejected": -112.24897766113281, + "loss": 0.6371, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.44050803780555725, + "rewards/margins": 0.14252620935440063, + "rewards/rejected": -0.5830342769622803, "step": 6130 }, { - "epoch": 1.06, - "grad_norm": 27.334786962517168, - "learning_rate": 2.68364782187818e-07, - "logits/chosen": -1.2877238988876343, - "logits/rejected": -1.229827642440796, - "logps/chosen": -210.6566619873047, - "logps/rejected": -323.015869140625, - "loss": 0.4396, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.5760023593902588, - "rewards/margins": 1.1295536756515503, - "rewards/rejected": -2.7055559158325195, + "epoch": 1.0578911095796002, + "grad_norm": 8.668318748474121, + "learning_rate": 5.36729564375636e-08, + "logits/chosen": -2.653858184814453, + "logits/rejected": -2.6416361331939697, + "logps/chosen": -97.1240005493164, + "logps/rejected": -110.01481628417969, + "loss": 0.6425, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.44045019149780273, + "rewards/margins": 0.13490334153175354, + "rewards/rejected": -0.5753535032272339, "step": 6140 }, { - "epoch": 1.06, - "grad_norm": 23.13483950909933, - "learning_rate": 2.6761493843818027e-07, - "logits/chosen": -1.24057936668396, - "logits/rejected": -1.1909449100494385, - "logps/chosen": -211.8172607421875, - "logps/rejected": -319.30926513671875, - "loss": 0.4605, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.5882833003997803, - "rewards/margins": 1.1051770448684692, - "rewards/rejected": -2.693460464477539, + "epoch": 1.0596140592694694, + "grad_norm": 8.170172691345215, + "learning_rate": 5.352298768763606e-08, + "logits/chosen": -2.5736560821533203, + "logits/rejected": -2.560098171234131, + "logps/chosen": -98.64427185058594, + "logps/rejected": -106.1328125, + "loss": 0.6553, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.45636358857154846, + "rewards/margins": 0.10507573932409286, + "rewards/rejected": -0.5614393949508667, "step": 6150 }, { - "epoch": 1.06, - "grad_norm": 24.089174586100004, - "learning_rate": 2.66864935395298e-07, - "logits/chosen": -1.1712977886199951, - "logits/rejected": -1.1290233135223389, - "logps/chosen": -205.40530395507812, - "logps/rejected": -303.6624450683594, - "loss": 0.4751, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.5445663928985596, - "rewards/margins": 0.9666827321052551, - "rewards/rejected": -2.51124906539917, + "epoch": 1.0613370089593384, + "grad_norm": 8.330756187438965, + "learning_rate": 5.33729870790596e-08, + "logits/chosen": -2.523110866546631, + "logits/rejected": -2.5138514041900635, + "logps/chosen": -93.60323333740234, + "logps/rejected": -106.94438171386719, + "loss": 0.6474, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4262743890285492, + "rewards/margins": 0.11767958104610443, + "rewards/rejected": -0.5439538955688477, "step": 6160 }, { - "epoch": 1.06, - "grad_norm": 23.745589448999045, - "learning_rate": 2.661147798415063e-07, - "logits/chosen": -1.3031284809112549, - "logits/rejected": -1.2508373260498047, - "logps/chosen": -230.56103515625, - "logps/rejected": -357.1623840332031, - "loss": 0.4073, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7110252380371094, - "rewards/margins": 1.2960346937179565, - "rewards/rejected": -3.0070605278015137, + "epoch": 1.0630599586492075, + "grad_norm": 6.593761920928955, + "learning_rate": 5.322295596830125e-08, + "logits/chosen": -2.647068500518799, + "logits/rejected": -2.6324515342712402, + "logps/chosen": -108.75308990478516, + "logps/rejected": -118.23289489746094, + "loss": 0.6441, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4929245412349701, + "rewards/margins": 0.12468580156564713, + "rewards/rejected": -0.6176103353500366, "step": 6170 }, { - "epoch": 1.06, - "grad_norm": 23.45984720617789, - "learning_rate": 2.6536447856051964e-07, - "logits/chosen": -1.2978737354278564, - "logits/rejected": -1.2452610731124878, - "logps/chosen": -238.743408203125, - "logps/rejected": -339.7936706542969, - "loss": 0.4798, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.805580735206604, - "rewards/margins": 1.062530517578125, - "rewards/rejected": -2.8681111335754395, + "epoch": 1.0647829083390765, + "grad_norm": 9.397297859191895, + "learning_rate": 5.3072895712103925e-08, + "logits/chosen": -2.6558566093444824, + "logits/rejected": -2.633418560028076, + "logps/chosen": -103.27950286865234, + "logps/rejected": -110.69441223144531, + "loss": 0.6459, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.4512435793876648, + "rewards/margins": 0.12590502202510834, + "rewards/rejected": -0.5771486163139343, "step": 6180 }, { - "epoch": 1.07, - "grad_norm": 24.825020766575623, - "learning_rate": 2.646140383373704e-07, - "logits/chosen": -1.304811716079712, - "logits/rejected": -1.2447645664215088, - "logps/chosen": -218.7591552734375, - "logps/rejected": -337.76025390625, - "loss": 0.394, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.6060594320297241, - "rewards/margins": 1.2416613101959229, - "rewards/rejected": -2.8477206230163574, + "epoch": 1.0665058580289455, + "grad_norm": 8.016385078430176, + "learning_rate": 5.292280766747408e-08, + "logits/chosen": -2.631197690963745, + "logits/rejected": -2.607970952987671, + "logps/chosen": -103.00118255615234, + "logps/rejected": -114.66705322265625, + "loss": 0.631, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4482520520687103, + "rewards/margins": 0.16839279234409332, + "rewards/rejected": -0.6166448593139648, "step": 6190 }, { - "epoch": 1.07, - "grad_norm": 22.348892489607106, - "learning_rate": 2.6386346595834716e-07, - "logits/chosen": -1.2410696744918823, - "logits/rejected": -1.1765029430389404, - "logps/chosen": -213.6040802001953, - "logps/rejected": -335.80865478515625, - "loss": 0.4249, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.5910637378692627, - "rewards/margins": 1.2148936986923218, - "rewards/rejected": -2.805957317352295, + "epoch": 1.0682288077188147, + "grad_norm": 8.2015962600708, + "learning_rate": 5.277269319166944e-08, + "logits/chosen": -2.5226476192474365, + "logits/rejected": -2.503005027770996, + "logps/chosen": -99.58888244628906, + "logps/rejected": -114.26560974121094, + "loss": 0.6372, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.45074397325515747, + "rewards/margins": 0.13954012095928192, + "rewards/rejected": -0.5902840495109558, "step": 6200 }, { - "epoch": 1.07, - "eval_logits/chosen": -1.446341872215271, - "eval_logits/rejected": -1.418765902519226, - "eval_logps/chosen": -210.3596954345703, - "eval_logps/rejected": -255.79803466796875, - "eval_loss": 0.6216332912445068, - "eval_rewards/accuracies": 0.6656598448753357, - "eval_rewards/chosen": -1.5165584087371826, - "eval_rewards/margins": 0.40984830260276794, - "eval_rewards/rejected": -1.9264066219329834, - "eval_runtime": 356.8025, - "eval_samples_per_second": 12.063, - "eval_steps_per_second": 1.508, + "epoch": 1.0682288077188147, + "eval_logits/chosen": -2.714384078979492, + "eval_logits/rejected": -2.7082998752593994, + "eval_logps/chosen": -95.46115112304688, + "eval_logps/rejected": -107.26507568359375, + "eval_loss": 0.6640035510063171, + "eval_rewards/accuracies": 0.6089683771133423, + "eval_rewards/chosen": -0.36749252676963806, + "eval_rewards/margins": 0.07335695624351501, + "eval_rewards/rejected": -0.4408494830131531, + "eval_runtime": 360.7146, + "eval_samples_per_second": 11.932, + "eval_steps_per_second": 1.491, "step": 6200 }, { - "epoch": 1.07, - "grad_norm": 34.90631179192167, - "learning_rate": 2.631127682109338e-07, - "logits/chosen": -1.3385263681411743, - "logits/rejected": -1.277630090713501, - "logps/chosen": -212.1560516357422, - "logps/rejected": -317.8212890625, - "loss": 0.4505, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.5736181735992432, - "rewards/margins": 1.0757099390029907, - "rewards/rejected": -2.6493279933929443, + "epoch": 1.0699517574086836, + "grad_norm": 8.162054061889648, + "learning_rate": 5.2622553642186765e-08, + "logits/chosen": -2.62589693069458, + "logits/rejected": -2.5983972549438477, + "logps/chosen": -102.60786437988281, + "logps/rejected": -114.27645111083984, + "loss": 0.6398, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.47782596945762634, + "rewards/margins": 0.13597813248634338, + "rewards/rejected": -0.6138041615486145, "step": 6210 }, { - "epoch": 1.07, - "grad_norm": 28.897418625202086, - "learning_rate": 2.6236195188374797e-07, - "logits/chosen": -1.3002517223358154, - "logits/rejected": -1.2492867708206177, - "logps/chosen": -213.46377563476562, - "logps/rejected": -318.5871887207031, - "loss": 0.4586, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.6133520603179932, - "rewards/margins": 1.047262191772461, - "rewards/rejected": -2.660614252090454, + "epoch": 1.0716747070985528, + "grad_norm": 7.845183372497559, + "learning_rate": 5.24723903767496e-08, + "logits/chosen": -2.613713026046753, + "logits/rejected": -2.5951294898986816, + "logps/chosen": -97.50817108154297, + "logps/rejected": -110.98738861083984, + "loss": 0.6403, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4536914825439453, + "rewards/margins": 0.1307128369808197, + "rewards/rejected": -0.5844042897224426, "step": 6220 }, { - "epoch": 1.07, - "grad_norm": 31.35968444075051, - "learning_rate": 2.616110237664793e-07, - "logits/chosen": -1.427841067314148, - "logits/rejected": -1.3521463871002197, - "logps/chosen": -208.4129180908203, - "logps/rejected": -360.6506042480469, - "loss": 0.3774, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.495847225189209, - "rewards/margins": 1.5452907085418701, - "rewards/rejected": -3.041138172149658, + "epoch": 1.0733976567884218, + "grad_norm": 7.181242942810059, + "learning_rate": 5.232220475329586e-08, + "logits/chosen": -2.747974395751953, + "logits/rejected": -2.729741096496582, + "logps/chosen": -101.5085220336914, + "logps/rejected": -120.6954116821289, + "loss": 0.6058, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4267581105232239, + "rewards/margins": 0.2146727293729782, + "rewards/rejected": -0.6414308547973633, "step": 6230 }, { - "epoch": 1.08, - "grad_norm": 29.527066343948846, - "learning_rate": 2.6085999064982873e-07, - "logits/chosen": -1.2126820087432861, - "logits/rejected": -1.143293023109436, - "logps/chosen": -223.4793701171875, - "logps/rejected": -341.6191711425781, - "loss": 0.4529, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.6712300777435303, - "rewards/margins": 1.2077951431274414, - "rewards/rejected": -2.879025459289551, + "epoch": 1.0751206064782908, + "grad_norm": 8.449043273925781, + "learning_rate": 5.217199812996574e-08, + "logits/chosen": -2.564089059829712, + "logits/rejected": -2.5353121757507324, + "logps/chosen": -102.2982406616211, + "logps/rejected": -113.02010345458984, + "loss": 0.6421, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.4591143727302551, + "rewards/margins": 0.13406124711036682, + "rewards/rejected": -0.5931755900382996, "step": 6240 }, { - "epoch": 1.08, - "grad_norm": 19.581487431832702, - "learning_rate": 2.601088593254465e-07, - "logits/chosen": -1.3335120677947998, - "logits/rejected": -1.2712721824645996, - "logps/chosen": -218.8231658935547, - "logps/rejected": -323.0040283203125, - "loss": 0.4925, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.6310733556747437, - "rewards/margins": 1.0603678226470947, - "rewards/rejected": -2.691441059112549, + "epoch": 1.07684355616816, + "grad_norm": 17.723363876342773, + "learning_rate": 5.202177186508929e-08, + "logits/chosen": -2.6494898796081543, + "logits/rejected": -2.627321720123291, + "logps/chosen": -100.54495239257812, + "logps/rejected": -107.69929504394531, + "loss": 0.6615, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4481717050075531, + "rewards/margins": 0.08987609297037125, + "rewards/rejected": -0.5380477905273438, "step": 6250 }, { - "epoch": 1.08, - "grad_norm": 36.934927120147826, - "learning_rate": 2.59357636585871e-07, - "logits/chosen": -1.232089877128601, - "logits/rejected": -1.1848324537277222, - "logps/chosen": -211.7543487548828, - "logps/rejected": -297.58697509765625, - "loss": 0.4911, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.578609824180603, - "rewards/margins": 0.8933493494987488, - "rewards/rejected": -2.471959352493286, + "epoch": 1.078566505858029, + "grad_norm": 7.934974193572998, + "learning_rate": 5.18715273171742e-08, + "logits/chosen": -2.52590012550354, + "logits/rejected": -2.506361722946167, + "logps/chosen": -103.57615661621094, + "logps/rejected": -110.61024475097656, + "loss": 0.6537, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.49651750922203064, + "rewards/margins": 0.10577519983053207, + "rewards/rejected": -0.6022926568984985, "step": 6260 }, { - "epoch": 1.08, - "grad_norm": 22.671481469710727, - "learning_rate": 2.5860632922446737e-07, - "logits/chosen": -1.5191317796707153, - "logits/rejected": -1.4832508563995361, - "logps/chosen": -200.9024658203125, - "logps/rejected": -307.0449523925781, - "loss": 0.4744, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.479931116104126, - "rewards/margins": 1.0443861484527588, - "rewards/rejected": -2.524317502975464, + "epoch": 1.080289455547898, + "grad_norm": 7.386178493499756, + "learning_rate": 5.1721265844893467e-08, + "logits/chosen": -2.783388137817383, + "logits/rejected": -2.782752513885498, + "logps/chosen": -99.8791275024414, + "logps/rejected": -114.23184967041016, + "loss": 0.6441, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4697844386100769, + "rewards/margins": 0.12615224719047546, + "rewards/rejected": -0.5959367156028748, "step": 6270 }, { - "epoch": 1.08, - "grad_norm": 22.47288317464536, - "learning_rate": 2.578549440353659e-07, - "logits/chosen": -1.2445075511932373, - "logits/rejected": -1.1983692646026611, - "logps/chosen": -185.7981414794922, - "logps/rejected": -284.7179260253906, - "loss": 0.4304, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.3140432834625244, - "rewards/margins": 1.0069011449813843, - "rewards/rejected": -2.320944309234619, + "epoch": 1.082012405237767, + "grad_norm": 9.967061996459961, + "learning_rate": 5.157098880707318e-08, + "logits/chosen": -2.4792990684509277, + "logits/rejected": -2.457472562789917, + "logps/chosen": -101.3305892944336, + "logps/rejected": -111.230224609375, + "loss": 0.6487, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4691835343837738, + "rewards/margins": 0.11668656766414642, + "rewards/rejected": -0.5858700275421143, "step": 6280 }, { - "epoch": 1.08, - "grad_norm": 26.874037522701656, - "learning_rate": 2.571034878134007e-07, - "logits/chosen": -1.3063162565231323, - "logits/rejected": -1.2508251667022705, - "logps/chosen": -197.89691162109375, - "logps/rejected": -304.0697937011719, - "loss": 0.4268, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.465261459350586, - "rewards/margins": 1.0520577430725098, - "rewards/rejected": -2.5173192024230957, + "epoch": 1.083735354927636, + "grad_norm": 7.863672256469727, + "learning_rate": 5.1420697562680136e-08, + "logits/chosen": -2.5599868297576904, + "logits/rejected": -2.5322113037109375, + "logps/chosen": -95.88512420654297, + "logps/rejected": -111.76493072509766, + "loss": 0.6337, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.44506701827049255, + "rewards/margins": 0.1488201916217804, + "rewards/rejected": -0.593887209892273, "step": 6290 }, { - "epoch": 1.09, - "grad_norm": 34.25929825482152, - "learning_rate": 2.5635196735404816e-07, - "logits/chosen": -1.327014446258545, - "logits/rejected": -1.273252248764038, - "logps/chosen": -216.51626586914062, - "logps/rejected": -310.141845703125, - "loss": 0.4731, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.640974760055542, - "rewards/margins": 0.9403258562088013, - "rewards/rejected": -2.581300973892212, + "epoch": 1.0854583046175053, + "grad_norm": 8.78534984588623, + "learning_rate": 5.1270393470809636e-08, + "logits/chosen": -2.62137508392334, + "logits/rejected": -2.596475839614868, + "logps/chosen": -103.24137878417969, + "logps/rejected": -113.2978515625, + "loss": 0.6555, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5080513954162598, + "rewards/margins": 0.10447593778371811, + "rewards/rejected": -0.6125272512435913, "step": 6300 }, { - "epoch": 1.09, - "eval_logits/chosen": -1.4054533243179321, - "eval_logits/rejected": -1.3767914772033691, - "eval_logps/chosen": -229.14906311035156, - "eval_logps/rejected": -278.4627685546875, - "eval_loss": 0.6205867528915405, - "eval_rewards/accuracies": 0.6654275059700012, - "eval_rewards/chosen": -1.7044522762298584, - "eval_rewards/margins": 0.4486016631126404, - "eval_rewards/rejected": -2.1530539989471436, - "eval_runtime": 356.7832, - "eval_samples_per_second": 12.063, - "eval_steps_per_second": 1.508, + "epoch": 1.0854583046175053, + "eval_logits/chosen": -2.7071239948272705, + "eval_logits/rejected": -2.7009353637695312, + "eval_logps/chosen": -96.7947998046875, + "eval_logps/rejected": -108.81401062011719, + "eval_loss": 0.6633469462394714, + "eval_rewards/accuracies": 0.6110594868659973, + "eval_rewards/chosen": -0.38082900643348694, + "eval_rewards/margins": 0.07550989836454391, + "eval_rewards/rejected": -0.45633891224861145, + "eval_runtime": 360.6678, + "eval_samples_per_second": 11.933, + "eval_steps_per_second": 1.492, "step": 6300 }, { - "epoch": 1.09, - "grad_norm": 30.63136108992965, - "learning_rate": 2.5560038945336583e-07, - "logits/chosen": -1.2807663679122925, - "logits/rejected": -1.213196039199829, - "logps/chosen": -209.709716796875, - "logps/rejected": -319.9820861816406, - "loss": 0.4534, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.5926458835601807, - "rewards/margins": 1.0916332006454468, - "rewards/rejected": -2.684279203414917, + "epoch": 1.0871812543073742, + "grad_norm": 10.046746253967285, + "learning_rate": 5.112007789067316e-08, + "logits/chosen": -2.604231119155884, + "logits/rejected": -2.5762839317321777, + "logps/chosen": -96.14618682861328, + "logps/rejected": -107.18050384521484, + "loss": 0.6572, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.456892728805542, + "rewards/margins": 0.09939368069171906, + "rewards/rejected": -0.5562864542007446, "step": 6310 }, { - "epoch": 1.09, - "grad_norm": 18.736489627828945, - "learning_rate": 2.548487609079305e-07, - "logits/chosen": -1.2793110609054565, - "logits/rejected": -1.2298452854156494, - "logps/chosen": -228.3084716796875, - "logps/rejected": -330.50390625, - "loss": 0.4929, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.7359955310821533, - "rewards/margins": 1.0360920429229736, - "rewards/rejected": -2.772087574005127, + "epoch": 1.0889042039972432, + "grad_norm": 8.157792091369629, + "learning_rate": 5.09697521815861e-08, + "logits/chosen": -2.563054323196411, + "logits/rejected": -2.5432045459747314, + "logps/chosen": -103.923095703125, + "logps/rejected": -114.36000061035156, + "loss": 0.6489, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4920113682746887, + "rewards/margins": 0.11835269629955292, + "rewards/rejected": -0.6103640794754028, "step": 6320 }, { - "epoch": 1.09, - "grad_norm": 24.122550114272617, - "learning_rate": 2.5409708851477687e-07, - "logits/chosen": -1.316935658454895, - "logits/rejected": -1.2505112886428833, - "logps/chosen": -206.8124237060547, - "logps/rejected": -340.0397644042969, - "loss": 0.3752, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.5354225635528564, - "rewards/margins": 1.3291146755218506, - "rewards/rejected": -2.864537477493286, + "epoch": 1.0906271536871124, + "grad_norm": 8.477439880371094, + "learning_rate": 5.0819417702955367e-08, + "logits/chosen": -2.5750272274017334, + "logits/rejected": -2.5546774864196777, + "logps/chosen": -98.45436096191406, + "logps/rejected": -118.94361877441406, + "loss": 0.6105, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.4516433775424957, + "rewards/margins": 0.20171110332012177, + "rewards/rejected": -0.6533544659614563, "step": 6330 }, { - "epoch": 1.09, - "grad_norm": 24.00191172899993, - "learning_rate": 2.533453790713363e-07, - "logits/chosen": -1.3309152126312256, - "logits/rejected": -1.2744200229644775, - "logps/chosen": -205.3987274169922, - "logps/rejected": -318.1818542480469, - "loss": 0.4432, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.4982860088348389, - "rewards/margins": 1.1549437046051025, - "rewards/rejected": -2.6532297134399414, + "epoch": 1.0923501033769814, + "grad_norm": 7.878482818603516, + "learning_rate": 5.066907581426726e-08, + "logits/chosen": -2.5957183837890625, + "logits/rejected": -2.5751445293426514, + "logps/chosen": -101.18866729736328, + "logps/rejected": -111.032470703125, + "loss": 0.6435, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.456013023853302, + "rewards/margins": 0.12570513784885406, + "rewards/rejected": -0.5817180871963501, "step": 6340 }, { - "epoch": 1.09, - "grad_norm": 37.256774507075455, - "learning_rate": 2.5259363937537523e-07, - "logits/chosen": -1.2830774784088135, - "logits/rejected": -1.2399280071258545, - "logps/chosen": -215.0625, - "logps/rejected": -328.13604736328125, - "loss": 0.4306, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.5956099033355713, - "rewards/margins": 1.1229556798934937, - "rewards/rejected": -2.7185654640197754, + "epoch": 1.0940730530668505, + "grad_norm": 7.387296676635742, + "learning_rate": 5.051872787507505e-08, + "logits/chosen": -2.560865879058838, + "logits/rejected": -2.5533196926116943, + "logps/chosen": -101.19855499267578, + "logps/rejected": -116.6091079711914, + "loss": 0.6344, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.45683735609054565, + "rewards/margins": 0.1462439000606537, + "rewards/rejected": -0.6030812859535217, "step": 6350 }, { - "epoch": 1.1, - "grad_norm": 24.16860767130257, - "learning_rate": 2.5184187622493356e-07, - "logits/chosen": -1.2492659091949463, - "logits/rejected": -1.1924443244934082, - "logps/chosen": -213.53012084960938, - "logps/rejected": -354.2420959472656, - "loss": 0.3785, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.6007035970687866, - "rewards/margins": 1.3961331844329834, - "rewards/rejected": -2.9968369007110596, + "epoch": 1.0957960027567195, + "grad_norm": 8.644234657287598, + "learning_rate": 5.036837524498672e-08, + "logits/chosen": -2.5833213329315186, + "logits/rejected": -2.5798540115356445, + "logps/chosen": -97.86479187011719, + "logps/rejected": -113.30020904541016, + "loss": 0.6367, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4439619481563568, + "rewards/margins": 0.14320224523544312, + "rewards/rejected": -0.5871641635894775, "step": 6360 }, { - "epoch": 1.1, - "grad_norm": 30.09884792748387, - "learning_rate": 2.510900964182635e-07, - "logits/chosen": -1.2614082098007202, - "logits/rejected": -1.2243916988372803, - "logps/chosen": -221.001708984375, - "logps/rejected": -329.3047180175781, - "loss": 0.4579, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.6786903142929077, - "rewards/margins": 1.0699204206466675, - "rewards/rejected": -2.7486109733581543, + "epoch": 1.0975189524465885, + "grad_norm": 8.76852798461914, + "learning_rate": 5.021801928365269e-08, + "logits/chosen": -2.6103196144104004, + "logits/rejected": -2.6085126399993896, + "logps/chosen": -99.3055191040039, + "logps/rejected": -111.53495025634766, + "loss": 0.6546, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.4616120457649231, + "rewards/margins": 0.10931645333766937, + "rewards/rejected": -0.5709284543991089, "step": 6370 }, { - "epoch": 1.1, - "grad_norm": 26.54672845989167, - "learning_rate": 2.503383067537674e-07, - "logits/chosen": -1.3264938592910767, - "logits/rejected": -1.2688075304031372, - "logps/chosen": -207.5281524658203, - "logps/rejected": -336.74346923828125, - "loss": 0.3866, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.5481336116790771, - "rewards/margins": 1.2540295124053955, - "rewards/rejected": -2.8021631240844727, + "epoch": 1.0992419021364577, + "grad_norm": 9.487778663635254, + "learning_rate": 5.006766135075349e-08, + "logits/chosen": -2.643077850341797, + "logits/rejected": -2.6314942836761475, + "logps/chosen": -101.48836517333984, + "logps/rejected": -119.13397216796875, + "loss": 0.6386, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.48753833770751953, + "rewards/margins": 0.13859817385673523, + "rewards/rejected": -0.6261364817619324, "step": 6380 }, { - "epoch": 1.1, - "grad_norm": 26.92277033670455, - "learning_rate": 2.495865140299374e-07, - "logits/chosen": -1.364383578300476, - "logits/rejected": -1.2942759990692139, - "logps/chosen": -213.89453125, - "logps/rejected": -337.72857666015625, - "loss": 0.4155, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.5782573223114014, - "rewards/margins": 1.2891353368759155, - "rewards/rejected": -2.8673927783966064, + "epoch": 1.1009648518263266, + "grad_norm": 7.828840732574463, + "learning_rate": 4.991730280598747e-08, + "logits/chosen": -2.682213544845581, + "logits/rejected": -2.6493566036224365, + "logps/chosen": -101.81409454345703, + "logps/rejected": -111.9026870727539, + "loss": 0.6355, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.45730310678482056, + "rewards/margins": 0.1517023742198944, + "rewards/rejected": -0.6090055108070374, "step": 6390 }, { - "epoch": 1.1, - "grad_norm": 26.35121867365253, - "learning_rate": 2.4883472504529284e-07, - "logits/chosen": -1.2807561159133911, - "logits/rejected": -1.2216075658798218, - "logps/chosen": -221.13613891601562, - "logps/rejected": -342.654296875, - "loss": 0.4089, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.6344629526138306, - "rewards/margins": 1.2270433902740479, - "rewards/rejected": -2.8615059852600098, + "epoch": 1.1026878015161956, + "grad_norm": 8.809715270996094, + "learning_rate": 4.976694500905857e-08, + "logits/chosen": -2.61970591545105, + "logits/rejected": -2.5968809127807617, + "logps/chosen": -103.23225402832031, + "logps/rejected": -115.15157318115234, + "loss": 0.6406, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.45538854598999023, + "rewards/margins": 0.13073064386844635, + "rewards/rejected": -0.5861191749572754, "step": 6400 }, { - "epoch": 1.1, - "eval_logits/chosen": -1.32829749584198, - "eval_logits/rejected": -1.2985448837280273, - "eval_logps/chosen": -253.03562927246094, - "eval_logps/rejected": -306.45611572265625, - "eval_loss": 0.6263204216957092, - "eval_rewards/accuracies": 0.6642658114433289, - "eval_rewards/chosen": -1.9433181285858154, - "eval_rewards/margins": 0.48966917395591736, - "eval_rewards/rejected": -2.4329869747161865, - "eval_runtime": 356.7188, - "eval_samples_per_second": 12.066, - "eval_steps_per_second": 1.508, + "epoch": 1.1026878015161956, + "eval_logits/chosen": -2.700328826904297, + "eval_logits/rejected": -2.6940526962280273, + "eval_logps/chosen": -97.13938903808594, + "eval_logps/rejected": -109.29045104980469, + "eval_loss": 0.6628761291503906, + "eval_rewards/accuracies": 0.6108271479606628, + "eval_rewards/chosen": -0.3842748999595642, + "eval_rewards/margins": 0.07682836055755615, + "eval_rewards/rejected": -0.461103230714798, + "eval_runtime": 360.0734, + "eval_samples_per_second": 11.953, + "eval_steps_per_second": 1.494, "step": 6400 }, { - "epoch": 1.1, - "grad_norm": 35.49752834329857, - "learning_rate": 2.480829465983194e-07, - "logits/chosen": -1.3197977542877197, - "logits/rejected": -1.2659125328063965, - "logps/chosen": -263.07965087890625, - "logps/rejected": -380.5269470214844, - "loss": 0.4862, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -2.0871684551239014, - "rewards/margins": 1.2009137868881226, - "rewards/rejected": -3.2880821228027344, + "epoch": 1.1044107512060648, + "grad_norm": 9.525028228759766, + "learning_rate": 4.961658931966387e-08, + "logits/chosen": -2.704301357269287, + "logits/rejected": -2.6803505420684814, + "logps/chosen": -102.91910552978516, + "logps/rejected": -113.65352630615234, + "loss": 0.6471, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4855470657348633, + "rewards/margins": 0.1335912048816681, + "rewards/rejected": -0.619138240814209, "step": 6410 }, { - "epoch": 1.11, - "grad_norm": 32.67436318673992, - "learning_rate": 2.473311854874075e-07, - "logits/chosen": -1.321010947227478, - "logits/rejected": -1.269235372543335, - "logps/chosen": -245.9833984375, - "logps/rejected": -353.6158142089844, - "loss": 0.4987, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.912111520767212, - "rewards/margins": 1.085077166557312, - "rewards/rejected": -2.9971888065338135, + "epoch": 1.1061337008959338, + "grad_norm": 9.287670135498047, + "learning_rate": 4.94662370974815e-08, + "logits/chosen": -2.645318031311035, + "logits/rejected": -2.6280157566070557, + "logps/chosen": -103.72320556640625, + "logps/rejected": -114.6949691772461, + "loss": 0.6492, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.48932743072509766, + "rewards/margins": 0.11837329715490341, + "rewards/rejected": -0.6077008247375488, "step": 6420 }, { - "epoch": 1.11, - "grad_norm": 23.752696598571593, - "learning_rate": 2.4657944851079076e-07, - "logits/chosen": -1.2947794198989868, - "logits/rejected": -1.2474058866500854, - "logps/chosen": -207.3226776123047, - "logps/rejected": -308.37408447265625, - "loss": 0.4638, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.5694684982299805, - "rewards/margins": 1.0322576761245728, - "rewards/rejected": -2.6017260551452637, + "epoch": 1.107856650585803, + "grad_norm": 9.23127555847168, + "learning_rate": 4.9315889702158156e-08, + "logits/chosen": -2.558070182800293, + "logits/rejected": -2.5443100929260254, + "logps/chosen": -96.07037353515625, + "logps/rejected": -101.66596221923828, + "loss": 0.666, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.45678821206092834, + "rewards/margins": 0.07751981914043427, + "rewards/rejected": -0.5343080163002014, "step": 6430 }, { - "epoch": 1.11, - "grad_norm": 23.030575066990085, - "learning_rate": 2.458277424664845e-07, - "logits/chosen": -1.339413046836853, - "logits/rejected": -1.2803711891174316, - "logps/chosen": -209.6329345703125, - "logps/rejected": -334.5424499511719, - "loss": 0.3994, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.5639805793762207, - "rewards/margins": 1.2495237588882446, - "rewards/rejected": -2.813504219055176, + "epoch": 1.109579600275672, + "grad_norm": 8.366679191589355, + "learning_rate": 4.9165548493296894e-08, + "logits/chosen": -2.568415880203247, + "logits/rejected": -2.551879405975342, + "logps/chosen": -102.79166412353516, + "logps/rejected": -113.081787109375, + "loss": 0.6582, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.49532589316368103, + "rewards/margins": 0.10356497764587402, + "rewards/rejected": -0.5988908410072327, "step": 6440 }, { - "epoch": 1.11, - "grad_norm": 28.704711169531485, - "learning_rate": 2.450760741522244e-07, - "logits/chosen": -1.3053383827209473, - "logits/rejected": -1.2392146587371826, - "logps/chosen": -219.56298828125, - "logps/rejected": -331.5205078125, - "loss": 0.4508, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.632362723350525, - "rewards/margins": 1.160873532295227, - "rewards/rejected": -2.793236255645752, + "epoch": 1.111302549965541, + "grad_norm": 8.183588981628418, + "learning_rate": 4.9015214830444874e-08, + "logits/chosen": -2.582038402557373, + "logits/rejected": -2.550384759902954, + "logps/chosen": -102.41912841796875, + "logps/rejected": -114.42729187011719, + "loss": 0.6341, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4609449505805969, + "rewards/margins": 0.16104629635810852, + "rewards/rejected": -0.6219911575317383, "step": 6450 }, { - "epoch": 1.11, - "grad_norm": 32.40806253150715, - "learning_rate": 2.443244503654047e-07, - "logits/chosen": -1.2578837871551514, - "logits/rejected": -1.2153687477111816, - "logps/chosen": -222.15573120117188, - "logps/rejected": -370.3708190917969, - "loss": 0.3719, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.7027851343154907, - "rewards/margins": 1.4216923713684082, - "rewards/rejected": -3.1244776248931885, + "epoch": 1.11302549965541, + "grad_norm": 9.3693208694458, + "learning_rate": 4.886489007308094e-08, + "logits/chosen": -2.539620876312256, + "logits/rejected": -2.5406689643859863, + "logps/chosen": -99.10316467285156, + "logps/rejected": -126.43525695800781, + "loss": 0.6064, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.4722733497619629, + "rewards/margins": 0.21284833550453186, + "rewards/rejected": -0.6851217150688171, "step": 6460 }, { - "epoch": 1.11, - "grad_norm": 27.914141548982084, - "learning_rate": 2.4357287790301755e-07, - "logits/chosen": -1.2337547540664673, - "logits/rejected": -1.178056001663208, - "logps/chosen": -217.49169921875, - "logps/rejected": -328.74554443359375, - "loss": 0.4362, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.6474285125732422, - "rewards/margins": 1.122659683227539, - "rewards/rejected": -2.7700884342193604, + "epoch": 1.114748449345279, + "grad_norm": 8.055909156799316, + "learning_rate": 4.8714575580603515e-08, + "logits/chosen": -2.51084303855896, + "logits/rejected": -2.4892048835754395, + "logps/chosen": -98.69096374511719, + "logps/rejected": -110.8814926147461, + "loss": 0.6468, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4592776298522949, + "rewards/margins": 0.13193976879119873, + "rewards/rejected": -0.5912173986434937, "step": 6470 }, { - "epoch": 1.12, - "grad_norm": 31.48390506247595, - "learning_rate": 2.428213635615902e-07, - "logits/chosen": -1.3232189416885376, - "logits/rejected": -1.2600330114364624, - "logps/chosen": -222.95278930664062, - "logps/rejected": -329.962158203125, - "loss": 0.4351, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.6635528802871704, - "rewards/margins": 1.1314866542816162, - "rewards/rejected": -2.795039415359497, + "epoch": 1.1164713990351482, + "grad_norm": 9.206692695617676, + "learning_rate": 4.856427271231805e-08, + "logits/chosen": -2.5963027477264404, + "logits/rejected": -2.5617220401763916, + "logps/chosen": -100.11074829101562, + "logps/rejected": -107.81233215332031, + "loss": 0.639, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.43541914224624634, + "rewards/margins": 0.13812783360481262, + "rewards/rejected": -0.5735469460487366, "step": 6480 }, { - "epoch": 1.12, - "grad_norm": 20.030188531568935, - "learning_rate": 2.420699141371251e-07, - "logits/chosen": -1.4895018339157104, - "logits/rejected": -1.4343178272247314, - "logps/chosen": -234.48953247070312, - "logps/rejected": -364.4090881347656, - "loss": 0.4536, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.8089115619659424, - "rewards/margins": 1.2942219972610474, - "rewards/rejected": -3.1031336784362793, + "epoch": 1.1181943487250172, + "grad_norm": 8.638041496276855, + "learning_rate": 4.841398282742503e-08, + "logits/chosen": -2.7634589672088623, + "logits/rejected": -2.751997947692871, + "logps/chosen": -103.34661865234375, + "logps/rejected": -116.1749038696289, + "loss": 0.6466, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.49726349115371704, + "rewards/margins": 0.12319080531597137, + "rewards/rejected": -0.6204543113708496, "step": 6490 }, { - "epoch": 1.12, - "grad_norm": 27.831603619813574, - "learning_rate": 2.41318536425037e-07, - "logits/chosen": -1.3722602128982544, - "logits/rejected": -1.3266656398773193, - "logps/chosen": -211.3208770751953, - "logps/rejected": -318.9163818359375, - "loss": 0.4055, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.551661491394043, - "rewards/margins": 1.1104365587234497, - "rewards/rejected": -2.6620981693267822, + "epoch": 1.1199172984148862, + "grad_norm": 9.092048645019531, + "learning_rate": 4.8263707285007393e-08, + "logits/chosen": -2.607485294342041, + "logits/rejected": -2.6008620262145996, + "logps/chosen": -104.70477294921875, + "logps/rejected": -114.68504333496094, + "loss": 0.6445, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.48498040437698364, + "rewards/margins": 0.13475123047828674, + "rewards/rejected": -0.6197316646575928, "step": 6500 }, { - "epoch": 1.12, - "eval_logits/chosen": -1.4495809078216553, - "eval_logits/rejected": -1.4227662086486816, - "eval_logps/chosen": -220.26852416992188, - "eval_logps/rejected": -266.0024108886719, - "eval_loss": 0.6262578964233398, - "eval_rewards/accuracies": 0.6656598448753357, - "eval_rewards/chosen": -1.6156466007232666, - "eval_rewards/margins": 0.4128037095069885, - "eval_rewards/rejected": -2.0284502506256104, - "eval_runtime": 356.7396, - "eval_samples_per_second": 12.065, - "eval_steps_per_second": 1.508, + "epoch": 1.1199172984148862, + "eval_logits/chosen": -2.6922686100006104, + "eval_logits/rejected": -2.686023235321045, + "eval_logps/chosen": -97.65068817138672, + "eval_logps/rejected": -109.87677001953125, + "eval_loss": 0.6626461148262024, + "eval_rewards/accuracies": 0.6096654534339905, + "eval_rewards/chosen": -0.38938793540000916, + "eval_rewards/margins": 0.0775785744190216, + "eval_rewards/rejected": -0.46696653962135315, + "eval_runtime": 359.8452, + "eval_samples_per_second": 11.961, + "eval_steps_per_second": 1.495, "step": 6500 }, { - "epoch": 1.12, - "grad_norm": 30.446262958118353, - "learning_rate": 2.4056723722009243e-07, - "logits/chosen": -1.3711057901382446, - "logits/rejected": -1.291212797164917, - "logps/chosen": -215.440673828125, - "logps/rejected": -331.94805908203125, - "loss": 0.4238, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5865962505340576, - "rewards/margins": 1.203033447265625, - "rewards/rejected": -2.7896294593811035, + "epoch": 1.1216402481047554, + "grad_norm": 8.9795560836792, + "learning_rate": 4.811344744401849e-08, + "logits/chosen": -2.6200554370880127, + "logits/rejected": -2.576789617538452, + "logps/chosen": -104.20268249511719, + "logps/rejected": -114.16617584228516, + "loss": 0.6395, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.4741840362548828, + "rewards/margins": 0.1373899281024933, + "rewards/rejected": -0.6115739941596985, "step": 6510 }, { - "epoch": 1.12, - "grad_norm": 22.937766573953056, - "learning_rate": 2.39816023316348e-07, - "logits/chosen": -1.3640129566192627, - "logits/rejected": -1.3016841411590576, - "logps/chosen": -203.5388641357422, - "logps/rejected": -329.3097839355469, - "loss": 0.3885, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.499146819114685, - "rewards/margins": 1.2792606353759766, - "rewards/rejected": -2.778407573699951, + "epoch": 1.1233631977946243, + "grad_norm": 10.132994651794434, + "learning_rate": 4.796320466326961e-08, + "logits/chosen": -2.5829601287841797, + "logits/rejected": -2.55981707572937, + "logps/chosen": -98.28492736816406, + "logps/rejected": -107.94759368896484, + "loss": 0.6482, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.446734756231308, + "rewards/margins": 0.11816434562206268, + "rewards/rejected": -0.5648991465568542, "step": 6520 }, { - "epoch": 1.13, - "grad_norm": 20.155288956338, - "learning_rate": 2.3906490150708894e-07, - "logits/chosen": -1.3035330772399902, - "logits/rejected": -1.2258248329162598, - "logps/chosen": -207.00711059570312, - "logps/rejected": -357.16375732421875, - "loss": 0.3594, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.533954381942749, - "rewards/margins": 1.5234779119491577, - "rewards/rejected": -3.057432174682617, + "epoch": 1.1250861474844935, + "grad_norm": 8.41073226928711, + "learning_rate": 4.7812980301417786e-08, + "logits/chosen": -2.5432591438293457, + "logits/rejected": -2.5165724754333496, + "logps/chosen": -100.28233337402344, + "logps/rejected": -114.78585052490234, + "loss": 0.6264, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4665835499763489, + "rewards/margins": 0.16683372855186462, + "rewards/rejected": -0.6334173083305359, "step": 6530 }, { - "epoch": 1.13, - "grad_norm": 38.1595216556655, - "learning_rate": 2.3831387858476739e-07, - "logits/chosen": -1.3005788326263428, - "logits/rejected": -1.2370128631591797, - "logps/chosen": -241.73861694335938, - "logps/rejected": -354.9060363769531, - "loss": 0.4724, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.8435420989990234, - "rewards/margins": 1.1428686380386353, - "rewards/rejected": -2.986410617828369, + "epoch": 1.1268090971743625, + "grad_norm": 7.00799036026001, + "learning_rate": 4.766277571695348e-08, + "logits/chosen": -2.590916633605957, + "logits/rejected": -2.5647997856140137, + "logps/chosen": -109.76686096191406, + "logps/rejected": -120.7196273803711, + "loss": 0.6495, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5239251255989075, + "rewards/margins": 0.12022165954113007, + "rewards/rejected": -0.644146740436554, "step": 6540 }, { - "epoch": 1.13, - "grad_norm": 32.72488280276923, - "learning_rate": 2.3756296134094176e-07, - "logits/chosen": -1.2309355735778809, - "logits/rejected": -1.1715677976608276, - "logps/chosen": -238.2613067626953, - "logps/rejected": -352.8631286621094, - "loss": 0.45, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.8673267364501953, - "rewards/margins": 1.1446090936660767, - "rewards/rejected": -3.0119359493255615, + "epoch": 1.1285320468642315, + "grad_norm": 9.350703239440918, + "learning_rate": 4.751259226818835e-08, + "logits/chosen": -2.505497455596924, + "logits/rejected": -2.483640432357788, + "logps/chosen": -99.55348205566406, + "logps/rejected": -111.34088134765625, + "loss": 0.652, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4800754487514496, + "rewards/margins": 0.11636264622211456, + "rewards/rejected": -0.5964381098747253, "step": 6550 }, { - "epoch": 1.13, - "grad_norm": 21.509539455997288, - "learning_rate": 2.368121565662142e-07, - "logits/chosen": -1.372521162033081, - "logits/rejected": -1.3001985549926758, - "logps/chosen": -221.03701782226562, - "logps/rejected": -343.4317626953125, - "loss": 0.4243, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.6504312753677368, - "rewards/margins": 1.2656335830688477, - "rewards/rejected": -2.916064739227295, + "epoch": 1.1302549965541007, + "grad_norm": 8.498332023620605, + "learning_rate": 4.736243131324284e-08, + "logits/chosen": -2.617701292037964, + "logits/rejected": -2.5795741081237793, + "logps/chosen": -102.66838073730469, + "logps/rejected": -115.81105041503906, + "loss": 0.6255, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.46664175391197205, + "rewards/margins": 0.17283225059509277, + "rewards/rejected": -0.6394740343093872, "step": 6560 }, { - "epoch": 1.13, - "grad_norm": 26.05014427851963, - "learning_rate": 2.3606147105017037e-07, - "logits/chosen": -1.3940800428390503, - "logits/rejected": -1.323072910308838, - "logps/chosen": -222.31124877929688, - "logps/rejected": -340.05926513671875, - "loss": 0.4147, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.6877048015594482, - "rewards/margins": 1.1861732006072998, - "rewards/rejected": -2.873878002166748, + "epoch": 1.1319779462439696, + "grad_norm": 9.389053344726562, + "learning_rate": 4.7212294210034075e-08, + "logits/chosen": -2.6099629402160645, + "logits/rejected": -2.5814719200134277, + "logps/chosen": -101.36234283447266, + "logps/rejected": -114.8719711303711, + "loss": 0.6357, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.477983295917511, + "rewards/margins": 0.14376316964626312, + "rewards/rejected": -0.6217464208602905, "step": 6570 }, { - "epoch": 1.13, - "grad_norm": 27.019728234474087, - "learning_rate": 2.3531091158131702e-07, - "logits/chosen": -1.4203673601150513, - "logits/rejected": -1.349675178527832, - "logps/chosen": -210.45614624023438, - "logps/rejected": -316.55548095703125, - "loss": 0.4356, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.5541622638702393, - "rewards/margins": 1.1116034984588623, - "rewards/rejected": -2.6657655239105225, + "epoch": 1.1337008959338388, + "grad_norm": 8.522909164428711, + "learning_rate": 4.70621823162634e-08, + "logits/chosen": -2.620063304901123, + "logits/rejected": -2.5842626094818115, + "logps/chosen": -103.08598327636719, + "logps/rejected": -108.29972839355469, + "loss": 0.6546, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.48057666420936584, + "rewards/margins": 0.10240229219198227, + "rewards/rejected": -0.5829789042472839, "step": 6580 }, { - "epoch": 1.14, - "grad_norm": 20.654019350018757, - "learning_rate": 2.3456048494702133e-07, - "logits/chosen": -1.360848069190979, - "logits/rejected": -1.2936543226242065, - "logps/chosen": -213.0439453125, - "logps/rejected": -346.81463623046875, - "loss": 0.4131, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.6021496057510376, - "rewards/margins": 1.3312013149261475, - "rewards/rejected": -2.9333510398864746, + "epoch": 1.1354238456237078, + "grad_norm": 10.206613540649414, + "learning_rate": 4.6912096989404264e-08, + "logits/chosen": -2.57188081741333, + "logits/rejected": -2.5505259037017822, + "logps/chosen": -99.1366958618164, + "logps/rejected": -112.9898681640625, + "loss": 0.6419, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4629674553871155, + "rewards/margins": 0.13212160766124725, + "rewards/rejected": -0.5950890779495239, "step": 6590 }, { - "epoch": 1.14, - "grad_norm": 39.76165527082194, - "learning_rate": 2.3381019793344897e-07, - "logits/chosen": -1.4293988943099976, - "logits/rejected": -1.3686877489089966, - "logps/chosen": -219.72946166992188, - "logps/rejected": -342.87799072265625, - "loss": 0.4373, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.6410694122314453, - "rewards/margins": 1.252076268196106, - "rewards/rejected": -2.893145799636841, + "epoch": 1.1371467953135768, + "grad_norm": 9.817237854003906, + "learning_rate": 4.6762039586689795e-08, + "logits/chosen": -2.66290020942688, + "logits/rejected": -2.647390604019165, + "logps/chosen": -102.1463394165039, + "logps/rejected": -112.91239929199219, + "loss": 0.6438, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4651399552822113, + "rewards/margins": 0.12810549139976501, + "rewards/rejected": -0.5932454466819763, "step": 6600 }, { - "epoch": 1.14, - "eval_logits/chosen": -1.4153751134872437, - "eval_logits/rejected": -1.3869836330413818, - "eval_logps/chosen": -250.33335876464844, - "eval_logps/rejected": -302.05145263671875, - "eval_loss": 0.6318928003311157, - "eval_rewards/accuracies": 0.6614776849746704, - "eval_rewards/chosen": -1.9162949323654175, - "eval_rewards/margins": 0.4726457893848419, - "eval_rewards/rejected": -2.3889405727386475, - "eval_runtime": 356.6448, - "eval_samples_per_second": 12.068, - "eval_steps_per_second": 1.509, + "epoch": 1.1371467953135768, + "eval_logits/chosen": -2.687708854675293, + "eval_logits/rejected": -2.6813712120056152, + "eval_logps/chosen": -97.78389739990234, + "eval_logps/rejected": -110.0129165649414, + "eval_loss": 0.6627377271652222, + "eval_rewards/accuracies": 0.607342004776001, + "eval_rewards/chosen": -0.3907199501991272, + "eval_rewards/margins": 0.07760793715715408, + "eval_rewards/rejected": -0.4683278203010559, + "eval_runtime": 359.7714, + "eval_samples_per_second": 11.963, + "eval_steps_per_second": 1.495, "step": 6600 }, { - "epoch": 1.14, - "grad_norm": 18.9597145646942, - "learning_rate": 2.3306005732550337e-07, - "logits/chosen": -1.3483235836029053, - "logits/rejected": -1.287246584892273, - "logps/chosen": -246.5947265625, - "logps/rejected": -377.0596618652344, - "loss": 0.3971, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.9461562633514404, - "rewards/margins": 1.3069889545440674, - "rewards/rejected": -3.253145217895508, + "epoch": 1.138869745003446, + "grad_norm": 8.432068824768066, + "learning_rate": 4.661201146510068e-08, + "logits/chosen": -2.617776393890381, + "logits/rejected": -2.600355863571167, + "logps/chosen": -98.23858642578125, + "logps/rejected": -113.22637939453125, + "loss": 0.6321, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.46272364258766174, + "rewards/margins": 0.15199866890907288, + "rewards/rejected": -0.6147223114967346, "step": 6610 }, { - "epoch": 1.14, - "grad_norm": 34.30473163237968, - "learning_rate": 2.3231006990676365e-07, - "logits/chosen": -1.3247897624969482, - "logits/rejected": -1.2637712955474854, - "logps/chosen": -252.6153106689453, - "logps/rejected": -367.00933837890625, - "loss": 0.4838, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.0022428035736084, - "rewards/margins": 1.1463488340377808, - "rewards/rejected": -3.148591995239258, + "epoch": 1.140592694693315, + "grad_norm": 9.28195571899414, + "learning_rate": 4.646201398135273e-08, + "logits/chosen": -2.5815634727478027, + "logits/rejected": -2.560616970062256, + "logps/chosen": -104.71697998046875, + "logps/rejected": -115.4776382446289, + "loss": 0.6548, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.523261308670044, + "rewards/margins": 0.10992765426635742, + "rewards/rejected": -0.6331889629364014, "step": 6620 }, { - "epoch": 1.14, - "grad_norm": 27.70016442005097, - "learning_rate": 2.3156024245942394e-07, - "logits/chosen": -1.3318690061569214, - "logits/rejected": -1.2696187496185303, - "logps/chosen": -205.23562622070312, - "logps/rejected": -317.63372802734375, - "loss": 0.4009, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.5194008350372314, - "rewards/margins": 1.1651160717010498, - "rewards/rejected": -2.6845173835754395, + "epoch": 1.1423156443831841, + "grad_norm": 8.086362838745117, + "learning_rate": 4.6312048491884784e-08, + "logits/chosen": -2.5562520027160645, + "logits/rejected": -2.5318968296051025, + "logps/chosen": -100.54441833496094, + "logps/rejected": -106.32527923583984, + "loss": 0.6566, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.47232919931411743, + "rewards/margins": 0.0990000069141388, + "rewards/rejected": -0.5713291764259338, "step": 6630 }, { - "epoch": 1.14, - "grad_norm": 19.533684280783113, - "learning_rate": 2.3081058176423148e-07, - "logits/chosen": -1.4036105871200562, - "logits/rejected": -1.3376753330230713, - "logps/chosen": -224.6147003173828, - "logps/rejected": -337.85577392578125, - "loss": 0.4359, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.7151901721954346, - "rewards/margins": 1.1676080226898193, - "rewards/rejected": -2.8827977180480957, + "epoch": 1.144038594073053, + "grad_norm": 7.732243061065674, + "learning_rate": 4.6162116352846295e-08, + "logits/chosen": -2.6114213466644287, + "logits/rejected": -2.577406644821167, + "logps/chosen": -102.87176513671875, + "logps/rejected": -110.48823547363281, + "loss": 0.6529, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.49765920639038086, + "rewards/margins": 0.11124183237552643, + "rewards/rejected": -0.6089010238647461, "step": 6640 }, { - "epoch": 1.15, - "grad_norm": 25.725355566297086, - "learning_rate": 2.300610946004256e-07, - "logits/chosen": -1.449748158454895, - "logits/rejected": -1.3810780048370361, - "logps/chosen": -218.1935577392578, - "logps/rejected": -359.31268310546875, - "loss": 0.3906, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.6608730554580688, - "rewards/margins": 1.3988673686981201, - "rewards/rejected": -3.0597405433654785, + "epoch": 1.145761543762922, + "grad_norm": 9.960616111755371, + "learning_rate": 4.6012218920085124e-08, + "logits/chosen": -2.6517696380615234, + "logits/rejected": -2.6302433013916016, + "logps/chosen": -99.01194763183594, + "logps/rejected": -117.2528076171875, + "loss": 0.6276, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.46898430585861206, + "rewards/margins": 0.17000864446163177, + "rewards/rejected": -0.638992965221405, "step": 6650 }, { - "epoch": 1.15, - "grad_norm": 22.25653908814933, - "learning_rate": 2.2931178774567662e-07, - "logits/chosen": -1.4511274099349976, - "logits/rejected": -1.389211654663086, - "logps/chosen": -204.99295043945312, - "logps/rejected": -340.06597900390625, - "loss": 0.4019, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.5325791835784912, - "rewards/margins": 1.326608657836914, - "rewards/rejected": -2.859187602996826, + "epoch": 1.1474844934527912, + "grad_norm": 7.823163986206055, + "learning_rate": 4.586235754913532e-08, + "logits/chosen": -2.6741387844085693, + "logits/rejected": -2.6565566062927246, + "logps/chosen": -98.02713775634766, + "logps/rejected": -115.93632507324219, + "loss": 0.6311, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.46266403794288635, + "rewards/margins": 0.1550600379705429, + "rewards/rejected": -0.6177240610122681, "step": 6660 }, { - "epoch": 1.15, - "grad_norm": 25.109705271958312, - "learning_rate": 2.285626679760239e-07, - "logits/chosen": -1.3574293851852417, - "logits/rejected": -1.300843596458435, - "logps/chosen": -223.3512420654297, - "logps/rejected": -382.12548828125, - "loss": 0.3859, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.7345237731933594, - "rewards/margins": 1.5338653326034546, - "rewards/rejected": -3.2683892250061035, + "epoch": 1.1492074431426602, + "grad_norm": 9.069050788879395, + "learning_rate": 4.5712533595204785e-08, + "logits/chosen": -2.5663323402404785, + "logits/rejected": -2.5679333209991455, + "logps/chosen": -100.11381530761719, + "logps/rejected": -125.8458480834961, + "loss": 0.6122, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5020694732666016, + "rewards/margins": 0.20346996188163757, + "rewards/rejected": -0.7055394649505615, "step": 6670 }, { - "epoch": 1.15, - "grad_norm": 36.664785759024156, - "learning_rate": 2.278137420658154e-07, - "logits/chosen": -1.3482401371002197, - "logits/rejected": -1.2863205671310425, - "logps/chosen": -229.06997680664062, - "logps/rejected": -332.17108154296875, - "loss": 0.4906, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.7523906230926514, - "rewards/margins": 1.0489892959594727, - "rewards/rejected": -2.8013803958892822, + "epoch": 1.1509303928325294, + "grad_norm": 9.263232231140137, + "learning_rate": 4.5562748413163086e-08, + "logits/chosen": -2.5756216049194336, + "logits/rejected": -2.546689748764038, + "logps/chosen": -101.28739166259766, + "logps/rejected": -112.49143981933594, + "loss": 0.6429, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4746217131614685, + "rewards/margins": 0.12963800132274628, + "rewards/rejected": -0.604259729385376, "step": 6680 }, { - "epoch": 1.15, - "grad_norm": 21.93167754697323, - "learning_rate": 2.270650167876456e-07, - "logits/chosen": -1.3556606769561768, - "logits/rejected": -1.294721245765686, - "logps/chosen": -201.72213745117188, - "logps/rejected": -337.44561767578125, - "loss": 0.3821, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.4457823038101196, - "rewards/margins": 1.3707009553909302, - "rewards/rejected": -2.81648325920105, + "epoch": 1.1526533425223984, + "grad_norm": 7.940311431884766, + "learning_rate": 4.5413003357529115e-08, + "logits/chosen": -2.5793943405151367, + "logits/rejected": -2.559263229370117, + "logps/chosen": -101.38301086425781, + "logps/rejected": -119.2028579711914, + "loss": 0.6141, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4422333240509033, + "rewards/margins": 0.19144585728645325, + "rewards/rejected": -0.6336792707443237, "step": 6690 }, { - "epoch": 1.15, - "grad_norm": 45.51069055649989, - "learning_rate": 2.2631649891229502e-07, - "logits/chosen": -1.3424584865570068, - "logits/rejected": -1.2963857650756836, - "logps/chosen": -230.5055694580078, - "logps/rejected": -343.5107421875, - "loss": 0.4568, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.7437101602554321, - "rewards/margins": 1.1325562000274658, - "rewards/rejected": -2.8762667179107666, + "epoch": 1.1543762922122673, + "grad_norm": 9.524909973144531, + "learning_rate": 4.5263299782459e-08, + "logits/chosen": -2.581721544265747, + "logits/rejected": -2.5713274478912354, + "logps/chosen": -104.95369720458984, + "logps/rejected": -118.57572937011719, + "loss": 0.6411, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4882546365261078, + "rewards/margins": 0.13852766156196594, + "rewards/rejected": -0.6267822980880737, "step": 6700 }, { - "epoch": 1.15, - "eval_logits/chosen": -1.4418696165084839, - "eval_logits/rejected": -1.4138153791427612, - "eval_logps/chosen": -229.56253051757812, - "eval_logps/rejected": -278.3695983886719, - "eval_loss": 0.6346877813339233, - "eval_rewards/accuracies": 0.6575278639793396, - "eval_rewards/chosen": -1.7085868120193481, - "eval_rewards/margins": 0.44353532791137695, - "eval_rewards/rejected": -2.1521220207214355, - "eval_runtime": 356.6768, - "eval_samples_per_second": 12.067, - "eval_steps_per_second": 1.508, + "epoch": 1.1543762922122673, + "eval_logits/chosen": -2.6791462898254395, + "eval_logits/rejected": -2.672869920730591, + "eval_logps/chosen": -98.66951751708984, + "eval_logps/rejected": -111.08659362792969, + "eval_loss": 0.6621683835983276, + "eval_rewards/accuracies": 0.6122211813926697, + "eval_rewards/chosen": -0.39957618713378906, + "eval_rewards/margins": 0.07948849350214005, + "eval_rewards/rejected": -0.4790646433830261, + "eval_runtime": 359.782, + "eval_samples_per_second": 11.963, + "eval_steps_per_second": 1.495, "step": 6700 }, { - "epoch": 1.16, - "grad_norm": 25.6076080920141, - "learning_rate": 2.2556819520866828e-07, - "logits/chosen": -1.3505706787109375, - "logits/rejected": -1.2714554071426392, - "logps/chosen": -215.65658569335938, - "logps/rejected": -377.618408203125, - "loss": 0.3473, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.5876433849334717, - "rewards/margins": 1.653158187866211, - "rewards/rejected": -3.2408013343811035, + "epoch": 1.1560992419021365, + "grad_norm": 7.029930591583252, + "learning_rate": 4.5113639041733654e-08, + "logits/chosen": -2.5752780437469482, + "logits/rejected": -2.5508055686950684, + "logps/chosen": -105.0222396850586, + "logps/rejected": -118.96745300292969, + "loss": 0.6333, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.48097047209739685, + "rewards/margins": 0.17322038114070892, + "rewards/rejected": -0.6541908383369446, "step": 6710 }, { - "epoch": 1.16, - "grad_norm": 25.373104312017844, - "learning_rate": 2.2482011244373357e-07, - "logits/chosen": -1.3233754634857178, - "logits/rejected": -1.2596690654754639, - "logps/chosen": -218.81887817382812, - "logps/rejected": -363.94854736328125, - "loss": 0.3888, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.674155592918396, - "rewards/margins": 1.4498602151870728, - "rewards/rejected": -3.1240158081054688, + "epoch": 1.1578221915920055, + "grad_norm": 9.368919372558594, + "learning_rate": 4.496402248874671e-08, + "logits/chosen": -2.564828634262085, + "logits/rejected": -2.550750255584717, + "logps/chosen": -97.69122314453125, + "logps/rejected": -116.96647644042969, + "loss": 0.6179, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.46276330947875977, + "rewards/margins": 0.19141033291816711, + "rewards/rejected": -0.6541736721992493, "step": 6720 }, { - "epoch": 1.16, - "grad_norm": 36.35012540803045, - "learning_rate": 2.2407225738246074e-07, - "logits/chosen": -1.2628940343856812, - "logits/rejected": -1.2141722440719604, - "logps/chosen": -245.4576416015625, - "logps/rejected": -356.5597229003906, - "loss": 0.4927, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.8878835439682007, - "rewards/margins": 1.1251258850097656, - "rewards/rejected": -3.013009548187256, + "epoch": 1.1595451412818747, + "grad_norm": 9.316107749938965, + "learning_rate": 4.4814451476492146e-08, + "logits/chosen": -2.524674892425537, + "logits/rejected": -2.5094120502471924, + "logps/chosen": -105.376708984375, + "logps/rejected": -116.1399154663086, + "loss": 0.6477, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4867584705352783, + "rewards/margins": 0.12187260389328003, + "rewards/rejected": -0.6086310744285583, "step": 6730 }, { - "epoch": 1.16, - "grad_norm": 30.43443111435983, - "learning_rate": 2.233246367877609e-07, - "logits/chosen": -1.3312593698501587, - "logits/rejected": -1.2748968601226807, - "logps/chosen": -217.5796356201172, - "logps/rejected": -356.3081970214844, - "loss": 0.4072, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.6314865350723267, - "rewards/margins": 1.3706390857696533, - "rewards/rejected": -3.0021252632141113, + "epoch": 1.1612680909717437, + "grad_norm": 10.971795082092285, + "learning_rate": 4.466492735755218e-08, + "logits/chosen": -2.587489128112793, + "logits/rejected": -2.579017400741577, + "logps/chosen": -102.76058197021484, + "logps/rejected": -120.98915100097656, + "loss": 0.627, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48327407240867615, + "rewards/margins": 0.16570714116096497, + "rewards/rejected": -0.6489812731742859, "step": 6740 }, { - "epoch": 1.16, - "grad_norm": 19.110570274965703, - "learning_rate": 2.2257725742042438e-07, - "logits/chosen": -1.3627091646194458, - "logits/rejected": -1.3030933141708374, - "logps/chosen": -224.2344512939453, - "logps/rejected": -367.8368225097656, - "loss": 0.4003, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.7180770635604858, - "rewards/margins": 1.4236048460006714, - "rewards/rejected": -3.1416821479797363, + "epoch": 1.1629910406616126, + "grad_norm": 9.019688606262207, + "learning_rate": 4.4515451484084875e-08, + "logits/chosen": -2.622640609741211, + "logits/rejected": -2.608363151550293, + "logps/chosen": -104.0699462890625, + "logps/rejected": -121.1629409790039, + "loss": 0.6334, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5161177515983582, + "rewards/margins": 0.15863685309886932, + "rewards/rejected": -0.6747546195983887, "step": 6750 }, { - "epoch": 1.16, - "grad_norm": 48.40608306760337, - "learning_rate": 2.2183012603906066e-07, - "logits/chosen": -1.312281608581543, - "logits/rejected": -1.2430318593978882, - "logps/chosen": -216.72750854492188, - "logps/rejected": -338.98541259765625, - "loss": 0.4699, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.6467088460922241, - "rewards/margins": 1.255919337272644, - "rewards/rejected": -2.902627944946289, + "epoch": 1.1647139903514818, + "grad_norm": 9.12403678894043, + "learning_rate": 4.436602520781213e-08, + "logits/chosen": -2.5706498622894287, + "logits/rejected": -2.5449025630950928, + "logps/chosen": -101.08219146728516, + "logps/rejected": -112.35832214355469, + "loss": 0.6398, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4902876317501068, + "rewards/margins": 0.14592550694942474, + "rewards/rejected": -0.6362131834030151, "step": 6760 }, { - "epoch": 1.17, - "grad_norm": 41.406179970761336, - "learning_rate": 2.2108324940003606e-07, - "logits/chosen": -1.3574762344360352, - "logits/rejected": -1.3120397329330444, - "logps/chosen": -211.9196014404297, - "logps/rejected": -334.21173095703125, - "loss": 0.433, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.5811679363250732, - "rewards/margins": 1.2045514583587646, - "rewards/rejected": -2.7857189178466797, + "epoch": 1.1664369400413508, + "grad_norm": 8.566431999206543, + "learning_rate": 4.4216649880007214e-08, + "logits/chosen": -2.586684465408325, + "logits/rejected": -2.579789876937866, + "logps/chosen": -100.56122589111328, + "logps/rejected": -118.85711669921875, + "loss": 0.6268, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.46726298332214355, + "rewards/margins": 0.1650046408176422, + "rewards/rejected": -0.6322677135467529, "step": 6770 }, { - "epoch": 1.17, - "grad_norm": 35.196901180748505, - "learning_rate": 2.2033663425741378e-07, - "logits/chosen": -1.3661503791809082, - "logits/rejected": -1.2911349534988403, - "logps/chosen": -219.1353302001953, - "logps/rejected": -340.84149169921875, - "loss": 0.4354, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.6264568567276, - "rewards/margins": 1.2487623691558838, - "rewards/rejected": -2.8752193450927734, + "epoch": 1.1681598897312198, + "grad_norm": 10.997673988342285, + "learning_rate": 4.4067326851482754e-08, + "logits/chosen": -2.5771098136901855, + "logits/rejected": -2.5475375652313232, + "logps/chosen": -108.99159240722656, + "logps/rejected": -116.57635498046875, + "loss": 0.6568, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5248969197273254, + "rewards/margins": 0.1076025515794754, + "rewards/rejected": -0.6324995756149292, "step": 6780 }, { - "epoch": 1.17, - "grad_norm": 30.125071463453715, - "learning_rate": 2.1959028736289184e-07, - "logits/chosen": -1.3736763000488281, - "logits/rejected": -1.3135449886322021, - "logps/chosen": -205.2973175048828, - "logps/rejected": -336.26531982421875, - "loss": 0.4184, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5477122068405151, - "rewards/margins": 1.2838819026947021, - "rewards/rejected": -2.8315939903259277, + "epoch": 1.169882839421089, + "grad_norm": 8.650938987731934, + "learning_rate": 4.391805747257837e-08, + "logits/chosen": -2.5931596755981445, + "logits/rejected": -2.577589511871338, + "logps/chosen": -98.49267578125, + "logps/rejected": -118.1158218383789, + "loss": 0.6256, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.47968751192092896, + "rewards/margins": 0.17027659714221954, + "rewards/rejected": -0.6499640941619873, "step": 6790 }, { - "epoch": 1.17, - "grad_norm": 20.154181246091905, - "learning_rate": 2.1884421546574288e-07, - "logits/chosen": -1.2408316135406494, - "logits/rejected": -1.166013240814209, - "logps/chosen": -215.8163299560547, - "logps/rejected": -352.4253234863281, - "loss": 0.396, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.634212851524353, - "rewards/margins": 1.3892707824707031, - "rewards/rejected": -3.0234837532043457, + "epoch": 1.171605789110958, + "grad_norm": 6.909446716308594, + "learning_rate": 4.3768843093148576e-08, + "logits/chosen": -2.5084850788116455, + "logits/rejected": -2.4819085597991943, + "logps/chosen": -101.25658416748047, + "logps/rejected": -116.91668701171875, + "loss": 0.6224, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4883821904659271, + "rewards/margins": 0.17982549965381622, + "rewards/rejected": -0.6682077050209045, "step": 6800 }, { - "epoch": 1.17, - "eval_logits/chosen": -1.4073745012283325, - "eval_logits/rejected": -1.3791805505752563, - "eval_logps/chosen": -242.52587890625, - "eval_logps/rejected": -293.1243896484375, - "eval_loss": 0.6304371356964111, - "eval_rewards/accuracies": 0.669377326965332, - "eval_rewards/chosen": -1.8382201194763184, - "eval_rewards/margins": 0.4614499807357788, - "eval_rewards/rejected": -2.2996702194213867, - "eval_runtime": 356.7418, - "eval_samples_per_second": 12.065, - "eval_steps_per_second": 1.508, + "epoch": 1.171605789110958, + "eval_logits/chosen": -2.66884446144104, + "eval_logits/rejected": -2.662496566772461, + "eval_logps/chosen": -100.33702850341797, + "eval_logps/rejected": -112.9988021850586, + "eval_loss": 0.6614071130752563, + "eval_rewards/accuracies": 0.6115241646766663, + "eval_rewards/chosen": -0.4162512421607971, + "eval_rewards/margins": 0.08193553239107132, + "eval_rewards/rejected": -0.4981868267059326, + "eval_runtime": 359.807, + "eval_samples_per_second": 11.962, + "eval_steps_per_second": 1.495, "step": 6800 }, { - "epoch": 1.17, - "grad_norm": 40.48439094660076, - "learning_rate": 2.1809842531275234e-07, - "logits/chosen": -1.3060812950134277, - "logits/rejected": -1.2478914260864258, - "logps/chosen": -250.30685424804688, - "logps/rejected": -360.6299133300781, - "loss": 0.458, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.9459367990493774, - "rewards/margins": 1.1430813074111938, - "rewards/rejected": -3.0890181064605713, + "epoch": 1.173328738800827, + "grad_norm": 8.090224266052246, + "learning_rate": 4.361968506255046e-08, + "logits/chosen": -2.56339168548584, + "logits/rejected": -2.5459580421447754, + "logps/chosen": -104.08646392822266, + "logps/rejected": -118.2791519165039, + "loss": 0.6255, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.48382797837257385, + "rewards/margins": 0.18149612843990326, + "rewards/rejected": -0.6653240919113159, "step": 6810 }, { - "epoch": 1.18, - "grad_norm": 22.745532083063072, - "learning_rate": 2.173529236481581e-07, - "logits/chosen": -1.3810464143753052, - "logits/rejected": -1.3168919086456299, - "logps/chosen": -248.2955322265625, - "logps/rejected": -380.9482421875, - "loss": 0.4335, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.9328635931015015, - "rewards/margins": 1.3468307256698608, - "rewards/rejected": -3.279694080352783, + "epoch": 1.175051688490696, + "grad_norm": 10.138392448425293, + "learning_rate": 4.347058472963162e-08, + "logits/chosen": -2.6586554050445557, + "logits/rejected": -2.630859136581421, + "logps/chosen": -106.34110260009766, + "logps/rejected": -117.0941390991211, + "loss": 0.6443, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5131762027740479, + "rewards/margins": 0.1275903284549713, + "rewards/rejected": -0.6407665014266968, "step": 6820 }, { - "epoch": 1.18, - "grad_norm": 29.05046249095613, - "learning_rate": 2.1660771721358898e-07, - "logits/chosen": -1.4409806728363037, - "logits/rejected": -1.3872400522232056, - "logps/chosen": -220.9757080078125, - "logps/rejected": -354.67041015625, - "loss": 0.4077, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.6707779169082642, - "rewards/margins": 1.3462539911270142, - "rewards/rejected": -3.0170319080352783, + "epoch": 1.176774638180565, + "grad_norm": 8.95630168914795, + "learning_rate": 4.3321543442717796e-08, + "logits/chosen": -2.6672542095184326, + "logits/rejected": -2.660428285598755, + "logps/chosen": -102.14369201660156, + "logps/rejected": -114.99869537353516, + "loss": 0.6439, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4824022650718689, + "rewards/margins": 0.1374589502811432, + "rewards/rejected": -0.6198612451553345, "step": 6830 }, { - "epoch": 1.18, - "grad_norm": 26.092849893502574, - "learning_rate": 2.1586281274800433e-07, - "logits/chosen": -1.4010366201400757, - "logits/rejected": -1.3389381170272827, - "logps/chosen": -229.3477020263672, - "logps/rejected": -351.7111511230469, - "loss": 0.4488, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7241207361221313, - "rewards/margins": 1.239141583442688, - "rewards/rejected": -2.9632620811462402, + "epoch": 1.1784975878704342, + "grad_norm": 8.868681907653809, + "learning_rate": 4.3172562549600866e-08, + "logits/chosen": -2.621293067932129, + "logits/rejected": -2.5976157188415527, + "logps/chosen": -108.63401794433594, + "logps/rejected": -121.0826644897461, + "loss": 0.6456, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5170341730117798, + "rewards/margins": 0.1398032307624817, + "rewards/rejected": -0.6568374633789062, "step": 6840 }, { - "epoch": 1.18, - "grad_norm": 31.755734616992513, - "learning_rate": 2.151182169876325e-07, - "logits/chosen": -1.3103221654891968, - "logits/rejected": -1.243399739265442, - "logps/chosen": -210.047607421875, - "logps/rejected": -330.6673278808594, - "loss": 0.438, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.559027075767517, - "rewards/margins": 1.2114872932434082, - "rewards/rejected": -2.7705142498016357, + "epoch": 1.1802205375603032, + "grad_norm": 8.628061294555664, + "learning_rate": 4.3023643397526496e-08, + "logits/chosen": -2.4805750846862793, + "logits/rejected": -2.4513936042785645, + "logps/chosen": -103.87449645996094, + "logps/rejected": -119.28651428222656, + "loss": 0.6333, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.49710139632225037, + "rewards/margins": 0.15935452282428741, + "rewards/rejected": -0.656455934047699, "step": 6850 }, { - "epoch": 1.18, - "grad_norm": 23.745751034555475, - "learning_rate": 2.143739366659102e-07, - "logits/chosen": -1.4120018482208252, - "logits/rejected": -1.3412996530532837, - "logps/chosen": -229.89297485351562, - "logps/rejected": -333.34490966796875, - "loss": 0.4456, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.707727074623108, - "rewards/margins": 1.1025744676589966, - "rewards/rejected": -2.8103013038635254, + "epoch": 1.1819434872501722, + "grad_norm": 9.598288536071777, + "learning_rate": 4.287478733318204e-08, + "logits/chosen": -2.5880210399627686, + "logits/rejected": -2.553973913192749, + "logps/chosen": -112.11553955078125, + "logps/rejected": -120.29103088378906, + "loss": 0.6355, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5300083756446838, + "rewards/margins": 0.14968501031398773, + "rewards/rejected": -0.6796934008598328, "step": 6860 }, { - "epoch": 1.18, - "grad_norm": 24.98697965563167, - "learning_rate": 2.1362997851342186e-07, - "logits/chosen": -1.300405502319336, - "logits/rejected": -1.2553117275238037, - "logps/chosen": -233.50424194335938, - "logps/rejected": -346.513671875, - "loss": 0.4593, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.8242870569229126, - "rewards/margins": 1.1232497692108154, - "rewards/rejected": -2.9475369453430176, + "epoch": 1.1836664369400414, + "grad_norm": 9.507317543029785, + "learning_rate": 4.272599570268437e-08, + "logits/chosen": -2.5000433921813965, + "logits/rejected": -2.4891629219055176, + "logps/chosen": -104.8589859008789, + "logps/rejected": -119.49824523925781, + "loss": 0.6399, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5377354621887207, + "rewards/margins": 0.13954982161521912, + "rewards/rejected": -0.6772853136062622, "step": 6870 }, { - "epoch": 1.19, - "grad_norm": 28.460821723801782, - "learning_rate": 2.1288634925783817e-07, - "logits/chosen": -1.3697658777236938, - "logits/rejected": -1.2964236736297607, - "logps/chosen": -220.921630859375, - "logps/rejected": -353.2587890625, - "loss": 0.3797, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.6666065454483032, - "rewards/margins": 1.3543717861175537, - "rewards/rejected": -3.0209782123565674, + "epoch": 1.1853893866299103, + "grad_norm": 8.982834815979004, + "learning_rate": 4.257726985156763e-08, + "logits/chosen": -2.6045875549316406, + "logits/rejected": -2.5753495693206787, + "logps/chosen": -107.98268127441406, + "logps/rejected": -120.47676086425781, + "loss": 0.6346, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5371975302696228, + "rewards/margins": 0.15595701336860657, + "rewards/rejected": -0.6931546330451965, "step": 6880 }, { - "epoch": 1.19, - "grad_norm": 28.677786254784447, - "learning_rate": 2.121430556238559e-07, - "logits/chosen": -1.3057619333267212, - "logits/rejected": -1.2380374670028687, - "logps/chosen": -215.5825958251953, - "logps/rejected": -367.3506774902344, - "loss": 0.3409, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.6328455209732056, - "rewards/margins": 1.516847014427185, - "rewards/rejected": -3.1496922969818115, + "epoch": 1.1871123363197795, + "grad_norm": 8.530778884887695, + "learning_rate": 4.2428611124771177e-08, + "logits/chosen": -2.5295612812042236, + "logits/rejected": -2.5056262016296387, + "logps/chosen": -101.05653381347656, + "logps/rejected": -117.50874328613281, + "loss": 0.633, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.48746204376220703, + "rewards/margins": 0.16373209655284882, + "rewards/rejected": -0.6511940956115723, "step": 6890 }, { - "epoch": 1.19, - "grad_norm": 43.60415706308666, - "learning_rate": 2.1140010433313642e-07, - "logits/chosen": -1.3161351680755615, - "logits/rejected": -1.2549692392349243, - "logps/chosen": -244.4730682373047, - "logps/rejected": -367.92681884765625, - "loss": 0.4312, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.9093977212905884, - "rewards/margins": 1.235694169998169, - "rewards/rejected": -3.1450917720794678, + "epoch": 1.1888352860096485, + "grad_norm": 9.275869369506836, + "learning_rate": 4.2280020866627286e-08, + "logits/chosen": -2.5908186435699463, + "logits/rejected": -2.5705745220184326, + "logps/chosen": -104.81771087646484, + "logps/rejected": -118.31611633300781, + "loss": 0.6437, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5126312971115112, + "rewards/margins": 0.13616414368152618, + "rewards/rejected": -0.648795485496521, "step": 6900 }, { - "epoch": 1.19, - "eval_logits/chosen": -1.3852744102478027, - "eval_logits/rejected": -1.3564972877502441, - "eval_logps/chosen": -266.2965393066406, - "eval_logps/rejected": -320.25164794921875, - "eval_loss": 0.6330453157424927, - "eval_rewards/accuracies": 0.6644981503486633, - "eval_rewards/chosen": -2.0759267807006836, - "eval_rewards/margins": 0.4950157105922699, - "eval_rewards/rejected": -2.5709426403045654, - "eval_runtime": 356.8834, - "eval_samples_per_second": 12.06, - "eval_steps_per_second": 1.507, + "epoch": 1.1888352860096485, + "eval_logits/chosen": -2.6617848873138428, + "eval_logits/rejected": -2.655430793762207, + "eval_logps/chosen": -101.02922821044922, + "eval_logps/rejected": -113.82203674316406, + "eval_loss": 0.6610434055328369, + "eval_rewards/accuracies": 0.6105948090553284, + "eval_rewards/chosen": -0.4231734275817871, + "eval_rewards/margins": 0.08324573189020157, + "eval_rewards/rejected": -0.5064191222190857, + "eval_runtime": 360.3467, + "eval_samples_per_second": 11.944, + "eval_steps_per_second": 1.493, "step": 6900 }, { - "epoch": 1.19, - "grad_norm": 33.10823667565031, - "learning_rate": 2.1065750210424572e-07, - "logits/chosen": -1.3516546487808228, - "logits/rejected": -1.280491828918457, - "logps/chosen": -246.9800262451172, - "logps/rejected": -387.73577880859375, - "loss": 0.4107, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.9011551141738892, - "rewards/margins": 1.4241554737091064, - "rewards/rejected": -3.3253109455108643, + "epoch": 1.1905582356995175, + "grad_norm": 9.048493385314941, + "learning_rate": 4.213150042084914e-08, + "logits/chosen": -2.6164145469665527, + "logits/rejected": -2.593142032623291, + "logps/chosen": -106.26557922363281, + "logps/rejected": -123.49296569824219, + "loss": 0.6194, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.49379563331604004, + "rewards/margins": 0.18906860053539276, + "rewards/rejected": -0.6828643083572388, "step": 6910 }, { - "epoch": 1.19, - "grad_norm": 31.592188794374742, - "learning_rate": 2.099152556525926e-07, - "logits/chosen": -1.4136111736297607, - "logits/rejected": -1.3561625480651855, - "logps/chosen": -256.72418212890625, - "logps/rejected": -363.986083984375, - "loss": 0.4662, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.0063765048980713, - "rewards/margins": 1.0958820581436157, - "rewards/rejected": -3.1022586822509766, + "epoch": 1.1922811853893867, + "grad_norm": 9.733365058898926, + "learning_rate": 4.198305113051852e-08, + "logits/chosen": -2.6699588298797607, + "logits/rejected": -2.6404507160186768, + "logps/chosen": -109.824951171875, + "logps/rejected": -120.6965560913086, + "loss": 0.6499, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.537209689617157, + "rewards/margins": 0.13205452263355255, + "rewards/rejected": -0.6692641973495483, "step": 6920 }, { - "epoch": 1.19, - "grad_norm": 33.58926677055865, - "learning_rate": 2.0917337169036924e-07, - "logits/chosen": -1.2991350889205933, - "logits/rejected": -1.224484920501709, - "logps/chosen": -217.6683807373047, - "logps/rejected": -366.7851867675781, - "loss": 0.3657, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.6003036499023438, - "rewards/margins": 1.525007724761963, - "rewards/rejected": -3.1253113746643066, + "epoch": 1.1940041350792556, + "grad_norm": 9.200611114501953, + "learning_rate": 4.183467433807385e-08, + "logits/chosen": -2.495748519897461, + "logits/rejected": -2.4691426753997803, + "logps/chosen": -107.3687744140625, + "logps/rejected": -122.67182922363281, + "loss": 0.6207, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.4973491132259369, + "rewards/margins": 0.18685005605220795, + "rewards/rejected": -0.6841990947723389, "step": 6930 }, { - "epoch": 1.2, - "grad_norm": 40.25747613620437, - "learning_rate": 2.0843185692648911e-07, - "logits/chosen": -1.3118326663970947, - "logits/rejected": -1.2168563604354858, - "logps/chosen": -208.8804168701172, - "logps/rejected": -361.11444091796875, - "loss": 0.3828, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.509263038635254, - "rewards/margins": 1.596047043800354, - "rewards/rejected": -3.1053099632263184, + "epoch": 1.1957270847691248, + "grad_norm": 11.490638732910156, + "learning_rate": 4.168637138529783e-08, + "logits/chosen": -2.492746591567993, + "logits/rejected": -2.450110673904419, + "logps/chosen": -106.14799499511719, + "logps/rejected": -115.3590316772461, + "loss": 0.6308, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4818505346775055, + "rewards/margins": 0.16582930088043213, + "rewards/rejected": -0.64767986536026, "step": 6940 }, { - "epoch": 1.2, - "grad_norm": 24.84940318081975, - "learning_rate": 2.076907180665276e-07, - "logits/chosen": -1.3450183868408203, - "logits/rejected": -1.2781970500946045, - "logps/chosen": -219.59927368164062, - "logps/rejected": -362.3084411621094, - "loss": 0.3846, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.657480239868164, - "rewards/margins": 1.4361209869384766, - "rewards/rejected": -3.0936012268066406, + "epoch": 1.1974500344589938, + "grad_norm": 8.843438148498535, + "learning_rate": 4.153814361330552e-08, + "logits/chosen": -2.5467190742492676, + "logits/rejected": -2.530759572982788, + "logps/chosen": -103.19229888916016, + "logps/rejected": -122.7353515625, + "loss": 0.6129, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.4933549761772156, + "rewards/margins": 0.20461630821228027, + "rewards/rejected": -0.6979712247848511, "step": 6950 }, { - "epoch": 1.2, - "grad_norm": 30.803786620950167, - "learning_rate": 2.0694996181266027e-07, - "logits/chosen": -1.5233880281448364, - "logits/rejected": -1.4538953304290771, - "logps/chosen": -233.41616821289062, - "logps/rejected": -332.0858459472656, - "loss": 0.4962, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.781266450881958, - "rewards/margins": 1.062042474746704, - "rewards/rejected": -2.843308925628662, + "epoch": 1.1991729841488628, + "grad_norm": 9.876995086669922, + "learning_rate": 4.138999236253205e-08, + "logits/chosen": -2.7103123664855957, + "logits/rejected": -2.674586057662964, + "logps/chosen": -109.5539321899414, + "logps/rejected": -114.24405670166016, + "loss": 0.648, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.542354941368103, + "rewards/margins": 0.12227793782949448, + "rewards/rejected": -0.6646329164505005, "step": 6960 }, { - "epoch": 1.2, - "grad_norm": 27.683191395647043, - "learning_rate": 2.062095948636031e-07, - "logits/chosen": -1.4839107990264893, - "logits/rejected": -1.41178297996521, - "logps/chosen": -189.777587890625, - "logps/rejected": -317.9434509277344, - "loss": 0.3784, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.3524045944213867, - "rewards/margins": 1.3231462240219116, - "rewards/rejected": -2.675550937652588, + "epoch": 1.200895933838732, + "grad_norm": 12.366546630859375, + "learning_rate": 4.1241918972720626e-08, + "logits/chosen": -2.6229166984558105, + "logits/rejected": -2.5920138359069824, + "logps/chosen": -104.89115142822266, + "logps/rejected": -114.06379699707031, + "loss": 0.644, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5034934282302856, + "rewards/margins": 0.13324648141860962, + "rewards/rejected": -0.63673996925354, "step": 6970 }, { - "epoch": 1.2, - "grad_norm": 22.494868613952, - "learning_rate": 2.0546962391455128e-07, - "logits/chosen": -1.4198600053787231, - "logits/rejected": -1.3551172018051147, - "logps/chosen": -198.3609161376953, - "logps/rejected": -321.55157470703125, - "loss": 0.4203, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.4413096904754639, - "rewards/margins": 1.244518518447876, - "rewards/rejected": -2.685828447341919, + "epoch": 1.202618883528601, + "grad_norm": 8.891714096069336, + "learning_rate": 4.1093924782910256e-08, + "logits/chosen": -2.532543420791626, + "logits/rejected": -2.5042622089385986, + "logps/chosen": -105.50868225097656, + "logps/rejected": -124.03361511230469, + "loss": 0.6174, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5128332376480103, + "rewards/margins": 0.19743052124977112, + "rewards/rejected": -0.7102638483047485, "step": 6980 }, { - "epoch": 1.2, - "grad_norm": 36.82272039938233, - "learning_rate": 2.0473005565711924e-07, - "logits/chosen": -1.335599422454834, - "logits/rejected": -1.2768608331680298, - "logps/chosen": -214.334716796875, - "logps/rejected": -331.76312255859375, - "loss": 0.4589, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.5734632015228271, - "rewards/margins": 1.201757550239563, - "rewards/rejected": -2.7752208709716797, + "epoch": 1.20434183321847, + "grad_norm": 9.686083793640137, + "learning_rate": 4.094601113142385e-08, + "logits/chosen": -2.4580886363983154, + "logits/rejected": -2.447352170944214, + "logps/chosen": -110.0320053100586, + "logps/rejected": -122.3864517211914, + "loss": 0.6381, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5304861068725586, + "rewards/margins": 0.15075218677520752, + "rewards/rejected": -0.6812382936477661, "step": 6990 }, { - "epoch": 1.21, - "grad_norm": 28.270864329924283, - "learning_rate": 2.039908967792795e-07, - "logits/chosen": -1.5961410999298096, - "logits/rejected": -1.521150827407837, - "logps/chosen": -228.6524200439453, - "logps/rejected": -364.3278503417969, - "loss": 0.4144, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7237600088119507, - "rewards/margins": 1.3727467060089111, - "rewards/rejected": -3.0965065956115723, + "epoch": 1.206064782908339, + "grad_norm": 9.574707984924316, + "learning_rate": 4.07981793558559e-08, + "logits/chosen": -2.7277112007141113, + "logits/rejected": -2.700732707977295, + "logps/chosen": -114.8912582397461, + "logps/rejected": -130.61427307128906, + "loss": 0.6268, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5859689712524414, + "rewards/margins": 0.17327004671096802, + "rewards/rejected": -0.7592389583587646, "step": 7000 }, { - "epoch": 1.21, - "eval_logits/chosen": -1.5385133028030396, - "eval_logits/rejected": -1.5128390789031982, - "eval_logps/chosen": -213.44802856445312, - "eval_logps/rejected": -257.9127502441406, - "eval_loss": 0.630026638507843, - "eval_rewards/accuracies": 0.6586896181106567, - "eval_rewards/chosen": -1.547441840171814, - "eval_rewards/margins": 0.40011176466941833, - "eval_rewards/rejected": -1.9475535154342651, - "eval_runtime": 356.7968, - "eval_samples_per_second": 12.063, - "eval_steps_per_second": 1.508, + "epoch": 1.206064782908339, + "eval_logits/chosen": -2.6553220748901367, + "eval_logits/rejected": -2.648987054824829, + "eval_logps/chosen": -102.90448760986328, + "eval_logps/rejected": -115.96162414550781, + "eval_loss": 0.660365104675293, + "eval_rewards/accuracies": 0.6089683771133423, + "eval_rewards/chosen": -0.44192585349082947, + "eval_rewards/margins": 0.08588908612728119, + "eval_rewards/rejected": -0.5278149247169495, + "eval_runtime": 359.8432, + "eval_samples_per_second": 11.961, + "eval_steps_per_second": 1.495, "step": 7000 }, { - "epoch": 1.21, - "grad_norm": 19.18466438304857, - "learning_rate": 2.0325215396530289e-07, - "logits/chosen": -1.4519102573394775, - "logits/rejected": -1.3836629390716553, - "logps/chosen": -213.21701049804688, - "logps/rejected": -340.19964599609375, - "loss": 0.4318, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.5678694248199463, - "rewards/margins": 1.305743932723999, - "rewards/rejected": -2.8736133575439453, + "epoch": 1.207787732598208, + "grad_norm": 7.930838108062744, + "learning_rate": 4.065043079306057e-08, + "logits/chosen": -2.558248996734619, + "logits/rejected": -2.5334038734436035, + "logps/chosen": -110.71012115478516, + "logps/rejected": -125.1780014038086, + "loss": 0.625, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5428487062454224, + "rewards/margins": 0.1803073137998581, + "rewards/rejected": -0.7231560349464417, "step": 7010 }, { - "epoch": 1.21, - "grad_norm": 27.94478473053083, - "learning_rate": 2.025138338956974e-07, - "logits/chosen": -1.4114625453948975, - "logits/rejected": -1.349818229675293, - "logps/chosen": -198.73695373535156, - "logps/rejected": -307.34136962890625, - "loss": 0.445, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.4743351936340332, - "rewards/margins": 1.0755380392074585, - "rewards/rejected": -2.549873113632202, + "epoch": 1.2095106822880772, + "grad_norm": 10.088218688964844, + "learning_rate": 4.050276677913948e-08, + "logits/chosen": -2.539398670196533, + "logits/rejected": -2.514713764190674, + "logps/chosen": -104.53022766113281, + "logps/rejected": -120.55018615722656, + "loss": 0.6357, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5319541692733765, + "rewards/margins": 0.14996173977851868, + "rewards/rejected": -0.6819158792495728, "step": 7020 }, { - "epoch": 1.21, - "grad_norm": 41.933851784209644, - "learning_rate": 2.0177594324714838e-07, - "logits/chosen": -1.4608399868011475, - "logits/rejected": -1.397789716720581, - "logps/chosen": -205.079345703125, - "logps/rejected": -330.39312744140625, - "loss": 0.4361, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.519981026649475, - "rewards/margins": 1.2328134775161743, - "rewards/rejected": -2.7527945041656494, + "epoch": 1.2112336319779462, + "grad_norm": 10.294520378112793, + "learning_rate": 4.0355188649429677e-08, + "logits/chosen": -2.5818893909454346, + "logits/rejected": -2.5657174587249756, + "logps/chosen": -108.9336929321289, + "logps/rejected": -125.43675231933594, + "loss": 0.6416, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5583351254463196, + "rewards/margins": 0.1448930948972702, + "rewards/rejected": -0.7032281756401062, "step": 7030 }, { - "epoch": 1.21, - "grad_norm": 27.891320371971226, - "learning_rate": 2.0103848869245764e-07, - "logits/chosen": -1.3869388103485107, - "logits/rejected": -1.3218698501586914, - "logps/chosen": -200.9859619140625, - "logps/rejected": -331.20611572265625, - "loss": 0.3926, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.4491612911224365, - "rewards/margins": 1.297973871231079, - "rewards/rejected": -2.7471349239349365, + "epoch": 1.2129565816678154, + "grad_norm": 12.805583953857422, + "learning_rate": 4.020769773849153e-08, + "logits/chosen": -2.526520013809204, + "logits/rejected": -2.5062756538391113, + "logps/chosen": -110.2097396850586, + "logps/rejected": -127.39622497558594, + "loss": 0.6268, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5413297414779663, + "rewards/margins": 0.16765829920768738, + "rewards/rejected": -0.7089880704879761, "step": 7040 }, { - "epoch": 1.21, - "grad_norm": 24.036317739572887, - "learning_rate": 2.0030147690048374e-07, - "logits/chosen": -1.3576328754425049, - "logits/rejected": -1.3013206720352173, - "logps/chosen": -198.55532836914062, - "logps/rejected": -319.38226318359375, - "loss": 0.4474, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.4711755514144897, - "rewards/margins": 1.1891216039657593, - "rewards/rejected": -2.660297155380249, + "epoch": 1.2146795313576844, + "grad_norm": 10.435907363891602, + "learning_rate": 4.0060295380096745e-08, + "logits/chosen": -2.502941608428955, + "logits/rejected": -2.483719825744629, + "logps/chosen": -104.99825286865234, + "logps/rejected": -122.9035415649414, + "loss": 0.6332, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5354247689247131, + "rewards/margins": 0.15982839465141296, + "rewards/rejected": -0.6952531337738037, "step": 7050 }, { - "epoch": 1.22, - "grad_norm": 36.02994677091849, - "learning_rate": 1.995649145360809e-07, - "logits/chosen": -1.4678010940551758, - "logits/rejected": -1.4199771881103516, - "logps/chosen": -212.34457397460938, - "logps/rejected": -316.0308532714844, - "loss": 0.4659, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5913760662078857, - "rewards/margins": 1.0398099422454834, - "rewards/rejected": -2.631186008453369, + "epoch": 1.2164024810475533, + "grad_norm": 10.906390190124512, + "learning_rate": 3.991298290721618e-08, + "logits/chosen": -2.6106855869293213, + "logits/rejected": -2.597623586654663, + "logps/chosen": -112.033447265625, + "logps/rejected": -123.20802307128906, + "loss": 0.6542, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5882318019866943, + "rewards/margins": 0.11472143977880478, + "rewards/rejected": -0.7029532194137573, "step": 7060 }, { - "epoch": 1.22, - "grad_norm": 65.30902966804867, - "learning_rate": 1.988288082600392e-07, - "logits/chosen": -1.3991708755493164, - "logits/rejected": -1.337200403213501, - "logps/chosen": -218.6618194580078, - "logps/rejected": -320.5107421875, - "loss": 0.5368, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.640601396560669, - "rewards/margins": 1.0321669578552246, - "rewards/rejected": -2.6727681159973145, + "epoch": 1.2181254307374225, + "grad_norm": 11.68134880065918, + "learning_rate": 3.976576165200784e-08, + "logits/chosen": -2.562283992767334, + "logits/rejected": -2.5388481616973877, + "logps/chosen": -113.50862121582031, + "logps/rejected": -120.29753112792969, + "loss": 0.6744, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.5890558958053589, + "rewards/margins": 0.08144049346446991, + "rewards/rejected": -0.67049640417099, "step": 7070 }, { - "epoch": 1.22, - "grad_norm": 29.064832805753056, - "learning_rate": 1.980931647290246e-07, - "logits/chosen": -1.4547747373580933, - "logits/rejected": -1.3819966316223145, - "logps/chosen": -202.14923095703125, - "logps/rejected": -324.9191589355469, - "loss": 0.4112, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.4868186712265015, - "rewards/margins": 1.2228963375091553, - "rewards/rejected": -2.709714889526367, + "epoch": 1.2198483804272915, + "grad_norm": 8.486589431762695, + "learning_rate": 3.961863294580492e-08, + "logits/chosen": -2.6284613609313965, + "logits/rejected": -2.600008487701416, + "logps/chosen": -107.27657318115234, + "logps/rejected": -123.54029846191406, + "loss": 0.631, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5380843281745911, + "rewards/margins": 0.1576683223247528, + "rewards/rejected": -0.6957526803016663, "step": 7080 }, { - "epoch": 1.22, - "grad_norm": 21.70626173606062, - "learning_rate": 1.97357990595518e-07, - "logits/chosen": -1.5178253650665283, - "logits/rejected": -1.4573280811309814, - "logps/chosen": -204.9860382080078, - "logps/rejected": -340.4731750488281, - "loss": 0.4111, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.5147212743759155, - "rewards/margins": 1.3682196140289307, - "rewards/rejected": -2.8829410076141357, + "epoch": 1.2215713301171607, + "grad_norm": 8.582545280456543, + "learning_rate": 3.94715981191036e-08, + "logits/chosen": -2.6594293117523193, + "logits/rejected": -2.648655891418457, + "logps/chosen": -110.38819885253906, + "logps/rejected": -125.69108581542969, + "loss": 0.631, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5687033534049988, + "rewards/margins": 0.16610802710056305, + "rewards/rejected": -0.7348113059997559, "step": 7090 }, { - "epoch": 1.22, - "grad_norm": 38.87350809965092, - "learning_rate": 1.9662329250775586e-07, - "logits/chosen": -1.3815619945526123, - "logits/rejected": -1.3237214088439941, - "logps/chosen": -205.8621826171875, - "logps/rejected": -319.4546203613281, - "loss": 0.4501, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.4879212379455566, - "rewards/margins": 1.1380326747894287, - "rewards/rejected": -2.6259539127349854, + "epoch": 1.2232942798070296, + "grad_norm": 8.429627418518066, + "learning_rate": 3.932465850155117e-08, + "logits/chosen": -2.528796434402466, + "logits/rejected": -2.508687973022461, + "logps/chosen": -110.3732681274414, + "logps/rejected": -126.6390151977539, + "loss": 0.6303, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5329998135566711, + "rewards/margins": 0.16490605473518372, + "rewards/rejected": -0.6979058980941772, "step": 7100 }, { - "epoch": 1.22, - "eval_logits/chosen": -1.4833621978759766, - "eval_logits/rejected": -1.4578860998153687, - "eval_logps/chosen": -215.61434936523438, - "eval_logps/rejected": -259.6932373046875, - "eval_loss": 0.6319575309753418, - "eval_rewards/accuracies": 0.6510223150253296, - "eval_rewards/chosen": -1.5691050291061401, - "eval_rewards/margins": 0.39625340700149536, - "eval_rewards/rejected": -1.9653586149215698, - "eval_runtime": 356.9358, - "eval_samples_per_second": 12.058, - "eval_steps_per_second": 1.507, + "epoch": 1.2232942798070296, + "eval_logits/chosen": -2.6505863666534424, + "eval_logits/rejected": -2.6443142890930176, + "eval_logps/chosen": -102.50411987304688, + "eval_logps/rejected": -115.5604019165039, + "eval_loss": 0.6603997349739075, + "eval_rewards/accuracies": 0.6129181981086731, + "eval_rewards/chosen": -0.4379221498966217, + "eval_rewards/margins": 0.08588062226772308, + "eval_rewards/rejected": -0.5238028168678284, + "eval_runtime": 360.333, + "eval_samples_per_second": 11.945, + "eval_steps_per_second": 1.493, "step": 7100 }, { - "epoch": 1.23, - "grad_norm": 30.114994381758486, - "learning_rate": 1.9588907710966943e-07, - "logits/chosen": -1.3856322765350342, - "logits/rejected": -1.3147612810134888, - "logps/chosen": -195.54681396484375, - "logps/rejected": -312.0347595214844, - "loss": 0.4228, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.4208539724349976, - "rewards/margins": 1.1898590326309204, - "rewards/rejected": -2.610713243484497, + "epoch": 1.2250172294968986, + "grad_norm": 9.6629056930542, + "learning_rate": 3.9177815421933884e-08, + "logits/chosen": -2.572434663772583, + "logits/rejected": -2.5384533405303955, + "logps/chosen": -108.16790771484375, + "logps/rejected": -114.19828796386719, + "loss": 0.6664, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5469506978988647, + "rewards/margins": 0.08520477265119553, + "rewards/rejected": -0.6321554183959961, "step": 7110 }, { - "epoch": 1.23, - "grad_norm": 44.292714805439296, - "learning_rate": 1.951553510408252e-07, - "logits/chosen": -1.3800169229507446, - "logits/rejected": -1.3161985874176025, - "logps/chosen": -223.2345428466797, - "logps/rejected": -308.5866394042969, - "loss": 0.5205, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.612497329711914, - "rewards/margins": 0.9479106068611145, - "rewards/rejected": -2.560408115386963, + "epoch": 1.2267401791867678, + "grad_norm": 10.517024040222168, + "learning_rate": 3.903107020816504e-08, + "logits/chosen": -2.561241626739502, + "logits/rejected": -2.52276873588562, + "logps/chosen": -116.26021575927734, + "logps/rejected": -118.6120834350586, + "loss": 0.6523, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5427398085594177, + "rewards/margins": 0.11778402328491211, + "rewards/rejected": -0.6605237722396851, "step": 7120 }, { - "epoch": 1.23, - "grad_norm": 31.181918195201007, - "learning_rate": 1.944221209363643e-07, - "logits/chosen": -1.300041913986206, - "logits/rejected": -1.2450910806655884, - "logps/chosen": -205.53591918945312, - "logps/rejected": -322.80950927734375, - "loss": 0.4277, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.4920589923858643, - "rewards/margins": 1.2042802572250366, - "rewards/rejected": -2.6963393688201904, + "epoch": 1.2284631288766368, + "grad_norm": 9.681543350219727, + "learning_rate": 3.8884424187272866e-08, + "logits/chosen": -2.455671787261963, + "logits/rejected": -2.4319849014282227, + "logps/chosen": -106.87947082519531, + "logps/rejected": -119.38846588134766, + "loss": 0.6352, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5053822994232178, + "rewards/margins": 0.15671773254871368, + "rewards/rejected": -0.662100076675415, "step": 7130 }, { - "epoch": 1.23, - "grad_norm": 33.82376669358963, - "learning_rate": 1.9368939342694328e-07, - "logits/chosen": -1.4221440553665161, - "logits/rejected": -1.382880449295044, - "logps/chosen": -190.2220458984375, - "logps/rejected": -294.0387878417969, - "loss": 0.4669, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.38909912109375, - "rewards/margins": 1.0321005582809448, - "rewards/rejected": -2.421199321746826, + "epoch": 1.230186078566506, + "grad_norm": 8.6825590133667, + "learning_rate": 3.873787868538866e-08, + "logits/chosen": -2.5538275241851807, + "logits/rejected": -2.548265218734741, + "logps/chosen": -104.10013580322266, + "logps/rejected": -119.7091293334961, + "loss": 0.6397, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5278245806694031, + "rewards/margins": 0.14982108771800995, + "rewards/rejected": -0.6776455640792847, "step": 7140 }, { - "epoch": 1.23, - "grad_norm": 41.46398078974056, - "learning_rate": 1.9295717513867324e-07, - "logits/chosen": -1.5011231899261475, - "logits/rejected": -1.4463145732879639, - "logps/chosen": -224.45938110351562, - "logps/rejected": -333.4562683105469, - "loss": 0.4627, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.6809113025665283, - "rewards/margins": 1.1032178401947021, - "rewards/rejected": -2.7841289043426514, + "epoch": 1.231909028256375, + "grad_norm": 10.221883773803711, + "learning_rate": 3.8591435027734646e-08, + "logits/chosen": -2.628955364227295, + "logits/rejected": -2.6071934700012207, + "logps/chosen": -117.13688659667969, + "logps/rejected": -128.465087890625, + "loss": 0.6497, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6074936985969543, + "rewards/margins": 0.12625807523727417, + "rewards/rejected": -0.7337517738342285, "step": 7150 }, { - "epoch": 1.23, - "grad_norm": 43.52157134513151, - "learning_rate": 1.9222547269306068e-07, - "logits/chosen": -1.415351152420044, - "logits/rejected": -1.3425204753875732, - "logps/chosen": -192.447021484375, - "logps/rejected": -316.6680908203125, - "loss": 0.4306, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3883023262023926, - "rewards/margins": 1.2378036975860596, - "rewards/rejected": -2.6261062622070312, + "epoch": 1.233631977946244, + "grad_norm": 9.913213729858398, + "learning_rate": 3.844509453861214e-08, + "logits/chosen": -2.545468807220459, + "logits/rejected": -2.511420965194702, + "logps/chosen": -106.26422119140625, + "logps/rejected": -124.04301452636719, + "loss": 0.6263, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5260246396064758, + "rewards/margins": 0.17362895607948303, + "rewards/rejected": -0.6996536254882812, "step": 7160 }, { - "epoch": 1.24, - "grad_norm": 28.198520340260067, - "learning_rate": 1.9149429270694705e-07, - "logits/chosen": -1.4002097845077515, - "logits/rejected": -1.3377676010131836, - "logps/chosen": -201.3988494873047, - "logps/rejected": -307.12530517578125, - "loss": 0.4632, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.4582065343856812, - "rewards/margins": 1.0752780437469482, - "rewards/rejected": -2.533484697341919, + "epoch": 1.235354927636113, + "grad_norm": 10.441494941711426, + "learning_rate": 3.829885854138941e-08, + "logits/chosen": -2.511805772781372, + "logits/rejected": -2.4859747886657715, + "logps/chosen": -110.22599029541016, + "logps/rejected": -122.40694427490234, + "loss": 0.6471, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5465968251228333, + "rewards/margins": 0.13938425481319427, + "rewards/rejected": -0.6859810948371887, "step": 7170 }, { - "epoch": 1.24, - "grad_norm": 36.940975044182615, - "learning_rate": 1.9076364179244937e-07, - "logits/chosen": -1.519090175628662, - "logits/rejected": -1.4556185007095337, - "logps/chosen": -192.38104248046875, - "logps/rejected": -318.6177673339844, - "loss": 0.3814, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.3707835674285889, - "rewards/margins": 1.2708237171173096, - "rewards/rejected": -2.6416070461273193, + "epoch": 1.237077877325982, + "grad_norm": 8.994463920593262, + "learning_rate": 3.815272835848987e-08, + "logits/chosen": -2.636838436126709, + "logits/rejected": -2.6227402687072754, + "logps/chosen": -106.834228515625, + "logps/rejected": -122.57475280761719, + "loss": 0.6284, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5153202414512634, + "rewards/margins": 0.16570481657981873, + "rewards/rejected": -0.6810250282287598, "step": 7180 }, { - "epoch": 1.24, - "grad_norm": 29.51979419688918, - "learning_rate": 1.900335265568999e-07, - "logits/chosen": -1.2953803539276123, - "logits/rejected": -1.22904372215271, - "logps/chosen": -216.4315643310547, - "logps/rejected": -340.22052001953125, - "loss": 0.4171, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.637787103652954, - "rewards/margins": 1.2283713817596436, - "rewards/rejected": -2.8661584854125977, + "epoch": 1.2388008270158513, + "grad_norm": 8.581153869628906, + "learning_rate": 3.8006705311379985e-08, + "logits/chosen": -2.488649845123291, + "logits/rejected": -2.4648635387420654, + "logps/chosen": -105.9777603149414, + "logps/rejected": -123.8721923828125, + "loss": 0.6289, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5330722332000732, + "rewards/margins": 0.1694321185350418, + "rewards/rejected": -0.7025043368339539, "step": 7190 }, { - "epoch": 1.24, - "grad_norm": 43.92936652185145, - "learning_rate": 1.893039536027872e-07, - "logits/chosen": -1.2936763763427734, - "logits/rejected": -1.2286694049835205, - "logps/chosen": -226.12228393554688, - "logps/rejected": -365.41058349609375, - "loss": 0.4303, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.7303447723388672, - "rewards/margins": 1.3623626232147217, - "rewards/rejected": -3.092707395553589, + "epoch": 1.2405237767057202, + "grad_norm": 10.977418899536133, + "learning_rate": 3.7860790720557445e-08, + "logits/chosen": -2.5012497901916504, + "logits/rejected": -2.4833712577819824, + "logps/chosen": -108.4955825805664, + "logps/rejected": -130.1787872314453, + "loss": 0.6251, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5539822578430176, + "rewards/margins": 0.18606498837471008, + "rewards/rejected": -0.74004727602005, "step": 7200 }, { - "epoch": 1.24, - "eval_logits/chosen": -1.436883568763733, - "eval_logits/rejected": -1.4103857278823853, - "eval_logps/chosen": -236.11033630371094, - "eval_logps/rejected": -283.7571105957031, - "eval_loss": 0.632332980632782, - "eval_rewards/accuracies": 0.6538103818893433, - "eval_rewards/chosen": -1.7740648984909058, - "eval_rewards/margins": 0.431932270526886, - "eval_rewards/rejected": -2.2059972286224365, - "eval_runtime": 356.7528, - "eval_samples_per_second": 12.064, - "eval_steps_per_second": 1.508, + "epoch": 1.2405237767057202, + "eval_logits/chosen": -2.6447579860687256, + "eval_logits/rejected": -2.6383352279663086, + "eval_logps/chosen": -103.08140563964844, + "eval_logps/rejected": -116.27261352539062, + "eval_loss": 0.659978449344635, + "eval_rewards/accuracies": 0.6101301312446594, + "eval_rewards/chosen": -0.4436950981616974, + "eval_rewards/margins": 0.08722980320453644, + "eval_rewards/rejected": -0.530924916267395, + "eval_runtime": 359.9641, + "eval_samples_per_second": 11.957, + "eval_steps_per_second": 1.495, "step": 7200 }, { - "epoch": 1.24, - "grad_norm": 35.504557595434676, - "learning_rate": 1.885749295276955e-07, - "logits/chosen": -1.4118075370788574, - "logits/rejected": -1.362586498260498, - "logps/chosen": -243.8560028076172, - "logps/rejected": -341.7215576171875, - "loss": 0.4936, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.8682628870010376, - "rewards/margins": 1.0187652111053467, - "rewards/rejected": -2.887028217315674, + "epoch": 1.2422467263955892, + "grad_norm": 10.23493766784668, + "learning_rate": 3.77149859055391e-08, + "logits/chosen": -2.6112637519836426, + "logits/rejected": -2.5924153327941895, + "logps/chosen": -115.63868713378906, + "logps/rejected": -121.07205963134766, + "loss": 0.6621, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5860998630523682, + "rewards/margins": 0.09432484954595566, + "rewards/rejected": -0.680424690246582, "step": 7210 }, { - "epoch": 1.24, - "grad_norm": 38.49322970172763, - "learning_rate": 1.8784646092424572e-07, - "logits/chosen": -1.2949811220169067, - "logits/rejected": -1.219310998916626, - "logps/chosen": -222.8941650390625, - "logps/rejected": -352.58172607421875, - "loss": 0.4373, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.6838912963867188, - "rewards/margins": 1.325777292251587, - "rewards/rejected": -3.0096685886383057, + "epoch": 1.2439696760854584, + "grad_norm": 9.584635734558105, + "learning_rate": 3.756929218484914e-08, + "logits/chosen": -2.4883735179901123, + "logits/rejected": -2.4554367065429688, + "logps/chosen": -110.43601989746094, + "logps/rejected": -120.67839050292969, + "loss": 0.6509, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.559060275554657, + "rewards/margins": 0.13107997179031372, + "rewards/rejected": -0.6901403069496155, "step": 7220 }, { - "epoch": 1.25, - "grad_norm": 28.566213465067236, - "learning_rate": 1.8711855438003543e-07, - "logits/chosen": -1.3604927062988281, - "logits/rejected": -1.2994263172149658, - "logps/chosen": -208.6522216796875, - "logps/rejected": -332.7591857910156, - "loss": 0.4031, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.5445417165756226, - "rewards/margins": 1.2535761594772339, - "rewards/rejected": -2.7981178760528564, + "epoch": 1.2456926257753274, + "grad_norm": 12.276067733764648, + "learning_rate": 3.7423710876007084e-08, + "logits/chosen": -2.5233511924743652, + "logits/rejected": -2.5052595138549805, + "logps/chosen": -111.4328384399414, + "logps/rejected": -119.4492416381836, + "loss": 0.6636, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5721146464347839, + "rewards/margins": 0.09257705509662628, + "rewards/rejected": -0.664691686630249, "step": 7230 }, { - "epoch": 1.25, - "grad_norm": 24.11769012327145, - "learning_rate": 1.8639121647757976e-07, - "logits/chosen": -1.3791191577911377, - "logits/rejected": -1.3320530652999878, - "logps/chosen": -218.4418182373047, - "logps/rejected": -334.22833251953125, - "loss": 0.4437, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.609986662864685, - "rewards/margins": 1.184805154800415, - "rewards/rejected": -2.7947916984558105, + "epoch": 1.2474155754651963, + "grad_norm": 10.505390167236328, + "learning_rate": 3.727824329551595e-08, + "logits/chosen": -2.5546631813049316, + "logits/rejected": -2.543801784515381, + "logps/chosen": -112.68693542480469, + "logps/rejected": -124.9740982055664, + "loss": 0.6382, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5524017214775085, + "rewards/margins": 0.14962348341941833, + "rewards/rejected": -0.7020251154899597, "step": 7240 }, { - "epoch": 1.25, - "grad_norm": 28.06311795428957, - "learning_rate": 1.8566445379425116e-07, - "logits/chosen": -1.4544193744659424, - "logits/rejected": -1.3801645040512085, - "logps/chosen": -202.21969604492188, - "logps/rejected": -322.6747131347656, - "loss": 0.4007, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.4765223264694214, - "rewards/margins": 1.2214938402175903, - "rewards/rejected": -2.69801664352417, + "epoch": 1.2491385251550655, + "grad_norm": 9.379290580749512, + "learning_rate": 3.713289075885023e-08, + "logits/chosen": -2.602900743484497, + "logits/rejected": -2.574934959411621, + "logps/chosen": -111.77998352050781, + "logps/rejected": -121.20411682128906, + "loss": 0.6556, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5721232295036316, + "rewards/margins": 0.11137218773365021, + "rewards/rejected": -0.6834954023361206, "step": 7250 }, { - "epoch": 1.25, - "grad_norm": 28.029045217779828, - "learning_rate": 1.8493827290222068e-07, - "logits/chosen": -1.4240261316299438, - "logits/rejected": -1.3594194650650024, - "logps/chosen": -222.39035034179688, - "logps/rejected": -343.7870788574219, - "loss": 0.446, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.6872167587280273, - "rewards/margins": 1.2270071506500244, - "rewards/rejected": -2.9142239093780518, + "epoch": 1.2508614748449345, + "grad_norm": 9.753902435302734, + "learning_rate": 3.698765458044414e-08, + "logits/chosen": -2.591021776199341, + "logits/rejected": -2.5614984035491943, + "logps/chosen": -108.41435241699219, + "logps/rejected": -123.48286437988281, + "loss": 0.6298, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5472859144210815, + "rewards/margins": 0.16381797194480896, + "rewards/rejected": -0.7111038565635681, "step": 7260 }, { - "epoch": 1.25, - "grad_norm": 33.35599192177956, - "learning_rate": 1.84212680368398e-07, - "logits/chosen": -1.4141993522644043, - "logits/rejected": -1.3509438037872314, - "logps/chosen": -217.01632690429688, - "logps/rejected": -327.6526794433594, - "loss": 0.4629, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.6272687911987305, - "rewards/margins": 1.1230875253677368, - "rewards/rejected": -2.7503561973571777, + "epoch": 1.2525844245348035, + "grad_norm": 9.254847526550293, + "learning_rate": 3.6842536073679596e-08, + "logits/chosen": -2.6056957244873047, + "logits/rejected": -2.577763080596924, + "logps/chosen": -106.95680236816406, + "logps/rejected": -119.10015869140625, + "loss": 0.6407, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5264113545417786, + "rewards/margins": 0.13823921978473663, + "rewards/rejected": -0.6646506190299988, "step": 7270 }, { - "epoch": 1.25, - "grad_norm": 38.40058672670466, - "learning_rate": 1.834876827543721e-07, - "logits/chosen": -1.4696061611175537, - "logits/rejected": -1.3933039903640747, - "logps/chosen": -214.4224090576172, - "logps/rejected": -348.7910461425781, - "loss": 0.4019, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.5783621072769165, - "rewards/margins": 1.3805334568023682, - "rewards/rejected": -2.958895444869995, + "epoch": 1.2543073742246726, + "grad_norm": 9.7710542678833, + "learning_rate": 3.669753655087442e-08, + "logits/chosen": -2.639329433441162, + "logits/rejected": -2.6067185401916504, + "logps/chosen": -107.37776184082031, + "logps/rejected": -121.13716888427734, + "loss": 0.6253, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5080801844596863, + "rewards/margins": 0.17418815195560455, + "rewards/rejected": -0.6822682619094849, "step": 7280 }, { - "epoch": 1.26, - "grad_norm": 46.97048710720416, - "learning_rate": 1.8276328661635248e-07, - "logits/chosen": -1.2667840719223022, - "logits/rejected": -1.2175432443618774, - "logps/chosen": -230.58251953125, - "logps/rejected": -343.7873840332031, - "loss": 0.4282, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.757728934288025, - "rewards/margins": 1.1217601299285889, - "rewards/rejected": -2.879488945007324, + "epoch": 1.2560303239145416, + "grad_norm": 9.96068286895752, + "learning_rate": 3.65526573232705e-08, + "logits/chosen": -2.443098545074463, + "logits/rejected": -2.4309229850769043, + "logps/chosen": -109.87693786621094, + "logps/rejected": -120.99009704589844, + "loss": 0.6564, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5508560538291931, + "rewards/margins": 0.10041414201259613, + "rewards/rejected": -0.6512702107429504, "step": 7290 }, { - "epoch": 1.26, - "grad_norm": 23.37837488393363, - "learning_rate": 1.8203949850510903e-07, - "logits/chosen": -1.1985424757003784, - "logits/rejected": -1.151474952697754, - "logps/chosen": -231.79354858398438, - "logps/rejected": -341.9132080078125, - "loss": 0.4717, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.8153043985366821, - "rewards/margins": 1.083820104598999, - "rewards/rejected": -2.8991243839263916, + "epoch": 1.2577532736044108, + "grad_norm": 10.196106910705566, + "learning_rate": 3.6407899701021807e-08, + "logits/chosen": -2.3989205360412598, + "logits/rejected": -2.3831562995910645, + "logps/chosen": -101.35286712646484, + "logps/rejected": -114.70906066894531, + "loss": 0.6531, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5109211206436157, + "rewards/margins": 0.11609502881765366, + "rewards/rejected": -0.6270160675048828, "step": 7300 }, { - "epoch": 1.26, - "eval_logits/chosen": -1.4253735542297363, - "eval_logits/rejected": -1.3984841108322144, - "eval_logps/chosen": -244.4295196533203, - "eval_logps/rejected": -294.37445068359375, - "eval_loss": 0.6293808221817017, - "eval_rewards/accuracies": 0.6668215394020081, - "eval_rewards/chosen": -1.857256531715393, - "eval_rewards/margins": 0.45491406321525574, - "eval_rewards/rejected": -2.3121707439422607, - "eval_runtime": 357.0293, - "eval_samples_per_second": 12.055, - "eval_steps_per_second": 1.507, + "epoch": 1.2577532736044108, + "eval_logits/chosen": -2.642998218536377, + "eval_logits/rejected": -2.636584997177124, + "eval_logps/chosen": -102.09992218017578, + "eval_logps/rejected": -115.19979095458984, + "eval_loss": 0.6601964235305786, + "eval_rewards/accuracies": 0.6124535202980042, + "eval_rewards/chosen": -0.4338802695274353, + "eval_rewards/margins": 0.08631633222103119, + "eval_rewards/rejected": -0.5201966166496277, + "eval_runtime": 359.6659, + "eval_samples_per_second": 11.967, + "eval_steps_per_second": 1.496, "step": 7300 }, { - "epoch": 1.26, - "grad_norm": 32.28579120461566, - "learning_rate": 1.8131632496591348e-07, - "logits/chosen": -1.354773759841919, - "logits/rejected": -1.288698673248291, - "logps/chosen": -231.0813446044922, - "logps/rejected": -361.4246520996094, - "loss": 0.4156, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7601534128189087, - "rewards/margins": 1.3353766202926636, - "rewards/rejected": -3.0955300331115723, + "epoch": 1.2594762232942798, + "grad_norm": 9.084516525268555, + "learning_rate": 3.6263264993182695e-08, + "logits/chosen": -2.5876553058624268, + "logits/rejected": -2.564537286758423, + "logps/chosen": -109.34537506103516, + "logps/rejected": -122.04302978515625, + "loss": 0.6359, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5428635478019714, + "rewards/margins": 0.15870937705039978, + "rewards/rejected": -0.7015729546546936, "step": 7310 }, { - "epoch": 1.26, - "grad_norm": 26.521159677348862, - "learning_rate": 1.8059377253847973e-07, - "logits/chosen": -1.374133825302124, - "logits/rejected": -1.314866304397583, - "logps/chosen": -226.6112823486328, - "logps/rejected": -341.32647705078125, - "loss": 0.478, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.72268545627594, - "rewards/margins": 1.179290533065796, - "rewards/rejected": -2.9019761085510254, + "epoch": 1.2611991729841487, + "grad_norm": 8.735158920288086, + "learning_rate": 3.6118754507695946e-08, + "logits/chosen": -2.5970218181610107, + "logits/rejected": -2.574091672897339, + "logps/chosen": -106.69927978515625, + "logps/rejected": -115.67286682128906, + "loss": 0.6509, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.5234571695327759, + "rewards/margins": 0.12171218544244766, + "rewards/rejected": -0.6451693773269653, "step": 7320 }, { - "epoch": 1.26, - "grad_norm": 42.16253795931932, - "learning_rate": 1.7987184775690508e-07, - "logits/chosen": -1.2531036138534546, - "logits/rejected": -1.1840673685073853, - "logps/chosen": -223.2836151123047, - "logps/rejected": -365.69842529296875, - "loss": 0.3926, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.7025184631347656, - "rewards/margins": 1.438976526260376, - "rewards/rejected": -3.1414952278137207, + "epoch": 1.262922122674018, + "grad_norm": 9.100055694580078, + "learning_rate": 3.597436955138102e-08, + "logits/chosen": -2.496898651123047, + "logits/rejected": -2.4709010124206543, + "logps/chosen": -103.43009948730469, + "logps/rejected": -122.0030288696289, + "loss": 0.6194, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5036752223968506, + "rewards/margins": 0.20063956081867218, + "rewards/rejected": -0.7043147683143616, "step": 7330 }, { - "epoch": 1.26, - "grad_norm": 24.413820614134853, - "learning_rate": 1.7915055714961092e-07, - "logits/chosen": -1.3367866277694702, - "logits/rejected": -1.274552822113037, - "logps/chosen": -241.989501953125, - "logps/rejected": -350.8983459472656, - "loss": 0.463, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.8661409616470337, - "rewards/margins": 1.1037722826004028, - "rewards/rejected": -2.9699134826660156, + "epoch": 1.264645072363887, + "grad_norm": 9.007047653198242, + "learning_rate": 3.583011142992218e-08, + "logits/chosen": -2.5679678916931152, + "logits/rejected": -2.543445348739624, + "logps/chosen": -109.4923095703125, + "logps/rejected": -120.74739074707031, + "loss": 0.6527, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5411123633384705, + "rewards/margins": 0.12720635533332825, + "rewards/rejected": -0.6683186888694763, "step": 7340 }, { - "epoch": 1.27, - "grad_norm": 27.114867030708584, - "learning_rate": 1.7842990723928376e-07, - "logits/chosen": -1.4280154705047607, - "logits/rejected": -1.3533068895339966, - "logps/chosen": -203.71188354492188, - "logps/rejected": -342.64697265625, - "loss": 0.3656, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.4681717157363892, - "rewards/margins": 1.4406144618988037, - "rewards/rejected": -2.9087860584259033, + "epoch": 1.266368022053756, + "grad_norm": 9.416090965270996, + "learning_rate": 3.568598144785675e-08, + "logits/chosen": -2.6188342571258545, + "logits/rejected": -2.5921006202697754, + "logps/chosen": -108.1654052734375, + "logps/rejected": -119.23826599121094, + "loss": 0.633, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5124603509902954, + "rewards/margins": 0.16223213076591492, + "rewards/rejected": -0.6746925115585327, "step": 7350 }, { - "epoch": 1.27, - "grad_norm": 43.36655567777902, - "learning_rate": 1.7770990454281605e-07, - "logits/chosen": -1.3013639450073242, - "logits/rejected": -1.2412099838256836, - "logps/chosen": -235.2455291748047, - "logps/rejected": -366.5183410644531, - "loss": 0.4174, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.8094427585601807, - "rewards/margins": 1.334092140197754, - "rewards/rejected": -3.1435346603393555, + "epoch": 1.268090971743625, + "grad_norm": 11.118308067321777, + "learning_rate": 3.5541980908563216e-08, + "logits/chosen": -2.5278282165527344, + "logits/rejected": -2.509023666381836, + "logps/chosen": -108.90681457519531, + "logps/rejected": -120.905517578125, + "loss": 0.6411, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.546089231967926, + "rewards/margins": 0.14111468195915222, + "rewards/rejected": -0.6872037649154663, "step": 7360 }, { - "epoch": 1.27, - "grad_norm": 43.02629441915015, - "learning_rate": 1.7699055557124791e-07, - "logits/chosen": -1.2064440250396729, - "logits/rejected": -1.1509660482406616, - "logps/chosen": -230.65185546875, - "logps/rejected": -360.70745849609375, - "loss": 0.4244, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7785717248916626, - "rewards/margins": 1.316173791885376, - "rewards/rejected": -3.094745635986328, + "epoch": 1.269813921433494, + "grad_norm": 9.836128234863281, + "learning_rate": 3.539811111424959e-08, + "logits/chosen": -2.440730571746826, + "logits/rejected": -2.4208149909973145, + "logps/chosen": -104.86732482910156, + "logps/rejected": -113.34352111816406, + "loss": 0.6599, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5206080675125122, + "rewards/margins": 0.10041671991348267, + "rewards/rejected": -0.6210247278213501, "step": 7370 }, { - "epoch": 1.27, - "grad_norm": 44.50263470615816, - "learning_rate": 1.7627186682970723e-07, - "logits/chosen": -1.269676923751831, - "logits/rejected": -1.2101144790649414, - "logps/chosen": -239.9287109375, - "logps/rejected": -365.6208190917969, - "loss": 0.4291, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.8677568435668945, - "rewards/margins": 1.2519527673721313, - "rewards/rejected": -3.1197097301483154, + "epoch": 1.2715368711233632, + "grad_norm": 9.010052680969238, + "learning_rate": 3.525437336594145e-08, + "logits/chosen": -2.530202865600586, + "logits/rejected": -2.5104517936706543, + "logps/chosen": -105.98480224609375, + "logps/rejected": -123.57877349853516, + "loss": 0.627, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5281702280044556, + "rewards/margins": 0.17074742913246155, + "rewards/rejected": -0.6989176869392395, "step": 7380 }, { - "epoch": 1.27, - "grad_norm": 58.270680983383755, - "learning_rate": 1.755538448173518e-07, - "logits/chosen": -1.2635023593902588, - "logits/rejected": -1.208660364151001, - "logps/chosen": -237.5902557373047, - "logps/rejected": -356.22052001953125, - "loss": 0.4469, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.8537395000457764, - "rewards/margins": 1.17844557762146, - "rewards/rejected": -3.0321850776672363, + "epoch": 1.2732598208132322, + "grad_norm": 10.061474800109863, + "learning_rate": 3.511076896347036e-08, + "logits/chosen": -2.501349687576294, + "logits/rejected": -2.4825596809387207, + "logps/chosen": -104.70169830322266, + "logps/rejected": -120.3403549194336, + "loss": 0.6387, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5248795747756958, + "rewards/margins": 0.1483328640460968, + "rewards/rejected": -0.6732124090194702, "step": 7390 }, { - "epoch": 1.27, - "grad_norm": 19.321168110421592, - "learning_rate": 1.7483649602730987e-07, - "logits/chosen": -1.2944018840789795, - "logits/rejected": -1.2126576900482178, - "logps/chosen": -228.0514373779297, - "logps/rejected": -363.4169006347656, - "loss": 0.3908, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.7260242700576782, - "rewards/margins": 1.4026000499725342, - "rewards/rejected": -3.128624439239502, + "epoch": 1.2749827705031014, + "grad_norm": 8.739763259887695, + "learning_rate": 3.4967299205461974e-08, + "logits/chosen": -2.5027923583984375, + "logits/rejected": -2.4664530754089355, + "logps/chosen": -111.92269134521484, + "logps/rejected": -119.95565032958984, + "loss": 0.6456, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5647163391113281, + "rewards/margins": 0.12913253903388977, + "rewards/rejected": -0.6938489079475403, "step": 7400 }, { - "epoch": 1.27, - "eval_logits/chosen": -1.4501019716262817, - "eval_logits/rejected": -1.423465609550476, - "eval_logps/chosen": -227.0261688232422, - "eval_logps/rejected": -274.0572204589844, - "eval_loss": 0.630664587020874, - "eval_rewards/accuracies": 0.6568308472633362, - "eval_rewards/chosen": -1.683223009109497, - "eval_rewards/margins": 0.4257754683494568, - "eval_rewards/rejected": -2.1089982986450195, - "eval_runtime": 357.4983, - "eval_samples_per_second": 12.039, - "eval_steps_per_second": 1.505, + "epoch": 1.2749827705031014, + "eval_logits/chosen": -2.6409223079681396, + "eval_logits/rejected": -2.634524345397949, + "eval_logps/chosen": -101.84141540527344, + "eval_logps/rejected": -114.98131561279297, + "eval_loss": 0.6600046753883362, + "eval_rewards/accuracies": 0.6124535202980042, + "eval_rewards/chosen": -0.4312951862812042, + "eval_rewards/margins": 0.08671677857637405, + "eval_rewards/rejected": -0.5180119276046753, + "eval_runtime": 360.4764, + "eval_samples_per_second": 11.94, + "eval_steps_per_second": 1.492, "step": 7400 }, { - "epoch": 1.28, - "grad_norm": 45.11638691089592, - "learning_rate": 1.741198269466219e-07, - "logits/chosen": -1.2776044607162476, - "logits/rejected": -1.2083661556243896, - "logps/chosen": -218.230712890625, - "logps/rejected": -343.81890869140625, - "loss": 0.4103, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.6449800729751587, - "rewards/margins": 1.2632148265838623, - "rewards/rejected": -2.9081950187683105, + "epoch": 1.2767057201929704, + "grad_norm": 10.62580394744873, + "learning_rate": 3.482396538932438e-08, + "logits/chosen": -2.479602336883545, + "logits/rejected": -2.4505069255828857, + "logps/chosen": -105.51661682128906, + "logps/rejected": -118.50335693359375, + "loss": 0.6429, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5176024436950684, + "rewards/margins": 0.137213796377182, + "rewards/rejected": -0.6548162698745728, "step": 7410 }, { - "epoch": 1.28, - "grad_norm": 42.030823379925565, - "learning_rate": 1.7340384405618134e-07, - "logits/chosen": -1.2458035945892334, - "logits/rejected": -1.1925244331359863, - "logps/chosen": -207.9458770751953, - "logps/rejected": -318.7289123535156, - "loss": 0.4746, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.5179194211959839, - "rewards/margins": 1.1385493278503418, - "rewards/rejected": -2.6564688682556152, + "epoch": 1.2784286698828393, + "grad_norm": 9.140203475952148, + "learning_rate": 3.4680768811236266e-08, + "logits/chosen": -2.4199485778808594, + "logits/rejected": -2.3975508213043213, + "logps/chosen": -105.53422546386719, + "logps/rejected": -115.3234634399414, + "loss": 0.6455, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4936627447605133, + "rewards/margins": 0.1288955807685852, + "rewards/rejected": -0.6225583553314209, "step": 7420 }, { - "epoch": 1.28, - "grad_norm": 31.232966428180802, - "learning_rate": 1.7268855383067683e-07, - "logits/chosen": -1.2855768203735352, - "logits/rejected": -1.2198007106781006, - "logps/chosen": -232.55398559570312, - "logps/rejected": -353.21661376953125, - "loss": 0.4445, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.7915750741958618, - "rewards/margins": 1.2219661474227905, - "rewards/rejected": -3.0135409832000732, + "epoch": 1.2801516195727085, + "grad_norm": 11.094929695129395, + "learning_rate": 3.4537710766135366e-08, + "logits/chosen": -2.470757246017456, + "logits/rejected": -2.44372296333313, + "logps/chosen": -109.74659729003906, + "logps/rejected": -123.33604431152344, + "loss": 0.6375, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5634235739707947, + "rewards/margins": 0.15116463601589203, + "rewards/rejected": -0.7145882248878479, "step": 7430 }, { - "epoch": 1.28, - "grad_norm": 34.93396652920407, - "learning_rate": 1.7197396273853276e-07, - "logits/chosen": -1.4023360013961792, - "logits/rejected": -1.343386173248291, - "logps/chosen": -240.6739044189453, - "logps/rejected": -337.22491455078125, - "loss": 0.5101, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.8467438220977783, - "rewards/margins": 0.9919818043708801, - "rewards/rejected": -2.8387253284454346, + "epoch": 1.2818745692625775, + "grad_norm": 9.100417137145996, + "learning_rate": 3.439479254770655e-08, + "logits/chosen": -2.5642802715301514, + "logits/rejected": -2.536123752593994, + "logps/chosen": -114.7548828125, + "logps/rejected": -126.5836181640625, + "loss": 0.6416, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5875005722045898, + "rewards/margins": 0.14469286799430847, + "rewards/rejected": -0.7321933507919312, "step": 7440 }, { - "epoch": 1.28, - "grad_norm": 27.19873025751527, - "learning_rate": 1.7126007724185165e-07, - "logits/chosen": -1.5503208637237549, - "logits/rejected": -1.4830740690231323, - "logps/chosen": -199.37518310546875, - "logps/rejected": -305.76214599609375, - "loss": 0.4474, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.4345240592956543, - "rewards/margins": 1.0972298383712769, - "rewards/rejected": -2.5317540168762207, + "epoch": 1.2835975189524467, + "grad_norm": 9.80208969116211, + "learning_rate": 3.425201544837033e-08, + "logits/chosen": -2.6381442546844482, + "logits/rejected": -2.6124885082244873, + "logps/chosen": -109.50288391113281, + "logps/rejected": -119.41923522949219, + "loss": 0.6478, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.535770833492279, + "rewards/margins": 0.13235479593276978, + "rewards/rejected": -0.6681256294250488, "step": 7450 }, { - "epoch": 1.29, - "grad_norm": 27.77498676919186, - "learning_rate": 1.7054690379635477e-07, - "logits/chosen": -1.3472172021865845, - "logits/rejected": -1.3040322065353394, - "logps/chosen": -191.5806427001953, - "logps/rejected": -319.0374755859375, - "loss": 0.4022, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.4145309925079346, - "rewards/margins": 1.2461668252944946, - "rewards/rejected": -2.6606979370117188, + "epoch": 1.2853204686423156, + "grad_norm": 9.576980590820312, + "learning_rate": 3.410938075927096e-08, + "logits/chosen": -2.4544360637664795, + "logits/rejected": -2.457080841064453, + "logps/chosen": -101.24821472167969, + "logps/rejected": -119.45255279541016, + "loss": 0.6353, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5108596682548523, + "rewards/margins": 0.15377506613731384, + "rewards/rejected": -0.6646348237991333, "step": 7460 }, { - "epoch": 1.29, - "grad_norm": 33.696980703475546, - "learning_rate": 1.698344488513247e-07, - "logits/chosen": -1.4441345930099487, - "logits/rejected": -1.3981006145477295, - "logps/chosen": -196.69949340820312, - "logps/rejected": -293.2108459472656, - "loss": 0.4734, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.42051100730896, - "rewards/margins": 0.998548150062561, - "rewards/rejected": -2.4190590381622314, + "epoch": 1.2870434183321846, + "grad_norm": 10.5313720703125, + "learning_rate": 3.396688977026494e-08, + "logits/chosen": -2.5422723293304443, + "logits/rejected": -2.5268795490264893, + "logps/chosen": -106.96131896972656, + "logps/rejected": -115.691650390625, + "loss": 0.6525, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5230668783187866, + "rewards/margins": 0.12071572244167328, + "rewards/rejected": -0.6437825560569763, "step": 7470 }, { - "epoch": 1.29, - "grad_norm": 35.61398182570765, - "learning_rate": 1.691227188495461e-07, - "logits/chosen": -1.3656269311904907, - "logits/rejected": -1.313783884048462, - "logps/chosen": -199.72702026367188, - "logps/rejected": -283.9599609375, - "loss": 0.5024, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.4509938955307007, - "rewards/margins": 0.8966015577316284, - "rewards/rejected": -2.3475959300994873, + "epoch": 1.2887663680220538, + "grad_norm": 9.148719787597656, + "learning_rate": 3.382454376990922e-08, + "logits/chosen": -2.500944137573242, + "logits/rejected": -2.475217580795288, + "logps/chosen": -105.0721206665039, + "logps/rejected": -112.06779479980469, + "loss": 0.6535, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.5043980479240417, + "rewards/margins": 0.12430262565612793, + "rewards/rejected": -0.6287007331848145, "step": 7480 }, { - "epoch": 1.29, - "grad_norm": 27.73975502601125, - "learning_rate": 1.684117202272485e-07, - "logits/chosen": -1.3349004983901978, - "logits/rejected": -1.287638545036316, - "logps/chosen": -206.85690307617188, - "logps/rejected": -317.2720031738281, - "loss": 0.4389, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.5430989265441895, - "rewards/margins": 1.098320484161377, - "rewards/rejected": -2.6414191722869873, + "epoch": 1.2904893177119228, + "grad_norm": 9.421043395996094, + "learning_rate": 3.36823440454497e-08, + "logits/chosen": -2.4685792922973633, + "logits/rejected": -2.4556145668029785, + "logps/chosen": -106.70912170410156, + "logps/rejected": -123.2634048461914, + "loss": 0.629, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5414863228797913, + "rewards/margins": 0.15967078506946564, + "rewards/rejected": -0.7011570334434509, "step": 7490 }, { - "epoch": 1.29, - "grad_norm": 28.229250472829, - "learning_rate": 1.6770145941404696e-07, - "logits/chosen": -1.3574326038360596, - "logits/rejected": -1.2926125526428223, - "logps/chosen": -197.47386169433594, - "logps/rejected": -314.25897216796875, - "loss": 0.4618, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.4602476358413696, - "rewards/margins": 1.162480115890503, - "rewards/rejected": -2.622727632522583, + "epoch": 1.292212267401792, + "grad_norm": 8.091127395629883, + "learning_rate": 3.3540291882809394e-08, + "logits/chosen": -2.4892683029174805, + "logits/rejected": -2.465965509414673, + "logps/chosen": -104.45188903808594, + "logps/rejected": -118.73978424072266, + "loss": 0.6455, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5297764539718628, + "rewards/margins": 0.1378394067287445, + "rewards/rejected": -0.6676157712936401, "step": 7500 }, { - "epoch": 1.29, - "eval_logits/chosen": -1.5060161352157593, - "eval_logits/rejected": -1.481170654296875, - "eval_logps/chosen": -211.69110107421875, - "eval_logps/rejected": -254.75897216796875, - "eval_loss": 0.627605676651001, - "eval_rewards/accuracies": 0.6531133651733398, - "eval_rewards/chosen": -1.5298728942871094, - "eval_rewards/margins": 0.38614320755004883, - "eval_rewards/rejected": -1.916015863418579, - "eval_runtime": 357.4881, - "eval_samples_per_second": 12.04, - "eval_steps_per_second": 1.505, + "epoch": 1.292212267401792, + "eval_logits/chosen": -2.635746955871582, + "eval_logits/rejected": -2.629236936569214, + "eval_logps/chosen": -101.78617095947266, + "eval_logps/rejected": -114.98066711425781, + "eval_loss": 0.6597474813461304, + "eval_rewards/accuracies": 0.6147769689559937, + "eval_rewards/chosen": -0.4307427406311035, + "eval_rewards/margins": 0.0872626081109047, + "eval_rewards/rejected": -0.51800537109375, + "eval_runtime": 359.6591, + "eval_samples_per_second": 11.967, + "eval_steps_per_second": 1.496, "step": 7500 }, { - "epoch": 1.29, - "grad_norm": 29.587971860400398, - "learning_rate": 1.669919428328847e-07, - "logits/chosen": -1.394683599472046, - "logits/rejected": -1.3351951837539673, - "logps/chosen": -214.37551879882812, - "logps/rejected": -310.74652099609375, - "loss": 0.4476, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.5710610151290894, - "rewards/margins": 1.0244272947311401, - "rewards/rejected": -2.5954883098602295, + "epoch": 1.293935217091661, + "grad_norm": 10.538162231445312, + "learning_rate": 3.339838856657694e-08, + "logits/chosen": -2.5177297592163086, + "logits/rejected": -2.4895987510681152, + "logps/chosen": -113.8686752319336, + "logps/rejected": -116.67778015136719, + "loss": 0.6629, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5660760998725891, + "rewards/margins": 0.08849167078733444, + "rewards/rejected": -0.6545677185058594, "step": 7510 }, { - "epoch": 1.3, - "grad_norm": 25.2120659878252, - "learning_rate": 1.6628317689997498e-07, - "logits/chosen": -1.3550820350646973, - "logits/rejected": -1.3013880252838135, - "logps/chosen": -199.8297119140625, - "logps/rejected": -322.51141357421875, - "loss": 0.4, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.4692803621292114, - "rewards/margins": 1.217524766921997, - "rewards/rejected": -2.686805248260498, + "epoch": 1.29565816678153, + "grad_norm": 10.612116813659668, + "learning_rate": 3.3256635379995e-08, + "logits/chosen": -2.4990150928497314, + "logits/rejected": -2.479405641555786, + "logps/chosen": -103.64764404296875, + "logps/rejected": -122.71256256103516, + "loss": 0.6233, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5071987509727478, + "rewards/margins": 0.18135200440883636, + "rewards/rejected": -0.6885508298873901, "step": 7520 }, { - "epoch": 1.3, - "grad_norm": 17.580126618538177, - "learning_rate": 1.6557516802474247e-07, - "logits/chosen": -1.2875080108642578, - "logits/rejected": -1.237430453300476, - "logps/chosen": -204.1475830078125, - "logps/rejected": -334.02349853515625, - "loss": 0.4147, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.5195860862731934, - "rewards/margins": 1.2862619161605835, - "rewards/rejected": -2.8058478832244873, + "epoch": 1.297381116471399, + "grad_norm": 9.325216293334961, + "learning_rate": 3.311503360494849e-08, + "logits/chosen": -2.437972068786621, + "logits/rejected": -2.434144973754883, + "logps/chosen": -103.4576187133789, + "logps/rejected": -120.95352935791016, + "loss": 0.6315, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5124028921127319, + "rewards/margins": 0.16277670860290527, + "rewards/rejected": -0.6751796007156372, "step": 7530 }, { - "epoch": 1.3, - "grad_norm": 24.405271336303738, - "learning_rate": 1.6486792260976618e-07, - "logits/chosen": -1.4056943655014038, - "logits/rejected": -1.3522775173187256, - "logps/chosen": -208.80911254882812, - "logps/rejected": -347.8929138183594, - "loss": 0.3834, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.5729515552520752, - "rewards/margins": 1.3793232440948486, - "rewards/rejected": -2.952274799346924, + "epoch": 1.299104066161268, + "grad_norm": 8.740460395812988, + "learning_rate": 3.297358452195324e-08, + "logits/chosen": -2.5994045734405518, + "logits/rejected": -2.592708110809326, + "logps/chosen": -99.10334014892578, + "logps/rejected": -118.64372253417969, + "loss": 0.6199, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.47562798857688904, + "rewards/margins": 0.18405228853225708, + "rewards/rejected": -0.6596802473068237, "step": 7540 }, { - "epoch": 1.3, - "grad_norm": 29.108508549211408, - "learning_rate": 1.6416144705072072e-07, - "logits/chosen": -1.2879887819290161, - "logits/rejected": -1.2317047119140625, - "logps/chosen": -233.01010131835938, - "logps/rejected": -367.2439270019531, - "loss": 0.4572, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.8120092153549194, - "rewards/margins": 1.3423044681549072, - "rewards/rejected": -3.154313564300537, + "epoch": 1.3008270158511372, + "grad_norm": 8.941034317016602, + "learning_rate": 3.283228941014414e-08, + "logits/chosen": -2.51711106300354, + "logits/rejected": -2.498652935028076, + "logps/chosen": -102.238525390625, + "logps/rejected": -120.69661712646484, + "loss": 0.6221, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5044776797294617, + "rewards/margins": 0.18435628712177277, + "rewards/rejected": -0.688834011554718, "step": 7550 }, { - "epoch": 1.3, - "grad_norm": 42.99955943761263, - "learning_rate": 1.6345574773631898e-07, - "logits/chosen": -1.388718843460083, - "logits/rejected": -1.3253867626190186, - "logps/chosen": -227.2410888671875, - "logps/rejected": -355.32391357421875, - "loss": 0.4386, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.7468681335449219, - "rewards/margins": 1.2529178857803345, - "rewards/rejected": -2.999785900115967, + "epoch": 1.3025499655410062, + "grad_norm": 9.605469703674316, + "learning_rate": 3.2691149547263794e-08, + "logits/chosen": -2.6023619174957275, + "logits/rejected": -2.582151412963867, + "logps/chosen": -99.77006530761719, + "logps/rejected": -122.7721939086914, + "loss": 0.6113, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4719509482383728, + "rewards/margins": 0.20214197039604187, + "rewards/rejected": -0.6740928888320923, "step": 7560 }, { - "epoch": 1.3, - "grad_norm": 38.1555696234109, - "learning_rate": 1.6275083104825414e-07, - "logits/chosen": -1.3410319089889526, - "logits/rejected": -1.2803980112075806, - "logps/chosen": -247.3603515625, - "logps/rejected": -375.22528076171875, - "loss": 0.4186, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.9059759378433228, - "rewards/margins": 1.3241702318191528, - "rewards/rejected": -3.2301464080810547, + "epoch": 1.3042729152308752, + "grad_norm": 10.741072654724121, + "learning_rate": 3.255016620965082e-08, + "logits/chosen": -2.588242292404175, + "logits/rejected": -2.5711989402770996, + "logps/chosen": -111.654052734375, + "logps/rejected": -124.45565032958984, + "loss": 0.6266, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5485700368881226, + "rewards/margins": 0.17345201969146729, + "rewards/rejected": -0.7220219969749451, "step": 7570 }, { - "epoch": 1.31, - "grad_norm": 41.061661152022474, - "learning_rate": 1.6204670336114224e-07, - "logits/chosen": -1.2776286602020264, - "logits/rejected": -1.2231152057647705, - "logps/chosen": -242.25015258789062, - "logps/rejected": -361.8536682128906, - "loss": 0.4535, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.8624213933944702, - "rewards/margins": 1.2097980976104736, - "rewards/rejected": -3.0722193717956543, + "epoch": 1.3059958649207444, + "grad_norm": 10.163570404052734, + "learning_rate": 3.240934067222845e-08, + "logits/chosen": -2.539912223815918, + "logits/rejected": -2.5225236415863037, + "logps/chosen": -103.22077941894531, + "logps/rejected": -120.7096939086914, + "loss": 0.6187, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.47207051515579224, + "rewards/margins": 0.1884404718875885, + "rewards/rejected": -0.6605108976364136, "step": 7580 }, { - "epoch": 1.31, - "grad_norm": 19.57994826653458, - "learning_rate": 1.6134337104246395e-07, - "logits/chosen": -1.3166749477386475, - "logits/rejected": -1.225110650062561, - "logps/chosen": -244.9367218017578, - "logps/rejected": -403.8605041503906, - "loss": 0.3392, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -1.8664249181747437, - "rewards/margins": 1.658355474472046, - "rewards/rejected": -3.524780750274658, + "epoch": 1.3077188146106133, + "grad_norm": 9.670416831970215, + "learning_rate": 3.226867420849279e-08, + "logits/chosen": -2.5681509971618652, + "logits/rejected": -2.526566505432129, + "logps/chosen": -115.42330169677734, + "logps/rejected": -125.2770004272461, + "loss": 0.6329, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5710044503211975, + "rewards/margins": 0.1676451563835144, + "rewards/rejected": -0.7386496067047119, "step": 7590 }, { - "epoch": 1.31, - "grad_norm": 46.40887691060314, - "learning_rate": 1.6064084045250786e-07, - "logits/chosen": -1.3110687732696533, - "logits/rejected": -1.2509706020355225, - "logps/chosen": -266.95465087890625, - "logps/rejected": -382.99432373046875, - "loss": 0.5019, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.144411563873291, - "rewards/margins": 1.1999857425689697, - "rewards/rejected": -3.3443970680236816, + "epoch": 1.3094417643004825, + "grad_norm": 11.190610885620117, + "learning_rate": 3.2128168090501575e-08, + "logits/chosen": -2.5398783683776855, + "logits/rejected": -2.513364791870117, + "logps/chosen": -111.20823669433594, + "logps/rejected": -114.00135803222656, + "loss": 0.6762, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.5867103934288025, + "rewards/margins": 0.06733911484479904, + "rewards/rejected": -0.6540495157241821, "step": 7600 }, { - "epoch": 1.31, - "eval_logits/chosen": -1.4277472496032715, - "eval_logits/rejected": -1.4007766246795654, - "eval_logps/chosen": -242.9215087890625, - "eval_logps/rejected": -292.66485595703125, - "eval_loss": 0.6300765872001648, - "eval_rewards/accuracies": 0.6624070405960083, - "eval_rewards/chosen": -1.8421767950057983, - "eval_rewards/margins": 0.4528978765010834, - "eval_rewards/rejected": -2.295074462890625, - "eval_runtime": 357.4474, - "eval_samples_per_second": 12.041, - "eval_steps_per_second": 1.505, + "epoch": 1.3094417643004825, + "eval_logits/chosen": -2.62813401222229, + "eval_logits/rejected": -2.621640682220459, + "eval_logps/chosen": -102.6287612915039, + "eval_logps/rejected": -115.96487426757812, + "eval_loss": 0.6593355536460876, + "eval_rewards/accuracies": 0.6117565035820007, + "eval_rewards/chosen": -0.4391685724258423, + "eval_rewards/margins": 0.08867882192134857, + "eval_rewards/rejected": -0.5278474688529968, + "eval_runtime": 359.5884, + "eval_samples_per_second": 11.969, + "eval_steps_per_second": 1.496, "step": 7600 }, { - "epoch": 1.31, - "grad_norm": 27.39761729744123, - "learning_rate": 1.5993911794431197e-07, - "logits/chosen": -1.3395607471466064, - "logits/rejected": -1.275943398475647, - "logps/chosen": -216.5625457763672, - "logps/rejected": -342.125244140625, - "loss": 0.4343, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.63640558719635, - "rewards/margins": 1.2829220294952393, - "rewards/rejected": -2.9193274974823, + "epoch": 1.3111647139903515, + "grad_norm": 7.892940998077393, + "learning_rate": 3.1987823588862395e-08, + "logits/chosen": -2.494873285293579, + "logits/rejected": -2.4665284156799316, + "logps/chosen": -105.79533386230469, + "logps/rejected": -118.2354965209961, + "loss": 0.6389, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5288399457931519, + "rewards/margins": 0.1512880027294159, + "rewards/rejected": -0.6801279783248901, "step": 7610 }, { - "epoch": 1.31, - "grad_norm": 28.677466981159814, - "learning_rate": 1.5923820986360703e-07, - "logits/chosen": -1.4301960468292236, - "logits/rejected": -1.3750264644622803, - "logps/chosen": -204.70155334472656, - "logps/rejected": -306.2242431640625, - "loss": 0.4617, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.4719994068145752, - "rewards/margins": 1.0525176525115967, - "rewards/rejected": -2.524517059326172, + "epoch": 1.3128876636802205, + "grad_norm": 9.36888599395752, + "learning_rate": 3.18476419727214e-08, + "logits/chosen": -2.5487782955169678, + "logits/rejected": -2.52319073677063, + "logps/chosen": -107.84880065917969, + "logps/rejected": -116.66322326660156, + "loss": 0.6477, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5033544301986694, + "rewards/margins": 0.1253475844860077, + "rewards/rejected": -0.6287019848823547, "step": 7620 }, { - "epoch": 1.31, - "grad_norm": 30.824980394711297, - "learning_rate": 1.585381225487588e-07, - "logits/chosen": -1.3620095252990723, - "logits/rejected": -1.3211814165115356, - "logps/chosen": -199.6609344482422, - "logps/rejected": -317.6810607910156, - "loss": 0.4326, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.49845290184021, - "rewards/margins": 1.1381967067718506, - "rewards/rejected": -2.6366496086120605, + "epoch": 1.3146106133700897, + "grad_norm": 11.694605827331543, + "learning_rate": 3.1707624509751754e-08, + "logits/chosen": -2.480271339416504, + "logits/rejected": -2.4887309074401855, + "logps/chosen": -99.38700866699219, + "logps/rejected": -120.13069152832031, + "loss": 0.6284, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.495540052652359, + "rewards/margins": 0.16540366411209106, + "rewards/rejected": -0.6609436869621277, "step": 7630 }, { - "epoch": 1.32, - "grad_norm": 40.129932359839145, - "learning_rate": 1.5783886233071074e-07, - "logits/chosen": -1.281798243522644, - "logits/rejected": -1.2172118425369263, - "logps/chosen": -226.0537872314453, - "logps/rejected": -354.72576904296875, - "loss": 0.423, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.6815427541732788, - "rewards/margins": 1.3418070077896118, - "rewards/rejected": -3.0233497619628906, + "epoch": 1.3163335630599586, + "grad_norm": 13.094701766967773, + "learning_rate": 3.156777246614215e-08, + "logits/chosen": -2.44416880607605, + "logits/rejected": -2.4228973388671875, + "logps/chosen": -111.28560638427734, + "logps/rejected": -125.09645080566406, + "loss": 0.6179, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5336799025535583, + "rewards/margins": 0.19319351017475128, + "rewards/rejected": -0.7268733978271484, "step": 7640 }, { - "epoch": 1.32, - "grad_norm": 34.0153156124325, - "learning_rate": 1.5714043553292683e-07, - "logits/chosen": -1.3627344369888306, - "logits/rejected": -1.304088830947876, - "logps/chosen": -245.76119995117188, - "logps/rejected": -363.3336486816406, - "loss": 0.4816, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.8740276098251343, - "rewards/margins": 1.2018343210220337, - "rewards/rejected": -3.075861692428589, + "epoch": 1.3180565127498278, + "grad_norm": 8.995244026184082, + "learning_rate": 3.1428087106585365e-08, + "logits/chosen": -2.565865993499756, + "logits/rejected": -2.5419039726257324, + "logps/chosen": -116.0489273071289, + "logps/rejected": -128.86428833007812, + "loss": 0.6364, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5765261650085449, + "rewards/margins": 0.15430589020252228, + "rewards/rejected": -0.730832040309906, "step": 7650 }, { - "epoch": 1.32, - "grad_norm": 27.126137701003756, - "learning_rate": 1.564428484713345e-07, - "logits/chosen": -1.36992609500885, - "logits/rejected": -1.2936866283416748, - "logps/chosen": -223.11538696289062, - "logps/rejected": -359.9754333496094, - "loss": 0.3757, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.6609264612197876, - "rewards/margins": 1.3914331197738647, - "rewards/rejected": -3.0523598194122314, + "epoch": 1.3197794624396968, + "grad_norm": 10.541084289550781, + "learning_rate": 3.12885696942669e-08, + "logits/chosen": -2.5416741371154785, + "logits/rejected": -2.5064024925231934, + "logps/chosen": -111.7835922241211, + "logps/rejected": -125.22023010253906, + "loss": 0.635, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5475324392318726, + "rewards/margins": 0.1569490134716034, + "rewards/rejected": -0.7044814825057983, "step": 7660 }, { - "epoch": 1.32, - "grad_norm": 34.38929359333108, - "learning_rate": 1.5574610745426704e-07, - "logits/chosen": -1.3428263664245605, - "logits/rejected": -1.283569097518921, - "logps/chosen": -209.7373046875, - "logps/rejected": -315.8505859375, - "loss": 0.5005, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.5220474004745483, - "rewards/margins": 1.107168436050415, - "rewards/rejected": -2.629215955734253, + "epoch": 1.3215024121295658, + "grad_norm": 12.941953659057617, + "learning_rate": 3.114922149085341e-08, + "logits/chosen": -2.492443799972534, + "logits/rejected": -2.467200994491577, + "logps/chosen": -110.19084167480469, + "logps/rejected": -119.7401123046875, + "loss": 0.6406, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.526465117931366, + "rewards/margins": 0.14155793190002441, + "rewards/rejected": -0.6680231690406799, "step": 7670 }, { - "epoch": 1.32, - "grad_norm": 26.0642265781016, - "learning_rate": 1.5505021878240732e-07, - "logits/chosen": -1.413971185684204, - "logits/rejected": -1.3582481145858765, - "logps/chosen": -209.92529296875, - "logps/rejected": -324.2261047363281, - "loss": 0.4226, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5928831100463867, - "rewards/margins": 1.1526020765304565, - "rewards/rejected": -2.745485305786133, + "epoch": 1.323225361819435, + "grad_norm": 8.414935111999512, + "learning_rate": 3.101004375648146e-08, + "logits/chosen": -2.5556654930114746, + "logits/rejected": -2.5384323596954346, + "logps/chosen": -103.7856674194336, + "logps/rejected": -119.1416244506836, + "loss": 0.6285, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5314931869506836, + "rewards/margins": 0.1628451645374298, + "rewards/rejected": -0.6943383812904358, "step": 7680 }, { - "epoch": 1.32, - "grad_norm": 21.73140455009811, - "learning_rate": 1.543551887487301e-07, - "logits/chosen": -1.5031044483184814, - "logits/rejected": -1.4207048416137695, - "logps/chosen": -188.10305786132812, - "logps/rejected": -301.28973388671875, - "loss": 0.3942, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.3312842845916748, - "rewards/margins": 1.1913520097732544, - "rewards/rejected": -2.5226359367370605, + "epoch": 1.324948311509304, + "grad_norm": 8.739034652709961, + "learning_rate": 3.087103774974602e-08, + "logits/chosen": -2.64532732963562, + "logits/rejected": -2.6009573936462402, + "logps/chosen": -101.80657958984375, + "logps/rejected": -114.46602630615234, + "loss": 0.6212, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4682212471961975, + "rewards/margins": 0.18588753044605255, + "rewards/rejected": -0.6541087031364441, "step": 7690 }, { - "epoch": 1.33, - "grad_norm": 21.539140223042764, - "learning_rate": 1.5366102363844552e-07, - "logits/chosen": -1.389103889465332, - "logits/rejected": -1.3187581300735474, - "logps/chosen": -202.11195373535156, - "logps/rejected": -320.7894592285156, - "loss": 0.4239, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.4840246438980103, - "rewards/margins": 1.1871830224990845, - "rewards/rejected": -2.6712074279785156, + "epoch": 1.3266712611991731, + "grad_norm": 8.63172435760498, + "learning_rate": 3.07322047276891e-08, + "logits/chosen": -2.512674570083618, + "logits/rejected": -2.48405385017395, + "logps/chosen": -105.63853454589844, + "logps/rejected": -120.13700866699219, + "loss": 0.6365, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5192811489105225, + "rewards/margins": 0.14506670832633972, + "rewards/rejected": -0.6643478274345398, "step": 7700 }, { - "epoch": 1.33, - "eval_logits/chosen": -1.4800820350646973, - "eval_logits/rejected": -1.4540327787399292, - "eval_logps/chosen": -219.68116760253906, - "eval_logps/rejected": -265.55712890625, - "eval_loss": 0.6266021728515625, - "eval_rewards/accuracies": 0.663336455821991, - "eval_rewards/chosen": -1.6097729206085205, - "eval_rewards/margins": 0.414224237203598, - "eval_rewards/rejected": -2.0239975452423096, - "eval_runtime": 357.1774, - "eval_samples_per_second": 12.05, - "eval_steps_per_second": 1.506, + "epoch": 1.3266712611991731, + "eval_logits/chosen": -2.6237244606018066, + "eval_logits/rejected": -2.617231845855713, + "eval_logps/chosen": -102.73433685302734, + "eval_logps/rejected": -116.12883758544922, + "eval_loss": 0.659196138381958, + "eval_rewards/accuracies": 0.6157063245773315, + "eval_rewards/chosen": -0.44022446870803833, + "eval_rewards/margins": 0.08926267176866531, + "eval_rewards/rejected": -0.529487133026123, + "eval_runtime": 359.8583, + "eval_samples_per_second": 11.96, + "eval_steps_per_second": 1.495, "step": 7700 }, { - "epoch": 1.33, - "grad_norm": 30.088144432586212, - "learning_rate": 1.5296772972894212e-07, - "logits/chosen": -1.4096615314483643, - "logits/rejected": -1.3569542169570923, - "logps/chosen": -206.3859100341797, - "logps/rejected": -320.184326171875, - "loss": 0.4, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.5484488010406494, - "rewards/margins": 1.1439803838729858, - "rewards/rejected": -2.6924290657043457, + "epoch": 1.328394210889042, + "grad_norm": 9.414764404296875, + "learning_rate": 3.0593545945788426e-08, + "logits/chosen": -2.5537688732147217, + "logits/rejected": -2.5392956733703613, + "logps/chosen": -111.46722412109375, + "logps/rejected": -122.5143814086914, + "loss": 0.6532, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5989468693733215, + "rewards/margins": 0.11693338304758072, + "rewards/rejected": -0.7158802151679993, "step": 7710 }, { - "epoch": 1.33, - "grad_norm": 40.080885647002745, - "learning_rate": 1.5227531328972995e-07, - "logits/chosen": -1.3759911060333252, - "logits/rejected": -1.3137165307998657, - "logps/chosen": -219.1600799560547, - "logps/rejected": -328.873046875, - "loss": 0.4476, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.6303611993789673, - "rewards/margins": 1.1502439975738525, - "rewards/rejected": -2.7806053161621094, + "epoch": 1.330117160578911, + "grad_norm": 10.093575477600098, + "learning_rate": 3.045506265794599e-08, + "logits/chosen": -2.5286622047424316, + "logits/rejected": -2.496185064315796, + "logps/chosen": -113.056884765625, + "logps/rejected": -122.25138092041016, + "loss": 0.6378, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5691434144973755, + "rewards/margins": 0.14525368809700012, + "rewards/rejected": -0.714397132396698, "step": 7720 }, { - "epoch": 1.33, - "grad_norm": 35.80033306298759, - "learning_rate": 1.5158378058238442e-07, - "logits/chosen": -1.3037515878677368, - "logits/rejected": -1.245792031288147, - "logps/chosen": -219.6901397705078, - "logps/rejected": -339.2084655761719, - "loss": 0.4158, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.657705545425415, - "rewards/margins": 1.2117184400558472, - "rewards/rejected": -2.8694233894348145, + "epoch": 1.33184011026878, + "grad_norm": 9.999787330627441, + "learning_rate": 3.0316756116476885e-08, + "logits/chosen": -2.489351749420166, + "logits/rejected": -2.4682085514068604, + "logps/chosen": -105.78633880615234, + "logps/rejected": -120.046875, + "loss": 0.6301, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.518639862537384, + "rewards/margins": 0.15913304686546326, + "rewards/rejected": -0.6777728796005249, "step": 7730 }, { - "epoch": 1.33, - "grad_norm": 33.772757774329904, - "learning_rate": 1.5089313786048885e-07, - "logits/chosen": -1.282684564590454, - "logits/rejected": -1.222401738166809, - "logps/chosen": -228.92276000976562, - "logps/rejected": -378.69622802734375, - "loss": 0.3819, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7724063396453857, - "rewards/margins": 1.4769179821014404, - "rewards/rejected": -3.2493247985839844, + "epoch": 1.3335630599586492, + "grad_norm": 9.254657745361328, + "learning_rate": 3.017862757209777e-08, + "logits/chosen": -2.481689453125, + "logits/rejected": -2.4662909507751465, + "logps/chosen": -104.68952941894531, + "logps/rejected": -126.27779388427734, + "loss": 0.6176, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5298460721969604, + "rewards/margins": 0.19512736797332764, + "rewards/rejected": -0.7249733805656433, "step": 7740 }, { - "epoch": 1.34, - "grad_norm": 28.25568491451203, - "learning_rate": 1.5020339136957877e-07, - "logits/chosen": -1.3118457794189453, - "logits/rejected": -1.2334351539611816, - "logps/chosen": -243.1823272705078, - "logps/rejected": -393.16705322265625, - "loss": 0.3772, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.892716407775879, - "rewards/margins": 1.5171799659729004, - "rewards/rejected": -3.4098963737487793, + "epoch": 1.3352860096485184, + "grad_norm": 9.561819076538086, + "learning_rate": 3.004067827391575e-08, + "logits/chosen": -2.4999566078186035, + "logits/rejected": -2.4698214530944824, + "logps/chosen": -110.5082778930664, + "logps/rejected": -127.4419174194336, + "loss": 0.6253, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5656547546386719, + "rewards/margins": 0.18680861592292786, + "rewards/rejected": -0.7524634599685669, "step": 7750 }, { - "epoch": 1.34, - "grad_norm": 34.5208633023336, - "learning_rate": 1.4951454734708458e-07, - "logits/chosen": -1.2015626430511475, - "logits/rejected": -1.1355557441711426, - "logps/chosen": -221.8160400390625, - "logps/rejected": -374.62139892578125, - "loss": 0.3608, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.7132511138916016, - "rewards/margins": 1.516405701637268, - "rewards/rejected": -3.229656934738159, + "epoch": 1.3370089593383874, + "grad_norm": 8.614842414855957, + "learning_rate": 2.990290946941691e-08, + "logits/chosen": -2.4035215377807617, + "logits/rejected": -2.3857004642486572, + "logps/chosen": -103.11820220947266, + "logps/rejected": -124.1723861694336, + "loss": 0.6166, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.526114821434021, + "rewards/margins": 0.19871024787425995, + "rewards/rejected": -0.7248250246047974, "step": 7760 }, { - "epoch": 1.34, - "grad_norm": 32.49208419213271, - "learning_rate": 1.4882661202227597e-07, - "logits/chosen": -1.256168007850647, - "logits/rejected": -1.1954753398895264, - "logps/chosen": -244.3036346435547, - "logps/rejected": -356.46368408203125, - "loss": 0.4625, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.9218246936798096, - "rewards/margins": 1.1179141998291016, - "rewards/rejected": -3.039738655090332, + "epoch": 1.3387319090282563, + "grad_norm": 9.305093765258789, + "learning_rate": 2.9765322404455194e-08, + "logits/chosen": -2.4598212242126465, + "logits/rejected": -2.4356131553649902, + "logps/chosen": -104.67939758300781, + "logps/rejected": -120.8833236694336, + "loss": 0.6307, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5256319642066956, + "rewards/margins": 0.15807506442070007, + "rewards/rejected": -0.683707058429718, "step": 7770 }, { - "epoch": 1.34, - "grad_norm": 33.49580205257945, - "learning_rate": 1.48139591616205e-07, - "logits/chosen": -1.3774831295013428, - "logits/rejected": -1.3217271566390991, - "logps/chosen": -245.13168334960938, - "logps/rejected": -392.8816223144531, - "loss": 0.3819, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.9360601902008057, - "rewards/margins": 1.4542224407196045, - "rewards/rejected": -3.390282392501831, + "epoch": 1.3404548587181253, + "grad_norm": 10.675363540649414, + "learning_rate": 2.9627918323241004e-08, + "logits/chosen": -2.6021933555603027, + "logits/rejected": -2.58845853805542, + "logps/chosen": -109.0901107788086, + "logps/rejected": -123.86561584472656, + "loss": 0.6522, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5755112767219543, + "rewards/margins": 0.12428200244903564, + "rewards/rejected": -0.69979327917099, "step": 7780 }, { - "epoch": 1.34, - "grad_norm": 31.546470528457153, - "learning_rate": 1.4745349234165016e-07, - "logits/chosen": -1.318555235862732, - "logits/rejected": -1.2555335760116577, - "logps/chosen": -245.47787475585938, - "logps/rejected": -402.24993896484375, - "loss": 0.36, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.9392179250717163, - "rewards/margins": 1.5643069744110107, - "rewards/rejected": -3.5035252571105957, + "epoch": 1.3421778084079945, + "grad_norm": 11.858165740966797, + "learning_rate": 2.9490698468330034e-08, + "logits/chosen": -2.536799192428589, + "logits/rejected": -2.523071765899658, + "logps/chosen": -106.40971374511719, + "logps/rejected": -124.10882568359375, + "loss": 0.6297, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5483931303024292, + "rewards/margins": 0.17372551560401917, + "rewards/rejected": -0.7221185564994812, "step": 7790 }, { - "epoch": 1.34, - "grad_norm": 33.94781782883591, - "learning_rate": 1.4676832040305984e-07, - "logits/chosen": -1.3638694286346436, - "logits/rejected": -1.3124583959579468, - "logps/chosen": -240.8196258544922, - "logps/rejected": -377.0190124511719, - "loss": 0.4156, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.868431806564331, - "rewards/margins": 1.3547109365463257, - "rewards/rejected": -3.223142623901367, + "epoch": 1.3439007580978635, + "grad_norm": 9.725826263427734, + "learning_rate": 2.9353664080611968e-08, + "logits/chosen": -2.563347339630127, + "logits/rejected": -2.557152032852173, + "logps/chosen": -110.21385192871094, + "logps/rejected": -129.93624877929688, + "loss": 0.6211, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.562333881855011, + "rewards/margins": 0.18971523642539978, + "rewards/rejected": -0.7520490884780884, "step": 7800 }, { - "epoch": 1.34, - "eval_logits/chosen": -1.3900244235992432, - "eval_logits/rejected": -1.3618897199630737, - "eval_logps/chosen": -258.3906555175781, - "eval_logps/rejected": -311.4806823730469, - "eval_loss": 0.6327470541000366, - "eval_rewards/accuracies": 0.6638011336326599, - "eval_rewards/chosen": -1.9968681335449219, - "eval_rewards/margins": 0.4863649308681488, - "eval_rewards/rejected": -2.4832329750061035, - "eval_runtime": 355.9596, - "eval_samples_per_second": 12.091, - "eval_steps_per_second": 1.511, + "epoch": 1.3439007580978635, + "eval_logits/chosen": -2.6180360317230225, + "eval_logits/rejected": -2.6114888191223145, + "eval_logps/chosen": -103.54806518554688, + "eval_logps/rejected": -117.0740737915039, + "eval_loss": 0.658841073513031, + "eval_rewards/accuracies": 0.6194238066673279, + "eval_rewards/chosen": -0.4483616352081299, + "eval_rewards/margins": 0.0905778631567955, + "eval_rewards/rejected": -0.5389395952224731, + "eval_runtime": 359.6595, + "eval_samples_per_second": 11.967, + "eval_steps_per_second": 1.496, "step": 7800 }, { - "epoch": 1.35, - "grad_norm": 53.82974487314594, - "learning_rate": 1.4608408199649686e-07, - "logits/chosen": -1.3559496402740479, - "logits/rejected": -1.285172462463379, - "logps/chosen": -246.6414031982422, - "logps/rejected": -362.58270263671875, - "loss": 0.4629, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.8763376474380493, - "rewards/margins": 1.2023292779922485, - "rewards/rejected": -3.078667163848877, + "epoch": 1.3456237077877327, + "grad_norm": 10.2208890914917, + "learning_rate": 2.9216816399299372e-08, + "logits/chosen": -2.583174228668213, + "logits/rejected": -2.555354595184326, + "logps/chosen": -111.4613265991211, + "logps/rejected": -122.44720458984375, + "loss": 0.6357, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5244373083114624, + "rewards/margins": 0.15247417986392975, + "rewards/rejected": -0.6769115328788757, "step": 7810 }, { - "epoch": 1.35, - "grad_norm": 40.516669727296595, - "learning_rate": 1.4540078330958167e-07, - "logits/chosen": -1.336315393447876, - "logits/rejected": -1.2665674686431885, - "logps/chosen": -243.50302124023438, - "logps/rejected": -392.2691650390625, - "loss": 0.4179, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.8713070154190063, - "rewards/margins": 1.4982094764709473, - "rewards/rejected": -3.369516372680664, + "epoch": 1.3473466574776016, + "grad_norm": 7.568568706512451, + "learning_rate": 2.908015666191633e-08, + "logits/chosen": -2.5538055896759033, + "logits/rejected": -2.5238800048828125, + "logps/chosen": -105.6512680053711, + "logps/rejected": -123.7880630493164, + "loss": 0.6232, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.49285778403282166, + "rewards/margins": 0.19189631938934326, + "rewards/rejected": -0.6847540736198425, "step": 7820 }, { - "epoch": 1.35, - "grad_norm": 32.90176835421052, - "learning_rate": 1.4471843052143696e-07, - "logits/chosen": -1.3154162168502808, - "logits/rejected": -1.2652655839920044, - "logps/chosen": -231.88034057617188, - "logps/rejected": -358.99169921875, - "loss": 0.4525, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.821041464805603, - "rewards/margins": 1.234797716140747, - "rewards/rejected": -3.0558390617370605, + "epoch": 1.3490696071674706, + "grad_norm": 10.060046195983887, + "learning_rate": 2.894368610428739e-08, + "logits/chosen": -2.4695115089416504, + "logits/rejected": -2.458768367767334, + "logps/chosen": -105.97291564941406, + "logps/rejected": -125.56675720214844, + "loss": 0.6339, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5617285966873169, + "rewards/margins": 0.15965981781482697, + "rewards/rejected": -0.7213884592056274, "step": 7830 }, { - "epoch": 1.35, - "grad_norm": 28.22131039387213, - "learning_rate": 1.440370298026315e-07, - "logits/chosen": -1.2927907705307007, - "logits/rejected": -1.23340904712677, - "logps/chosen": -216.0460662841797, - "logps/rejected": -339.61273193359375, - "loss": 0.412, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.6249713897705078, - "rewards/margins": 1.2290581464767456, - "rewards/rejected": -2.8540291786193848, + "epoch": 1.3507925568573398, + "grad_norm": 8.617898941040039, + "learning_rate": 2.8807405960526297e-08, + "logits/chosen": -2.4485971927642822, + "logits/rejected": -2.427280902862549, + "logps/chosen": -108.01090240478516, + "logps/rejected": -126.32137298583984, + "loss": 0.6278, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5444341897964478, + "rewards/margins": 0.1764529049396515, + "rewards/rejected": -0.7208870649337769, "step": 7840 }, { - "epoch": 1.35, - "grad_norm": 36.753366417631874, - "learning_rate": 1.4335658731512451e-07, - "logits/chosen": -1.301358699798584, - "logits/rejected": -1.2169835567474365, - "logps/chosen": -216.15402221679688, - "logps/rejected": -345.2431945800781, - "loss": 0.3962, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.6059744358062744, - "rewards/margins": 1.3626439571380615, - "rewards/rejected": -2.968618392944336, + "epoch": 1.3525155065472088, + "grad_norm": 10.258645057678223, + "learning_rate": 2.8671317463024904e-08, + "logits/chosen": -2.4429070949554443, + "logits/rejected": -2.4116177558898926, + "logps/chosen": -110.6179428100586, + "logps/rejected": -117.56571197509766, + "loss": 0.6424, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5504086017608643, + "rewards/margins": 0.14148317277431488, + "rewards/rejected": -0.6918917894363403, "step": 7850 }, { - "epoch": 1.35, - "grad_norm": 25.514074332943455, - "learning_rate": 1.4267710921220973e-07, - "logits/chosen": -1.3115109205245972, - "logits/rejected": -1.2281205654144287, - "logps/chosen": -219.056396484375, - "logps/rejected": -366.49932861328125, - "loss": 0.3504, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.6618436574935913, - "rewards/margins": 1.4767673015594482, - "rewards/rejected": -3.138611078262329, + "epoch": 1.354238456237078, + "grad_norm": 8.191166877746582, + "learning_rate": 2.8535421842441948e-08, + "logits/chosen": -2.4752354621887207, + "logits/rejected": -2.4408156871795654, + "logps/chosen": -107.2601089477539, + "logps/rejected": -127.4552993774414, + "loss": 0.6149, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5436367392539978, + "rewards/margins": 0.2043880671262741, + "rewards/rejected": -0.7480248212814331, "step": 7860 }, { - "epoch": 1.36, - "grad_norm": 51.13665015949875, - "learning_rate": 1.4199860163846007e-07, - "logits/chosen": -1.3125016689300537, - "logits/rejected": -1.251068353652954, - "logps/chosen": -239.65835571289062, - "logps/rejected": -367.1151428222656, - "loss": 0.4608, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.8398945331573486, - "rewards/margins": 1.2877471446990967, - "rewards/rejected": -3.1276419162750244, + "epoch": 1.355961405926947, + "grad_norm": 9.708309173583984, + "learning_rate": 2.8399720327692013e-08, + "logits/chosen": -2.4959757328033447, + "logits/rejected": -2.4798731803894043, + "logps/chosen": -113.02349853515625, + "logps/rejected": -128.7800750732422, + "loss": 0.6297, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5734953880310059, + "rewards/margins": 0.1706721931695938, + "rewards/rejected": -0.7441675662994385, "step": 7870 }, { - "epoch": 1.36, - "grad_norm": 46.23192498635243, - "learning_rate": 1.4132107072967165e-07, - "logits/chosen": -1.3768285512924194, - "logits/rejected": -1.3229854106903076, - "logps/chosen": -240.4175567626953, - "logps/rejected": -354.0284118652344, - "loss": 0.4709, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.8730905055999756, - "rewards/margins": 1.1423285007476807, - "rewards/rejected": -3.0154192447662354, + "epoch": 1.3576843556168159, + "grad_norm": 11.513443946838379, + "learning_rate": 2.826421414593433e-08, + "logits/chosen": -2.5580294132232666, + "logits/rejected": -2.5429983139038086, + "logps/chosen": -108.66941833496094, + "logps/rejected": -122.9479751586914, + "loss": 0.6411, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5555582642555237, + "rewards/margins": 0.1487930566072464, + "rewards/rejected": -0.7043513059616089, "step": 7880 }, { - "epoch": 1.36, - "grad_norm": 32.40033693942687, - "learning_rate": 1.406445226128088e-07, - "logits/chosen": -1.340899109840393, - "logits/rejected": -1.2813217639923096, - "logps/chosen": -226.41830444335938, - "logps/rejected": -349.4949645996094, - "loss": 0.4501, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.720557451248169, - "rewards/margins": 1.221872329711914, - "rewards/rejected": -2.942429542541504, + "epoch": 1.359407305306685, + "grad_norm": 10.203879356384277, + "learning_rate": 2.812890452256176e-08, + "logits/chosen": -2.5020322799682617, + "logits/rejected": -2.482840061187744, + "logps/chosen": -108.50390625, + "logps/rejected": -126.86137390136719, + "loss": 0.6284, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5412786602973938, + "rewards/margins": 0.1748257875442505, + "rewards/rejected": -0.7161044478416443, "step": 7890 }, { - "epoch": 1.36, - "grad_norm": 38.482928631188805, - "learning_rate": 1.399689634059479e-07, - "logits/chosen": -1.3165191411972046, - "logits/rejected": -1.2694923877716064, - "logps/chosen": -227.16726684570312, - "logps/rejected": -357.72381591796875, - "loss": 0.418, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7499094009399414, - "rewards/margins": 1.291032075881958, - "rewards/rejected": -3.0409417152404785, + "epoch": 1.361130254996554, + "grad_norm": 10.851054191589355, + "learning_rate": 2.7993792681189583e-08, + "logits/chosen": -2.4921271800994873, + "logits/rejected": -2.488858699798584, + "logps/chosen": -109.33931732177734, + "logps/rejected": -126.16691589355469, + "loss": 0.641, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5716311931610107, + "rewards/margins": 0.15354785323143005, + "rewards/rejected": -0.7251790761947632, "step": 7900 }, { - "epoch": 1.36, - "eval_logits/chosen": -1.4474836587905884, - "eval_logits/rejected": -1.420722484588623, - "eval_logps/chosen": -235.39988708496094, - "eval_logps/rejected": -283.75970458984375, - "eval_loss": 0.6320576071739197, - "eval_rewards/accuracies": 0.6577602028846741, - "eval_rewards/chosen": -1.7669605016708374, - "eval_rewards/margins": 0.4390629529953003, - "eval_rewards/rejected": -2.206023693084717, - "eval_runtime": 357.7699, - "eval_samples_per_second": 12.03, - "eval_steps_per_second": 1.504, + "epoch": 1.361130254996554, + "eval_logits/chosen": -2.6142539978027344, + "eval_logits/rejected": -2.6076812744140625, + "eval_logps/chosen": -104.2408676147461, + "eval_logps/rejected": -117.97350311279297, + "eval_loss": 0.6580982804298401, + "eval_rewards/accuracies": 0.6217471957206726, + "eval_rewards/chosen": -0.45528969168663025, + "eval_rewards/margins": 0.0926441103219986, + "eval_rewards/rejected": -0.54793381690979, + "eval_runtime": 359.2482, + "eval_samples_per_second": 11.981, + "eval_steps_per_second": 1.498, "step": 7900 }, { - "epoch": 1.36, - "grad_norm": 43.10989046415876, - "learning_rate": 1.3929439921822334e-07, - "logits/chosen": -1.3463201522827148, - "logits/rejected": -1.282036542892456, - "logps/chosen": -232.250732421875, - "logps/rejected": -347.2474060058594, - "loss": 0.4703, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.7544885873794556, - "rewards/margins": 1.1816383600234985, - "rewards/rejected": -2.936127185821533, + "epoch": 1.3628532046864232, + "grad_norm": 10.77269172668457, + "learning_rate": 2.7858879843644666e-08, + "logits/chosen": -2.495147705078125, + "logits/rejected": -2.472882032394409, + "logps/chosen": -114.080078125, + "logps/rejected": -127.13459777832031, + "loss": 0.637, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.572566568851471, + "rewards/margins": 0.16227777302265167, + "rewards/rejected": -0.7348443269729614, "step": 7910 }, { - "epoch": 1.36, - "grad_norm": 22.98150967323039, - "learning_rate": 1.3862083614977067e-07, - "logits/chosen": -1.3695622682571411, - "logits/rejected": -1.3146297931671143, - "logps/chosen": -206.7174072265625, - "logps/rejected": -311.8243103027344, - "loss": 0.4652, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.519517421722412, - "rewards/margins": 1.0840694904327393, - "rewards/rejected": -2.6035869121551514, + "epoch": 1.3645761543762922, + "grad_norm": 9.462078094482422, + "learning_rate": 2.7724167229954133e-08, + "logits/chosen": -2.4954214096069336, + "logits/rejected": -2.4742679595947266, + "logps/chosen": -109.8337631225586, + "logps/rejected": -122.7774658203125, + "loss": 0.6309, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5501305460929871, + "rewards/margins": 0.16303303837776184, + "rewards/rejected": -0.7131635546684265, "step": 7920 }, { - "epoch": 1.37, - "grad_norm": 26.732976356570454, - "learning_rate": 1.3794828029167267e-07, - "logits/chosen": -1.4295904636383057, - "logits/rejected": -1.3580360412597656, - "logps/chosen": -213.1838836669922, - "logps/rejected": -333.4698181152344, - "loss": 0.4051, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.5487329959869385, - "rewards/margins": 1.2622630596160889, - "rewards/rejected": -2.8109960556030273, + "epoch": 1.3662991040661612, + "grad_norm": 12.405163764953613, + "learning_rate": 2.758965605833453e-08, + "logits/chosen": -2.545825481414795, + "logits/rejected": -2.5186660289764404, + "logps/chosen": -116.6957778930664, + "logps/rejected": -126.533203125, + "loss": 0.6346, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5836119651794434, + "rewards/margins": 0.1577313393354416, + "rewards/rejected": -0.7413433194160461, "step": 7930 }, { - "epoch": 1.37, - "grad_norm": 39.35835071533213, - "learning_rate": 1.3727673772590376e-07, - "logits/chosen": -1.3716719150543213, - "logits/rejected": -1.3115837574005127, - "logps/chosen": -209.64309692382812, - "logps/rejected": -334.11761474609375, - "loss": 0.4165, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.5110173225402832, - "rewards/margins": 1.2777800559997559, - "rewards/rejected": -2.788797378540039, + "epoch": 1.3680220537560304, + "grad_norm": 11.84246826171875, + "learning_rate": 2.745534754518075e-08, + "logits/chosen": -2.450265407562256, + "logits/rejected": -2.4336650371551514, + "logps/chosen": -114.76383209228516, + "logps/rejected": -131.10342407226562, + "loss": 0.6262, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5620352029800415, + "rewards/margins": 0.1967960149049759, + "rewards/rejected": -0.7588313221931458, "step": 7940 }, { - "epoch": 1.37, - "grad_norm": 23.363749701913548, - "learning_rate": 1.3660621452527505e-07, - "logits/chosen": -1.308699369430542, - "logits/rejected": -1.2606542110443115, - "logps/chosen": -190.32920837402344, - "logps/rejected": -315.5460205078125, - "loss": 0.4323, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.3881752490997314, - "rewards/margins": 1.234622597694397, - "rewards/rejected": -2.6227974891662598, + "epoch": 1.3697450034458993, + "grad_norm": 10.144691467285156, + "learning_rate": 2.732124290505501e-08, + "logits/chosen": -2.4120466709136963, + "logits/rejected": -2.4035420417785645, + "logps/chosen": -106.06107330322266, + "logps/rejected": -125.48957824707031, + "loss": 0.6304, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5455502271652222, + "rewards/margins": 0.17666089534759521, + "rewards/rejected": -0.7222111821174622, "step": 7950 }, { - "epoch": 1.37, - "grad_norm": 30.69306170978758, - "learning_rate": 1.3593671675337954e-07, - "logits/chosen": -1.335451364517212, - "logits/rejected": -1.2736941576004028, - "logps/chosen": -200.57716369628906, - "logps/rejected": -317.3564453125, - "loss": 0.4216, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.5011732578277588, - "rewards/margins": 1.1477513313293457, - "rewards/rejected": -2.6489245891571045, + "epoch": 1.3714679531357685, + "grad_norm": 9.811049461364746, + "learning_rate": 2.7187343350675906e-08, + "logits/chosen": -2.429908514022827, + "logits/rejected": -2.4045462608337402, + "logps/chosen": -106.56965637207031, + "logps/rejected": -118.9483871459961, + "loss": 0.6675, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5609061121940613, + "rewards/margins": 0.10372792184352875, + "rewards/rejected": -0.6646339297294617, "step": 7960 }, { - "epoch": 1.37, - "grad_norm": 30.826633710533333, - "learning_rate": 1.3526825046453706e-07, - "logits/chosen": -1.3753823041915894, - "logits/rejected": -1.3085488080978394, - "logps/chosen": -217.94619750976562, - "logps/rejected": -333.4029846191406, - "loss": 0.4507, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.6257708072662354, - "rewards/margins": 1.1876569986343384, - "rewards/rejected": -2.8134281635284424, + "epoch": 1.3731909028256375, + "grad_norm": 8.188979148864746, + "learning_rate": 2.705365009290741e-08, + "logits/chosen": -2.5196266174316406, + "logits/rejected": -2.4908502101898193, + "logps/chosen": -115.40313720703125, + "logps/rejected": -124.7549057006836, + "loss": 0.6544, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6000576615333557, + "rewards/margins": 0.1267736852169037, + "rewards/rejected": -0.7268313765525818, "step": 7970 }, { - "epoch": 1.37, - "grad_norm": 32.72955968672792, - "learning_rate": 1.3460082170373987e-07, - "logits/chosen": -1.398342490196228, - "logits/rejected": -1.3425318002700806, - "logps/chosen": -230.95849609375, - "logps/rejected": -356.80609130859375, - "loss": 0.4128, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.740744948387146, - "rewards/margins": 1.2664308547973633, - "rewards/rejected": -3.0071756839752197, + "epoch": 1.3749138525155065, + "grad_norm": 12.60647201538086, + "learning_rate": 2.6920164340747976e-08, + "logits/chosen": -2.567558765411377, + "logits/rejected": -2.553401470184326, + "logps/chosen": -111.7811508178711, + "logps/rejected": -127.6240234375, + "loss": 0.6324, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5490156412124634, + "rewards/margins": 0.16616667807102203, + "rewards/rejected": -0.715182363986969, "step": 7980 }, { - "epoch": 1.38, - "grad_norm": 37.50249892999267, - "learning_rate": 1.339344365065973e-07, - "logits/chosen": -1.3826172351837158, - "logits/rejected": -1.3275426626205444, - "logps/chosen": -234.38742065429688, - "logps/rejected": -364.55828857421875, - "loss": 0.4381, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.8373771905899048, - "rewards/margins": 1.2917721271514893, - "rewards/rejected": -3.1291489601135254, + "epoch": 1.3766368022053757, + "grad_norm": 9.503989219665527, + "learning_rate": 2.678688730131946e-08, + "logits/chosen": -2.581275701522827, + "logits/rejected": -2.568692445755005, + "logps/chosen": -105.19342041015625, + "logps/rejected": -123.72818756103516, + "loss": 0.6307, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5452620387077332, + "rewards/margins": 0.17543041706085205, + "rewards/rejected": -0.7206924557685852, "step": 7990 }, { - "epoch": 1.38, - "grad_norm": 35.27460341174213, - "learning_rate": 1.3326910089928246e-07, - "logits/chosen": -1.2450647354125977, - "logits/rejected": -1.1912448406219482, - "logps/chosen": -227.9076385498047, - "logps/rejected": -362.02349853515625, - "loss": 0.4084, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.7342026233673096, - "rewards/margins": 1.3236587047576904, - "rewards/rejected": -3.057861804962158, + "epoch": 1.3783597518952446, + "grad_norm": 10.40451717376709, + "learning_rate": 2.665382017985649e-08, + "logits/chosen": -2.4500977993011475, + "logits/rejected": -2.439554214477539, + "logps/chosen": -107.6703109741211, + "logps/rejected": -127.6657485961914, + "loss": 0.6228, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5315279364585876, + "rewards/margins": 0.18270538747310638, + "rewards/rejected": -0.7142332792282104, "step": 8000 }, { - "epoch": 1.38, - "eval_logits/chosen": -1.4088190793991089, - "eval_logits/rejected": -1.381564974784851, - "eval_logps/chosen": -247.23068237304688, - "eval_logps/rejected": -297.6674499511719, - "eval_loss": 0.631807804107666, - "eval_rewards/accuracies": 0.6638011336326599, - "eval_rewards/chosen": -1.8852684497833252, - "eval_rewards/margins": 0.45983266830444336, - "eval_rewards/rejected": -2.3451011180877686, - "eval_runtime": 357.6561, - "eval_samples_per_second": 12.034, - "eval_steps_per_second": 1.504, + "epoch": 1.3783597518952446, + "eval_logits/chosen": -2.610853910446167, + "eval_logits/rejected": -2.604253053665161, + "eval_logps/chosen": -104.54552459716797, + "eval_logps/rejected": -118.3795166015625, + "eval_loss": 0.6577586531639099, + "eval_rewards/accuracies": 0.6215148568153381, + "eval_rewards/chosen": -0.4583362340927124, + "eval_rewards/margins": 0.0936574935913086, + "eval_rewards/rejected": -0.5519937872886658, + "eval_runtime": 359.5438, + "eval_samples_per_second": 11.971, + "eval_steps_per_second": 1.496, "step": 8000 }, { - "epoch": 1.38, - "grad_norm": 33.54268692406238, - "learning_rate": 1.3260482089847603e-07, - "logits/chosen": -1.2896820306777954, - "logits/rejected": -1.2158801555633545, - "logps/chosen": -234.7473907470703, - "logps/rejected": -369.5241394042969, - "loss": 0.4207, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7761294841766357, - "rewards/margins": 1.3950544595718384, - "rewards/rejected": -3.1711838245391846, + "epoch": 1.3800827015851138, + "grad_norm": 9.779748916625977, + "learning_rate": 2.6520964179695206e-08, + "logits/chosen": -2.483043909072876, + "logits/rejected": -2.461374521255493, + "logps/chosen": -112.08184814453125, + "logps/rejected": -120.54051208496094, + "loss": 0.6427, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5492201447486877, + "rewards/margins": 0.1319514662027359, + "rewards/rejected": -0.6811715960502625, "step": 8010 }, { - "epoch": 1.38, - "grad_norm": 28.405016653483855, - "learning_rate": 1.3194160251131365e-07, - "logits/chosen": -1.3419923782348633, - "logits/rejected": -1.257868766784668, - "logps/chosen": -242.85787963867188, - "logps/rejected": -379.0827331542969, - "loss": 0.4081, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.826235055923462, - "rewards/margins": 1.417443037033081, - "rewards/rejected": -3.243677854537964, + "epoch": 1.3818056512749828, + "grad_norm": 10.46135139465332, + "learning_rate": 2.638832050226273e-08, + "logits/chosen": -2.4984774589538574, + "logits/rejected": -2.4609427452087402, + "logps/chosen": -120.8825454711914, + "logps/rejected": -130.7781219482422, + "loss": 0.6356, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6064472198486328, + "rewards/margins": 0.153905987739563, + "rewards/rejected": -0.7603532075881958, "step": 8020 }, { - "epoch": 1.38, - "grad_norm": 39.15936163505855, - "learning_rate": 1.3127945173532988e-07, - "logits/chosen": -1.3448692560195923, - "logits/rejected": -1.284053087234497, - "logps/chosen": -212.48580932617188, - "logps/rejected": -355.14935302734375, - "loss": 0.4275, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.6137079000473022, - "rewards/margins": 1.4035985469818115, - "rewards/rejected": -3.017306327819824, + "epoch": 1.3835286009648518, + "grad_norm": 9.138134002685547, + "learning_rate": 2.6255890347065978e-08, + "logits/chosen": -2.489896297454834, + "logits/rejected": -2.475051164627075, + "logps/chosen": -104.41996002197266, + "logps/rejected": -124.8290786743164, + "loss": 0.6254, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5327662229537964, + "rewards/margins": 0.18135295808315277, + "rewards/rejected": -0.7141191959381104, "step": 8030 }, { - "epoch": 1.39, - "grad_norm": 33.82521625803319, - "learning_rate": 1.3061837455840538e-07, - "logits/chosen": -1.3016248941421509, - "logits/rejected": -1.2252373695373535, - "logps/chosen": -226.1641082763672, - "logps/rejected": -372.94171142578125, - "loss": 0.3642, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7177082300186157, - "rewards/margins": 1.4918298721313477, - "rewards/rejected": -3.209538221359253, + "epoch": 1.385251550654721, + "grad_norm": 9.862136840820312, + "learning_rate": 2.6123674911681077e-08, + "logits/chosen": -2.454495668411255, + "logits/rejected": -2.4251887798309326, + "logps/chosen": -113.9755630493164, + "logps/rejected": -127.5917739868164, + "loss": 0.6385, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5954891443252563, + "rewards/margins": 0.16014710068702698, + "rewards/rejected": -0.7556362748146057, "step": 8040 }, { - "epoch": 1.39, - "grad_norm": 23.926808236247748, - "learning_rate": 1.2995837695871188e-07, - "logits/chosen": -1.3715155124664307, - "logits/rejected": -1.3059993982315063, - "logps/chosen": -211.1027374267578, - "logps/rejected": -356.53533935546875, - "loss": 0.4153, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.5601820945739746, - "rewards/margins": 1.4470088481903076, - "rewards/rejected": -3.0071911811828613, + "epoch": 1.38697450034459, + "grad_norm": 8.39928913116455, + "learning_rate": 2.5991675391742373e-08, + "logits/chosen": -2.5200531482696533, + "logits/rejected": -2.5020241737365723, + "logps/chosen": -109.97434997558594, + "logps/rejected": -128.35328674316406, + "loss": 0.629, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5486682653427124, + "rewards/margins": 0.1764025092124939, + "rewards/rejected": -0.7250707745552063, "step": 8050 }, { - "epoch": 1.39, - "grad_norm": 31.557939346164492, - "learning_rate": 1.2929946490465855e-07, - "logits/chosen": -1.4260159730911255, - "logits/rejected": -1.3587584495544434, - "logps/chosen": -217.4468536376953, - "logps/rejected": -329.12548828125, - "loss": 0.4882, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.6211131811141968, - "rewards/margins": 1.1693998575210571, - "rewards/rejected": -2.7905125617980957, + "epoch": 1.388697450034459, + "grad_norm": 9.907453536987305, + "learning_rate": 2.5859892980931707e-08, + "logits/chosen": -2.5607526302337646, + "logits/rejected": -2.5263822078704834, + "logps/chosen": -115.3295669555664, + "logps/rejected": -122.81480407714844, + "loss": 0.6529, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5999096632003784, + "rewards/margins": 0.12736575305461884, + "rewards/rejected": -0.7272753119468689, "step": 8060 }, { - "epoch": 1.39, - "grad_norm": 40.190393995057335, - "learning_rate": 1.2864164435483777e-07, - "logits/chosen": -1.354252815246582, - "logits/rejected": -1.2852472066879272, - "logps/chosen": -220.60400390625, - "logps/rejected": -329.6341857910156, - "loss": 0.4492, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.6716521978378296, - "rewards/margins": 1.1254643201828003, - "rewards/rejected": -2.79711651802063, + "epoch": 1.390420399724328, + "grad_norm": 8.719491004943848, + "learning_rate": 2.5728328870967553e-08, + "logits/chosen": -2.472240924835205, + "logits/rejected": -2.443647623062134, + "logps/chosen": -111.67752838134766, + "logps/rejected": -123.70146179199219, + "loss": 0.6365, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5821923017501831, + "rewards/margins": 0.15517497062683105, + "rewards/rejected": -0.7373672723770142, "step": 8070 }, { - "epoch": 1.39, - "grad_norm": 32.42169318224056, - "learning_rate": 1.2798492125797145e-07, - "logits/chosen": -1.3571466207504272, - "logits/rejected": -1.3098504543304443, - "logps/chosen": -203.21746826171875, - "logps/rejected": -327.30889892578125, - "loss": 0.4336, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.5024254322052002, - "rewards/margins": 1.1913418769836426, - "rewards/rejected": -2.6937670707702637, + "epoch": 1.392143349414197, + "grad_norm": 9.55966567993164, + "learning_rate": 2.5596984251594288e-08, + "logits/chosen": -2.458172559738159, + "logits/rejected": -2.45215106010437, + "logps/chosen": -105.4540786743164, + "logps/rejected": -128.3647003173828, + "loss": 0.6257, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5246763229370117, + "rewards/margins": 0.179347425699234, + "rewards/rejected": -0.7040236592292786, "step": 8080 }, { - "epoch": 1.39, - "grad_norm": 23.167056574146898, - "learning_rate": 1.273293015528571e-07, - "logits/chosen": -1.3101780414581299, - "logits/rejected": -1.2438642978668213, - "logps/chosen": -202.09756469726562, - "logps/rejected": -327.48089599609375, - "loss": 0.4286, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.490017056465149, - "rewards/margins": 1.2703895568847656, - "rewards/rejected": -2.760406732559204, + "epoch": 1.3938662991040662, + "grad_norm": 8.97030258178711, + "learning_rate": 2.546586031057142e-08, + "logits/chosen": -2.441591262817383, + "logits/rejected": -2.415339946746826, + "logps/chosen": -109.4200439453125, + "logps/rejected": -126.6836166381836, + "loss": 0.6242, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5630167722702026, + "rewards/margins": 0.18928496539592743, + "rewards/rejected": -0.752301812171936, "step": 8090 }, { - "epoch": 1.4, - "grad_norm": 41.21644559684058, - "learning_rate": 1.2667479116831436e-07, - "logits/chosen": -1.3472046852111816, - "logits/rejected": -1.3043591976165771, - "logps/chosen": -227.7296905517578, - "logps/rejected": -334.98541259765625, - "loss": 0.4616, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.736731767654419, - "rewards/margins": 1.0323158502578735, - "rewards/rejected": -2.769047498703003, + "epoch": 1.3955892487939352, + "grad_norm": 9.099763870239258, + "learning_rate": 2.5334958233662874e-08, + "logits/chosen": -2.4939823150634766, + "logits/rejected": -2.486513137817383, + "logps/chosen": -112.15140533447266, + "logps/rejected": -130.70358276367188, + "loss": 0.641, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5806952714920044, + "rewards/margins": 0.1454145610332489, + "rewards/rejected": -0.7261097431182861, "step": 8100 }, { - "epoch": 1.4, - "eval_logits/chosen": -1.4580535888671875, - "eval_logits/rejected": -1.4319345951080322, - "eval_logps/chosen": -226.49224853515625, - "eval_logps/rejected": -272.92999267578125, - "eval_loss": 0.6337063908576965, - "eval_rewards/accuracies": 0.6563661694526672, - "eval_rewards/chosen": -1.6778842210769653, - "eval_rewards/margins": 0.41984203457832336, - "eval_rewards/rejected": -2.097726345062256, - "eval_runtime": 356.5742, - "eval_samples_per_second": 12.07, - "eval_steps_per_second": 1.509, + "epoch": 1.3955892487939352, + "eval_logits/chosen": -2.6063179969787598, + "eval_logits/rejected": -2.599682092666626, + "eval_logps/chosen": -105.29100799560547, + "eval_logps/rejected": -119.1444320678711, + "eval_loss": 0.6578975915908813, + "eval_rewards/accuracies": 0.6177973747253418, + "eval_rewards/chosen": -0.46579110622406006, + "eval_rewards/margins": 0.09385194629430771, + "eval_rewards/rejected": -0.559643030166626, + "eval_runtime": 359.7237, + "eval_samples_per_second": 11.965, + "eval_steps_per_second": 1.496, "step": 8100 }, { - "epoch": 1.4, - "grad_norm": 43.64122789141382, - "learning_rate": 1.2602139602313066e-07, - "logits/chosen": -1.3520846366882324, - "logits/rejected": -1.2789603471755981, - "logps/chosen": -220.7334442138672, - "logps/rejected": -337.8800354003906, - "loss": 0.4345, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.6468912363052368, - "rewards/margins": 1.221639633178711, - "rewards/rejected": -2.8685309886932373, + "epoch": 1.3973121984838044, + "grad_norm": 10.305115699768066, + "learning_rate": 2.5204279204626135e-08, + "logits/chosen": -2.5142409801483154, + "logits/rejected": -2.4806110858917236, + "logps/chosen": -113.26153564453125, + "logps/rejected": -122.00248718261719, + "loss": 0.6423, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5719997882843018, + "rewards/margins": 0.13758215308189392, + "rewards/rejected": -0.7095819711685181, "step": 8110 }, { - "epoch": 1.4, - "grad_norm": 46.7301047333346, - "learning_rate": 1.2536912202600908e-07, - "logits/chosen": -1.3374398946762085, - "logits/rejected": -1.2761024236679077, - "logps/chosen": -215.7982940673828, - "logps/rejected": -333.050048828125, - "loss": 0.4249, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.617179274559021, - "rewards/margins": 1.1847528219223022, - "rewards/rejected": -2.8019323348999023, + "epoch": 1.3990351481736734, + "grad_norm": 11.399801254272461, + "learning_rate": 2.5073824405201815e-08, + "logits/chosen": -2.4770634174346924, + "logits/rejected": -2.4584784507751465, + "logps/chosen": -108.218994140625, + "logps/rejected": -122.0374984741211, + "loss": 0.6382, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5411659479141235, + "rewards/margins": 0.15037959814071655, + "rewards/rejected": -0.6915455460548401, "step": 8120 }, { - "epoch": 1.4, - "grad_norm": 42.0819810676, - "learning_rate": 1.2471797507551323e-07, - "logits/chosen": -1.3652501106262207, - "logits/rejected": -1.310935139656067, - "logps/chosen": -208.43954467773438, - "logps/rejected": -316.9969787597656, - "loss": 0.4354, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.536388635635376, - "rewards/margins": 1.1277352571487427, - "rewards/rejected": -2.664124011993408, + "epoch": 1.4007580978635423, + "grad_norm": 9.684117317199707, + "learning_rate": 2.494359501510265e-08, + "logits/chosen": -2.4873130321502686, + "logits/rejected": -2.4719839096069336, + "logps/chosen": -108.14006042480469, + "logps/rejected": -119.9603042602539, + "loss": 0.6353, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5331994891166687, + "rewards/margins": 0.16038167476654053, + "rewards/rejected": -0.6935811638832092, "step": 8130 }, { - "epoch": 1.4, - "grad_norm": 32.76814476973973, - "learning_rate": 1.2406796106001526e-07, - "logits/chosen": -1.3102951049804688, - "logits/rejected": -1.2481873035430908, - "logps/chosen": -213.14266967773438, - "logps/rejected": -348.0755615234375, - "loss": 0.4184, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.6040756702423096, - "rewards/margins": 1.3488930463790894, - "rewards/rejected": -2.9529685974121094, + "epoch": 1.4024810475534115, + "grad_norm": 10.679859161376953, + "learning_rate": 2.4813592212003055e-08, + "logits/chosen": -2.4541687965393066, + "logits/rejected": -2.4306445121765137, + "logps/chosen": -111.33210754394531, + "logps/rejected": -127.0435791015625, + "loss": 0.6391, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5860517621040344, + "rewards/margins": 0.15630805492401123, + "rewards/rejected": -0.7423598170280457, "step": 8140 }, { - "epoch": 1.4, - "grad_norm": 20.86333145951887, - "learning_rate": 1.2341908585764197e-07, - "logits/chosen": -1.3613207340240479, - "logits/rejected": -1.2992537021636963, - "logps/chosen": -226.6147918701172, - "logps/rejected": -361.0736083984375, - "loss": 0.4225, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.7447595596313477, - "rewards/margins": 1.3353623151779175, - "rewards/rejected": -3.0801219940185547, + "epoch": 1.4042039972432805, + "grad_norm": 10.034843444824219, + "learning_rate": 2.4683817171528393e-08, + "logits/chosen": -2.5279853343963623, + "logits/rejected": -2.508397340774536, + "logps/chosen": -109.01942443847656, + "logps/rejected": -127.464111328125, + "loss": 0.6338, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5687164068222046, + "rewards/margins": 0.17518633604049683, + "rewards/rejected": -0.7439028024673462, "step": 8150 }, { - "epoch": 1.41, - "grad_norm": 33.69365894701024, - "learning_rate": 1.2277135533622173e-07, - "logits/chosen": -1.3138097524642944, - "logits/rejected": -1.241779088973999, - "logps/chosen": -218.9955291748047, - "logps/rejected": -367.7317810058594, - "loss": 0.3696, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.6299861669540405, - "rewards/margins": 1.524584174156189, - "rewards/rejected": -3.1545703411102295, + "epoch": 1.4059269469331497, + "grad_norm": 8.829487800598145, + "learning_rate": 2.4554271067244347e-08, + "logits/chosen": -2.4655117988586426, + "logits/rejected": -2.4463486671447754, + "logps/chosen": -113.00874328613281, + "logps/rejected": -130.19166564941406, + "loss": 0.6101, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5699566006660461, + "rewards/margins": 0.2090790569782257, + "rewards/rejected": -0.7790356874465942, "step": 8160 }, { - "epoch": 1.41, - "grad_norm": 41.9578809865992, - "learning_rate": 1.2212477535323158e-07, - "logits/chosen": -1.3314152956008911, - "logits/rejected": -1.261887550354004, - "logps/chosen": -240.8853759765625, - "logps/rejected": -356.1999816894531, - "loss": 0.4335, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.8119796514511108, - "rewards/margins": 1.2307933568954468, - "rewards/rejected": -3.0427732467651367, + "epoch": 1.4076498966230186, + "grad_norm": 9.293244361877441, + "learning_rate": 2.4424955070646314e-08, + "logits/chosen": -2.533050537109375, + "logits/rejected": -2.508591890335083, + "logps/chosen": -114.20674896240234, + "logps/rejected": -123.04630279541016, + "loss": 0.6294, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5449591875076294, + "rewards/margins": 0.1659817099571228, + "rewards/rejected": -0.7109408378601074, "step": 8170 }, { - "epoch": 1.41, - "grad_norm": 37.15151945333557, - "learning_rate": 1.2147935175574403e-07, - "logits/chosen": -1.336161732673645, - "logits/rejected": -1.2775371074676514, - "logps/chosen": -241.3358917236328, - "logps/rejected": -364.7397766113281, - "loss": 0.4214, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.850393295288086, - "rewards/margins": 1.2469345331192017, - "rewards/rejected": -3.0973281860351562, + "epoch": 1.4093728463128876, + "grad_norm": 9.863910675048828, + "learning_rate": 2.4295870351148807e-08, + "logits/chosen": -2.516082763671875, + "logits/rejected": -2.4976694583892822, + "logps/chosen": -117.4303970336914, + "logps/rejected": -127.5738754272461, + "loss": 0.6541, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6113814115524292, + "rewards/margins": 0.11424535512924194, + "rewards/rejected": -0.7256268262863159, "step": 8180 }, { - "epoch": 1.41, - "grad_norm": 35.78807424327242, - "learning_rate": 1.208350903803745e-07, - "logits/chosen": -1.290093183517456, - "logits/rejected": -1.2244975566864014, - "logps/chosen": -240.7860870361328, - "logps/rejected": -369.93792724609375, - "loss": 0.4527, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.8283048868179321, - "rewards/margins": 1.3327312469482422, - "rewards/rejected": -3.1610360145568848, + "epoch": 1.4110957960027566, + "grad_norm": 10.611021041870117, + "learning_rate": 2.41670180760749e-08, + "logits/chosen": -2.48005747795105, + "logits/rejected": -2.4579241275787354, + "logps/chosen": -118.31016540527344, + "logps/rejected": -127.42778015136719, + "loss": 0.6571, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6033154726028442, + "rewards/margins": 0.13233919441699982, + "rewards/rejected": -0.7356546521186829, "step": 8190 }, { - "epoch": 1.41, - "grad_norm": 36.986247487672124, - "learning_rate": 1.2019199705322793e-07, - "logits/chosen": -1.3099769353866577, - "logits/rejected": -1.244363784790039, - "logps/chosen": -241.85092163085938, - "logps/rejected": -364.93157958984375, - "loss": 0.4033, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.868650197982788, - "rewards/margins": 1.2554208040237427, - "rewards/rejected": -3.1240711212158203, + "epoch": 1.4128187456926258, + "grad_norm": 11.68910026550293, + "learning_rate": 2.4038399410645588e-08, + "logits/chosen": -2.50583553314209, + "logits/rejected": -2.475226640701294, + "logps/chosen": -116.03187561035156, + "logps/rejected": -126.24421691894531, + "loss": 0.6504, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6103672981262207, + "rewards/margins": 0.12656204402446747, + "rewards/rejected": -0.7369293570518494, "step": 8200 }, { - "epoch": 1.41, - "eval_logits/chosen": -1.4116390943527222, - "eval_logits/rejected": -1.3845247030258179, - "eval_logps/chosen": -245.81504821777344, - "eval_logps/rejected": -296.2736511230469, - "eval_loss": 0.6331018805503845, - "eval_rewards/accuracies": 0.6638011336326599, - "eval_rewards/chosen": -1.8711119890213013, - "eval_rewards/margins": 0.460050493478775, - "eval_rewards/rejected": -2.331162452697754, - "eval_runtime": 357.1978, - "eval_samples_per_second": 12.049, - "eval_steps_per_second": 1.506, + "epoch": 1.4128187456926258, + "eval_logits/chosen": -2.6040306091308594, + "eval_logits/rejected": -2.597447633743286, + "eval_logps/chosen": -105.78633880615234, + "eval_logps/rejected": -119.84130859375, + "eval_loss": 0.6571367979049683, + "eval_rewards/accuracies": 0.6212825179100037, + "eval_rewards/chosen": -0.47074440121650696, + "eval_rewards/margins": 0.09586748480796814, + "eval_rewards/rejected": -0.5666118860244751, + "eval_runtime": 359.4352, + "eval_samples_per_second": 11.974, + "eval_steps_per_second": 1.497, "step": 8200 }, { - "epoch": 1.41, - "grad_norm": 29.593005359208316, - "learning_rate": 1.1955007758984717e-07, - "logits/chosen": -1.2003768682479858, - "logits/rejected": -1.1418938636779785, - "logps/chosen": -232.4004669189453, - "logps/rejected": -356.89031982421875, - "loss": 0.4207, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.7730633020401, - "rewards/margins": 1.2342889308929443, - "rewards/rejected": -3.007352113723755, + "epoch": 1.414541695382495, + "grad_norm": 11.93120288848877, + "learning_rate": 2.3910015517969434e-08, + "logits/chosen": -2.3909761905670166, + "logits/rejected": -2.373305082321167, + "logps/chosen": -114.6285629272461, + "logps/rejected": -131.95509338378906, + "loss": 0.6334, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5952364802360535, + "rewards/margins": 0.16276808083057404, + "rewards/rejected": -0.7580045461654663, "step": 8210 }, { - "epoch": 1.42, - "grad_norm": 28.630103927734417, - "learning_rate": 1.1890933779515897e-07, - "logits/chosen": -1.2915620803833008, - "logits/rejected": -1.2165160179138184, - "logps/chosen": -236.4958953857422, - "logps/rejected": -377.47528076171875, - "loss": 0.3975, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.8341341018676758, - "rewards/margins": 1.4102541208267212, - "rewards/rejected": -3.2443878650665283, + "epoch": 1.416264645072364, + "grad_norm": 10.773359298706055, + "learning_rate": 2.3781867559031794e-08, + "logits/chosen": -2.5130276679992676, + "logits/rejected": -2.4877219200134277, + "logps/chosen": -110.43150329589844, + "logps/rejected": -126.22157287597656, + "loss": 0.6344, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5733839869499207, + "rewards/margins": 0.15835942327976227, + "rewards/rejected": -0.7317434549331665, "step": 8220 }, { - "epoch": 1.42, - "grad_norm": 19.561231715377982, - "learning_rate": 1.1826978346342301e-07, - "logits/chosen": -1.2921059131622314, - "logits/rejected": -1.229309320449829, - "logps/chosen": -233.7902069091797, - "logps/rejected": -382.6428527832031, - "loss": 0.3569, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -1.7854340076446533, - "rewards/margins": 1.527852177619934, - "rewards/rejected": -3.313286304473877, + "epoch": 1.417987594762233, + "grad_norm": 9.052794456481934, + "learning_rate": 2.3653956692684602e-08, + "logits/chosen": -2.477832794189453, + "logits/rejected": -2.462078332901001, + "logps/chosen": -115.74124908447266, + "logps/rejected": -124.70304870605469, + "loss": 0.6514, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6049267053604126, + "rewards/margins": 0.1287112683057785, + "rewards/rejected": -0.7336378693580627, "step": 8230 }, { - "epoch": 1.42, - "grad_norm": 41.33964598646989, - "learning_rate": 1.1763142037817805e-07, - "logits/chosen": -1.3490978479385376, - "logits/rejected": -1.2747749090194702, - "logps/chosen": -261.16094970703125, - "logps/rejected": -395.5751953125, - "loss": 0.3843, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -2.049976348876953, - "rewards/margins": 1.3874568939208984, - "rewards/rejected": -3.4374337196350098, + "epoch": 1.4197105444521019, + "grad_norm": 11.35895824432373, + "learning_rate": 2.352628407563561e-08, + "logits/chosen": -2.576815128326416, + "logits/rejected": -2.551685094833374, + "logps/chosen": -116.01173400878906, + "logps/rejected": -126.32490539550781, + "loss": 0.6397, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5983807444572449, + "rewards/margins": 0.14644809067249298, + "rewards/rejected": -0.7448288202285767, "step": 8240 }, { - "epoch": 1.42, - "grad_norm": 27.38502220196457, - "learning_rate": 1.1699425431219079e-07, - "logits/chosen": -1.2738348245620728, - "logits/rejected": -1.215987205505371, - "logps/chosen": -255.5185546875, - "logps/rejected": -400.17083740234375, - "loss": 0.4124, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.9787349700927734, - "rewards/margins": 1.4798691272735596, - "rewards/rejected": -3.458604335784912, + "epoch": 1.421433494141971, + "grad_norm": 9.576996803283691, + "learning_rate": 2.339885086243816e-08, + "logits/chosen": -2.509434223175049, + "logits/rejected": -2.4891815185546875, + "logps/chosen": -113.9859390258789, + "logps/rejected": -128.5716094970703, + "loss": 0.6249, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5633655786514282, + "rewards/margins": 0.17911618947982788, + "rewards/rejected": -0.7424817681312561, "step": 8250 }, { - "epoch": 1.42, - "grad_norm": 35.93771365818128, - "learning_rate": 1.1635829102740294e-07, - "logits/chosen": -1.3693095445632935, - "logits/rejected": -1.3095340728759766, - "logps/chosen": -248.7408905029297, - "logps/rejected": -385.82574462890625, - "loss": 0.4401, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.9232066869735718, - "rewards/margins": 1.382420539855957, - "rewards/rejected": -3.3056271076202393, + "epoch": 1.42315644383184, + "grad_norm": 9.92259407043457, + "learning_rate": 2.3271658205480586e-08, + "logits/chosen": -2.587524890899658, + "logits/rejected": -2.5685718059539795, + "logps/chosen": -113.36087799072266, + "logps/rejected": -133.5902099609375, + "loss": 0.6148, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5694628953933716, + "rewards/margins": 0.21361835300922394, + "rewards/rejected": -0.7830812335014343, "step": 8260 }, { - "epoch": 1.42, - "grad_norm": 32.99199105515522, - "learning_rate": 1.1572353627487948e-07, - "logits/chosen": -1.3601871728897095, - "logits/rejected": -1.3054234981536865, - "logps/chosen": -243.6525421142578, - "logps/rejected": -381.771240234375, - "loss": 0.4241, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.9026378393173218, - "rewards/margins": 1.3616206645965576, - "rewards/rejected": -3.264258623123169, + "epoch": 1.4248793935217092, + "grad_norm": 11.390279769897461, + "learning_rate": 2.3144707254975898e-08, + "logits/chosen": -2.5706088542938232, + "logits/rejected": -2.5607664585113525, + "logps/chosen": -112.4377670288086, + "logps/rejected": -129.7074432373047, + "loss": 0.6412, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5904213190078735, + "rewards/margins": 0.15297070145606995, + "rewards/rejected": -0.7433920502662659, "step": 8270 }, { - "epoch": 1.43, - "grad_norm": 40.84568027678274, - "learning_rate": 1.1508999579475654e-07, - "logits/chosen": -1.321771502494812, - "logits/rejected": -1.2765603065490723, - "logps/chosen": -239.63668823242188, - "logps/rejected": -357.2843933105469, - "loss": 0.453, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.8728437423706055, - "rewards/margins": 1.1715965270996094, - "rewards/rejected": -3.044440269470215, + "epoch": 1.4266023432115782, + "grad_norm": 9.951886177062988, + "learning_rate": 2.3017999158951305e-08, + "logits/chosen": -2.4798500537872314, + "logits/rejected": -2.4695355892181396, + "logps/chosen": -115.2122573852539, + "logps/rejected": -127.40425872802734, + "loss": 0.6564, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6283519864082336, + "rewards/margins": 0.11694036424160004, + "rewards/rejected": -0.7452923059463501, "step": 8280 }, { - "epoch": 1.43, - "grad_norm": 27.9600215465676, - "learning_rate": 1.1445767531618944e-07, - "logits/chosen": -1.2803277969360352, - "logits/rejected": -1.1913877725601196, - "logps/chosen": -233.3045196533203, - "logps/rejected": -352.5013732910156, - "loss": 0.4225, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7336339950561523, - "rewards/margins": 1.2495338916778564, - "rewards/rejected": -2.983167886734009, + "epoch": 1.4283252929014472, + "grad_norm": 9.370098114013672, + "learning_rate": 2.2891535063237886e-08, + "logits/chosen": -2.4599788188934326, + "logits/rejected": -2.414714813232422, + "logps/chosen": -116.16448974609375, + "logps/rejected": -124.69068908691406, + "loss": 0.6398, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5622825622558594, + "rewards/margins": 0.14268946647644043, + "rewards/rejected": -0.704971969127655, "step": 8290 }, { - "epoch": 1.43, - "grad_norm": 25.002008446127316, - "learning_rate": 1.1382658055730096e-07, - "logits/chosen": -1.419141173362732, - "logits/rejected": -1.3513821363449097, - "logps/chosen": -243.5513458251953, - "logps/rejected": -379.9726867675781, - "loss": 0.4659, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.8599157333374023, - "rewards/margins": 1.3946608304977417, - "rewards/rejected": -3.2545769214630127, + "epoch": 1.4300482425913164, + "grad_norm": 8.352498054504395, + "learning_rate": 2.2765316111460193e-08, + "logits/chosen": -2.627147674560547, + "logits/rejected": -2.605398178100586, + "logps/chosen": -116.4769058227539, + "logps/rejected": -127.98457336425781, + "loss": 0.6472, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5889750719070435, + "rewards/margins": 0.14557085931301117, + "rewards/rejected": -0.7345459461212158, "step": 8300 }, { - "epoch": 1.43, - "eval_logits/chosen": -1.4014118909835815, - "eval_logits/rejected": -1.3744704723358154, - "eval_logps/chosen": -253.2737579345703, - "eval_logps/rejected": -304.1915588378906, - "eval_loss": 0.6337563991546631, - "eval_rewards/accuracies": 0.6642658114433289, - "eval_rewards/chosen": -1.9456990957260132, - "eval_rewards/margins": 0.4646424651145935, - "eval_rewards/rejected": -2.410341739654541, - "eval_runtime": 357.7816, - "eval_samples_per_second": 12.03, - "eval_steps_per_second": 1.504, + "epoch": 1.4300482425913164, + "eval_logits/chosen": -2.6018471717834473, + "eval_logits/rejected": -2.5952625274658203, + "eval_logps/chosen": -105.32196044921875, + "eval_logps/rejected": -119.30452728271484, + "eval_loss": 0.6573455929756165, + "eval_rewards/accuracies": 0.6217471957206726, + "eval_rewards/chosen": -0.46610066294670105, + "eval_rewards/margins": 0.0951433777809143, + "eval_rewards/rejected": -0.5612440705299377, + "eval_runtime": 359.5035, + "eval_samples_per_second": 11.972, + "eval_steps_per_second": 1.497, "step": 8300 }, { - "epoch": 1.43, - "grad_norm": 42.89349489877897, - "learning_rate": 1.1319671722512958e-07, - "logits/chosen": -1.2304198741912842, - "logits/rejected": -1.1626794338226318, - "logps/chosen": -231.46615600585938, - "logps/rejected": -345.99493408203125, - "loss": 0.4309, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7518718242645264, - "rewards/margins": 1.1967473030090332, - "rewards/rejected": -2.9486191272735596, + "epoch": 1.4317711922811853, + "grad_norm": 12.541621208190918, + "learning_rate": 2.2639343445025914e-08, + "logits/chosen": -2.4146697521209717, + "logits/rejected": -2.3866145610809326, + "logps/chosen": -108.785888671875, + "logps/rejected": -118.2060317993164, + "loss": 0.6407, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5251675844192505, + "rewards/margins": 0.1453622281551361, + "rewards/rejected": -0.6705297231674194, "step": 8310 }, { - "epoch": 1.43, - "grad_norm": 42.24768978841262, - "learning_rate": 1.1256809101557793e-07, - "logits/chosen": -1.3194677829742432, - "logits/rejected": -1.263414740562439, - "logps/chosen": -216.904052734375, - "logps/rejected": -355.9466552734375, - "loss": 0.4015, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.644235372543335, - "rewards/margins": 1.3747762441635132, - "rewards/rejected": -3.0190117359161377, + "epoch": 1.4334941419710545, + "grad_norm": 9.450784683227539, + "learning_rate": 2.2513618203115585e-08, + "logits/chosen": -2.483109951019287, + "logits/rejected": -2.468421220779419, + "logps/chosen": -104.41475677490234, + "logps/rejected": -123.78426361083984, + "loss": 0.6265, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5190132260322571, + "rewards/margins": 0.17828358709812164, + "rewards/rejected": -0.6972967982292175, "step": 8320 }, { - "epoch": 1.44, - "grad_norm": 24.459920685245006, - "learning_rate": 1.1194070761336133e-07, - "logits/chosen": -1.3198477029800415, - "logits/rejected": -1.2680397033691406, - "logps/chosen": -233.9806365966797, - "logps/rejected": -365.51239013671875, - "loss": 0.4175, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.8538386821746826, - "rewards/margins": 1.262743592262268, - "rewards/rejected": -3.1165823936462402, + "epoch": 1.4352170916609235, + "grad_norm": 10.334905624389648, + "learning_rate": 2.2388141522672265e-08, + "logits/chosen": -2.497607469558716, + "logits/rejected": -2.4882454872131348, + "logps/chosen": -105.68269348144531, + "logps/rejected": -125.55326843261719, + "loss": 0.6422, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5706783533096313, + "rewards/margins": 0.14622056484222412, + "rewards/rejected": -0.7168989181518555, "step": 8330 }, { - "epoch": 1.44, - "grad_norm": 44.12413238893927, - "learning_rate": 1.1131457269195598e-07, - "logits/chosen": -1.3826789855957031, - "logits/rejected": -1.3336975574493408, - "logps/chosen": -241.165283203125, - "logps/rejected": -356.74481201171875, - "loss": 0.4828, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.8674249649047852, - "rewards/margins": 1.1834924221038818, - "rewards/rejected": -3.050917387008667, + "epoch": 1.4369400413507925, + "grad_norm": 9.96259593963623, + "learning_rate": 2.22629145383912e-08, + "logits/chosen": -2.5489068031311035, + "logits/rejected": -2.5338613986968994, + "logps/chosen": -111.05059814453125, + "logps/rejected": -123.3394546508789, + "loss": 0.6394, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.566070556640625, + "rewards/margins": 0.15070785582065582, + "rewards/rejected": -0.7167783975601196, "step": 8340 }, { - "epoch": 1.44, - "grad_norm": 27.88048778386721, - "learning_rate": 1.106896919135483e-07, - "logits/chosen": -1.212838888168335, - "logits/rejected": -1.1588201522827148, - "logps/chosen": -237.91293334960938, - "logps/rejected": -359.9263000488281, - "loss": 0.4398, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.847447395324707, - "rewards/margins": 1.225775122642517, - "rewards/rejected": -3.0732228755950928, + "epoch": 1.4386629910406616, + "grad_norm": 10.026483535766602, + "learning_rate": 2.213793838270966e-08, + "logits/chosen": -2.3980913162231445, + "logits/rejected": -2.3814735412597656, + "logps/chosen": -115.02793884277344, + "logps/rejected": -126.07624816894531, + "loss": 0.6536, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6185691356658936, + "rewards/margins": 0.11624781787395477, + "rewards/rejected": -0.7348170280456543, "step": 8350 }, { - "epoch": 1.44, - "grad_norm": 41.137677010372954, - "learning_rate": 1.1006607092898326e-07, - "logits/chosen": -1.2542845010757446, - "logits/rejected": -1.1702654361724854, - "logps/chosen": -207.97622680664062, - "logps/rejected": -355.0627746582031, - "loss": 0.3657, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.5343948602676392, - "rewards/margins": 1.4972972869873047, - "rewards/rejected": -3.0316920280456543, + "epoch": 1.4403859407305306, + "grad_norm": 11.445932388305664, + "learning_rate": 2.2013214185796653e-08, + "logits/chosen": -2.3885562419891357, + "logits/rejected": -2.355684757232666, + "logps/chosen": -111.50667572021484, + "logps/rejected": -127.6391372680664, + "loss": 0.6232, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5695258975028992, + "rewards/margins": 0.18785393238067627, + "rewards/rejected": -0.7573798894882202, "step": 8360 }, { - "epoch": 1.44, - "grad_norm": 30.172641899374565, - "learning_rate": 1.0944371537771347e-07, - "logits/chosen": -1.3405089378356934, - "logits/rejected": -1.2776302099227905, - "logps/chosen": -218.90786743164062, - "logps/rejected": -363.3168640136719, - "loss": 0.3823, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.6786830425262451, - "rewards/margins": 1.4186073541641235, - "rewards/rejected": -3.097290277481079, + "epoch": 1.4421088904203998, + "grad_norm": 10.501673698425293, + "learning_rate": 2.1888743075542692e-08, + "logits/chosen": -2.4620909690856934, + "logits/rejected": -2.44533109664917, + "logps/chosen": -113.21112060546875, + "logps/rejected": -130.18067932128906, + "loss": 0.6461, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6215483546257019, + "rewards/margins": 0.14412672817707062, + "rewards/rejected": -0.7656751871109009, "step": 8370 }, { - "epoch": 1.44, - "grad_norm": 26.55031255402569, - "learning_rate": 1.0882263088774809e-07, - "logits/chosen": -1.4416921138763428, - "logits/rejected": -1.3774442672729492, - "logps/chosen": -203.69728088378906, - "logps/rejected": -338.6905212402344, - "loss": 0.4202, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.5246427059173584, - "rewards/margins": 1.3209424018859863, - "rewards/rejected": -2.845585346221924, + "epoch": 1.4438318401102688, + "grad_norm": 8.78179931640625, + "learning_rate": 2.1764526177549618e-08, + "logits/chosen": -2.5779061317443848, + "logits/rejected": -2.5591421127319336, + "logps/chosen": -104.83152770996094, + "logps/rejected": -125.90281677246094, + "loss": 0.6241, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5359030961990356, + "rewards/margins": 0.1816587895154953, + "rewards/rejected": -0.7175619602203369, "step": 8380 }, { - "epoch": 1.45, - "grad_norm": 35.13759701459188, - "learning_rate": 1.0820282307560196e-07, - "logits/chosen": -1.4239284992218018, - "logits/rejected": -1.3506910800933838, - "logps/chosen": -219.5550537109375, - "logps/rejected": -363.826171875, - "loss": 0.3804, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.6286277770996094, - "rewards/margins": 1.4488070011138916, - "rewards/rejected": -3.077434778213501, + "epoch": 1.4455547898001377, + "grad_norm": 8.149552345275879, + "learning_rate": 2.1640564615120394e-08, + "logits/chosen": -2.5387609004974365, + "logits/rejected": -2.508159637451172, + "logps/chosen": -111.774658203125, + "logps/rejected": -135.379638671875, + "loss": 0.5986, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5507842302322388, + "rewards/margins": 0.241925448179245, + "rewards/rejected": -0.7927096486091614, "step": 8390 }, { - "epoch": 1.45, - "grad_norm": 28.883468549799368, - "learning_rate": 1.075842975462449e-07, - "logits/chosen": -1.3790721893310547, - "logits/rejected": -1.3156163692474365, - "logps/chosen": -207.9996337890625, - "logps/rejected": -343.63677978515625, - "loss": 0.4254, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.5770851373672485, - "rewards/margins": 1.351677656173706, - "rewards/rejected": -2.928762912750244, + "epoch": 1.447277739490007, + "grad_norm": 10.88570499420166, + "learning_rate": 2.151685950924898e-08, + "logits/chosen": -2.5128719806671143, + "logits/rejected": -2.4928791522979736, + "logps/chosen": -104.6343765258789, + "logps/rejected": -121.88565826416016, + "loss": 0.6298, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5432831645011902, + "rewards/margins": 0.1677609086036682, + "rewards/rejected": -0.7110441327095032, "step": 8400 }, { - "epoch": 1.45, - "eval_logits/chosen": -1.4531240463256836, - "eval_logits/rejected": -1.4271684885025024, - "eval_logps/chosen": -233.58177185058594, - "eval_logps/rejected": -281.2073974609375, - "eval_loss": 0.6341521143913269, - "eval_rewards/accuracies": 0.6589219570159912, - "eval_rewards/chosen": -1.7487791776657104, - "eval_rewards/margins": 0.4317210614681244, - "eval_rewards/rejected": -2.1805002689361572, - "eval_runtime": 357.7383, - "eval_samples_per_second": 12.031, - "eval_steps_per_second": 1.504, + "epoch": 1.447277739490007, + "eval_logits/chosen": -2.599398374557495, + "eval_logits/rejected": -2.5927748680114746, + "eval_logps/chosen": -104.80558776855469, + "eval_logps/rejected": -118.77681732177734, + "eval_loss": 0.6573162078857422, + "eval_rewards/accuracies": 0.6205855011940002, + "eval_rewards/chosen": -0.460936963558197, + "eval_rewards/margins": 0.09503000974655151, + "eval_rewards/rejected": -0.5559669733047485, + "eval_runtime": 359.7437, + "eval_samples_per_second": 11.964, + "eval_steps_per_second": 1.496, "step": 8400 }, { - "epoch": 1.45, - "grad_norm": 26.87906657018289, - "learning_rate": 1.0696705989305085e-07, - "logits/chosen": -1.3252408504486084, - "logits/rejected": -1.2531208992004395, - "logps/chosen": -228.47628784179688, - "logps/rejected": -369.2504577636719, - "loss": 0.4096, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.7083253860473633, - "rewards/margins": 1.466355323791504, - "rewards/rejected": -3.1746809482574463, + "epoch": 1.449000689179876, + "grad_norm": 8.112527847290039, + "learning_rate": 2.1393411978610172e-08, + "logits/chosen": -2.4637961387634277, + "logits/rejected": -2.4320006370544434, + "logps/chosen": -113.9928207397461, + "logps/rejected": -131.36863708496094, + "loss": 0.6049, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5633238554000854, + "rewards/margins": 0.23221902549266815, + "rewards/rejected": -0.7955428957939148, "step": 8410 }, { - "epoch": 1.45, - "grad_norm": 29.144216830031528, - "learning_rate": 1.0635111569774755e-07, - "logits/chosen": -1.2288157939910889, - "logits/rejected": -1.1764132976531982, - "logps/chosen": -192.9091796875, - "logps/rejected": -327.469482421875, - "loss": 0.3615, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.412838339805603, - "rewards/margins": 1.3490186929702759, - "rewards/rejected": -2.761857032775879, + "epoch": 1.450723638869745, + "grad_norm": 10.700355529785156, + "learning_rate": 2.1270223139549508e-08, + "logits/chosen": -2.3602561950683594, + "logits/rejected": -2.3470911979675293, + "logps/chosen": -106.328369140625, + "logps/rejected": -125.5106430053711, + "loss": 0.6249, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5467656254768372, + "rewards/margins": 0.19570481777191162, + "rewards/rejected": -0.7424705624580383, "step": 8420 }, { - "epoch": 1.45, - "grad_norm": 29.070566266372918, - "learning_rate": 1.0573647053036552e-07, - "logits/chosen": -1.3514432907104492, - "logits/rejected": -1.294538974761963, - "logps/chosen": -215.29830932617188, - "logps/rejected": -337.3027038574219, - "loss": 0.4437, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.6178497076034546, - "rewards/margins": 1.188742995262146, - "rewards/rejected": -2.8065924644470215, + "epoch": 1.452446588559614, + "grad_norm": 11.980216979980469, + "learning_rate": 2.1147294106073104e-08, + "logits/chosen": -2.4795050621032715, + "logits/rejected": -2.4644033908843994, + "logps/chosen": -109.48161315917969, + "logps/rejected": -128.14706420898438, + "loss": 0.638, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5599022507667542, + "rewards/margins": 0.15501144528388977, + "rewards/rejected": -0.7149137258529663, "step": 8430 }, { - "epoch": 1.45, - "grad_norm": 31.754197528307863, - "learning_rate": 1.0512312994918865e-07, - "logits/chosen": -1.387795329093933, - "logits/rejected": -1.3368542194366455, - "logps/chosen": -220.80947875976562, - "logps/rejected": -341.77264404296875, - "loss": 0.4488, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.6601117849349976, - "rewards/margins": 1.242388129234314, - "rewards/rejected": -2.9024999141693115, + "epoch": 1.454169538249483, + "grad_norm": 10.321578025817871, + "learning_rate": 2.102462598983773e-08, + "logits/chosen": -2.5045294761657715, + "logits/rejected": -2.489931583404541, + "logps/chosen": -116.15422058105469, + "logps/rejected": -124.9060287475586, + "loss": 0.6552, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6136160492897034, + "rewards/margins": 0.12014230340719223, + "rewards/rejected": -0.7337583303451538, "step": 8440 }, { - "epoch": 1.46, - "grad_norm": 25.669279884647963, - "learning_rate": 1.0451109950070275e-07, - "logits/chosen": -1.2506482601165771, - "logits/rejected": -1.2008402347564697, - "logps/chosen": -225.53250122070312, - "logps/rejected": -366.031982421875, - "loss": 0.4143, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7753582000732422, - "rewards/margins": 1.3562324047088623, - "rewards/rejected": -3.1315910816192627, + "epoch": 1.4558924879393522, + "grad_norm": 10.292564392089844, + "learning_rate": 2.090221990014055e-08, + "logits/chosen": -2.392467498779297, + "logits/rejected": -2.3858132362365723, + "logps/chosen": -105.68475341796875, + "logps/rejected": -129.52902221679688, + "loss": 0.6235, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5767289996147156, + "rewards/margins": 0.1895083636045456, + "rewards/rejected": -0.7662373781204224, "step": 8450 }, { - "epoch": 1.46, - "grad_norm": 25.414444784217327, - "learning_rate": 1.039003847195466e-07, - "logits/chosen": -1.3718957901000977, - "logits/rejected": -1.3084933757781982, - "logps/chosen": -226.52572631835938, - "logps/rejected": -355.208251953125, - "loss": 0.378, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.7372535467147827, - "rewards/margins": 1.277478814125061, - "rewards/rejected": -3.0147323608398438, + "epoch": 1.4576154376292212, + "grad_norm": 10.444774627685547, + "learning_rate": 2.078007694390932e-08, + "logits/chosen": -2.5114336013793945, + "logits/rejected": -2.4861104488372803, + "logps/chosen": -110.15858459472656, + "logps/rejected": -128.76162719726562, + "loss": 0.6275, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5735193490982056, + "rewards/margins": 0.1767922192811966, + "rewards/rejected": -0.750311553478241, "step": 8460 }, { - "epoch": 1.46, - "grad_norm": 47.176323143328105, - "learning_rate": 1.0329099112846071e-07, - "logits/chosen": -1.3328293561935425, - "logits/rejected": -1.2753039598464966, - "logps/chosen": -257.13079833984375, - "logps/rejected": -379.00323486328125, - "loss": 0.4915, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.010730504989624, - "rewards/margins": 1.2479288578033447, - "rewards/rejected": -3.2586593627929688, + "epoch": 1.4593383873190904, + "grad_norm": 10.15573787689209, + "learning_rate": 2.0658198225692143e-08, + "logits/chosen": -2.4851930141448975, + "logits/rejected": -2.4597885608673096, + "logps/chosen": -116.81849670410156, + "logps/rejected": -128.00364685058594, + "loss": 0.6459, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6075204014778137, + "rewards/margins": 0.14105847477912903, + "rewards/rejected": -0.7485788464546204, "step": 8470 }, { - "epoch": 1.46, - "grad_norm": 32.56367895113138, - "learning_rate": 1.0268292423823838e-07, - "logits/chosen": -1.3693746328353882, - "logits/rejected": -1.2976529598236084, - "logps/chosen": -223.7007598876953, - "logps/rejected": -357.4267272949219, - "loss": 0.439, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.7069003582000732, - "rewards/margins": 1.3514857292175293, - "rewards/rejected": -3.0583860874176025, + "epoch": 1.4610613370089593, + "grad_norm": 10.597168922424316, + "learning_rate": 2.0536584847647675e-08, + "logits/chosen": -2.478397846221924, + "logits/rejected": -2.4498822689056396, + "logps/chosen": -112.1146011352539, + "logps/rejected": -126.52950286865234, + "loss": 0.6381, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5907739996910095, + "rewards/margins": 0.1586400866508484, + "rewards/rejected": -0.7494140863418579, "step": 8480 }, { - "epoch": 1.46, - "grad_norm": 27.548404093695904, - "learning_rate": 1.020761895476753e-07, - "logits/chosen": -1.4502463340759277, - "logits/rejected": -1.3987720012664795, - "logps/chosen": -210.2566375732422, - "logps/rejected": -340.35089111328125, - "loss": 0.4054, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.5552849769592285, - "rewards/margins": 1.2762036323547363, - "rewards/rejected": -2.831489086151123, + "epoch": 1.4627842866988283, + "grad_norm": 7.842881202697754, + "learning_rate": 2.041523790953506e-08, + "logits/chosen": -2.5317792892456055, + "logits/rejected": -2.5239269733428955, + "logps/chosen": -111.8382339477539, + "logps/rejected": -130.12106323242188, + "loss": 0.6354, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5709823369979858, + "rewards/margins": 0.15821108222007751, + "rewards/rejected": -0.7291934490203857, "step": 8490 }, { - "epoch": 1.46, - "grad_norm": 25.509139166950433, - "learning_rate": 1.0147079254352001e-07, - "logits/chosen": -1.3013901710510254, - "logits/rejected": -1.2501300573349, - "logps/chosen": -205.6046142578125, - "logps/rejected": -333.3544921875, - "loss": 0.4177, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.5129337310791016, - "rewards/margins": 1.2862192392349243, - "rewards/rejected": -2.7991526126861572, + "epoch": 1.4645072363886975, + "grad_norm": 9.383299827575684, + "learning_rate": 2.0294158508704e-08, + "logits/chosen": -2.409794569015503, + "logits/rejected": -2.395017147064209, + "logps/chosen": -109.6671142578125, + "logps/rejected": -127.86033630371094, + "loss": 0.6207, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5536033511161804, + "rewards/margins": 0.19054511189460754, + "rewards/rejected": -0.7441484928131104, "step": 8500 }, { - "epoch": 1.46, - "eval_logits/chosen": -1.4731091260910034, - "eval_logits/rejected": -1.4476723670959473, - "eval_logps/chosen": -229.227783203125, - "eval_logps/rejected": -275.5844421386719, - "eval_loss": 0.6337706446647644, - "eval_rewards/accuracies": 0.6589219570159912, - "eval_rewards/chosen": -1.705239176750183, - "eval_rewards/margins": 0.4190312325954437, - "eval_rewards/rejected": -2.124270439147949, - "eval_runtime": 357.6856, - "eval_samples_per_second": 12.033, - "eval_steps_per_second": 1.504, + "epoch": 1.4645072363886975, + "eval_logits/chosen": -2.595156669616699, + "eval_logits/rejected": -2.5885396003723145, + "eval_logps/chosen": -104.50138092041016, + "eval_logps/rejected": -118.48872375488281, + "eval_loss": 0.6572505235671997, + "eval_rewards/accuracies": 0.6180297136306763, + "eval_rewards/chosen": -0.45789483189582825, + "eval_rewards/margins": 0.09519128501415253, + "eval_rewards/rejected": -0.5530860424041748, + "eval_runtime": 359.686, + "eval_samples_per_second": 11.966, + "eval_steps_per_second": 1.496, "step": 8500 }, { - "epoch": 1.47, - "grad_norm": 42.02138934399011, - "learning_rate": 1.008667387004242e-07, - "logits/chosen": -1.3346723318099976, - "logits/rejected": -1.2678366899490356, - "logps/chosen": -221.626953125, - "logps/rejected": -353.46221923828125, - "loss": 0.3852, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.6379005908966064, - "rewards/margins": 1.381575345993042, - "rewards/rejected": -3.0194761753082275, + "epoch": 1.4662301860785665, + "grad_norm": 11.489233016967773, + "learning_rate": 2.017334774008484e-08, + "logits/chosen": -2.4692511558532715, + "logits/rejected": -2.4383511543273926, + "logps/chosen": -114.93684387207031, + "logps/rejected": -128.41346740722656, + "loss": 0.6178, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5708756446838379, + "rewards/margins": 0.19799299538135529, + "rewards/rejected": -0.7688685655593872, "step": 8510 }, { - "epoch": 1.47, - "grad_norm": 31.896788173795407, - "learning_rate": 1.002640334808933e-07, - "logits/chosen": -1.3286793231964111, - "logits/rejected": -1.2712717056274414, - "logps/chosen": -226.07302856445312, - "logps/rejected": -338.1138000488281, - "loss": 0.4501, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7251167297363281, - "rewards/margins": 1.1692787408828735, - "rewards/rejected": -2.8943958282470703, + "epoch": 1.4679531357684357, + "grad_norm": 10.60742473602295, + "learning_rate": 2.0052806696178658e-08, + "logits/chosen": -2.4626574516296387, + "logits/rejected": -2.4323716163635254, + "logps/chosen": -109.2204360961914, + "logps/rejected": -121.94303894042969, + "loss": 0.6338, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5565052032470703, + "rewards/margins": 0.1759699583053589, + "rewards/rejected": -0.7324752807617188, "step": 8520 }, { - "epoch": 1.47, - "grad_norm": 30.87059976350335, - "learning_rate": 9.9662682335237e-08, - "logits/chosen": -1.3089802265167236, - "logits/rejected": -1.2482343912124634, - "logps/chosen": -220.247314453125, - "logps/rejected": -340.78228759765625, - "loss": 0.4195, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.6637243032455444, - "rewards/margins": 1.2299330234527588, - "rewards/rejected": -2.8936572074890137, + "epoch": 1.4696760854583046, + "grad_norm": 9.371819496154785, + "learning_rate": 1.99325364670474e-08, + "logits/chosen": -2.4444479942321777, + "logits/rejected": -2.427464246749878, + "logps/chosen": -113.06219482421875, + "logps/rejected": -122.6248550415039, + "loss": 0.6539, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5917520523071289, + "rewards/margins": 0.12022414058446884, + "rewards/rejected": -0.7119762301445007, "step": 8530 }, { - "epoch": 1.47, - "grad_norm": 29.57486542104792, - "learning_rate": 9.906269070152004e-08, - "logits/chosen": -1.4429051876068115, - "logits/rejected": -1.3985341787338257, - "logps/chosen": -219.606689453125, - "logps/rejected": -326.71759033203125, - "loss": 0.4908, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.6491502523422241, - "rewards/margins": 1.0737760066986084, - "rewards/rejected": -2.722926378250122, + "epoch": 1.4713990351481736, + "grad_norm": 9.851543426513672, + "learning_rate": 1.9812538140304008e-08, + "logits/chosen": -2.568233013153076, + "logits/rejected": -2.5535032749176025, + "logps/chosen": -110.19102478027344, + "logps/rejected": -123.43714904785156, + "loss": 0.6447, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.554968535900116, + "rewards/margins": 0.13506591320037842, + "rewards/rejected": -0.6900344491004944, "step": 8540 }, { - "epoch": 1.47, - "grad_norm": 25.587032990412716, - "learning_rate": 9.846406400551308e-08, - "logits/chosen": -1.3666541576385498, - "logits/rejected": -1.3007842302322388, - "logps/chosen": -233.0788116455078, - "logps/rejected": -375.75469970703125, - "loss": 0.3971, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.7926390171051025, - "rewards/margins": 1.4118077754974365, - "rewards/rejected": -3.204446792602539, + "epoch": 1.4731219848380428, + "grad_norm": 9.308712005615234, + "learning_rate": 1.9692812801102615e-08, + "logits/chosen": -2.4927818775177, + "logits/rejected": -2.48159122467041, + "logps/chosen": -111.54307556152344, + "logps/rejected": -132.26345825195312, + "loss": 0.6221, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5770515203475952, + "rewards/margins": 0.1922859102487564, + "rewards/rejected": -0.7693374752998352, "step": 8550 }, { - "epoch": 1.47, - "grad_norm": 29.854174805438845, - "learning_rate": 9.786680766064318e-08, - "logits/chosen": -1.4583765268325806, - "logits/rejected": -1.3929063081741333, - "logps/chosen": -231.9521942138672, - "logps/rejected": -365.8257751464844, - "loss": 0.4261, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7775615453720093, - "rewards/margins": 1.3575212955474854, - "rewards/rejected": -3.135082721710205, + "epoch": 1.4748449345279118, + "grad_norm": 9.304411888122559, + "learning_rate": 1.9573361532128635e-08, + "logits/chosen": -2.578611373901367, + "logits/rejected": -2.5568089485168457, + "logps/chosen": -111.9488525390625, + "logps/rejected": -129.10482788085938, + "loss": 0.6248, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5774158239364624, + "rewards/margins": 0.19028854370117188, + "rewards/rejected": -0.7677043080329895, "step": 8560 }, { - "epoch": 1.48, - "grad_norm": 31.030686033718272, - "learning_rate": 9.727092706794554e-08, - "logits/chosen": -1.3249984979629517, - "logits/rejected": -1.2678711414337158, - "logps/chosen": -229.1342010498047, - "logps/rejected": -341.3080749511719, - "loss": 0.4511, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.7480465173721313, - "rewards/margins": 1.145034670829773, - "rewards/rejected": -2.8930811882019043, + "epoch": 1.476567884217781, + "grad_norm": 10.689298629760742, + "learning_rate": 1.9454185413589108e-08, + "logits/chosen": -2.439042568206787, + "logits/rejected": -2.4161269664764404, + "logps/chosen": -112.94194030761719, + "logps/rejected": -128.4320068359375, + "loss": 0.6274, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.585848867893219, + "rewards/margins": 0.17812153697013855, + "rewards/rejected": -0.7639704346656799, "step": 8570 }, { - "epoch": 1.48, - "grad_norm": 32.81944513147602, - "learning_rate": 9.667642761601433e-08, - "logits/chosen": -1.418872594833374, - "logits/rejected": -1.3541626930236816, - "logps/chosen": -211.55941772460938, - "logps/rejected": -351.56304931640625, - "loss": 0.3834, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.5743210315704346, - "rewards/margins": 1.3962310552597046, - "rewards/rejected": -2.9705519676208496, + "epoch": 1.47829083390765, + "grad_norm": 10.238391876220703, + "learning_rate": 1.9335285523202867e-08, + "logits/chosen": -2.5246376991271973, + "logits/rejected": -2.5036513805389404, + "logps/chosen": -106.84709167480469, + "logps/rejected": -129.79830932617188, + "loss": 0.6038, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5271083116531372, + "rewards/margins": 0.22541937232017517, + "rewards/rejected": -0.7525277137756348, "step": 8580 }, { - "epoch": 1.48, - "grad_norm": 28.302147205613945, - "learning_rate": 9.608331468095377e-08, - "logits/chosen": -1.3961126804351807, - "logits/rejected": -1.319726586341858, - "logps/chosen": -209.43771362304688, - "logps/rejected": -345.10186767578125, - "loss": 0.358, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.524304986000061, - "rewards/margins": 1.3930083513259888, - "rewards/rejected": -2.91731333732605, + "epoch": 1.480013783597519, + "grad_norm": 8.995641708374023, + "learning_rate": 1.9216662936190753e-08, + "logits/chosen": -2.5343661308288574, + "logits/rejected": -2.506255626678467, + "logps/chosen": -107.98847961425781, + "logps/rejected": -123.33349609375, + "loss": 0.617, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5099393129348755, + "rewards/margins": 0.18957418203353882, + "rewards/rejected": -0.6995135545730591, "step": 8590 }, { - "epoch": 1.48, - "grad_norm": 22.424385790342782, - "learning_rate": 9.549159362632986e-08, - "logits/chosen": -1.3214257955551147, - "logits/rejected": -1.26907217502594, - "logps/chosen": -230.94613647460938, - "logps/rejected": -337.48309326171875, - "loss": 0.4537, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.7750024795532227, - "rewards/margins": 1.082525372505188, - "rewards/rejected": -2.8575279712677, + "epoch": 1.481736733287388, + "grad_norm": 10.315347671508789, + "learning_rate": 1.909831872526597e-08, + "logits/chosen": -2.4555325508117676, + "logits/rejected": -2.4294888973236084, + "logps/chosen": -112.75224304199219, + "logps/rejected": -121.38525390625, + "loss": 0.6661, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5931397676467896, + "rewards/margins": 0.10323586314916611, + "rewards/rejected": -0.6963757276535034, "step": 8600 }, { - "epoch": 1.48, - "eval_logits/chosen": -1.4457244873046875, - "eval_logits/rejected": -1.4196789264678955, - "eval_logps/chosen": -243.827392578125, - "eval_logps/rejected": -292.89404296875, - "eval_loss": 0.63252854347229, - "eval_rewards/accuracies": 0.6677509546279907, - "eval_rewards/chosen": -1.8512355089187622, - "eval_rewards/margins": 0.4461313486099243, - "eval_rewards/rejected": -2.2973668575286865, - "eval_runtime": 357.6199, - "eval_samples_per_second": 12.035, - "eval_steps_per_second": 1.504, + "epoch": 1.481736733287388, + "eval_logits/chosen": -2.591291904449463, + "eval_logits/rejected": -2.5845961570739746, + "eval_logps/chosen": -105.10528564453125, + "eval_logps/rejected": -119.16321563720703, + "eval_loss": 0.6571276187896729, + "eval_rewards/accuracies": 0.6203531622886658, + "eval_rewards/chosen": -0.46393388509750366, + "eval_rewards/margins": 0.09589700400829315, + "eval_rewards/rejected": -0.559830904006958, + "eval_runtime": 359.5964, + "eval_samples_per_second": 11.969, + "eval_steps_per_second": 1.496, "step": 8600 }, { - "epoch": 1.48, - "grad_norm": 33.10659637068871, - "learning_rate": 9.490126980312165e-08, - "logits/chosen": -1.3417989015579224, - "logits/rejected": -1.2814117670059204, - "logps/chosen": -232.3809814453125, - "logps/rejected": -360.255615234375, - "loss": 0.4184, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7825310230255127, - "rewards/margins": 1.3146696090698242, - "rewards/rejected": -3.097200632095337, + "epoch": 1.483459682977257, + "grad_norm": 9.687759399414062, + "learning_rate": 1.898025396062433e-08, + "logits/chosen": -2.4798989295959473, + "logits/rejected": -2.459146499633789, + "logps/chosen": -111.55943298339844, + "logps/rejected": -124.44791412353516, + "loss": 0.6374, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5741453766822815, + "rewards/margins": 0.1647716462612152, + "rewards/rejected": -0.7389170527458191, "step": 8610 }, { - "epoch": 1.49, - "grad_norm": 29.42647146776832, - "learning_rate": 9.431234854967291e-08, - "logits/chosen": -1.2606632709503174, - "logits/rejected": -1.2125203609466553, - "logps/chosen": -236.4669189453125, - "logps/rejected": -357.4127502441406, - "loss": 0.4326, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.8169389963150024, - "rewards/margins": 1.2449721097946167, - "rewards/rejected": -3.061911106109619, + "epoch": 1.4851826326671262, + "grad_norm": 9.408075332641602, + "learning_rate": 1.886246970993458e-08, + "logits/chosen": -2.4093403816223145, + "logits/rejected": -2.394554853439331, + "logps/chosen": -114.43864440917969, + "logps/rejected": -121.6338882446289, + "loss": 0.6598, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5966447591781616, + "rewards/margins": 0.10718707740306854, + "rewards/rejected": -0.7038318514823914, "step": 8620 }, { - "epoch": 1.49, - "grad_norm": 23.619846728441956, - "learning_rate": 9.372483519164398e-08, - "logits/chosen": -1.2358766794204712, - "logits/rejected": -1.1763203144073486, - "logps/chosen": -209.49526977539062, - "logps/rejected": -350.7640075683594, - "loss": 0.3756, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.5778772830963135, - "rewards/margins": 1.4093445539474487, - "rewards/rejected": -2.9872217178344727, + "epoch": 1.4869055823569952, + "grad_norm": 10.718470573425293, + "learning_rate": 1.8744967038328796e-08, + "logits/chosen": -2.3900675773620605, + "logits/rejected": -2.3785276412963867, + "logps/chosen": -103.84339904785156, + "logps/rejected": -122.72688293457031, + "loss": 0.6201, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5213834643363953, + "rewards/margins": 0.1853039264678955, + "rewards/rejected": -0.7066873908042908, "step": 8630 }, { - "epoch": 1.49, - "grad_norm": 23.10386102267089, - "learning_rate": 9.313873504196313e-08, - "logits/chosen": -1.4057111740112305, - "logits/rejected": -1.3474836349487305, - "logps/chosen": -225.91744995117188, - "logps/rejected": -339.12139892578125, - "loss": 0.4632, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.7019745111465454, - "rewards/margins": 1.161685585975647, - "rewards/rejected": -2.8636600971221924, + "epoch": 1.4886285320468642, + "grad_norm": 9.722565650939941, + "learning_rate": 1.8627747008392626e-08, + "logits/chosen": -2.521705150604248, + "logits/rejected": -2.4955756664276123, + "logps/chosen": -114.07643127441406, + "logps/rejected": -121.75555419921875, + "loss": 0.6665, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5833483934402466, + "rewards/margins": 0.10640629380941391, + "rewards/rejected": -0.6897546648979187, "step": 8640 }, { - "epoch": 1.49, - "grad_norm": 51.94922788167167, - "learning_rate": 9.255405340077949e-08, - "logits/chosen": -1.321274757385254, - "logits/rejected": -1.256255865097046, - "logps/chosen": -225.003662109375, - "logps/rejected": -348.2788391113281, - "loss": 0.458, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.700404405593872, - "rewards/margins": 1.241997480392456, - "rewards/rejected": -2.9424021244049072, + "epoch": 1.4903514817367332, + "grad_norm": 9.940096855163574, + "learning_rate": 1.8510810680155898e-08, + "logits/chosen": -2.43583607673645, + "logits/rejected": -2.4159131050109863, + "logps/chosen": -109.96278381347656, + "logps/rejected": -126.08406829833984, + "loss": 0.6303, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5500949621200562, + "rewards/margins": 0.17001654207706451, + "rewards/rejected": -0.7201114892959595, "step": 8650 }, { - "epoch": 1.49, - "grad_norm": 32.91948860177697, - "learning_rate": 9.197079555541379e-08, - "logits/chosen": -1.3428630828857422, - "logits/rejected": -1.2893078327178955, - "logps/chosen": -228.0025177001953, - "logps/rejected": -354.33404541015625, - "loss": 0.46, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7465832233428955, - "rewards/margins": 1.2794687747955322, - "rewards/rejected": -3.0260515213012695, + "epoch": 1.4920744314266023, + "grad_norm": 9.738779067993164, + "learning_rate": 1.8394159111082756e-08, + "logits/chosen": -2.447446346282959, + "logits/rejected": -2.4335532188415527, + "logps/chosen": -110.02061462402344, + "logps/rejected": -126.77510833740234, + "loss": 0.629, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5665866732597351, + "rewards/margins": 0.18393734097480774, + "rewards/rejected": -0.75052410364151, "step": 8660 }, { - "epoch": 1.49, - "grad_norm": 23.89924519243948, - "learning_rate": 9.138896678031202e-08, - "logits/chosen": -1.4497371912002563, - "logits/rejected": -1.3866420984268188, - "logps/chosen": -211.95297241210938, - "logps/rejected": -345.5428466796875, - "loss": 0.4208, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5996501445770264, - "rewards/margins": 1.3280017375946045, - "rewards/rejected": -2.927651882171631, + "epoch": 1.4937973811164715, + "grad_norm": 8.617897987365723, + "learning_rate": 1.8277793356062403e-08, + "logits/chosen": -2.5401453971862793, + "logits/rejected": -2.523151397705078, + "logps/chosen": -111.16178131103516, + "logps/rejected": -129.0643768310547, + "loss": 0.633, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5916589498519897, + "rewards/margins": 0.17111726105213165, + "rewards/rejected": -0.762776255607605, "step": 8670 }, { - "epoch": 1.5, - "grad_norm": 25.326870727709025, - "learning_rate": 9.080857233699624e-08, - "logits/chosen": -1.364079236984253, - "logits/rejected": -1.323317289352417, - "logps/chosen": -223.8002166748047, - "logps/rejected": -334.2272644042969, - "loss": 0.4608, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7312272787094116, - "rewards/margins": 1.0833412408828735, - "rewards/rejected": -2.814568281173706, + "epoch": 1.4955203308063405, + "grad_norm": 11.4242525100708, + "learning_rate": 1.8161714467399248e-08, + "logits/chosen": -2.4823286533355713, + "logits/rejected": -2.4764370918273926, + "logps/chosen": -108.46478271484375, + "logps/rejected": -122.68070983886719, + "loss": 0.6517, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5778648257255554, + "rewards/margins": 0.121099554002285, + "rewards/rejected": -0.6989644765853882, "step": 8680 }, { - "epoch": 1.5, - "grad_norm": 28.8350727770196, - "learning_rate": 9.022961747401841e-08, - "logits/chosen": -1.385801911354065, - "logits/rejected": -1.323319911956787, - "logps/chosen": -221.8018341064453, - "logps/rejected": -329.33428955078125, - "loss": 0.4513, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.66269850730896, - "rewards/margins": 1.1479780673980713, - "rewards/rejected": -2.8106765747070312, + "epoch": 1.4972432804962095, + "grad_norm": 10.568251609802246, + "learning_rate": 1.8045923494803683e-08, + "logits/chosen": -2.504668951034546, + "logits/rejected": -2.47914457321167, + "logps/chosen": -115.25125885009766, + "logps/rejected": -117.3531723022461, + "loss": 0.6705, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5971495509147644, + "rewards/margins": 0.093296580016613, + "rewards/rejected": -0.6904462575912476, "step": 8690 }, { - "epoch": 1.5, - "grad_norm": 27.89499138401969, - "learning_rate": 8.96521074269117e-08, - "logits/chosen": -1.3936054706573486, - "logits/rejected": -1.3266212940216064, - "logps/chosen": -221.9488067626953, - "logps/rejected": -332.57354736328125, - "loss": 0.4176, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.654510498046875, - "rewards/margins": 1.1451376676559448, - "rewards/rejected": -2.7996482849121094, + "epoch": 1.4989662301860784, + "grad_norm": 9.535359382629395, + "learning_rate": 1.793042148538234e-08, + "logits/chosen": -2.50766921043396, + "logits/rejected": -2.476576089859009, + "logps/chosen": -115.72029113769531, + "logps/rejected": -124.52447509765625, + "loss": 0.6475, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5922280550003052, + "rewards/margins": 0.12659205496311188, + "rewards/rejected": -0.7188200950622559, "step": 8700 }, { - "epoch": 1.5, - "eval_logits/chosen": -1.4750956296920776, - "eval_logits/rejected": -1.4490503072738647, - "eval_logps/chosen": -231.7505340576172, - "eval_logps/rejected": -279.62408447265625, - "eval_loss": 0.6307649612426758, - "eval_rewards/accuracies": 0.6654275059700012, - "eval_rewards/chosen": -1.7304668426513672, - "eval_rewards/margins": 0.4342002868652344, - "eval_rewards/rejected": -2.1646668910980225, - "eval_runtime": 356.447, - "eval_samples_per_second": 12.075, - "eval_steps_per_second": 1.509, + "epoch": 1.4989662301860784, + "eval_logits/chosen": -2.5911717414855957, + "eval_logits/rejected": -2.584584951400757, + "eval_logps/chosen": -104.41332244873047, + "eval_logps/rejected": -118.42513275146484, + "eval_loss": 0.6571927666664124, + "eval_rewards/accuracies": 0.6189591288566589, + "eval_rewards/chosen": -0.4570142924785614, + "eval_rewards/margins": 0.09543582797050476, + "eval_rewards/rejected": -0.5524501204490662, + "eval_runtime": 359.4233, + "eval_samples_per_second": 11.975, + "eval_steps_per_second": 1.497, "step": 8700 }, { - "epoch": 1.5, - "grad_norm": 44.13104620276346, - "learning_rate": 8.907604741814403e-08, - "logits/chosen": -1.3480675220489502, - "logits/rejected": -1.3072960376739502, - "logps/chosen": -227.20175170898438, - "logps/rejected": -328.07940673828125, - "loss": 0.4921, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7436609268188477, - "rewards/margins": 1.0057998895645142, - "rewards/rejected": -2.7494606971740723, + "epoch": 1.5006891798759476, + "grad_norm": 13.408405303955078, + "learning_rate": 1.781520948362881e-08, + "logits/chosen": -2.4691367149353027, + "logits/rejected": -2.4566128253936768, + "logps/chosen": -111.83473205566406, + "logps/rejected": -123.60860443115234, + "loss": 0.6608, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.5899911522865295, + "rewards/margins": 0.1146332398056984, + "rewards/rejected": -0.7046244144439697, "step": 8710 }, { - "epoch": 1.5, - "grad_norm": 40.22151350765789, - "learning_rate": 8.850144265707039e-08, - "logits/chosen": -1.3768417835235596, - "logits/rejected": -1.312811255455017, - "logps/chosen": -228.0986328125, - "logps/rejected": -353.222412109375, - "loss": 0.3894, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.717865228652954, - "rewards/margins": 1.2698724269866943, - "rewards/rejected": -2.9877376556396484, + "epoch": 1.5024121295658168, + "grad_norm": 9.210076332092285, + "learning_rate": 1.7700288531414077e-08, + "logits/chosen": -2.479508876800537, + "logits/rejected": -2.454876184463501, + "logps/chosen": -111.58013916015625, + "logps/rejected": -125.40617370605469, + "loss": 0.6349, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5523253679275513, + "rewards/margins": 0.1571783721446991, + "rewards/rejected": -0.7095038294792175, "step": 8720 }, { - "epoch": 1.5, - "grad_norm": 34.383596796071096, - "learning_rate": 8.792829833988588e-08, - "logits/chosen": -1.3603075742721558, - "logits/rejected": -1.2962515354156494, - "logps/chosen": -222.20852661132812, - "logps/rejected": -350.58489990234375, - "loss": 0.4555, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.699462890625, - "rewards/margins": 1.288184404373169, - "rewards/rejected": -2.987647533416748, + "epoch": 1.5041350792556858, + "grad_norm": 10.0499267578125, + "learning_rate": 1.7585659667977177e-08, + "logits/chosen": -2.494037389755249, + "logits/rejected": -2.4638280868530273, + "logps/chosen": -107.62284088134766, + "logps/rejected": -125.51090240478516, + "loss": 0.6281, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5535154342651367, + "rewards/margins": 0.1831100732088089, + "rewards/rejected": -0.7366255521774292, "step": 8730 }, { - "epoch": 1.51, - "grad_norm": 43.757133699793904, - "learning_rate": 8.735661964957869e-08, - "logits/chosen": -1.3438574075698853, - "logits/rejected": -1.2943146228790283, - "logps/chosen": -223.5640411376953, - "logps/rejected": -365.6803894042969, - "loss": 0.3991, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.721771001815796, - "rewards/margins": 1.4051529169082642, - "rewards/rejected": -3.1269240379333496, + "epoch": 1.5058580289455548, + "grad_norm": 10.449970245361328, + "learning_rate": 1.747132392991574e-08, + "logits/chosen": -2.4582109451293945, + "logits/rejected": -2.4484829902648926, + "logps/chosen": -110.2053451538086, + "logps/rejected": -129.15306091308594, + "loss": 0.6306, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5880126357078552, + "rewards/margins": 0.17351998388767242, + "rewards/rejected": -0.7615326046943665, "step": 8740 }, { - "epoch": 1.51, - "grad_norm": 33.49609695850668, - "learning_rate": 8.678641175588324e-08, - "logits/chosen": -1.3823951482772827, - "logits/rejected": -1.3145440816879272, - "logps/chosen": -226.49917602539062, - "logps/rejected": -367.0007019042969, - "loss": 0.4057, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7283353805541992, - "rewards/margins": 1.4096132516860962, - "rewards/rejected": -3.137948513031006, + "epoch": 1.5075809786354237, + "grad_norm": 10.32069206237793, + "learning_rate": 1.735728235117665e-08, + "logits/chosen": -2.501255512237549, + "logits/rejected": -2.4790732860565186, + "logps/chosen": -108.33027648925781, + "logps/rejected": -128.27000427246094, + "loss": 0.6158, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.54632169008255, + "rewards/margins": 0.20436473190784454, + "rewards/rejected": -0.7506864666938782, "step": 8750 }, { - "epoch": 1.51, - "grad_norm": 24.899312092376913, - "learning_rate": 8.62176798152335e-08, - "logits/chosen": -1.3334381580352783, - "logits/rejected": -1.2971229553222656, - "logps/chosen": -219.86520385742188, - "logps/rejected": -323.74310302734375, - "loss": 0.4992, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.6835765838623047, - "rewards/margins": 1.002506971359253, - "rewards/rejected": -2.6860833168029785, + "epoch": 1.509303928325293, + "grad_norm": 8.917093276977539, + "learning_rate": 1.7243535963046702e-08, + "logits/chosen": -2.444776773452759, + "logits/rejected": -2.4357781410217285, + "logps/chosen": -104.23927307128906, + "logps/rejected": -127.4343032836914, + "loss": 0.6182, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5274238586425781, + "rewards/margins": 0.19519765675067902, + "rewards/rejected": -0.7226213812828064, "step": 8760 }, { - "epoch": 1.51, - "grad_norm": 24.892210563053087, - "learning_rate": 8.565042897071606e-08, - "logits/chosen": -1.3791553974151611, - "logits/rejected": -1.3129332065582275, - "logps/chosen": -221.7880096435547, - "logps/rejected": -350.95306396484375, - "loss": 0.4061, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.6673253774642944, - "rewards/margins": 1.3446027040481567, - "rewards/rejected": -3.011927843093872, + "epoch": 1.5110268780151621, + "grad_norm": 9.523360252380371, + "learning_rate": 1.7130085794143213e-08, + "logits/chosen": -2.4868338108062744, + "logits/rejected": -2.4598746299743652, + "logps/chosen": -113.57242584228516, + "logps/rejected": -126.35479736328125, + "loss": 0.633, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5849705338478088, + "rewards/margins": 0.18097633123397827, + "rewards/rejected": -0.7659467458724976, "step": 8770 }, { - "epoch": 1.51, - "grad_norm": 29.916856004909253, - "learning_rate": 8.508466435202402e-08, - "logits/chosen": -1.413527488708496, - "logits/rejected": -1.3698149919509888, - "logps/chosen": -221.784912109375, - "logps/rejected": -348.02508544921875, - "loss": 0.4088, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7269346714019775, - "rewards/margins": 1.2188551425933838, - "rewards/rejected": -2.9457898139953613, + "epoch": 1.512749827705031, + "grad_norm": 9.518341064453125, + "learning_rate": 1.7016932870404804e-08, + "logits/chosen": -2.508802652359009, + "logits/rejected": -2.5059142112731934, + "logps/chosen": -106.00608825683594, + "logps/rejected": -129.78359985351562, + "loss": 0.6202, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5691240429878235, + "rewards/margins": 0.19421690702438354, + "rewards/rejected": -0.763340950012207, "step": 8780 }, { - "epoch": 1.51, - "grad_norm": 33.97524926089677, - "learning_rate": 8.452039107541042e-08, - "logits/chosen": -1.402840256690979, - "logits/rejected": -1.333287000656128, - "logps/chosen": -228.7657928466797, - "logps/rejected": -360.1128845214844, - "loss": 0.4378, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.7385919094085693, - "rewards/margins": 1.3424417972564697, - "rewards/rejected": -3.08103346824646, + "epoch": 1.5144727773949, + "grad_norm": 12.33554744720459, + "learning_rate": 1.6904078215082085e-08, + "logits/chosen": -2.5131776332855225, + "logits/rejected": -2.4882612228393555, + "logps/chosen": -113.7440414428711, + "logps/rejected": -125.62518310546875, + "loss": 0.6415, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5882024765014648, + "rewards/margins": 0.1477518081665039, + "rewards/rejected": -0.7359542846679688, "step": 8790 }, { - "epoch": 1.52, - "grad_norm": 35.70383300311999, - "learning_rate": 8.395761424364193e-08, - "logits/chosen": -1.3199231624603271, - "logits/rejected": -1.2492494583129883, - "logps/chosen": -216.8665771484375, - "logps/rejected": -342.27081298828125, - "loss": 0.4486, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.6186439990997314, - "rewards/margins": 1.2790682315826416, - "rewards/rejected": -2.897711992263794, + "epoch": 1.516195727084769, + "grad_norm": 11.471351623535156, + "learning_rate": 1.6791522848728385e-08, + "logits/chosen": -2.4327633380889893, + "logits/rejected": -2.402374029159546, + "logps/chosen": -109.92652893066406, + "logps/rejected": -120.5476303100586, + "loss": 0.6476, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5492741465568542, + "rewards/margins": 0.13125640153884888, + "rewards/rejected": -0.6805306077003479, "step": 8800 }, { - "epoch": 1.52, - "eval_logits/chosen": -1.4813001155853271, - "eval_logits/rejected": -1.4554513692855835, - "eval_logps/chosen": -232.9863739013672, - "eval_logps/rejected": -280.9822082519531, - "eval_loss": 0.629108190536499, - "eval_rewards/accuracies": 0.669377326965332, - "eval_rewards/chosen": -1.7428252696990967, - "eval_rewards/margins": 0.43542277812957764, - "eval_rewards/rejected": -2.178248167037964, - "eval_runtime": 357.8908, - "eval_samples_per_second": 12.026, - "eval_steps_per_second": 1.503, + "epoch": 1.516195727084769, + "eval_logits/chosen": -2.5883116722106934, + "eval_logits/rejected": -2.5816092491149902, + "eval_logps/chosen": -104.7545394897461, + "eval_logps/rejected": -118.84388732910156, + "eval_loss": 0.6569488644599915, + "eval_rewards/accuracies": 0.6194238066673279, + "eval_rewards/chosen": -0.4604264795780182, + "eval_rewards/margins": 0.09621115028858185, + "eval_rewards/rejected": -0.556637704372406, + "eval_runtime": 359.5269, + "eval_samples_per_second": 11.971, + "eval_steps_per_second": 1.496, "step": 8800 }, { - "epoch": 1.52, - "grad_norm": 25.157636621740963, - "learning_rate": 8.33963389459528e-08, - "logits/chosen": -1.4286869764328003, - "logits/rejected": -1.3668185472488403, - "logps/chosen": -215.29214477539062, - "logps/rejected": -350.935546875, - "loss": 0.3864, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.6436516046524048, - "rewards/margins": 1.3397597074508667, - "rewards/rejected": -2.9834113121032715, + "epoch": 1.5179186767746382, + "grad_norm": 11.274436950683594, + "learning_rate": 1.667926778919056e-08, + "logits/chosen": -2.546412706375122, + "logits/rejected": -2.524261951446533, + "logps/chosen": -108.3786849975586, + "logps/rejected": -128.4949188232422, + "loss": 0.6263, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.57418292760849, + "rewards/margins": 0.18470506370067596, + "rewards/rejected": -0.7588880658149719, "step": 8810 }, { - "epoch": 1.52, - "grad_norm": 30.0040629228507, - "learning_rate": 8.283657025799872e-08, - "logits/chosen": -1.40675950050354, - "logits/rejected": -1.3426892757415771, - "logps/chosen": -213.4414825439453, - "logps/rejected": -352.6174621582031, - "loss": 0.3977, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.630210518836975, - "rewards/margins": 1.3984111547470093, - "rewards/rejected": -3.0286216735839844, + "epoch": 1.5196416264645074, + "grad_norm": 9.02099895477295, + "learning_rate": 1.6567314051599745e-08, + "logits/chosen": -2.5184104442596436, + "logits/rejected": -2.498175859451294, + "logps/chosen": -101.60962677001953, + "logps/rejected": -120.52022552490234, + "loss": 0.616, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.511854350566864, + "rewards/margins": 0.19552382826805115, + "rewards/rejected": -0.7073782682418823, "step": 8820 }, { - "epoch": 1.52, - "grad_norm": 44.391862230544554, - "learning_rate": 8.227831324181109e-08, - "logits/chosen": -1.2691117525100708, - "logits/rejected": -1.2064220905303955, - "logps/chosen": -220.8523406982422, - "logps/rejected": -337.88763427734375, - "loss": 0.5033, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.6626207828521729, - "rewards/margins": 1.188899278640747, - "rewards/rejected": -2.85152006149292, + "epoch": 1.5213645761543764, + "grad_norm": 9.249839782714844, + "learning_rate": 1.6455662648362217e-08, + "logits/chosen": -2.3751707077026367, + "logits/rejected": -2.353773832321167, + "logps/chosen": -110.77812194824219, + "logps/rejected": -122.40968322753906, + "loss": 0.6505, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5618213415145874, + "rewards/margins": 0.1348516345024109, + "rewards/rejected": -0.6966729164123535, "step": 8830 }, { - "epoch": 1.52, - "grad_norm": 35.6035223426018, - "learning_rate": 8.172157294575108e-08, - "logits/chosen": -1.3077764511108398, - "logits/rejected": -1.2591360807418823, - "logps/chosen": -206.6147003173828, - "logps/rejected": -329.2171936035156, - "loss": 0.4314, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5428450107574463, - "rewards/margins": 1.211372971534729, - "rewards/rejected": -2.7542176246643066, + "epoch": 1.5230875258442453, + "grad_norm": 11.191672325134277, + "learning_rate": 1.6344314589150214e-08, + "logits/chosen": -2.403146982192993, + "logits/rejected": -2.3920133113861084, + "logps/chosen": -103.38592529296875, + "logps/rejected": -124.9670639038086, + "loss": 0.6167, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5103756189346313, + "rewards/margins": 0.2013304978609085, + "rewards/rejected": -0.7117059826850891, "step": 8840 }, { - "epoch": 1.52, - "grad_norm": 24.29335767133043, - "learning_rate": 8.116635440446402e-08, - "logits/chosen": -1.461669921875, - "logits/rejected": -1.3966032266616821, - "logps/chosen": -200.99661254882812, - "logps/rejected": -347.1763000488281, - "loss": 0.3793, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.4875738620758057, - "rewards/margins": 1.4415907859802246, - "rewards/rejected": -2.9291648864746094, + "epoch": 1.5248104755341143, + "grad_norm": 9.034074783325195, + "learning_rate": 1.6233270880892802e-08, + "logits/chosen": -2.566189765930176, + "logits/rejected": -2.548550844192505, + "logps/chosen": -104.69144439697266, + "logps/rejected": -126.1294937133789, + "loss": 0.6212, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5244103670120239, + "rewards/margins": 0.19395431876182556, + "rewards/rejected": -0.7183647155761719, "step": 8850 }, { - "epoch": 1.53, - "grad_norm": 28.563826235603777, - "learning_rate": 8.061266263883404e-08, - "logits/chosen": -1.3678812980651855, - "logits/rejected": -1.3074369430541992, - "logps/chosen": -219.76559448242188, - "logps/rejected": -344.3088684082031, - "loss": 0.4045, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.665743112564087, - "rewards/margins": 1.2486860752105713, - "rewards/rejected": -2.914429187774658, + "epoch": 1.5265334252239835, + "grad_norm": 10.647709846496582, + "learning_rate": 1.612253252776681e-08, + "logits/chosen": -2.4782826900482178, + "logits/rejected": -2.4569811820983887, + "logps/chosen": -109.85008239746094, + "logps/rejected": -128.80184936523438, + "loss": 0.6183, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5666898488998413, + "rewards/margins": 0.1927156150341034, + "rewards/rejected": -0.7594054341316223, "step": 8860 }, { - "epoch": 1.53, - "grad_norm": 35.12276578609268, - "learning_rate": 8.006050265593814e-08, - "logits/chosen": -1.510338544845581, - "logits/rejected": -1.4287976026535034, - "logps/chosen": -221.04647827148438, - "logps/rejected": -359.68280029296875, - "loss": 0.3904, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.6428911685943604, - "rewards/margins": 1.4038238525390625, - "rewards/rejected": -3.046715021133423, + "epoch": 1.5282563749138525, + "grad_norm": 11.228182792663574, + "learning_rate": 1.601210053118763e-08, + "logits/chosen": -2.6204450130462646, + "logits/rejected": -2.5886178016662598, + "logps/chosen": -112.09134674072266, + "logps/rejected": -129.68994140625, + "loss": 0.6196, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5532141327857971, + "rewards/margins": 0.19342902302742004, + "rewards/rejected": -0.7466431856155396, "step": 8870 }, { - "epoch": 1.53, - "grad_norm": 44.16416214094416, - "learning_rate": 7.950987944900192e-08, - "logits/chosen": -1.3029206991195679, - "logits/rejected": -1.2382522821426392, - "logps/chosen": -218.285888671875, - "logps/rejected": -348.65289306640625, - "loss": 0.4284, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.64852774143219, - "rewards/margins": 1.3227118253707886, - "rewards/rejected": -2.9712395668029785, + "epoch": 1.5299793246037217, + "grad_norm": 10.30317211151123, + "learning_rate": 1.5901975889800383e-08, + "logits/chosen": -2.4214556217193604, + "logits/rejected": -2.3968639373779297, + "logps/chosen": -112.53263092041016, + "logps/rejected": -126.5382080078125, + "loss": 0.6389, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5908144116401672, + "rewards/margins": 0.15907804667949677, + "rewards/rejected": -0.7498924136161804, "step": 8880 }, { - "epoch": 1.53, - "grad_norm": 37.57144300162699, - "learning_rate": 7.896079799735308e-08, - "logits/chosen": -1.3296968936920166, - "logits/rejected": -1.262048602104187, - "logps/chosen": -228.5214080810547, - "logps/rejected": -362.320556640625, - "loss": 0.3695, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.7403507232666016, - "rewards/margins": 1.3753221035003662, - "rewards/rejected": -3.115673065185547, + "epoch": 1.5317022742935906, + "grad_norm": 11.484233856201172, + "learning_rate": 1.5792159599470616e-08, + "logits/chosen": -2.458347797393799, + "logits/rejected": -2.4344797134399414, + "logps/chosen": -112.7353286743164, + "logps/rejected": -126.00569915771484, + "loss": 0.6296, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.582409679889679, + "rewards/margins": 0.16995909810066223, + "rewards/rejected": -0.7523688077926636, "step": 8890 }, { - "epoch": 1.53, - "grad_norm": 26.182342901511827, - "learning_rate": 7.841326326637781e-08, - "logits/chosen": -1.3689161539077759, - "logits/rejected": -1.2965118885040283, - "logps/chosen": -223.8673858642578, - "logps/rejected": -368.6548156738281, - "loss": 0.3594, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.6899131536483765, - "rewards/margins": 1.467007040977478, - "rewards/rejected": -3.1569199562072754, + "epoch": 1.5334252239834596, + "grad_norm": 8.930644035339355, + "learning_rate": 1.5682652653275564e-08, + "logits/chosen": -2.5040814876556396, + "logits/rejected": -2.475045680999756, + "logps/chosen": -112.71919250488281, + "logps/rejected": -127.18507385253906, + "loss": 0.6336, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5783306360244751, + "rewards/margins": 0.1635587066411972, + "rewards/rejected": -0.7418893575668335, "step": 8900 }, { - "epoch": 1.53, - "eval_logits/chosen": -1.4271249771118164, - "eval_logits/rejected": -1.4001696109771729, - "eval_logps/chosen": -251.50247192382812, - "eval_logps/rejected": -303.11505126953125, - "eval_loss": 0.6299463510513306, - "eval_rewards/accuracies": 0.6675186157226562, - "eval_rewards/chosen": -1.9279862642288208, - "eval_rewards/margins": 0.47159045934677124, - "eval_rewards/rejected": -2.3995769023895264, - "eval_runtime": 357.9404, - "eval_samples_per_second": 12.024, - "eval_steps_per_second": 1.503, + "epoch": 1.5334252239834596, + "eval_logits/chosen": -2.5838677883148193, + "eval_logits/rejected": -2.5771963596343994, + "eval_logps/chosen": -105.63292694091797, + "eval_logps/rejected": -119.80806732177734, + "eval_loss": 0.656794548034668, + "eval_rewards/accuracies": 0.6189591288566589, + "eval_rewards/chosen": -0.4692104160785675, + "eval_rewards/margins": 0.09706905484199524, + "eval_rewards/rejected": -0.5662794709205627, + "eval_runtime": 359.4324, + "eval_samples_per_second": 11.974, + "eval_steps_per_second": 1.497, "step": 8900 }, { - "epoch": 1.54, - "grad_norm": 33.45762067690765, - "learning_rate": 7.786728020747463e-08, - "logits/chosen": -1.3184845447540283, - "logits/rejected": -1.262537956237793, - "logps/chosen": -243.18051147460938, - "logps/rejected": -373.2486572265625, - "loss": 0.4348, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.8748201131820679, - "rewards/margins": 1.3269211053848267, - "rewards/rejected": -3.2017414569854736, + "epoch": 1.5351481736733288, + "grad_norm": 11.859126091003418, + "learning_rate": 1.5573456041494926e-08, + "logits/chosen": -2.4731361865997314, + "logits/rejected": -2.4600205421447754, + "logps/chosen": -116.18925476074219, + "logps/rejected": -133.89511108398438, + "loss": 0.6203, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6051359176635742, + "rewards/margins": 0.20322290062904358, + "rewards/rejected": -0.8083587884902954, "step": 8910 }, { - "epoch": 1.54, - "grad_norm": 17.15569346859002, - "learning_rate": 7.73228537580104e-08, - "logits/chosen": -1.4410854578018188, - "logits/rejected": -1.3533989191055298, - "logps/chosen": -239.29391479492188, - "logps/rejected": -403.9765625, - "loss": 0.3353, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -1.8225845098495483, - "rewards/margins": 1.6963199377059937, - "rewards/rejected": -3.518904447555542, + "epoch": 1.5368711233631978, + "grad_norm": 8.969971656799316, + "learning_rate": 1.5464570751602078e-08, + "logits/chosen": -2.5627894401550293, + "logits/rejected": -2.5319936275482178, + "logps/chosen": -115.80232238769531, + "logps/rejected": -133.4971160888672, + "loss": 0.6122, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5875670313835144, + "rewards/margins": 0.22629371285438538, + "rewards/rejected": -0.8138607740402222, "step": 8920 }, { - "epoch": 1.54, - "grad_norm": 23.134592614889822, - "learning_rate": 7.677998884127543e-08, - "logits/chosen": -1.3612538576126099, - "logits/rejected": -1.2906397581100464, - "logps/chosen": -248.0911102294922, - "logps/rejected": -387.23736572265625, - "loss": 0.4191, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.9340145587921143, - "rewards/margins": 1.4329915046691895, - "rewards/rejected": -3.367006301879883, + "epoch": 1.538594073053067, + "grad_norm": 9.40312385559082, + "learning_rate": 1.5355997768255086e-08, + "logits/chosen": -2.506324291229248, + "logits/rejected": -2.476980447769165, + "logps/chosen": -113.90055847167969, + "logps/rejected": -125.7453842163086, + "loss": 0.6358, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5921135544776917, + "rewards/margins": 0.15963508188724518, + "rewards/rejected": -0.7517485618591309, "step": 8930 }, { - "epoch": 1.54, - "grad_norm": 31.45965420355019, - "learning_rate": 7.623869036643901e-08, - "logits/chosen": -1.3538182973861694, - "logits/rejected": -1.290165662765503, - "logps/chosen": -234.25064086914062, - "logps/rejected": -377.9019470214844, - "loss": 0.3828, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.7864036560058594, - "rewards/margins": 1.434646725654602, - "rewards/rejected": -3.22105073928833, + "epoch": 1.540317022742936, + "grad_norm": 9.786426544189453, + "learning_rate": 1.5247738073287803e-08, + "logits/chosen": -2.488471508026123, + "logits/rejected": -2.470641851425171, + "logps/chosen": -111.13584899902344, + "logps/rejected": -128.58689880371094, + "loss": 0.6293, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5550108551979065, + "rewards/margins": 0.17295490205287933, + "rewards/rejected": -0.7279657125473022, "step": 8940 }, { - "epoch": 1.54, - "grad_norm": 52.90203534371852, - "learning_rate": 7.569896322850489e-08, - "logits/chosen": -1.214430570602417, - "logits/rejected": -1.1799967288970947, - "logps/chosen": -237.46340942382812, - "logps/rejected": -357.76739501953125, - "loss": 0.4502, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.8337962627410889, - "rewards/margins": 1.172048807144165, - "rewards/rejected": -3.005844831466675, + "epoch": 1.5420399724328049, + "grad_norm": 11.296224594116211, + "learning_rate": 1.5139792645700976e-08, + "logits/chosen": -2.3937854766845703, + "logits/rejected": -2.3945984840393066, + "logps/chosen": -107.13525390625, + "logps/rejected": -126.40647888183594, + "loss": 0.6402, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5303353071212769, + "rewards/margins": 0.16198968887329102, + "rewards/rejected": -0.6923248767852783, "step": 8950 }, { - "epoch": 1.54, - "grad_norm": 44.77979199965103, - "learning_rate": 7.516081230826715e-08, - "logits/chosen": -1.329178810119629, - "logits/rejected": -1.259275197982788, - "logps/chosen": -257.3124694824219, - "logps/rejected": -397.51519775390625, - "loss": 0.4137, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -2.0291686058044434, - "rewards/margins": 1.4289162158966064, - "rewards/rejected": -3.4580845832824707, + "epoch": 1.5437629221226739, + "grad_norm": 10.965607643127441, + "learning_rate": 1.503216246165343e-08, + "logits/chosen": -2.5009124279022217, + "logits/rejected": -2.4750828742980957, + "logps/chosen": -118.30435943603516, + "logps/rejected": -136.55319213867188, + "loss": 0.6181, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6390171051025391, + "rewards/margins": 0.20923519134521484, + "rewards/rejected": -0.8482524156570435, "step": 8960 }, { - "epoch": 1.55, - "grad_norm": 30.44657329134242, - "learning_rate": 7.462424247226606e-08, - "logits/chosen": -1.3297767639160156, - "logits/rejected": -1.2500559091567993, - "logps/chosen": -235.6514434814453, - "logps/rejected": -384.52142333984375, - "loss": 0.3625, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.8238303661346436, - "rewards/margins": 1.510573148727417, - "rewards/rejected": -3.3344035148620605, + "epoch": 1.545485871812543, + "grad_norm": 11.08918285369873, + "learning_rate": 1.4924848494453214e-08, + "logits/chosen": -2.4640207290649414, + "logits/rejected": -2.4379353523254395, + "logps/chosen": -112.51423645019531, + "logps/rejected": -126.09162902832031, + "loss": 0.633, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5923029184341431, + "rewards/margins": 0.15762947499752045, + "rewards/rejected": -0.7499323487281799, "step": 8970 }, { - "epoch": 1.55, - "grad_norm": 27.607114771049687, - "learning_rate": 7.408925857274373e-08, - "logits/chosen": -1.4012901782989502, - "logits/rejected": -1.3390867710113525, - "logps/chosen": -257.0660705566406, - "logps/rejected": -369.2894592285156, - "loss": 0.5045, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.0260767936706543, - "rewards/margins": 1.1399564743041992, - "rewards/rejected": -3.1660332679748535, + "epoch": 1.5472088215024122, + "grad_norm": 12.172833442687988, + "learning_rate": 1.4817851714548745e-08, + "logits/chosen": -2.5477683544158936, + "logits/rejected": -2.520660400390625, + "logps/chosen": -118.75528717041016, + "logps/rejected": -125.38212585449219, + "loss": 0.6745, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6427457332611084, + "rewards/margins": 0.08407661318778992, + "rewards/rejected": -0.7268223166465759, "step": 8980 }, { - "epoch": 1.55, - "grad_norm": 24.870179348536638, - "learning_rate": 7.355586544760109e-08, - "logits/chosen": -1.2825881242752075, - "logits/rejected": -1.2176916599273682, - "logps/chosen": -230.96267700195312, - "logps/rejected": -376.7090148925781, - "loss": 0.3662, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.7600934505462646, - "rewards/margins": 1.4670263528823853, - "rewards/rejected": -3.2271199226379395, + "epoch": 1.5489317711922812, + "grad_norm": 9.766542434692383, + "learning_rate": 1.4711173089520218e-08, + "logits/chosen": -2.451903820037842, + "logits/rejected": -2.434842586517334, + "logps/chosen": -109.77362060546875, + "logps/rejected": -125.96296691894531, + "loss": 0.6268, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5482394099235535, + "rewards/margins": 0.1711864173412323, + "rewards/rejected": -0.7194257974624634, "step": 8990 }, { - "epoch": 1.55, - "grad_norm": 36.79345395085888, - "learning_rate": 7.302406792035298e-08, - "logits/chosen": -1.369960069656372, - "logits/rejected": -1.2962627410888672, - "logps/chosen": -246.2954559326172, - "logps/rejected": -387.07904052734375, - "loss": 0.4428, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.8888282775878906, - "rewards/margins": 1.4562675952911377, - "rewards/rejected": -3.3450961112976074, + "epoch": 1.5506547208821502, + "grad_norm": 11.284889221191406, + "learning_rate": 1.4604813584070597e-08, + "logits/chosen": -2.4958648681640625, + "logits/rejected": -2.4693679809570312, + "logps/chosen": -116.52508544921875, + "logps/rejected": -130.39151000976562, + "loss": 0.6282, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5911062955856323, + "rewards/margins": 0.18699118494987488, + "rewards/rejected": -0.7780975103378296, "step": 9000 }, { - "epoch": 1.55, - "eval_logits/chosen": -1.436100721359253, - "eval_logits/rejected": -1.4093130826950073, - "eval_logps/chosen": -247.8895263671875, - "eval_logps/rejected": -298.9695739746094, - "eval_loss": 0.631913423538208, - "eval_rewards/accuracies": 0.6642658114433289, - "eval_rewards/chosen": -1.8918566703796387, - "eval_rewards/margins": 0.46626490354537964, - "eval_rewards/rejected": -2.358121633529663, - "eval_runtime": 357.5482, - "eval_samples_per_second": 12.038, - "eval_steps_per_second": 1.505, + "epoch": 1.5506547208821502, + "eval_logits/chosen": -2.5820982456207275, + "eval_logits/rejected": -2.57542085647583, + "eval_logps/chosen": -105.7962417602539, + "eval_logps/rejected": -120.07608032226562, + "eval_loss": 0.6563650369644165, + "eval_rewards/accuracies": 0.6187267899513245, + "eval_rewards/chosen": -0.4708433747291565, + "eval_rewards/margins": 0.09811615198850632, + "eval_rewards/rejected": -0.5689595341682434, + "eval_runtime": 359.9722, + "eval_samples_per_second": 11.956, + "eval_steps_per_second": 1.495, "step": 9000 }, { - "epoch": 1.55, - "grad_norm": 23.705026745341147, - "learning_rate": 7.249387080008552e-08, - "logits/chosen": -1.3333415985107422, - "logits/rejected": -1.2726539373397827, - "logps/chosen": -234.00778198242188, - "logps/rejected": -349.1484069824219, - "loss": 0.4458, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.7839086055755615, - "rewards/margins": 1.1792595386505127, - "rewards/rejected": -2.963167905807495, + "epoch": 1.5523776705720191, + "grad_norm": 9.287176132202148, + "learning_rate": 1.4498774160017102e-08, + "logits/chosen": -2.4706435203552246, + "logits/rejected": -2.447434902191162, + "logps/chosen": -112.8412094116211, + "logps/rejected": -124.1606216430664, + "loss": 0.6432, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5720191597938538, + "rewards/margins": 0.14104202389717102, + "rewards/rejected": -0.7130612134933472, "step": 9010 }, { - "epoch": 1.55, - "grad_norm": 38.89598953145415, - "learning_rate": 7.196527888141199e-08, - "logits/chosen": -1.2887022495269775, - "logits/rejected": -1.2170953750610352, - "logps/chosen": -209.22061157226562, - "logps/rejected": -373.1058044433594, - "loss": 0.3497, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.5529781579971313, - "rewards/margins": 1.6689532995224, - "rewards/rejected": -3.2219314575195312, + "epoch": 1.5541006202618883, + "grad_norm": 11.95606803894043, + "learning_rate": 1.4393055776282397e-08, + "logits/chosen": -2.4061226844787598, + "logits/rejected": -2.3893802165985107, + "logps/chosen": -110.66056823730469, + "logps/rejected": -129.71044921875, + "loss": 0.6103, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.5673651695251465, + "rewards/margins": 0.22053882479667664, + "rewards/rejected": -0.7879040837287903, "step": 9020 }, { - "epoch": 1.56, - "grad_norm": 28.117977627167093, - "learning_rate": 7.14382969444299e-08, - "logits/chosen": -1.3042352199554443, - "logits/rejected": -1.266124963760376, - "logps/chosen": -225.1204833984375, - "logps/rejected": -359.4920959472656, - "loss": 0.4126, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.7398335933685303, - "rewards/margins": 1.3171305656433105, - "rewards/rejected": -3.0569639205932617, + "epoch": 1.5558235699517575, + "grad_norm": 11.436843872070312, + "learning_rate": 1.428765938888598e-08, + "logits/chosen": -2.422454357147217, + "logits/rejected": -2.4265215396881104, + "logps/chosen": -107.36305236816406, + "logps/rejected": -126.97566223144531, + "loss": 0.6313, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5621106028556824, + "rewards/margins": 0.1694938689470291, + "rewards/rejected": -0.7316044569015503, "step": 9030 }, { - "epoch": 1.56, - "grad_norm": 33.729240485438446, - "learning_rate": 7.091292975467744e-08, - "logits/chosen": -1.2989321947097778, - "logits/rejected": -1.2422449588775635, - "logps/chosen": -219.76953125, - "logps/rejected": -347.19561767578125, - "loss": 0.4361, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.6861400604248047, - "rewards/margins": 1.2657876014709473, - "rewards/rejected": -2.951927661895752, + "epoch": 1.5575465196416265, + "grad_norm": 12.741418838500977, + "learning_rate": 1.4182585950935488e-08, + "logits/chosen": -2.4318161010742188, + "logits/rejected": -2.412322521209717, + "logps/chosen": -113.08719635009766, + "logps/rejected": -127.19560241699219, + "loss": 0.6486, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6191827654838562, + "rewards/margins": 0.13276712596416473, + "rewards/rejected": -0.7519499063491821, "step": 9040 }, { - "epoch": 1.56, - "grad_norm": 43.84683137314539, - "learning_rate": 7.038918206309061e-08, - "logits/chosen": -1.363384485244751, - "logits/rejected": -1.299889087677002, - "logps/chosen": -237.06698608398438, - "logps/rejected": -374.26708984375, - "loss": 0.4154, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.8334167003631592, - "rewards/margins": 1.3851557970046997, - "rewards/rejected": -3.2185721397399902, + "epoch": 1.5592694693314955, + "grad_norm": 10.499228477478027, + "learning_rate": 1.4077836412618122e-08, + "logits/chosen": -2.487452983856201, + "logits/rejected": -2.470038890838623, + "logps/chosen": -114.92085266113281, + "logps/rejected": -127.76414489746094, + "loss": 0.6475, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6116332411766052, + "rewards/margins": 0.14158475399017334, + "rewards/rejected": -0.7532179355621338, "step": 9050 }, { - "epoch": 1.56, - "grad_norm": 36.66512791686662, - "learning_rate": 6.986705860596004e-08, - "logits/chosen": -1.3877991437911987, - "logits/rejected": -1.3261516094207764, - "logps/chosen": -228.3108367919922, - "logps/rejected": -349.1913146972656, - "loss": 0.4466, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.7361310720443726, - "rewards/margins": 1.224022626876831, - "rewards/rejected": -2.960153579711914, + "epoch": 1.5609924190213644, + "grad_norm": 10.750195503234863, + "learning_rate": 1.3973411721192008e-08, + "logits/chosen": -2.482776165008545, + "logits/rejected": -2.4582302570343018, + "logps/chosen": -115.44197082519531, + "logps/rejected": -129.9430694580078, + "loss": 0.6374, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6072896122932434, + "rewards/margins": 0.16011087596416473, + "rewards/rejected": -0.7674003839492798, "step": 9060 }, { - "epoch": 1.56, - "grad_norm": 29.90532654515578, - "learning_rate": 6.934656410488849e-08, - "logits/chosen": -1.3249752521514893, - "logits/rejected": -1.256667971611023, - "logps/chosen": -210.0726776123047, - "logps/rejected": -357.83990478515625, - "loss": 0.3591, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.5548808574676514, - "rewards/margins": 1.4754278659820557, - "rewards/rejected": -3.030308246612549, + "epoch": 1.5627153687112336, + "grad_norm": 10.500717163085938, + "learning_rate": 1.3869312820977696e-08, + "logits/chosen": -2.44934344291687, + "logits/rejected": -2.431421995162964, + "logps/chosen": -110.8654556274414, + "logps/rejected": -133.5511016845703, + "loss": 0.6098, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5629990100860596, + "rewards/margins": 0.22451594471931458, + "rewards/rejected": -0.7875149250030518, "step": 9070 }, { - "epoch": 1.56, - "grad_norm": 33.369257953836836, - "learning_rate": 6.882770326674753e-08, - "logits/chosen": -1.3675148487091064, - "logits/rejected": -1.3241257667541504, - "logps/chosen": -205.6577606201172, - "logps/rejected": -325.0508117675781, - "loss": 0.4562, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.536978840827942, - "rewards/margins": 1.1593728065490723, - "rewards/rejected": -2.6963515281677246, + "epoch": 1.5644383184011028, + "grad_norm": 10.117134094238281, + "learning_rate": 1.3765540653349505e-08, + "logits/chosen": -2.453556776046753, + "logits/rejected": -2.447422742843628, + "logps/chosen": -106.34598541259766, + "logps/rejected": -127.54154968261719, + "loss": 0.6224, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5436528325080872, + "rewards/margins": 0.17753125727176666, + "rewards/rejected": -0.7211841344833374, "step": 9080 }, { - "epoch": 1.57, - "grad_norm": 27.478791555939875, - "learning_rate": 6.831048078363603e-08, - "logits/chosen": -1.340841293334961, - "logits/rejected": -1.2636873722076416, - "logps/chosen": -221.9270477294922, - "logps/rejected": -348.5683898925781, - "loss": 0.3936, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.666637659072876, - "rewards/margins": 1.3149703741073608, - "rewards/rejected": -2.9816081523895264, + "epoch": 1.5661612680909718, + "grad_norm": 11.284063339233398, + "learning_rate": 1.3662096156727204e-08, + "logits/chosen": -2.428724765777588, + "logits/rejected": -2.3964526653289795, + "logps/chosen": -113.4808349609375, + "logps/rejected": -122.68611907958984, + "loss": 0.641, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5821224451065063, + "rewards/margins": 0.1405847817659378, + "rewards/rejected": -0.722707211971283, "step": 9090 }, { - "epoch": 1.57, - "grad_norm": 39.68968478397206, - "learning_rate": 6.779490133283639e-08, - "logits/chosen": -1.3765848875045776, - "logits/rejected": -1.3157910108566284, - "logps/chosen": -231.72561645507812, - "logps/rejected": -339.90594482421875, - "loss": 0.4441, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7491241693496704, - "rewards/margins": 1.115006685256958, - "rewards/rejected": -2.864131212234497, + "epoch": 1.5678842177808407, + "grad_norm": 10.751830101013184, + "learning_rate": 1.3558980266567277e-08, + "logits/chosen": -2.490169048309326, + "logits/rejected": -2.4657349586486816, + "logps/chosen": -116.18363952636719, + "logps/rejected": -126.50498962402344, + "loss": 0.646, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5938640236854553, + "rewards/margins": 0.13608434796333313, + "rewards/rejected": -0.7299484014511108, "step": 9100 }, { - "epoch": 1.57, - "eval_logits/chosen": -1.4595789909362793, - "eval_logits/rejected": -1.433526635169983, - "eval_logps/chosen": -236.91993713378906, - "eval_logps/rejected": -285.5493469238281, - "eval_loss": 0.6315240859985352, - "eval_rewards/accuracies": 0.6670538783073425, - "eval_rewards/chosen": -1.7821608781814575, - "eval_rewards/margins": 0.4417589604854584, - "eval_rewards/rejected": -2.2239201068878174, - "eval_runtime": 357.8152, - "eval_samples_per_second": 12.029, - "eval_steps_per_second": 1.504, + "epoch": 1.5678842177808407, + "eval_logits/chosen": -2.579942226409912, + "eval_logits/rejected": -2.5732128620147705, + "eval_logps/chosen": -105.95286560058594, + "eval_logps/rejected": -120.22126770019531, + "eval_loss": 0.6564629077911377, + "eval_rewards/accuracies": 0.6187267899513245, + "eval_rewards/chosen": -0.4724096655845642, + "eval_rewards/margins": 0.09800174087285995, + "eval_rewards/rejected": -0.5704114437103271, + "eval_runtime": 359.7651, + "eval_samples_per_second": 11.963, + "eval_steps_per_second": 1.495, "step": 9100 }, { - "epoch": 1.57, - "grad_norm": 26.17971404373787, - "learning_rate": 6.72809695767736e-08, - "logits/chosen": -1.3913519382476807, - "logits/rejected": -1.3288047313690186, - "logps/chosen": -211.9567413330078, - "logps/rejected": -343.7449951171875, - "loss": 0.4005, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.567034125328064, - "rewards/margins": 1.3445512056350708, - "rewards/rejected": -2.9115850925445557, + "epoch": 1.5696071674707097, + "grad_norm": 9.482916831970215, + "learning_rate": 1.345619391535472e-08, + "logits/chosen": -2.490828037261963, + "logits/rejected": -2.4676239490509033, + "logps/chosen": -112.53236389160156, + "logps/rejected": -126.99684143066406, + "loss": 0.6303, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5728387236595154, + "rewards/margins": 0.17117749154567719, + "rewards/rejected": -0.744016170501709, "step": 9110 }, { - "epoch": 1.57, - "grad_norm": 36.967070697201144, - "learning_rate": 6.67686901629718e-08, - "logits/chosen": -1.3988851308822632, - "logits/rejected": -1.3274810314178467, - "logps/chosen": -217.08779907226562, - "logps/rejected": -345.1493225097656, - "loss": 0.4303, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.5681654214859009, - "rewards/margins": 1.332497000694275, - "rewards/rejected": -2.9006621837615967, + "epoch": 1.571330117160579, + "grad_norm": 12.508149147033691, + "learning_rate": 1.3353738032594358e-08, + "logits/chosen": -2.5123047828674316, + "logits/rejected": -2.4805686473846436, + "logps/chosen": -116.52827453613281, + "logps/rejected": -128.24295043945312, + "loss": 0.631, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5625268816947937, + "rewards/margins": 0.16890086233615875, + "rewards/rejected": -0.731427788734436, "step": 9120 }, { - "epoch": 1.57, - "grad_norm": 31.635032954440245, - "learning_rate": 6.625806772401346e-08, - "logits/chosen": -1.323700189590454, - "logits/rejected": -1.2679407596588135, - "logps/chosen": -219.18026733398438, - "logps/rejected": -335.81927490234375, - "loss": 0.4429, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.641919732093811, - "rewards/margins": 1.1755802631378174, - "rewards/rejected": -2.817499876022339, + "epoch": 1.573053066850448, + "grad_norm": 9.84267807006836, + "learning_rate": 1.3251613544802692e-08, + "logits/chosen": -2.431236505508423, + "logits/rejected": -2.408712863922119, + "logps/chosen": -112.70121765136719, + "logps/rejected": -128.7128143310547, + "loss": 0.6296, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.576823353767395, + "rewards/margins": 0.16929545998573303, + "rewards/rejected": -0.7461186647415161, "step": 9130 }, { - "epoch": 1.57, - "grad_norm": 18.580130895157108, - "learning_rate": 6.574910687749641e-08, - "logits/chosen": -1.3721438646316528, - "logits/rejected": -1.284090280532837, - "logps/chosen": -218.16458129882812, - "logps/rejected": -355.3311462402344, - "loss": 0.3806, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.608903169631958, - "rewards/margins": 1.4527578353881836, - "rewards/rejected": -3.0616610050201416, + "epoch": 1.574776016540317, + "grad_norm": 9.542338371276855, + "learning_rate": 1.3149821375499282e-08, + "logits/chosen": -2.502328872680664, + "logits/rejected": -2.4545814990997314, + "logps/chosen": -112.37251281738281, + "logps/rejected": -126.47017669677734, + "loss": 0.6136, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5509093999862671, + "rewards/margins": 0.22205904126167297, + "rewards/rejected": -0.7729684710502625, "step": 9140 }, { - "epoch": 1.58, - "grad_norm": 35.67358647499166, - "learning_rate": 6.524181222599281e-08, - "logits/chosen": -1.3545089960098267, - "logits/rejected": -1.2818124294281006, - "logps/chosen": -233.56680297851562, - "logps/rejected": -374.37420654296875, - "loss": 0.4067, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7525733709335327, - "rewards/margins": 1.4464915990829468, - "rewards/rejected": -3.1990647315979004, + "epoch": 1.576498966230186, + "grad_norm": 10.919657707214355, + "learning_rate": 1.3048362445198563e-08, + "logits/chosen": -2.4875550270080566, + "logits/rejected": -2.4600603580474854, + "logps/chosen": -119.6942367553711, + "logps/rejected": -136.88499450683594, + "loss": 0.6158, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6136825680732727, + "rewards/margins": 0.21039915084838867, + "rewards/rejected": -0.8240815997123718, "step": 9150 }, { - "epoch": 1.58, - "grad_norm": 25.211132645134647, - "learning_rate": 6.473618835700731e-08, - "logits/chosen": -1.3555432558059692, - "logits/rejected": -1.2993581295013428, - "logps/chosen": -213.2099151611328, - "logps/rejected": -361.63824462890625, - "loss": 0.3679, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.6408780813217163, - "rewards/margins": 1.444949984550476, - "rewards/rejected": -3.0858283042907715, + "epoch": 1.578221915920055, + "grad_norm": 8.170670509338379, + "learning_rate": 1.2947237671401463e-08, + "logits/chosen": -2.4972095489501953, + "logits/rejected": -2.4924275875091553, + "logps/chosen": -103.85929107666016, + "logps/rejected": -127.87440490722656, + "loss": 0.6157, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5470938086509705, + "rewards/margins": 0.20080165565013885, + "rewards/rejected": -0.7478954195976257, "step": 9160 }, { - "epoch": 1.58, - "grad_norm": 29.097063689803438, - "learning_rate": 6.423223984293543e-08, - "logits/chosen": -1.4018914699554443, - "logits/rejected": -1.3220356702804565, - "logps/chosen": -224.4427947998047, - "logps/rejected": -370.28741455078125, - "loss": 0.3913, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.6960633993148804, - "rewards/margins": 1.5122019052505493, - "rewards/rejected": -3.208265781402588, + "epoch": 1.5799448656099242, + "grad_norm": 9.501344680786133, + "learning_rate": 1.2846447968587087e-08, + "logits/chosen": -2.5076279640197754, + "logits/rejected": -2.477036237716675, + "logps/chosen": -117.05989074707031, + "logps/rejected": -129.93243408203125, + "loss": 0.629, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6221837997436523, + "rewards/margins": 0.18241745233535767, + "rewards/rejected": -0.8046013116836548, "step": 9170 }, { - "epoch": 1.58, - "grad_norm": 26.255457005191428, - "learning_rate": 6.372997124102245e-08, - "logits/chosen": -1.3606441020965576, - "logits/rejected": -1.3003352880477905, - "logps/chosen": -224.84521484375, - "logps/rejected": -351.32806396484375, - "loss": 0.4274, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7013254165649414, - "rewards/margins": 1.290497899055481, - "rewards/rejected": -2.991823434829712, + "epoch": 1.5816678152997934, + "grad_norm": 10.158044815063477, + "learning_rate": 1.274599424820449e-08, + "logits/chosen": -2.485940933227539, + "logits/rejected": -2.467524766921997, + "logps/chosen": -115.52877044677734, + "logps/rejected": -127.1000747680664, + "loss": 0.6454, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6079785823822021, + "rewards/margins": 0.14137926697731018, + "rewards/rejected": -0.7493578195571899, "step": 9180 }, { - "epoch": 1.58, - "grad_norm": 27.40777133201824, - "learning_rate": 6.322938709332195e-08, - "logits/chosen": -1.4560340642929077, - "logits/rejected": -1.4088115692138672, - "logps/chosen": -246.0127716064453, - "logps/rejected": -384.2671203613281, - "loss": 0.3953, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.918752670288086, - "rewards/margins": 1.3545637130737305, - "rewards/rejected": -3.2733166217803955, + "epoch": 1.5833907649896624, + "grad_norm": 10.683402061462402, + "learning_rate": 1.2645877418664391e-08, + "logits/chosen": -2.587747097015381, + "logits/rejected": -2.582094669342041, + "logps/chosen": -116.20283508300781, + "logps/rejected": -136.19113159179688, + "loss": 0.6294, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6204559206962585, + "rewards/margins": 0.17218129336833954, + "rewards/rejected": -0.7926372289657593, "step": 9190 }, { - "epoch": 1.59, - "grad_norm": 29.72804041222211, - "learning_rate": 6.273049192665502e-08, - "logits/chosen": -1.3812012672424316, - "logits/rejected": -1.3190171718597412, - "logps/chosen": -222.0596466064453, - "logps/rejected": -362.23779296875, - "loss": 0.3898, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.6922333240509033, - "rewards/margins": 1.4142590761184692, - "rewards/rejected": -3.106492280960083, + "epoch": 1.5851137146795313, + "grad_norm": 11.139490127563477, + "learning_rate": 1.2546098385331006e-08, + "logits/chosen": -2.488598108291626, + "logits/rejected": -2.467776298522949, + "logps/chosen": -114.04997253417969, + "logps/rejected": -132.52149963378906, + "loss": 0.6225, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.611980140209198, + "rewards/margins": 0.1971258819103241, + "rewards/rejected": -0.8091060519218445, "step": 9200 }, { - "epoch": 1.59, - "eval_logits/chosen": -1.4436826705932617, - "eval_logits/rejected": -1.41750967502594, - "eval_logps/chosen": -235.59715270996094, - "eval_logps/rejected": -284.1919250488281, - "eval_loss": 0.6316264271736145, - "eval_rewards/accuracies": 0.6656598448753357, - "eval_rewards/chosen": -1.7689329385757446, - "eval_rewards/margins": 0.4414127469062805, - "eval_rewards/rejected": -2.21034574508667, - "eval_runtime": 358.0647, - "eval_samples_per_second": 12.02, - "eval_steps_per_second": 1.503, + "epoch": 1.5851137146795313, + "eval_logits/chosen": -2.5781242847442627, + "eval_logits/rejected": -2.5714316368103027, + "eval_logps/chosen": -106.330322265625, + "eval_logps/rejected": -120.6733169555664, + "eval_loss": 0.6562556624412537, + "eval_rewards/accuracies": 0.6189591288566589, + "eval_rewards/chosen": -0.47618430852890015, + "eval_rewards/margins": 0.09874764084815979, + "eval_rewards/rejected": -0.5749318599700928, + "eval_runtime": 359.8588, + "eval_samples_per_second": 11.96, + "eval_steps_per_second": 1.495, "step": 9200 }, { - "epoch": 1.59, - "grad_norm": 37.6892193633804, - "learning_rate": 6.223329025256896e-08, - "logits/chosen": -1.2596355676651, - "logits/rejected": -1.189841866493225, - "logps/chosen": -223.52413940429688, - "logps/rejected": -360.711669921875, - "loss": 0.4099, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.6768831014633179, - "rewards/margins": 1.412092924118042, - "rewards/rejected": -3.0889759063720703, + "epoch": 1.5868366643694003, + "grad_norm": 11.177249908447266, + "learning_rate": 1.2446658050513792e-08, + "logits/chosen": -2.3836617469787598, + "logits/rejected": -2.359231948852539, + "logps/chosen": -117.04586029052734, + "logps/rejected": -128.28465270996094, + "loss": 0.6418, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6119889616966248, + "rewards/margins": 0.15295186638832092, + "rewards/rejected": -0.7649407982826233, "step": 9210 }, { - "epoch": 1.59, - "grad_norm": 56.42606826064067, - "learning_rate": 6.173778656729678e-08, - "logits/chosen": -1.3393886089324951, - "logits/rejected": -1.2738230228424072, - "logps/chosen": -217.1497344970703, - "logps/rejected": -360.0973205566406, - "loss": 0.4035, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.671485185623169, - "rewards/margins": 1.4341779947280884, - "rewards/rejected": -3.1056630611419678, + "epoch": 1.5885596140592695, + "grad_norm": 14.950928688049316, + "learning_rate": 1.2347557313459355e-08, + "logits/chosen": -2.4695205688476562, + "logits/rejected": -2.447995662689209, + "logps/chosen": -107.27723693847656, + "logps/rejected": -122.84378814697266, + "loss": 0.636, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5726373791694641, + "rewards/margins": 0.1603108048439026, + "rewards/rejected": -0.7329481840133667, "step": 9220 }, { - "epoch": 1.59, - "grad_norm": 41.20515118078039, - "learning_rate": 6.124398535171655e-08, - "logits/chosen": -1.2532026767730713, - "logits/rejected": -1.1974518299102783, - "logps/chosen": -219.7674102783203, - "logps/rejected": -351.6419372558594, - "loss": 0.4204, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.7048327922821045, - "rewards/margins": 1.2831361293792725, - "rewards/rejected": -2.987969160079956, + "epoch": 1.5902825637491387, + "grad_norm": 9.63260269165039, + "learning_rate": 1.2248797070343308e-08, + "logits/chosen": -2.396231174468994, + "logits/rejected": -2.3813791275024414, + "logps/chosen": -105.32853698730469, + "logps/rejected": -126.0599136352539, + "loss": 0.6272, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.560218334197998, + "rewards/margins": 0.171835258603096, + "rewards/rejected": -0.7320536375045776, "step": 9230 }, { - "epoch": 1.59, - "grad_norm": 37.10435338672497, - "learning_rate": 6.07518910713106e-08, - "logits/chosen": -1.3163211345672607, - "logits/rejected": -1.269641399383545, - "logps/chosen": -227.19949340820312, - "logps/rejected": -363.2820739746094, - "loss": 0.3956, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.710626244544983, - "rewards/margins": 1.3764336109161377, - "rewards/rejected": -3.087059736251831, + "epoch": 1.5920055134390076, + "grad_norm": 10.700993537902832, + "learning_rate": 1.2150378214262118e-08, + "logits/chosen": -2.4350173473358154, + "logits/rejected": -2.4298243522644043, + "logps/chosen": -114.66731262207031, + "logps/rejected": -133.24984741210938, + "loss": 0.6242, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5851711630821228, + "rewards/margins": 0.20138947665691376, + "rewards/rejected": -0.7865606546401978, "step": 9240 }, { - "epoch": 1.59, - "grad_norm": 38.05805409886092, - "learning_rate": 6.026150817612544e-08, - "logits/chosen": -1.2923637628555298, - "logits/rejected": -1.2226426601409912, - "logps/chosen": -215.64614868164062, - "logps/rejected": -348.65020751953125, - "loss": 0.4343, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.6089054346084595, - "rewards/margins": 1.3319495916366577, - "rewards/rejected": -2.940855026245117, + "epoch": 1.5937284631288766, + "grad_norm": 9.717818260192871, + "learning_rate": 1.2052301635225087e-08, + "logits/chosen": -2.4408679008483887, + "logits/rejected": -2.417024612426758, + "logps/chosen": -109.55340576171875, + "logps/rejected": -125.67057800292969, + "loss": 0.6361, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5476741790771484, + "rewards/margins": 0.16295097768306732, + "rewards/rejected": -0.710625171661377, "step": 9250 }, { - "epoch": 1.6, - "grad_norm": 31.84368181089064, - "learning_rate": 5.977284110073136e-08, - "logits/chosen": -1.3127715587615967, - "logits/rejected": -1.2576462030410767, - "logps/chosen": -220.4276885986328, - "logps/rejected": -353.9443054199219, - "loss": 0.4051, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.7238426208496094, - "rewards/margins": 1.3259329795837402, - "rewards/rejected": -3.0497756004333496, + "epoch": 1.5954514128187456, + "grad_norm": 10.486372947692871, + "learning_rate": 1.1954568220146272e-08, + "logits/chosen": -2.4232518672943115, + "logits/rejected": -2.4059088230133057, + "logps/chosen": -109.2516860961914, + "logps/rejected": -126.35398864746094, + "loss": 0.6336, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6118893623352051, + "rewards/margins": 0.16174429655075073, + "rewards/rejected": -0.773633599281311, "step": 9260 }, { - "epoch": 1.6, - "grad_norm": 23.613823280583112, - "learning_rate": 5.928589426418235e-08, - "logits/chosen": -1.416325330734253, - "logits/rejected": -1.3402214050292969, - "logps/chosen": -227.9755859375, - "logps/rejected": -368.1861572265625, - "loss": 0.391, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7309129238128662, - "rewards/margins": 1.4332636594772339, - "rewards/rejected": -3.1641767024993896, + "epoch": 1.5971743625086148, + "grad_norm": 8.578853607177734, + "learning_rate": 1.1857178852836468e-08, + "logits/chosen": -2.5483627319335938, + "logits/rejected": -2.517956018447876, + "logps/chosen": -115.4470443725586, + "logps/rejected": -127.4085464477539, + "loss": 0.6402, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6055377721786499, + "rewards/margins": 0.1504029929637909, + "rewards/rejected": -0.7559407353401184, "step": 9270 }, { - "epoch": 1.6, - "grad_norm": 25.039505768647498, - "learning_rate": 5.8800672069976105e-08, - "logits/chosen": -1.3524010181427002, - "logits/rejected": -1.2967437505722046, - "logps/chosen": -217.47488403320312, - "logps/rejected": -346.5975341796875, - "loss": 0.4171, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.639002799987793, - "rewards/margins": 1.2891342639923096, - "rewards/rejected": -2.9281370639801025, + "epoch": 1.598897312198484, + "grad_norm": 10.61180305480957, + "learning_rate": 1.1760134413995222e-08, + "logits/chosen": -2.4837899208068848, + "logits/rejected": -2.4677019119262695, + "logps/chosen": -109.03898620605469, + "logps/rejected": -128.00949096679688, + "loss": 0.626, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5546872019767761, + "rewards/margins": 0.18747368454933167, + "rewards/rejected": -0.742160975933075, "step": 9280 }, { - "epoch": 1.6, - "grad_norm": 29.09884422283532, - "learning_rate": 5.831717890601434e-08, - "logits/chosen": -1.2608332633972168, - "logits/rejected": -1.2108803987503052, - "logps/chosen": -222.2425079345703, - "logps/rejected": -329.8652038574219, - "loss": 0.4687, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.6758387088775635, - "rewards/margins": 1.1141695976257324, - "rewards/rejected": -2.790008068084717, + "epoch": 1.600620261888353, + "grad_norm": 9.662049293518066, + "learning_rate": 1.1663435781202868e-08, + "logits/chosen": -2.380941867828369, + "logits/rejected": -2.360813617706299, + "logps/chosen": -109.13102722167969, + "logps/rejected": -119.53753662109375, + "loss": 0.6433, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5447606444358826, + "rewards/margins": 0.14195260405540466, + "rewards/rejected": -0.6867132186889648, "step": 9290 }, { - "epoch": 1.6, - "grad_norm": 39.97591942100439, - "learning_rate": 5.7835419144563e-08, - "logits/chosen": -1.3308337926864624, - "logits/rejected": -1.2767422199249268, - "logps/chosen": -234.8633270263672, - "logps/rejected": -378.3427429199219, - "loss": 0.3657, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.812017798423767, - "rewards/margins": 1.413147211074829, - "rewards/rejected": -3.2251651287078857, + "epoch": 1.602343211578222, + "grad_norm": 12.733745574951172, + "learning_rate": 1.15670838289126e-08, + "logits/chosen": -2.4522814750671387, + "logits/rejected": -2.4458775520324707, + "logps/chosen": -116.55436706542969, + "logps/rejected": -138.33456420898438, + "loss": 0.6223, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6287103891372681, + "rewards/margins": 0.195957213640213, + "rewards/rejected": -0.8246676325798035, "step": 9300 }, { - "epoch": 1.6, - "eval_logits/chosen": -1.4361413717269897, - "eval_logits/rejected": -1.409943699836731, - "eval_logps/chosen": -239.39939880371094, - "eval_logps/rejected": -288.6492614746094, - "eval_loss": 0.6325801014900208, - "eval_rewards/accuracies": 0.6638011336326599, - "eval_rewards/chosen": -1.806955337524414, - "eval_rewards/margins": 0.4479631185531616, - "eval_rewards/rejected": -2.2549185752868652, - "eval_runtime": 358.0378, - "eval_samples_per_second": 12.021, - "eval_steps_per_second": 1.503, + "epoch": 1.602343211578222, + "eval_logits/chosen": -2.575881004333496, + "eval_logits/rejected": -2.56917405128479, + "eval_logps/chosen": -106.3382797241211, + "eval_logps/rejected": -120.71073150634766, + "eval_loss": 0.6561717391014099, + "eval_rewards/accuracies": 0.6180297136306763, + "eval_rewards/chosen": -0.4762639105319977, + "eval_rewards/margins": 0.09904211759567261, + "eval_rewards/rejected": -0.5753059983253479, + "eval_runtime": 359.8227, + "eval_samples_per_second": 11.961, + "eval_steps_per_second": 1.495, "step": 9300 }, { - "epoch": 1.6, - "grad_norm": 23.678957806069693, - "learning_rate": 5.7355397142212495e-08, - "logits/chosen": -1.3521010875701904, - "logits/rejected": -1.2914403676986694, - "logps/chosen": -218.249267578125, - "logps/rejected": -341.4629211425781, - "loss": 0.4569, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.6451466083526611, - "rewards/margins": 1.2658127546310425, - "rewards/rejected": -2.910959482192993, + "epoch": 1.6040661612680909, + "grad_norm": 13.567085266113281, + "learning_rate": 1.1471079428442499e-08, + "logits/chosen": -2.473120927810669, + "logits/rejected": -2.4475350379943848, + "logps/chosen": -115.34141540527344, + "logps/rejected": -127.76911926269531, + "loss": 0.6353, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6159577965736389, + "rewards/margins": 0.1579708755016327, + "rewards/rejected": -0.7739286422729492, "step": 9310 }, { - "epoch": 1.61, - "grad_norm": 35.87762222953201, - "learning_rate": 5.687711723983907e-08, - "logits/chosen": -1.4106115102767944, - "logits/rejected": -1.3428720235824585, - "logps/chosen": -235.66696166992188, - "logps/rejected": -378.86505126953125, - "loss": 0.4091, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.8487489223480225, - "rewards/margins": 1.3959980010986328, - "rewards/rejected": -3.2447471618652344, + "epoch": 1.60578911095796, + "grad_norm": 9.380661964416504, + "learning_rate": 1.1375423447967814e-08, + "logits/chosen": -2.528146266937256, + "logits/rejected": -2.5089707374572754, + "logps/chosen": -114.348876953125, + "logps/rejected": -136.70558166503906, + "loss": 0.6247, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6351672410964966, + "rewards/margins": 0.187762051820755, + "rewards/rejected": -0.8229292631149292, "step": 9320 }, { - "epoch": 1.61, - "grad_norm": 36.79795244053593, - "learning_rate": 5.640058376256437e-08, - "logits/chosen": -1.3952717781066895, - "logits/rejected": -1.3357038497924805, - "logps/chosen": -222.98147583007812, - "logps/rejected": -338.1329650878906, - "loss": 0.458, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.6642459630966187, - "rewards/margins": 1.166606068611145, - "rewards/rejected": -2.8308520317077637, + "epoch": 1.607512060647829, + "grad_norm": 12.360684394836426, + "learning_rate": 1.1280116752512875e-08, + "logits/chosen": -2.489619016647339, + "logits/rejected": -2.464404582977295, + "logps/chosen": -118.02415466308594, + "logps/rejected": -136.76522827148438, + "loss": 0.6226, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6144608855247498, + "rewards/margins": 0.20259025692939758, + "rewards/rejected": -0.8170512318611145, "step": 9330 }, { - "epoch": 1.61, - "grad_norm": 21.105194172978344, - "learning_rate": 5.5925801019717637e-08, - "logits/chosen": -1.308809518814087, - "logits/rejected": -1.2506635189056396, - "logps/chosen": -229.06600952148438, - "logps/rejected": -373.9830627441406, - "loss": 0.4046, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7726303339004517, - "rewards/margins": 1.4514251947402954, - "rewards/rejected": -3.224055528640747, + "epoch": 1.6092350103376982, + "grad_norm": 8.799599647521973, + "learning_rate": 1.1185160203943528e-08, + "logits/chosen": -2.4144415855407715, + "logits/rejected": -2.4025747776031494, + "logps/chosen": -114.1600112915039, + "logps/rejected": -132.9754180908203, + "loss": 0.6244, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6235083341598511, + "rewards/margins": 0.1903582364320755, + "rewards/rejected": -0.8138664960861206, "step": 9340 }, { - "epoch": 1.61, - "grad_norm": 41.165260710581435, - "learning_rate": 5.5452773304795585e-08, - "logits/chosen": -1.3799827098846436, - "logits/rejected": -1.31058669090271, - "logps/chosen": -214.2169647216797, - "logps/rejected": -341.6388854980469, - "loss": 0.4018, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.6169312000274658, - "rewards/margins": 1.2802019119262695, - "rewards/rejected": -2.8971328735351562, + "epoch": 1.6109579600275672, + "grad_norm": 34.12635040283203, + "learning_rate": 1.1090554660959117e-08, + "logits/chosen": -2.479396104812622, + "logits/rejected": -2.450967311859131, + "logps/chosen": -109.45731353759766, + "logps/rejected": -124.7352294921875, + "loss": 0.6427, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.569072425365448, + "rewards/margins": 0.15909281373023987, + "rewards/rejected": -0.7281652688980103, "step": 9350 }, { - "epoch": 1.61, - "grad_norm": 30.89004835348826, - "learning_rate": 5.4981504895424273e-08, - "logits/chosen": -1.4255657196044922, - "logits/rejected": -1.349281668663025, - "logps/chosen": -211.1226043701172, - "logps/rejected": -347.12811279296875, - "loss": 0.3748, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.5445703268051147, - "rewards/margins": 1.4008324146270752, - "rewards/rejected": -2.9454026222229004, + "epoch": 1.6126809097174362, + "grad_norm": 10.003735542297363, + "learning_rate": 1.0996300979084855e-08, + "logits/chosen": -2.536195755004883, + "logits/rejected": -2.5000789165496826, + "logps/chosen": -115.14347839355469, + "logps/rejected": -134.09971618652344, + "loss": 0.6073, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5846112966537476, + "rewards/margins": 0.2306203842163086, + "rewards/rejected": -0.8152316808700562, "step": 9360 }, { - "epoch": 1.61, - "grad_norm": 26.504198753445007, - "learning_rate": 5.4512000053320266e-08, - "logits/chosen": -1.4140576124191284, - "logits/rejected": -1.333478569984436, - "logps/chosen": -233.5621795654297, - "logps/rejected": -371.4867248535156, - "loss": 0.3992, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.7974094152450562, - "rewards/margins": 1.403607726097107, - "rewards/rejected": -3.201017379760742, + "epoch": 1.6144038594073054, + "grad_norm": 13.890110969543457, + "learning_rate": 1.0902400010664053e-08, + "logits/chosen": -2.5462453365325928, + "logits/rejected": -2.5133166313171387, + "logps/chosen": -111.9314956665039, + "logps/rejected": -126.63999938964844, + "loss": 0.6309, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5809223055839539, + "rewards/margins": 0.17146947979927063, + "rewards/rejected": -0.7523918151855469, "step": 9370 }, { - "epoch": 1.62, - "grad_norm": 22.785892226259428, - "learning_rate": 5.4044263024251994e-08, - "logits/chosen": -1.3954850435256958, - "logits/rejected": -1.3420671224594116, - "logps/chosen": -226.4377899169922, - "logps/rejected": -347.89373779296875, - "loss": 0.4574, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7203410863876343, - "rewards/margins": 1.2211310863494873, - "rewards/rejected": -2.941472291946411, + "epoch": 1.6161268090971743, + "grad_norm": 11.433756828308105, + "learning_rate": 1.0808852604850399e-08, + "logits/chosen": -2.5420687198638916, + "logits/rejected": -2.526165723800659, + "logps/chosen": -109.38885498046875, + "logps/rejected": -125.33390045166016, + "loss": 0.6335, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5498009920120239, + "rewards/margins": 0.16597667336463928, + "rewards/rejected": -0.7157777547836304, "step": 9380 }, { - "epoch": 1.62, - "grad_norm": 31.550228794525225, - "learning_rate": 5.357829803800137e-08, - "logits/chosen": -1.2319252490997314, - "logits/rejected": -1.172555923461914, - "logps/chosen": -237.8747100830078, - "logps/rejected": -377.8671875, - "loss": 0.409, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.8478126525878906, - "rewards/margins": 1.3827223777770996, - "rewards/rejected": -3.2305350303649902, + "epoch": 1.6178497587870435, + "grad_norm": 10.514060974121094, + "learning_rate": 1.0715659607600275e-08, + "logits/chosen": -2.346153736114502, + "logits/rejected": -2.331106424331665, + "logps/chosen": -112.1683349609375, + "logps/rejected": -132.49806213378906, + "loss": 0.6226, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.5906833410263062, + "rewards/margins": 0.1861725151538849, + "rewards/rejected": -0.7768558263778687, "step": 9390 }, { - "epoch": 1.62, - "grad_norm": 27.350825817958476, - "learning_rate": 5.3114109308325743e-08, - "logits/chosen": -1.2861920595169067, - "logits/rejected": -1.2316094636917114, - "logps/chosen": -221.30990600585938, - "logps/rejected": -339.2297058105469, - "loss": 0.4666, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.6726821660995483, - "rewards/margins": 1.1769441366195679, - "rewards/rejected": -2.849626302719116, + "epoch": 1.6195727084769125, + "grad_norm": 8.785523414611816, + "learning_rate": 1.0622821861665148e-08, + "logits/chosen": -2.4343514442443848, + "logits/rejected": -2.41630482673645, + "logps/chosen": -111.20463562011719, + "logps/rejected": -128.87466430664062, + "loss": 0.6288, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5713671445846558, + "rewards/margins": 0.17466680705547333, + "rewards/rejected": -0.7460339665412903, "step": 9400 }, { - "epoch": 1.62, - "eval_logits/chosen": -1.437654733657837, - "eval_logits/rejected": -1.4113147258758545, - "eval_logps/chosen": -238.54745483398438, - "eval_logps/rejected": -287.8304138183594, - "eval_loss": 0.6324562430381775, - "eval_rewards/accuracies": 0.6631041169166565, - "eval_rewards/chosen": -1.7984360456466675, - "eval_rewards/margins": 0.4482942521572113, - "eval_rewards/rejected": -2.246730327606201, - "eval_runtime": 358.2323, - "eval_samples_per_second": 12.015, - "eval_steps_per_second": 1.502, + "epoch": 1.6195727084769125, + "eval_logits/chosen": -2.5731117725372314, + "eval_logits/rejected": -2.5664474964141846, + "eval_logps/chosen": -106.89212036132812, + "eval_logps/rejected": -121.37098693847656, + "eval_loss": 0.6558800935745239, + "eval_rewards/accuracies": 0.6201208233833313, + "eval_rewards/chosen": -0.48180222511291504, + "eval_rewards/margins": 0.10010644048452377, + "eval_rewards/rejected": -0.581908643245697, + "eval_runtime": 359.1559, + "eval_samples_per_second": 11.984, + "eval_steps_per_second": 1.498, "step": 9400 }, { - "epoch": 1.62, - "grad_norm": 37.163291490549675, - "learning_rate": 5.265170103291952e-08, - "logits/chosen": -1.3120262622833252, - "logits/rejected": -1.2514145374298096, - "logps/chosen": -221.3708038330078, - "logps/rejected": -354.61456298828125, - "loss": 0.4083, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.6749778985977173, - "rewards/margins": 1.3499835729599, - "rewards/rejected": -3.0249617099761963, + "epoch": 1.6212956581667815, + "grad_norm": 11.417981147766113, + "learning_rate": 1.0530340206583904e-08, + "logits/chosen": -2.4483437538146973, + "logits/rejected": -2.4333364963531494, + "logps/chosen": -113.51603698730469, + "logps/rejected": -127.11344909667969, + "loss": 0.6412, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5964230298995972, + "rewards/margins": 0.15341778099536896, + "rewards/rejected": -0.7498408555984497, "step": 9410 }, { - "epoch": 1.62, - "grad_norm": 38.28562781683107, - "learning_rate": 5.2191077393376165e-08, - "logits/chosen": -1.3560113906860352, - "logits/rejected": -1.298722267150879, - "logps/chosen": -231.69131469726562, - "logps/rejected": -346.3023376464844, - "loss": 0.4541, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.7664096355438232, - "rewards/margins": 1.1748406887054443, - "rewards/rejected": -2.9412503242492676, + "epoch": 1.6230186078566504, + "grad_norm": 14.008613586425781, + "learning_rate": 1.0438215478675232e-08, + "logits/chosen": -2.475764513015747, + "logits/rejected": -2.458162307739258, + "logps/chosen": -119.61146545410156, + "logps/rejected": -127.99200439453125, + "loss": 0.6564, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6457251310348511, + "rewards/margins": 0.11228646337985992, + "rewards/rejected": -0.7580116391181946, "step": 9420 }, { - "epoch": 1.62, - "grad_norm": 31.533083950987773, - "learning_rate": 5.173224255515099e-08, - "logits/chosen": -1.3096221685409546, - "logits/rejected": -1.2421365976333618, - "logps/chosen": -223.90087890625, - "logps/rejected": -373.5675048828125, - "loss": 0.4027, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.7149451971054077, - "rewards/margins": 1.5314754247665405, - "rewards/rejected": -3.2464206218719482, + "epoch": 1.6247415575465196, + "grad_norm": 9.169174194335938, + "learning_rate": 1.0346448511030198e-08, + "logits/chosen": -2.4436872005462646, + "logits/rejected": -2.430039167404175, + "logps/chosen": -114.7473373413086, + "logps/rejected": -130.70941162109375, + "loss": 0.6372, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6231524348258972, + "rewards/margins": 0.19460538029670715, + "rewards/rejected": -0.8177579045295715, "step": 9430 }, { - "epoch": 1.63, - "grad_norm": 40.702906044337006, - "learning_rate": 5.127520066752256e-08, - "logits/chosen": -1.2992658615112305, - "logits/rejected": -1.2521995306015015, - "logps/chosen": -227.48019409179688, - "logps/rejected": -347.92181396484375, - "loss": 0.4109, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.7786552906036377, - "rewards/margins": 1.176113247871399, - "rewards/rejected": -2.954768419265747, + "epoch": 1.6264645072363888, + "grad_norm": 10.711912155151367, + "learning_rate": 1.0255040133504512e-08, + "logits/chosen": -2.432291269302368, + "logits/rejected": -2.425797939300537, + "logps/chosen": -110.25630187988281, + "logps/rejected": -127.0741195678711, + "loss": 0.6458, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6062155961990356, + "rewards/margins": 0.13991376757621765, + "rewards/rejected": -0.7461293339729309, "step": 9440 }, { - "epoch": 1.63, - "grad_norm": 32.090498966901635, - "learning_rate": 5.0819955863555916e-08, - "logits/chosen": -1.4480046033859253, - "logits/rejected": -1.3969746828079224, - "logps/chosen": -240.8600616455078, - "logps/rejected": -356.9917297363281, - "loss": 0.4612, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.874746561050415, - "rewards/margins": 1.189202070236206, - "rewards/rejected": -3.0639488697052, + "epoch": 1.6281874569262578, + "grad_norm": 11.031365394592285, + "learning_rate": 1.0163991172711184e-08, + "logits/chosen": -2.5792200565338135, + "logits/rejected": -2.5714049339294434, + "logps/chosen": -119.13470458984375, + "logps/rejected": -127.2987289428711, + "loss": 0.6619, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6574575304985046, + "rewards/margins": 0.10914991796016693, + "rewards/rejected": -0.7666074633598328, "step": 9450 }, { - "epoch": 1.63, - "grad_norm": 17.790347252816513, - "learning_rate": 5.0366512260064883e-08, - "logits/chosen": -1.310302734375, - "logits/rejected": -1.2530990839004517, - "logps/chosen": -209.5319061279297, - "logps/rejected": -382.5646667480469, - "loss": 0.3101, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -1.5730886459350586, - "rewards/margins": 1.7125349044799805, - "rewards/rejected": -3.285623550415039, + "epoch": 1.6299104066161267, + "grad_norm": 9.260765075683594, + "learning_rate": 1.0073302452012977e-08, + "logits/chosen": -2.437150001525879, + "logits/rejected": -2.4315149784088135, + "logps/chosen": -106.49686431884766, + "logps/rejected": -139.8819580078125, + "loss": 0.5715, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.5424596667289734, + "rewards/margins": 0.31610342860221863, + "rewards/rejected": -0.8585631251335144, "step": 9460 }, { - "epoch": 1.63, - "grad_norm": 38.95147289526329, - "learning_rate": 4.9914873957574906e-08, - "logits/chosen": -1.1751810312271118, - "logits/rejected": -1.1032475233078003, - "logps/chosen": -228.77847290039062, - "logps/rejected": -351.155517578125, - "loss": 0.4307, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.7615420818328857, - "rewards/margins": 1.2454551458358765, - "rewards/rejected": -3.006997585296631, + "epoch": 1.6316333563059957, + "grad_norm": 10.981246948242188, + "learning_rate": 9.98297479151498e-09, + "logits/chosen": -2.312954902648926, + "logits/rejected": -2.284334659576416, + "logps/chosen": -113.22406005859375, + "logps/rejected": -122.59544372558594, + "loss": 0.6581, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6057893633842468, + "rewards/margins": 0.11547912657260895, + "rewards/rejected": -0.7212685346603394, "step": 9470 }, { - "epoch": 1.63, - "grad_norm": 26.885837020564978, - "learning_rate": 4.94650450402859e-08, - "logits/chosen": -1.3068211078643799, - "logits/rejected": -1.22744882106781, - "logps/chosen": -227.61032104492188, - "logps/rejected": -368.6788330078125, - "loss": 0.3906, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.7363221645355225, - "rewards/margins": 1.4241564273834229, - "rewards/rejected": -3.1604788303375244, + "epoch": 1.633356305995865, + "grad_norm": 10.705116271972656, + "learning_rate": 9.89300900805718e-09, + "logits/chosen": -2.441366672515869, + "logits/rejected": -2.409208297729492, + "logps/chosen": -115.3991470336914, + "logps/rejected": -131.6602020263672, + "loss": 0.628, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6140663027763367, + "rewards/margins": 0.1759718358516693, + "rewards/rejected": -0.7900381088256836, "step": 9480 }, { - "epoch": 1.64, - "grad_norm": 30.073094764522576, - "learning_rate": 4.9017029576035404e-08, - "logits/chosen": -1.2682311534881592, - "logits/rejected": -1.2089643478393555, - "logps/chosen": -231.9444580078125, - "logps/rejected": -363.248291015625, - "loss": 0.3954, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.7784042358398438, - "rewards/margins": 1.3314237594604492, - "rewards/rejected": -3.109828233718872, + "epoch": 1.635079255685734, + "grad_norm": 12.51076602935791, + "learning_rate": 9.80340591520708e-09, + "logits/chosen": -2.4258036613464355, + "logits/rejected": -2.40817928314209, + "logps/chosen": -114.81951904296875, + "logps/rejected": -129.5521240234375, + "loss": 0.636, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6070705652236938, + "rewards/margins": 0.16568514704704285, + "rewards/rejected": -0.7727557420730591, "step": 9490 }, { - "epoch": 1.64, - "grad_norm": 23.54399691970181, - "learning_rate": 4.857083161626174e-08, - "logits/chosen": -1.3256847858428955, - "logits/rejected": -1.2643065452575684, - "logps/chosen": -226.8311004638672, - "logps/rejected": -383.41436767578125, - "loss": 0.3503, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.7317718267440796, - "rewards/margins": 1.551546335220337, - "rewards/rejected": -3.283318042755127, + "epoch": 1.636802205375603, + "grad_norm": 10.093734741210938, + "learning_rate": 9.714166323252348e-09, + "logits/chosen": -2.4748165607452393, + "logits/rejected": -2.4608230590820312, + "logps/chosen": -110.67747497558594, + "logps/rejected": -130.24691772460938, + "loss": 0.6223, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5703105926513672, + "rewards/margins": 0.18143048882484436, + "rewards/rejected": -0.7517410516738892, "step": 9500 }, { - "epoch": 1.64, - "eval_logits/chosen": -1.402784824371338, - "eval_logits/rejected": -1.3757189512252808, - "eval_logps/chosen": -252.00526428222656, - "eval_logps/rejected": -304.04388427734375, - "eval_loss": 0.6339713931083679, - "eval_rewards/accuracies": 0.6586896181106567, - "eval_rewards/chosen": -1.9330142736434937, - "eval_rewards/margins": 0.4758506715297699, - "eval_rewards/rejected": -2.408864736557007, - "eval_runtime": 358.2372, - "eval_samples_per_second": 12.014, - "eval_steps_per_second": 1.502, + "epoch": 1.636802205375603, + "eval_logits/chosen": -2.571709632873535, + "eval_logits/rejected": -2.564974308013916, + "eval_logps/chosen": -106.93737030029297, + "eval_logps/rejected": -121.4600830078125, + "eval_loss": 0.6557111144065857, + "eval_rewards/accuracies": 0.6175650358200073, + "eval_rewards/chosen": -0.4822547733783722, + "eval_rewards/margins": 0.10054484009742737, + "eval_rewards/rejected": -0.5827996134757996, + "eval_runtime": 359.7295, + "eval_samples_per_second": 11.965, + "eval_steps_per_second": 1.496, "step": 9500 }, { - "epoch": 1.64, - "grad_norm": 33.834138719354485, - "learning_rate": 4.812645519596748e-08, - "logits/chosen": -1.2126704454421997, - "logits/rejected": -1.152756929397583, - "logps/chosen": -234.50845336914062, - "logps/rejected": -376.29010009765625, - "loss": 0.3692, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.8366127014160156, - "rewards/margins": 1.3869558572769165, - "rewards/rejected": -3.2235684394836426, + "epoch": 1.638525155065472, + "grad_norm": 10.927206039428711, + "learning_rate": 9.625291039193495e-09, + "logits/chosen": -2.389148712158203, + "logits/rejected": -2.3725249767303467, + "logps/chosen": -108.08433532714844, + "logps/rejected": -132.2962188720703, + "loss": 0.6147, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5720130205154419, + "rewards/margins": 0.21152958273887634, + "rewards/rejected": -0.7835426330566406, "step": 9510 }, { - "epoch": 1.64, - "grad_norm": 24.273551744010256, - "learning_rate": 4.7683904333682715e-08, - "logits/chosen": -1.439879059791565, - "logits/rejected": -1.3854401111602783, - "logps/chosen": -251.1964874267578, - "logps/rejected": -393.2129821777344, - "loss": 0.426, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.9885809421539307, - "rewards/margins": 1.3651524782180786, - "rewards/rejected": -3.3537330627441406, + "epoch": 1.640248104755341, + "grad_norm": 8.823415756225586, + "learning_rate": 9.536780866736544e-09, + "logits/chosen": -2.5889158248901367, + "logits/rejected": -2.5745842456817627, + "logps/chosen": -111.9592056274414, + "logps/rejected": -139.13119506835938, + "loss": 0.612, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5962141156196594, + "rewards/margins": 0.21659378707408905, + "rewards/rejected": -0.8128078579902649, "step": 9520 }, { - "epoch": 1.64, - "grad_norm": 34.99697034368316, - "learning_rate": 4.72431830314291e-08, - "logits/chosen": -1.3440197706222534, - "logits/rejected": -1.2706291675567627, - "logps/chosen": -233.45089721679688, - "logps/rejected": -384.12225341796875, - "loss": 0.368, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.8057721853256226, - "rewards/margins": 1.5267770290374756, - "rewards/rejected": -3.3325493335723877, + "epoch": 1.6419710544452102, + "grad_norm": 13.857977867126465, + "learning_rate": 9.44863660628582e-09, + "logits/chosen": -2.494685649871826, + "logits/rejected": -2.4717154502868652, + "logps/chosen": -112.91267395019531, + "logps/rejected": -134.03042602539062, + "loss": 0.6056, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6001669764518738, + "rewards/margins": 0.2312859743833542, + "rewards/rejected": -0.8314529657363892, "step": 9530 }, { - "epoch": 1.64, - "grad_norm": 41.925642700921735, - "learning_rate": 4.68042952746831e-08, - "logits/chosen": -1.247374415397644, - "logits/rejected": -1.1915947198867798, - "logps/chosen": -240.2723388671875, - "logps/rejected": -373.23394775390625, - "loss": 0.4017, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.8617527484893799, - "rewards/margins": 1.3366732597351074, - "rewards/rejected": -3.1984262466430664, + "epoch": 1.6436940041350794, + "grad_norm": 9.767173767089844, + "learning_rate": 9.360859054936621e-09, + "logits/chosen": -2.4332919120788574, + "logits/rejected": -2.4117319583892822, + "logps/chosen": -113.34354400634766, + "logps/rejected": -131.25839233398438, + "loss": 0.6263, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5922496318817139, + "rewards/margins": 0.1862720549106598, + "rewards/rejected": -0.7785216569900513, "step": 9540 }, { - "epoch": 1.65, - "grad_norm": 33.59274463489468, - "learning_rate": 4.636724503234074e-08, - "logits/chosen": -1.3158290386199951, - "logits/rejected": -1.2710721492767334, - "logps/chosen": -234.71505737304688, - "logps/rejected": -374.32415771484375, - "loss": 0.4214, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.8152387142181396, - "rewards/margins": 1.3604776859283447, - "rewards/rejected": -3.1757161617279053, + "epoch": 1.6454169538249483, + "grad_norm": 10.432958602905273, + "learning_rate": 9.273449006468148e-09, + "logits/chosen": -2.4807279109954834, + "logits/rejected": -2.4802868366241455, + "logps/chosen": -109.90055847167969, + "logps/rejected": -132.210693359375, + "loss": 0.6246, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5667723417282104, + "rewards/margins": 0.18738147616386414, + "rewards/rejected": -0.754153847694397, "step": 9550 }, { - "epoch": 1.65, - "grad_norm": 34.17989472034478, - "learning_rate": 4.593203625668077e-08, - "logits/chosen": -1.4079006910324097, - "logits/rejected": -1.3559339046478271, - "logps/chosen": -226.49813842773438, - "logps/rejected": -359.3128967285156, - "loss": 0.3967, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.728776216506958, - "rewards/margins": 1.3338409662246704, - "rewards/rejected": -3.0626168251037598, + "epoch": 1.6471399035148173, + "grad_norm": 13.059929847717285, + "learning_rate": 9.186407251336153e-09, + "logits/chosen": -2.539731025695801, + "logits/rejected": -2.5303072929382324, + "logps/chosen": -112.74991607666016, + "logps/rejected": -132.66310119628906, + "loss": 0.628, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5912712812423706, + "rewards/margins": 0.20478937029838562, + "rewards/rejected": -0.7960606813430786, "step": 9560 }, { - "epoch": 1.65, - "grad_norm": 31.348331592601255, - "learning_rate": 4.549867288332987e-08, - "logits/chosen": -1.2812343835830688, - "logits/rejected": -1.2303446531295776, - "logps/chosen": -227.1624298095703, - "logps/rejected": -361.9521789550781, - "loss": 0.4189, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.7691608667373657, - "rewards/margins": 1.3438093662261963, - "rewards/rejected": -3.1129703521728516, + "epoch": 1.6488628532046863, + "grad_norm": 10.162718772888184, + "learning_rate": 9.099734576665975e-09, + "logits/chosen": -2.4429385662078857, + "logits/rejected": -2.4294652938842773, + "logps/chosen": -110.95243072509766, + "logps/rejected": -128.51576232910156, + "loss": 0.6345, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6070041060447693, + "rewards/margins": 0.17155200242996216, + "rewards/rejected": -0.7785560488700867, "step": 9570 }, { - "epoch": 1.65, - "grad_norm": 39.69075540606707, - "learning_rate": 4.5067158831226273e-08, - "logits/chosen": -1.3360213041305542, - "logits/rejected": -1.2739002704620361, - "logps/chosen": -245.7711639404297, - "logps/rejected": -386.2242126464844, - "loss": 0.4138, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.9129165410995483, - "rewards/margins": 1.3799970149993896, - "rewards/rejected": -3.2929134368896484, + "epoch": 1.6505858028945555, + "grad_norm": 10.574372291564941, + "learning_rate": 9.013431766245255e-09, + "logits/chosen": -2.5211551189422607, + "logits/rejected": -2.503304958343506, + "logps/chosen": -115.49881744384766, + "logps/rejected": -136.0047607421875, + "loss": 0.6301, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6101093888282776, + "rewards/margins": 0.18049034476280212, + "rewards/rejected": -0.7905997037887573, "step": 9580 }, { - "epoch": 1.65, - "grad_norm": 33.115164323400506, - "learning_rate": 4.463749800258479e-08, - "logits/chosen": -1.444392442703247, - "logits/rejected": -1.3800554275512695, - "logps/chosen": -227.8526153564453, - "logps/rejected": -368.66693115234375, - "loss": 0.3906, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7177770137786865, - "rewards/margins": 1.4058539867401123, - "rewards/rejected": -3.123631000518799, + "epoch": 1.6523087525844247, + "grad_norm": 9.696565628051758, + "learning_rate": 8.927499600516958e-09, + "logits/chosen": -2.594769239425659, + "logits/rejected": -2.577392101287842, + "logps/chosen": -116.54072570800781, + "logps/rejected": -130.11834716796875, + "loss": 0.6505, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6044396758079529, + "rewards/margins": 0.13363295793533325, + "rewards/rejected": -0.7380726933479309, "step": 9590 }, { - "epoch": 1.65, - "grad_norm": 41.69230357375958, - "learning_rate": 4.420969428286139e-08, - "logits/chosen": -1.2808119058609009, - "logits/rejected": -1.2005847692489624, - "logps/chosen": -230.84872436523438, - "logps/rejected": -385.7454528808594, - "loss": 0.3729, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.7579460144042969, - "rewards/margins": 1.5761973857879639, - "rewards/rejected": -3.334143877029419, + "epoch": 1.6540317022742936, + "grad_norm": 10.850507736206055, + "learning_rate": 8.841938856572278e-09, + "logits/chosen": -2.429518699645996, + "logits/rejected": -2.399512529373169, + "logps/chosen": -116.4022445678711, + "logps/rejected": -130.88446044921875, + "loss": 0.6363, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6134180426597595, + "rewards/margins": 0.17180520296096802, + "rewards/rejected": -0.7852233052253723, "step": 9600 }, { - "epoch": 1.65, - "eval_logits/chosen": -1.391427993774414, - "eval_logits/rejected": -1.3641211986541748, - "eval_logps/chosen": -252.29434204101562, - "eval_logps/rejected": -304.6582946777344, - "eval_loss": 0.6356525421142578, - "eval_rewards/accuracies": 0.6563661694526672, - "eval_rewards/chosen": -1.935904860496521, - "eval_rewards/margins": 0.4791041910648346, - "eval_rewards/rejected": -2.415009021759033, - "eval_runtime": 357.9055, - "eval_samples_per_second": 12.026, - "eval_steps_per_second": 1.503, + "epoch": 1.6540317022742936, + "eval_logits/chosen": -2.5682740211486816, + "eval_logits/rejected": -2.5615086555480957, + "eval_logps/chosen": -107.62434387207031, + "eval_logps/rejected": -122.20415496826172, + "eval_loss": 0.6556257009506226, + "eval_rewards/accuracies": 0.6196561455726624, + "eval_rewards/chosen": -0.4891245663166046, + "eval_rewards/margins": 0.10111591964960098, + "eval_rewards/rejected": -0.590240478515625, + "eval_runtime": 359.5427, + "eval_samples_per_second": 11.971, + "eval_steps_per_second": 1.496, "step": 9600 }, { - "epoch": 1.66, - "grad_norm": 38.45195266525548, - "learning_rate": 4.378375154071806e-08, - "logits/chosen": -1.2705994844436646, - "logits/rejected": -1.2090730667114258, - "logps/chosen": -229.653076171875, - "logps/rejected": -377.4794006347656, - "loss": 0.3925, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.7335882186889648, - "rewards/margins": 1.491674780845642, - "rewards/rejected": -3.2252631187438965, + "epoch": 1.6557546519641626, + "grad_norm": 11.242480278015137, + "learning_rate": 8.756750308143613e-09, + "logits/chosen": -2.4543559551239014, + "logits/rejected": -2.4368574619293213, + "logps/chosen": -111.0262680053711, + "logps/rejected": -129.50485229492188, + "loss": 0.6185, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5471795797348022, + "rewards/margins": 0.19828134775161743, + "rewards/rejected": -0.7454609274864197, "step": 9610 }, { - "epoch": 1.66, - "grad_norm": 34.39068811627909, - "learning_rate": 4.335967362798787e-08, - "logits/chosen": -1.4060554504394531, - "logits/rejected": -1.3566725254058838, - "logps/chosen": -241.97201538085938, - "logps/rejected": -348.53228759765625, - "loss": 0.4833, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.8736858367919922, - "rewards/margins": 1.0578181743621826, - "rewards/rejected": -2.9315037727355957, + "epoch": 1.6574776016540316, + "grad_norm": 10.200096130371094, + "learning_rate": 8.671934725597574e-09, + "logits/chosen": -2.5679984092712402, + "logits/rejected": -2.5582830905914307, + "logps/chosen": -116.09355163574219, + "logps/rejected": -129.18531799316406, + "loss": 0.6516, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6150714755058289, + "rewards/margins": 0.12277498096227646, + "rewards/rejected": -0.7378464937210083, "step": 9620 }, { - "epoch": 1.66, - "grad_norm": 41.46128098759825, - "learning_rate": 4.293746437963983e-08, - "logits/chosen": -1.3315681219100952, - "logits/rejected": -1.2634027004241943, - "logps/chosen": -258.986328125, - "logps/rejected": -377.8593444824219, - "loss": 0.4636, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.0355629920959473, - "rewards/margins": 1.2129532098770142, - "rewards/rejected": -3.248516082763672, + "epoch": 1.6592005513439008, + "grad_norm": 14.261276245117188, + "learning_rate": 8.587492875927965e-09, + "logits/chosen": -2.480210781097412, + "logits/rejected": -2.454237461090088, + "logps/chosen": -119.82755279541016, + "logps/rejected": -134.42041015625, + "loss": 0.6343, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6436928510665894, + "rewards/margins": 0.1701437532901764, + "rewards/rejected": -0.8138366937637329, "step": 9630 }, { - "epoch": 1.66, - "grad_norm": 43.601466824424364, - "learning_rate": 4.2517127613744986e-08, - "logits/chosen": -1.3833634853363037, - "logits/rejected": -1.3209584951400757, - "logps/chosen": -235.45803833007812, - "logps/rejected": -356.86627197265625, - "loss": 0.4311, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7948541641235352, - "rewards/margins": 1.2689039707183838, - "rewards/rejected": -3.063758134841919, + "epoch": 1.66092350103377, + "grad_norm": 10.625503540039062, + "learning_rate": 8.503425522748997e-09, + "logits/chosen": -2.5318641662597656, + "logits/rejected": -2.5037484169006348, + "logps/chosen": -117.7755355834961, + "logps/rejected": -127.9942398071289, + "loss": 0.6371, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6177867650985718, + "rewards/margins": 0.1568756103515625, + "rewards/rejected": -0.7746623754501343, "step": 9640 }, { - "epoch": 1.66, - "grad_norm": 29.63300631760297, - "learning_rate": 4.209866713144078e-08, - "logits/chosen": -1.2863609790802002, - "logits/rejected": -1.2273352146148682, - "logps/chosen": -234.44369506835938, - "logps/rejected": -349.785888671875, - "loss": 0.4813, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.7898155450820923, - "rewards/margins": 1.1682324409484863, - "rewards/rejected": -2.958048105239868, + "epoch": 1.662646450723639, + "grad_norm": 10.061976432800293, + "learning_rate": 8.419733426288155e-09, + "logits/chosen": -2.419862747192383, + "logits/rejected": -2.39957332611084, + "logps/chosen": -118.35591888427734, + "logps/rejected": -129.8536834716797, + "loss": 0.652, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6287807822227478, + "rewards/margins": 0.12983281910419464, + "rewards/rejected": -0.758613646030426, "step": 9650 }, { - "epoch": 1.66, - "grad_norm": 28.816878634667912, - "learning_rate": 4.1682086716897826e-08, - "logits/chosen": -1.3099644184112549, - "logits/rejected": -1.264509916305542, - "logps/chosen": -219.1770477294922, - "logps/rejected": -347.0464172363281, - "loss": 0.4171, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.6862246990203857, - "rewards/margins": 1.2479627132415771, - "rewards/rejected": -2.934187412261963, + "epoch": 1.664369400413508, + "grad_norm": 9.312915802001953, + "learning_rate": 8.336417343379565e-09, + "logits/chosen": -2.436915636062622, + "logits/rejected": -2.430359125137329, + "logps/chosen": -110.80461120605469, + "logps/rejected": -132.95384216308594, + "loss": 0.6305, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6022011041641235, + "rewards/margins": 0.19084128737449646, + "rewards/rejected": -0.7930424809455872, "step": 9660 }, { - "epoch": 1.67, - "grad_norm": 29.767921273426662, - "learning_rate": 4.1267390137284725e-08, - "logits/chosen": -1.3837589025497437, - "logits/rejected": -1.307064414024353, - "logps/chosen": -233.31289672851562, - "logps/rejected": -396.89337158203125, - "loss": 0.3643, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7717788219451904, - "rewards/margins": 1.6641696691513062, - "rewards/rejected": -3.435948610305786, + "epoch": 1.6660923501033769, + "grad_norm": 11.565176963806152, + "learning_rate": 8.253478027456945e-09, + "logits/chosen": -2.5210142135620117, + "logits/rejected": -2.495251178741455, + "logps/chosen": -113.76652526855469, + "logps/rejected": -131.93850708007812, + "loss": 0.6148, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5764992833137512, + "rewards/margins": 0.2094554603099823, + "rewards/rejected": -0.7859547138214111, "step": 9670 }, { - "epoch": 1.67, - "grad_norm": 35.63166890222917, - "learning_rate": 4.085458114273463e-08, - "logits/chosen": -1.3208513259887695, - "logits/rejected": -1.2681634426116943, - "logps/chosen": -229.1538848876953, - "logps/rejected": -348.280029296875, - "loss": 0.4797, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.7467445135116577, - "rewards/margins": 1.2142714262008667, - "rewards/rejected": -2.9610159397125244, + "epoch": 1.667815299793246, + "grad_norm": 9.607332229614258, + "learning_rate": 8.170916228546925e-09, + "logits/chosen": -2.476175546646118, + "logits/rejected": -2.4628398418426514, + "logps/chosen": -113.82244873046875, + "logps/rejected": -122.46971130371094, + "loss": 0.6612, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5935350060462952, + "rewards/margins": 0.10920468717813492, + "rewards/rejected": -0.7027397155761719, "step": 9680 }, { - "epoch": 1.67, - "grad_norm": 37.72367340978869, - "learning_rate": 4.044366346631106e-08, - "logits/chosen": -1.248106598854065, - "logits/rejected": -1.1902921199798584, - "logps/chosen": -231.75619506835938, - "logps/rejected": -361.8299255371094, - "loss": 0.4176, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.7594753503799438, - "rewards/margins": 1.3130239248275757, - "rewards/rejected": -3.0724992752075195, + "epoch": 1.6695382494831152, + "grad_norm": 12.794886589050293, + "learning_rate": 8.088732693262213e-09, + "logits/chosen": -2.3811025619506836, + "logits/rejected": -2.364696741104126, + "logps/chosen": -117.9845962524414, + "logps/rejected": -130.56321716308594, + "loss": 0.6451, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6215648651123047, + "rewards/margins": 0.13795089721679688, + "rewards/rejected": -0.7595157027244568, "step": 9690 }, { - "epoch": 1.67, - "grad_norm": 29.824443223289826, - "learning_rate": 4.00346408239742e-08, - "logits/chosen": -1.2740222215652466, - "logits/rejected": -1.2087305784225464, - "logps/chosen": -237.3057403564453, - "logps/rejected": -371.45361328125, - "loss": 0.4403, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.8458435535430908, - "rewards/margins": 1.3461719751358032, - "rewards/rejected": -3.1920151710510254, + "epoch": 1.6712611991729842, + "grad_norm": 11.55554485321045, + "learning_rate": 8.006928164794841e-09, + "logits/chosen": -2.440767288208008, + "logits/rejected": -2.4117016792297363, + "logps/chosen": -112.40250396728516, + "logps/rejected": -129.28512573242188, + "loss": 0.6355, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5969595909118652, + "rewards/margins": 0.1731598973274231, + "rewards/rejected": -0.7701194882392883, "step": 9700 }, { - "epoch": 1.67, - "eval_logits/chosen": -1.417214274406433, - "eval_logits/rejected": -1.390296220779419, - "eval_logps/chosen": -244.72193908691406, - "eval_logps/rejected": -295.69439697265625, - "eval_loss": 0.6342071294784546, - "eval_rewards/accuracies": 0.6624070405960083, - "eval_rewards/chosen": -1.8601804971694946, - "eval_rewards/margins": 0.46518951654434204, - "eval_rewards/rejected": -2.3253698348999023, - "eval_runtime": 357.4423, - "eval_samples_per_second": 12.041, - "eval_steps_per_second": 1.505, + "epoch": 1.6712611991729842, + "eval_logits/chosen": -2.567650556564331, + "eval_logits/rejected": -2.560908555984497, + "eval_logps/chosen": -107.51300048828125, + "eval_logps/rejected": -122.10337829589844, + "eval_loss": 0.6555582880973816, + "eval_rewards/accuracies": 0.6210501790046692, + "eval_rewards/chosen": -0.48801106214523315, + "eval_rewards/margins": 0.10122145712375641, + "eval_rewards/rejected": -0.5892325639724731, + "eval_runtime": 359.6285, + "eval_samples_per_second": 11.968, + "eval_steps_per_second": 1.496, "step": 9700 }, { - "epoch": 1.67, - "grad_norm": 37.595203889217025, - "learning_rate": 3.96275169145473e-08, - "logits/chosen": -1.16835618019104, - "logits/rejected": -1.1231775283813477, - "logps/chosen": -235.44985961914062, - "logps/rejected": -346.90167236328125, - "loss": 0.4504, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.8184646368026733, - "rewards/margins": 1.131596326828003, - "rewards/rejected": -2.950061082839966, + "epoch": 1.6729841488628532, + "grad_norm": 12.837274551391602, + "learning_rate": 7.925503382909459e-09, + "logits/chosen": -2.3130369186401367, + "logits/rejected": -2.299406051635742, + "logps/chosen": -115.46980285644531, + "logps/rejected": -129.88174438476562, + "loss": 0.644, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6184121370315552, + "rewards/margins": 0.16119442880153656, + "rewards/rejected": -0.7796066999435425, "step": 9710 }, { - "epoch": 1.67, - "grad_norm": 45.4457406884452, - "learning_rate": 3.922229541968322e-08, - "logits/chosen": -1.3615391254425049, - "logits/rejected": -1.312709927558899, - "logps/chosen": -238.9713134765625, - "logps/rejected": -351.1409912109375, - "loss": 0.5068, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.8562841415405273, - "rewards/margins": 1.1417663097381592, - "rewards/rejected": -2.9980504512786865, + "epoch": 1.6747070985527222, + "grad_norm": 11.885695457458496, + "learning_rate": 7.844459083936644e-09, + "logits/chosen": -2.4749066829681396, + "logits/rejected": -2.461320400238037, + "logps/chosen": -114.23624420166016, + "logps/rejected": -131.29092407226562, + "loss": 0.6259, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6087769865989685, + "rewards/margins": 0.1906432807445526, + "rewards/rejected": -0.7994202971458435, "step": 9720 }, { - "epoch": 1.68, - "grad_norm": 33.15996891543965, - "learning_rate": 3.881898000383116e-08, - "logits/chosen": -1.3564598560333252, - "logits/rejected": -1.3007166385650635, - "logps/chosen": -208.45571899414062, - "logps/rejected": -348.89300537109375, - "loss": 0.4111, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.5474106073379517, - "rewards/margins": 1.3974087238311768, - "rewards/rejected": -2.944819450378418, + "epoch": 1.6764300482425913, + "grad_norm": 9.85313606262207, + "learning_rate": 7.763796000766231e-09, + "logits/chosen": -2.4554290771484375, + "logits/rejected": -2.4421284198760986, + "logps/chosen": -112.19117736816406, + "logps/rejected": -133.25039672851562, + "loss": 0.6204, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5845453143119812, + "rewards/margins": 0.20357480645179749, + "rewards/rejected": -0.7881200909614563, "step": 9730 }, { - "epoch": 1.68, - "grad_norm": 32.7011999016843, - "learning_rate": 3.841757431420351e-08, - "logits/chosen": -1.3424409627914429, - "logits/rejected": -1.2776859998703003, - "logps/chosen": -232.28494262695312, - "logps/rejected": -366.96038818359375, - "loss": 0.4073, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.7781145572662354, - "rewards/margins": 1.3590996265411377, - "rewards/rejected": -3.137214183807373, + "epoch": 1.6781529979324605, + "grad_norm": 10.972027778625488, + "learning_rate": 7.683514862840701e-09, + "logits/chosen": -2.4811558723449707, + "logits/rejected": -2.463585138320923, + "logps/chosen": -116.2174072265625, + "logps/rejected": -129.1941680908203, + "loss": 0.6451, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6174328327178955, + "rewards/margins": 0.14185911417007446, + "rewards/rejected": -0.7592920064926147, "step": 9740 }, { - "epoch": 1.68, - "grad_norm": 26.93270991808977, - "learning_rate": 3.801808198074266e-08, - "logits/chosen": -1.367996096611023, - "logits/rejected": -1.290379285812378, - "logps/chosen": -232.1277313232422, - "logps/rejected": -353.48175048828125, - "loss": 0.3957, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.7703111171722412, - "rewards/margins": 1.271135926246643, - "rewards/rejected": -3.0414466857910156, + "epoch": 1.6798759476223295, + "grad_norm": 9.66303539276123, + "learning_rate": 7.603616396148533e-09, + "logits/chosen": -2.4816958904266357, + "logits/rejected": -2.4432339668273926, + "logps/chosen": -117.9178695678711, + "logps/rejected": -127.9693603515625, + "loss": 0.6382, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.627899706363678, + "rewards/margins": 0.15825578570365906, + "rewards/rejected": -0.7861555814743042, "step": 9750 }, { - "epoch": 1.68, - "grad_norm": 19.984914469702172, - "learning_rate": 3.7620506616088817e-08, - "logits/chosen": -1.3691097497940063, - "logits/rejected": -1.306137204170227, - "logps/chosen": -238.61715698242188, - "logps/rejected": -359.2353210449219, - "loss": 0.4193, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.8544162511825562, - "rewards/margins": 1.225228190422058, - "rewards/rejected": -3.079644203186035, + "epoch": 1.6815988973121985, + "grad_norm": 9.837355613708496, + "learning_rate": 7.524101323217763e-09, + "logits/chosen": -2.4979424476623535, + "logits/rejected": -2.4742271900177, + "logps/chosen": -114.5103759765625, + "logps/rejected": -128.76182556152344, + "loss": 0.6449, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6132702231407166, + "rewards/margins": 0.1615351140499115, + "rewards/rejected": -0.7748053073883057, "step": 9760 }, { - "epoch": 1.68, - "grad_norm": 28.256879995074712, - "learning_rate": 3.72248518155463e-08, - "logits/chosen": -1.2638490200042725, - "logits/rejected": -1.198878526687622, - "logps/chosen": -214.2969512939453, - "logps/rejected": -354.8340148925781, - "loss": 0.3748, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.6020538806915283, - "rewards/margins": 1.4068208932876587, - "rewards/rejected": -3.0088746547698975, + "epoch": 1.6833218470020674, + "grad_norm": 10.263019561767578, + "learning_rate": 7.4449703631092596e-09, + "logits/chosen": -2.4073855876922607, + "logits/rejected": -2.388444423675537, + "logps/chosen": -114.07319641113281, + "logps/rejected": -130.03860473632812, + "loss": 0.6368, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5996518135070801, + "rewards/margins": 0.16121190786361694, + "rewards/rejected": -0.7608636617660522, "step": 9770 }, { - "epoch": 1.69, - "grad_norm": 34.80859221520256, - "learning_rate": 3.683112115705225e-08, - "logits/chosen": -1.3776103258132935, - "logits/rejected": -1.2958616018295288, - "logps/chosen": -220.2239227294922, - "logps/rejected": -346.7090759277344, - "loss": 0.4194, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.5932377576828003, - "rewards/margins": 1.3576481342315674, - "rewards/rejected": -2.9508860111236572, + "epoch": 1.6850447966919366, + "grad_norm": 8.965226173400879, + "learning_rate": 7.366224231410451e-09, + "logits/chosen": -2.4908275604248047, + "logits/rejected": -2.44807767868042, + "logps/chosen": -120.42997741699219, + "logps/rejected": -132.4361114501953, + "loss": 0.6172, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5954740643501282, + "rewards/margins": 0.21281616389751434, + "rewards/rejected": -0.8082901835441589, "step": 9780 }, { - "epoch": 1.69, - "grad_norm": 49.153516151462675, - "learning_rate": 3.6439318201143096e-08, - "logits/chosen": -1.336469054222107, - "logits/rejected": -1.2982733249664307, - "logps/chosen": -235.4860382080078, - "logps/rejected": -368.72442626953125, - "loss": 0.4339, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.8210595846176147, - "rewards/margins": 1.2906904220581055, - "rewards/rejected": -3.1117501258850098, + "epoch": 1.6867677463818056, + "grad_norm": 9.79748249053955, + "learning_rate": 7.28786364022862e-09, + "logits/chosen": -2.462313175201416, + "logits/rejected": -2.4611024856567383, + "logps/chosen": -112.70414733886719, + "logps/rejected": -141.95468139648438, + "loss": 0.6079, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.592925488948822, + "rewards/margins": 0.25078535079956055, + "rewards/rejected": -0.8437107801437378, "step": 9790 }, { - "epoch": 1.69, - "grad_norm": 45.583738179863275, - "learning_rate": 3.604944649092323e-08, - "logits/chosen": -1.3702198266983032, - "logits/rejected": -1.2832942008972168, - "logps/chosen": -227.76315307617188, - "logps/rejected": -392.9060363769531, - "loss": 0.3633, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.7098026275634766, - "rewards/margins": 1.682472586631775, - "rewards/rejected": -3.392275333404541, + "epoch": 1.6884906960716748, + "grad_norm": 11.095004081726074, + "learning_rate": 7.209889298184646e-09, + "logits/chosen": -2.490950107574463, + "logits/rejected": -2.457878351211548, + "logps/chosen": -117.1009750366211, + "logps/rejected": -133.694580078125, + "loss": 0.6247, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6029871702194214, + "rewards/margins": 0.19695623219013214, + "rewards/rejected": -0.7999434471130371, "step": 9800 }, { - "epoch": 1.69, - "eval_logits/chosen": -1.4198648929595947, - "eval_logits/rejected": -1.3927710056304932, - "eval_logps/chosen": -244.33860778808594, - "eval_logps/rejected": -295.2367248535156, - "eval_loss": 0.6346299648284912, - "eval_rewards/accuracies": 0.6589219570159912, - "eval_rewards/chosen": -1.856347680091858, - "eval_rewards/margins": 0.4644457995891571, - "eval_rewards/rejected": -2.320793390274048, - "eval_runtime": 357.9375, - "eval_samples_per_second": 12.024, - "eval_steps_per_second": 1.503, + "epoch": 1.6884906960716748, + "eval_logits/chosen": -2.5670363903045654, + "eval_logits/rejected": -2.5602853298187256, + "eval_logps/chosen": -107.6542739868164, + "eval_logps/rejected": -122.27554321289062, + "eval_loss": 0.6554702520370483, + "eval_rewards/accuracies": 0.6201208233833313, + "eval_rewards/chosen": -0.4894237220287323, + "eval_rewards/margins": 0.10153036564588547, + "eval_rewards/rejected": -0.5909541249275208, + "eval_runtime": 359.471, + "eval_samples_per_second": 11.973, + "eval_steps_per_second": 1.497, "step": 9800 }, { - "epoch": 1.69, - "grad_norm": 27.982644219340326, - "learning_rate": 3.566150955203251e-08, - "logits/chosen": -1.2991117238998413, - "logits/rejected": -1.2290585041046143, - "logps/chosen": -232.1947784423828, - "logps/rejected": -362.37738037109375, - "loss": 0.438, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.767719030380249, - "rewards/margins": 1.3174628019332886, - "rewards/rejected": -3.085181951522827, + "epoch": 1.6902136457615438, + "grad_norm": 10.662399291992188, + "learning_rate": 7.132301910406502e-09, + "logits/chosen": -2.425443649291992, + "logits/rejected": -2.3938422203063965, + "logps/chosen": -115.65562438964844, + "logps/rejected": -134.46087646484375, + "loss": 0.6255, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6023647785186768, + "rewards/margins": 0.2033398449420929, + "rewards/rejected": -0.8057045936584473, "step": 9810 }, { - "epoch": 1.69, - "grad_norm": 62.700137314497454, - "learning_rate": 3.52755108926146e-08, - "logits/chosen": -1.3249415159225464, - "logits/rejected": -1.2635711431503296, - "logps/chosen": -226.71157836914062, - "logps/rejected": -362.6283874511719, - "loss": 0.4126, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7259670495986938, - "rewards/margins": 1.360840916633606, - "rewards/rejected": -3.0868077278137207, + "epoch": 1.6919365954514127, + "grad_norm": 10.450873374938965, + "learning_rate": 7.05510217852292e-09, + "logits/chosen": -2.4725582599639893, + "logits/rejected": -2.448711395263672, + "logps/chosen": -112.17340087890625, + "logps/rejected": -134.11328125, + "loss": 0.6117, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5805162191390991, + "rewards/margins": 0.220969557762146, + "rewards/rejected": -0.8014858365058899, "step": 9820 }, { - "epoch": 1.69, - "grad_norm": 31.049190619621527, - "learning_rate": 3.489145400328511e-08, - "logits/chosen": -1.3645232915878296, - "logits/rejected": -1.3135316371917725, - "logps/chosen": -240.0381317138672, - "logps/rejected": -366.198486328125, - "loss": 0.4544, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.820874810218811, - "rewards/margins": 1.2675881385803223, - "rewards/rejected": -3.088463068008423, + "epoch": 1.693659545141282, + "grad_norm": 9.498969078063965, + "learning_rate": 6.978290800657022e-09, + "logits/chosen": -2.504911422729492, + "logits/rejected": -2.498277425765991, + "logps/chosen": -119.9706039428711, + "logps/rejected": -136.05508422851562, + "loss": 0.6471, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6198269128799438, + "rewards/margins": 0.1672651469707489, + "rewards/rejected": -0.7870920896530151, "step": 9830 }, { - "epoch": 1.7, - "grad_norm": 31.393346729361205, - "learning_rate": 3.4509342357099904e-08, - "logits/chosen": -1.3261866569519043, - "logits/rejected": -1.248307704925537, - "logps/chosen": -227.368896484375, - "logps/rejected": -373.2166442871094, - "loss": 0.4319, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.7153198719024658, - "rewards/margins": 1.4764636754989624, - "rewards/rejected": -3.1917834281921387, + "epoch": 1.6953824948311509, + "grad_norm": 8.868417739868164, + "learning_rate": 6.901868471419981e-09, + "logits/chosen": -2.44360613822937, + "logits/rejected": -2.415085792541504, + "logps/chosen": -119.94268798828125, + "logps/rejected": -135.39608764648438, + "loss": 0.6387, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6406481266021729, + "rewards/margins": 0.17294269800186157, + "rewards/rejected": -0.8135908842086792, "step": 9840 }, { - "epoch": 1.7, - "grad_norm": 28.920399059721735, - "learning_rate": 3.4129179409524225e-08, - "logits/chosen": -1.3411327600479126, - "logits/rejected": -1.2954285144805908, - "logps/chosen": -218.49609375, - "logps/rejected": -335.6559753417969, - "loss": 0.4306, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.639061689376831, - "rewards/margins": 1.2008213996887207, - "rewards/rejected": -2.8398830890655518, + "epoch": 1.69710544452102, + "grad_norm": 10.25020694732666, + "learning_rate": 6.825835881904846e-09, + "logits/chosen": -2.458670139312744, + "logits/rejected": -2.4489316940307617, + "logps/chosen": -110.32945251464844, + "logps/rejected": -124.38945007324219, + "loss": 0.6304, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5572781562805176, + "rewards/margins": 0.16997310519218445, + "rewards/rejected": -0.7272512316703796, "step": 9850 }, { - "epoch": 1.7, - "grad_norm": 42.06449748795488, - "learning_rate": 3.375096859840071e-08, - "logits/chosen": -1.4100219011306763, - "logits/rejected": -1.3593881130218506, - "logps/chosen": -245.61587524414062, - "logps/rejected": -357.762939453125, - "loss": 0.4891, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.89302659034729, - "rewards/margins": 1.1459534168243408, - "rewards/rejected": -3.038980007171631, + "epoch": 1.698828394210889, + "grad_norm": 11.723773956298828, + "learning_rate": 6.750193719680142e-09, + "logits/chosen": -2.527556896209717, + "logits/rejected": -2.5174648761749268, + "logps/chosen": -122.32655334472656, + "logps/rejected": -132.7868194580078, + "loss": 0.656, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6600021123886108, + "rewards/margins": 0.12902231514453888, + "rewards/rejected": -0.7890244722366333, "step": 9860 }, { - "epoch": 1.7, - "grad_norm": 39.095333220619935, - "learning_rate": 3.337471334391903e-08, - "logits/chosen": -1.3635808229446411, - "logits/rejected": -1.2952406406402588, - "logps/chosen": -218.5416717529297, - "logps/rejected": -349.49041748046875, - "loss": 0.4054, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.658063530921936, - "rewards/margins": 1.3279998302459717, - "rewards/rejected": -2.9860637187957764, + "epoch": 1.700551343900758, + "grad_norm": 10.979412078857422, + "learning_rate": 6.674942668783806e-09, + "logits/chosen": -2.4933903217315674, + "logits/rejected": -2.469564914703369, + "logps/chosen": -112.51314544677734, + "logps/rejected": -127.08055114746094, + "loss": 0.6332, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5978198051452637, + "rewards/margins": 0.1641312837600708, + "rewards/rejected": -0.7619511485099792, "step": 9870 }, { - "epoch": 1.7, - "grad_norm": 26.222603081146218, - "learning_rate": 3.300041704858425e-08, - "logits/chosen": -1.2638031244277954, - "logits/rejected": -1.2089149951934814, - "logps/chosen": -224.624755859375, - "logps/rejected": -374.73345947265625, - "loss": 0.3794, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.7256309986114502, - "rewards/margins": 1.4671021699905396, - "rewards/rejected": -3.1927330493927, + "epoch": 1.7022742935906272, + "grad_norm": 9.7615385055542, + "learning_rate": 6.60008340971685e-09, + "logits/chosen": -2.404886484146118, + "logits/rejected": -2.3997762203216553, + "logps/chosen": -111.34349060058594, + "logps/rejected": -132.93601989746094, + "loss": 0.6291, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.592779278755188, + "rewards/margins": 0.1820637732744217, + "rewards/rejected": -0.7748430967330933, "step": 9880 }, { - "epoch": 1.7, - "grad_norm": 32.364868235731464, - "learning_rate": 3.262808309718668e-08, - "logits/chosen": -1.2537232637405396, - "logits/rejected": -1.204134225845337, - "logps/chosen": -237.5514373779297, - "logps/rejected": -368.892578125, - "loss": 0.4235, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.8114869594573975, - "rewards/margins": 1.3386226892471313, - "rewards/rejected": -3.1501097679138184, + "epoch": 1.7039972432804962, + "grad_norm": 9.9425630569458, + "learning_rate": 6.525616619437335e-09, + "logits/chosen": -2.3720321655273438, + "logits/rejected": -2.364936590194702, + "logps/chosen": -113.4536361694336, + "logps/rejected": -134.3558349609375, + "loss": 0.6028, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5703598260879517, + "rewards/margins": 0.23440685868263245, + "rewards/rejected": -0.8047667741775513, "step": 9890 }, { - "epoch": 1.71, - "grad_norm": 42.03184187301272, - "learning_rate": 3.2257714856770866e-08, - "logits/chosen": -1.3525625467300415, - "logits/rejected": -1.2814313173294067, - "logps/chosen": -209.0051727294922, - "logps/rejected": -360.9290466308594, - "loss": 0.3727, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.5728161334991455, - "rewards/margins": 1.538694977760315, - "rewards/rejected": -3.111510992050171, + "epoch": 1.7057201929703654, + "grad_norm": 12.73387336730957, + "learning_rate": 6.451542971354173e-09, + "logits/chosen": -2.4607715606689453, + "logits/rejected": -2.4324750900268555, + "logps/chosen": -105.95125579833984, + "logps/rejected": -132.67672729492188, + "loss": 0.5826, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5424472689628601, + "rewards/margins": 0.286365807056427, + "rewards/rejected": -0.8288131952285767, "step": 9900 }, { - "epoch": 1.71, - "eval_logits/chosen": -1.424879550933838, - "eval_logits/rejected": -1.397822618484497, - "eval_logps/chosen": -246.3584747314453, - "eval_logps/rejected": -297.6012878417969, - "eval_loss": 0.6336408853530884, - "eval_rewards/accuracies": 0.6556691527366638, - "eval_rewards/chosen": -1.876546025276184, - "eval_rewards/margins": 0.46789297461509705, - "eval_rewards/rejected": -2.3444390296936035, - "eval_runtime": 357.1672, - "eval_samples_per_second": 12.05, - "eval_steps_per_second": 1.506, + "epoch": 1.7057201929703654, + "eval_logits/chosen": -2.5658624172210693, + "eval_logits/rejected": -2.559070348739624, + "eval_logps/chosen": -107.81818389892578, + "eval_logps/rejected": -122.47151184082031, + "eval_loss": 0.6553860306739807, + "eval_rewards/accuracies": 0.6205855011940002, + "eval_rewards/chosen": -0.49106287956237793, + "eval_rewards/margins": 0.1018509715795517, + "eval_rewards/rejected": -0.5929138660430908, + "eval_runtime": 359.8195, + "eval_samples_per_second": 11.962, + "eval_steps_per_second": 1.495, "step": 9900 }, { - "epoch": 1.71, - "grad_norm": 26.80328363192874, - "learning_rate": 3.1889315676605325e-08, - "logits/chosen": -1.4107153415679932, - "logits/rejected": -1.322643756866455, - "logps/chosen": -216.0201873779297, - "logps/rejected": -352.2988586425781, - "loss": 0.4129, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5847244262695312, - "rewards/margins": 1.4318273067474365, - "rewards/rejected": -3.0165517330169678, + "epoch": 1.7074431426602343, + "grad_norm": 8.75168514251709, + "learning_rate": 6.377863135321066e-09, + "logits/chosen": -2.5228099822998047, + "logits/rejected": -2.481369733810425, + "logps/chosen": -117.33937072753906, + "logps/rejected": -129.14263916015625, + "loss": 0.6249, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.597710907459259, + "rewards/margins": 0.1872120201587677, + "rewards/rejected": -0.7849228382110596, "step": 9910 }, { - "epoch": 1.71, - "grad_norm": 31.18899577015784, - "learning_rate": 3.152288888815227e-08, - "logits/chosen": -1.3761640787124634, - "logits/rejected": -1.3048861026763916, - "logps/chosen": -222.0412139892578, - "logps/rejected": -367.63140869140625, - "loss": 0.3517, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.6936910152435303, - "rewards/margins": 1.4894893169403076, - "rewards/rejected": -3.183180332183838, + "epoch": 1.7091660923501033, + "grad_norm": 10.73083209991455, + "learning_rate": 6.304577777630454e-09, + "logits/chosen": -2.523171901702881, + "logits/rejected": -2.4971327781677246, + "logps/chosen": -112.56683349609375, + "logps/rejected": -128.8625030517578, + "loss": 0.6182, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5987794399261475, + "rewards/margins": 0.19654551148414612, + "rewards/rejected": -0.7953248620033264, "step": 9920 }, { - "epoch": 1.71, - "grad_norm": 35.38732271471543, - "learning_rate": 3.1158437805037296e-08, - "logits/chosen": -1.3241338729858398, - "logits/rejected": -1.276207685470581, - "logps/chosen": -219.0576171875, - "logps/rejected": -356.7098693847656, - "loss": 0.4241, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.671561598777771, - "rewards/margins": 1.347208023071289, - "rewards/rejected": -3.0187695026397705, + "epoch": 1.7108890420399723, + "grad_norm": 13.223040580749512, + "learning_rate": 6.231687561007459e-09, + "logits/chosen": -2.4568839073181152, + "logits/rejected": -2.452805280685425, + "logps/chosen": -109.905517578125, + "logps/rejected": -133.2655029296875, + "loss": 0.6177, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5799885392189026, + "rewards/margins": 0.2040647566318512, + "rewards/rejected": -0.7840532660484314, "step": 9930 }, { - "epoch": 1.71, - "grad_norm": 24.595744118468183, - "learning_rate": 3.079596572301965e-08, - "logits/chosen": -1.4158904552459717, - "logits/rejected": -1.3684117794036865, - "logps/chosen": -236.92745971679688, - "logps/rejected": -355.1565246582031, - "loss": 0.4652, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.8187148571014404, - "rewards/margins": 1.125727653503418, - "rewards/rejected": -2.9444425106048584, + "epoch": 1.7126119917298415, + "grad_norm": 8.866291046142578, + "learning_rate": 6.1591931446039306e-09, + "logits/chosen": -2.533574104309082, + "logits/rejected": -2.52690052986145, + "logps/chosen": -114.55931091308594, + "logps/rejected": -135.74856567382812, + "loss": 0.6405, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5949760675430298, + "rewards/margins": 0.15510234236717224, + "rewards/rejected": -0.7500783205032349, "step": 9940 }, { - "epoch": 1.71, - "grad_norm": 59.09049440288645, - "learning_rate": 3.043547591996226e-08, - "logits/chosen": -1.3634991645812988, - "logits/rejected": -1.2813036441802979, - "logps/chosen": -228.6792755126953, - "logps/rejected": -375.29547119140625, - "loss": 0.3893, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7471336126327515, - "rewards/margins": 1.4925024509429932, - "rewards/rejected": -3.239635944366455, + "epoch": 1.7143349414197107, + "grad_norm": 12.860427856445312, + "learning_rate": 6.087095183992452e-09, + "logits/chosen": -2.4990382194519043, + "logits/rejected": -2.46634840965271, + "logps/chosen": -114.99503326416016, + "logps/rejected": -131.1891632080078, + "loss": 0.6247, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6103070974349976, + "rewards/margins": 0.1882888674736023, + "rewards/rejected": -0.7985959649085999, "step": 9950 }, { - "epoch": 1.72, - "grad_norm": 50.2107128049137, - "learning_rate": 3.0076971655802196e-08, - "logits/chosen": -1.3980926275253296, - "logits/rejected": -1.3432905673980713, - "logps/chosen": -247.3916015625, - "logps/rejected": -370.38458251953125, - "loss": 0.4546, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.9397674798965454, - "rewards/margins": 1.2183306217193604, - "rewards/rejected": -3.158097743988037, + "epoch": 1.7160578911095796, + "grad_norm": 11.31727123260498, + "learning_rate": 6.015394331160439e-09, + "logits/chosen": -2.5300450325012207, + "logits/rejected": -2.514608144760132, + "logps/chosen": -118.2430419921875, + "logps/rejected": -130.07740783691406, + "loss": 0.6632, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.648059606552124, + "rewards/margins": 0.10662396997213364, + "rewards/rejected": -0.7546836137771606, "step": 9960 }, { - "epoch": 1.72, - "grad_norm": 31.69304656950342, - "learning_rate": 2.972045617252114e-08, - "logits/chosen": -1.37814462184906, - "logits/rejected": -1.328137755393982, - "logps/chosen": -225.2104949951172, - "logps/rejected": -350.4515075683594, - "loss": 0.4552, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.7387615442276, - "rewards/margins": 1.2483638525009155, - "rewards/rejected": -2.9871251583099365, + "epoch": 1.7177808407994486, + "grad_norm": 11.905298233032227, + "learning_rate": 5.944091234504228e-09, + "logits/chosen": -2.4777989387512207, + "logits/rejected": -2.4606761932373047, + "logps/chosen": -110.2918472290039, + "logps/rejected": -128.54547119140625, + "loss": 0.6299, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5893170237541199, + "rewards/margins": 0.17843811213970184, + "rewards/rejected": -0.7677551507949829, "step": 9970 }, { - "epoch": 1.72, - "grad_norm": 18.89889146936782, - "learning_rate": 2.9365932694115913e-08, - "logits/chosen": -1.2746312618255615, - "logits/rejected": -1.2151672840118408, - "logps/chosen": -238.1916046142578, - "logps/rejected": -384.50164794921875, - "loss": 0.4013, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.8541069030761719, - "rewards/margins": 1.459223747253418, - "rewards/rejected": -3.3133304119110107, + "epoch": 1.7195037904893176, + "grad_norm": 11.689369201660156, + "learning_rate": 5.8731865388231825e-09, + "logits/chosen": -2.425647020339966, + "logits/rejected": -2.413154125213623, + "logps/chosen": -113.85538482666016, + "logps/rejected": -134.4022216796875, + "loss": 0.6234, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6105948090553284, + "rewards/margins": 0.2016739547252655, + "rewards/rejected": -0.8122687339782715, "step": 9980 }, { - "epoch": 1.72, - "grad_norm": 42.47918178591677, - "learning_rate": 2.9013404426569855e-08, - "logits/chosen": -1.3667715787887573, - "logits/rejected": -1.2965288162231445, - "logps/chosen": -238.6469268798828, - "logps/rejected": -354.388427734375, - "loss": 0.4658, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.8494361639022827, - "rewards/margins": 1.195669412612915, - "rewards/rejected": -3.045105457305908, + "epoch": 1.7212267401791868, + "grad_norm": 12.944132804870605, + "learning_rate": 5.802680885313971e-09, + "logits/chosen": -2.508399486541748, + "logits/rejected": -2.4662134647369385, + "logps/chosen": -116.12748718261719, + "logps/rejected": -125.13218688964844, + "loss": 0.6538, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.62394779920578, + "rewards/margins": 0.12821142375469208, + "rewards/rejected": -0.7521592378616333, "step": 9990 }, { - "epoch": 1.72, - "grad_norm": 21.723980366758628, - "learning_rate": 2.8662874557823013e-08, - "logits/chosen": -1.37138831615448, - "logits/rejected": -1.3194690942764282, - "logps/chosen": -232.44027709960938, - "logps/rejected": -357.5736389160156, - "loss": 0.424, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.7671289443969727, - "rewards/margins": 1.239452600479126, - "rewards/rejected": -3.0065817832946777, + "epoch": 1.722949689869056, + "grad_norm": 10.039288520812988, + "learning_rate": 5.732574911564603e-09, + "logits/chosen": -2.509500026702881, + "logits/rejected": -2.491403818130493, + "logps/chosen": -115.86087799072266, + "logps/rejected": -136.30348205566406, + "loss": 0.6181, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6011232137680054, + "rewards/margins": 0.1922348439693451, + "rewards/rejected": -0.7933580279350281, "step": 10000 }, { - "epoch": 1.72, - "eval_logits/chosen": -1.4226433038711548, - "eval_logits/rejected": -1.3957637548446655, - "eval_logps/chosen": -245.6855010986328, - "eval_logps/rejected": -296.6435852050781, - "eval_loss": 0.6344332098960876, - "eval_rewards/accuracies": 0.6514869928359985, - "eval_rewards/chosen": -1.8698163032531738, - "eval_rewards/margins": 0.4650455117225647, - "eval_rewards/rejected": -2.334861993789673, - "eval_runtime": 357.1222, - "eval_samples_per_second": 12.052, - "eval_steps_per_second": 1.506, + "epoch": 1.722949689869056, + "eval_logits/chosen": -2.564741849899292, + "eval_logits/rejected": -2.5579206943511963, + "eval_logps/chosen": -107.93734741210938, + "eval_logps/rejected": -122.6296157836914, + "eval_loss": 0.6552858352661133, + "eval_rewards/accuracies": 0.6203531622886658, + "eval_rewards/chosen": -0.49225449562072754, + "eval_rewards/margins": 0.1022404208779335, + "eval_rewards/rejected": -0.5944948792457581, + "eval_runtime": 359.7482, + "eval_samples_per_second": 11.964, + "eval_steps_per_second": 1.495, "step": 10000 }, { - "epoch": 1.72, - "grad_norm": 19.257035851897566, - "learning_rate": 2.8314346257744177e-08, - "logits/chosen": -1.3713723421096802, - "logits/rejected": -1.3086481094360352, - "logps/chosen": -225.0717010498047, - "logps/rejected": -362.225830078125, - "loss": 0.3966, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.7269824743270874, - "rewards/margins": 1.3675779104232788, - "rewards/rejected": -3.094560146331787, + "epoch": 1.724672639558925, + "grad_norm": 10.848592758178711, + "learning_rate": 5.662869251548835e-09, + "logits/chosen": -2.494889736175537, + "logits/rejected": -2.4740841388702393, + "logps/chosen": -110.259521484375, + "logps/rejected": -134.33322143554688, + "loss": 0.6036, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.578801691532135, + "rewards/margins": 0.23675695061683655, + "rewards/rejected": -0.8155585527420044, "step": 10010 }, { - "epoch": 1.73, - "grad_norm": 33.76415450194936, - "learning_rate": 2.7967822678101466e-08, - "logits/chosen": -1.3136873245239258, - "logits/rejected": -1.243048071861267, - "logps/chosen": -228.7400360107422, - "logps/rejected": -360.3517761230469, - "loss": 0.411, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.75080144405365, - "rewards/margins": 1.3429168462753296, - "rewards/rejected": -3.0937180519104004, + "epoch": 1.7263955892487939, + "grad_norm": 11.91601848602295, + "learning_rate": 5.5935645356202935e-09, + "logits/chosen": -2.436588764190674, + "logits/rejected": -2.4070498943328857, + "logps/chosen": -116.73612976074219, + "logps/rejected": -128.99179077148438, + "loss": 0.6409, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6306353211402893, + "rewards/margins": 0.14948755502700806, + "rewards/rejected": -0.7801228761672974, "step": 10020 }, { - "epoch": 1.73, - "grad_norm": 30.936869629766193, - "learning_rate": 2.7623306952534316e-08, - "logits/chosen": -1.3288711309432983, - "logits/rejected": -1.260801076889038, - "logps/chosen": -239.26663208007812, - "logps/rejected": -358.018798828125, - "loss": 0.4206, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.8187427520751953, - "rewards/margins": 1.2403138875961304, - "rewards/rejected": -3.0590567588806152, + "epoch": 1.7281185389386629, + "grad_norm": 14.203158378601074, + "learning_rate": 5.524661390506863e-09, + "logits/chosen": -2.4671003818511963, + "logits/rejected": -2.4386401176452637, + "logps/chosen": -117.65501403808594, + "logps/rejected": -125.408203125, + "loss": 0.6486, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6026381254196167, + "rewards/margins": 0.13013479113578796, + "rewards/rejected": -0.732772946357727, "step": 10030 }, { - "epoch": 1.73, - "grad_norm": 23.395127833799734, - "learning_rate": 2.728080219652504e-08, - "logits/chosen": -1.509854793548584, - "logits/rejected": -1.4493193626403809, - "logps/chosen": -229.5972900390625, - "logps/rejected": -358.5816345214844, - "loss": 0.4241, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7475786209106445, - "rewards/margins": 1.2939976453781128, - "rewards/rejected": -3.0415761470794678, + "epoch": 1.729841488628532, + "grad_norm": 8.88199234008789, + "learning_rate": 5.456160439305007e-09, + "logits/chosen": -2.599710464477539, + "logits/rejected": -2.5841920375823975, + "logps/chosen": -116.64871978759766, + "logps/rejected": -134.38421630859375, + "loss": 0.6312, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6180770993232727, + "rewards/margins": 0.18140612542629242, + "rewards/rejected": -0.7994831800460815, "step": 10040 }, { - "epoch": 1.73, - "grad_norm": 36.993355629776445, - "learning_rate": 2.694031150737036e-08, - "logits/chosen": -1.3353779315948486, - "logits/rejected": -1.2893092632293701, - "logps/chosen": -227.55111694335938, - "logps/rejected": -348.4396057128906, - "loss": 0.4256, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.7357978820800781, - "rewards/margins": 1.1867949962615967, - "rewards/rejected": -2.922593355178833, + "epoch": 1.7315644383184012, + "grad_norm": 14.16472053527832, + "learning_rate": 5.388062301474072e-09, + "logits/chosen": -2.4518356323242188, + "logits/rejected": -2.449625015258789, + "logps/chosen": -120.41426086425781, + "logps/rejected": -131.92730712890625, + "loss": 0.6666, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.6644247770309448, + "rewards/margins": 0.09248615801334381, + "rewards/rejected": -0.756911039352417, "step": 10050 }, { - "epoch": 1.73, - "grad_norm": 40.38498271374321, - "learning_rate": 2.6601837964153996e-08, - "logits/chosen": -1.2823264598846436, - "logits/rejected": -1.2305335998535156, - "logps/chosen": -225.500732421875, - "logps/rejected": -361.58197021484375, - "loss": 0.4373, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.7677650451660156, - "rewards/margins": 1.3243902921676636, - "rewards/rejected": -3.0921549797058105, + "epoch": 1.7332873880082702, + "grad_norm": 11.537849426269531, + "learning_rate": 5.320367592830799e-09, + "logits/chosen": -2.4141101837158203, + "logits/rejected": -2.4072766304016113, + "logps/chosen": -108.069091796875, + "logps/rejected": -130.06570434570312, + "loss": 0.6295, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5933195948600769, + "rewards/margins": 0.1834740787744522, + "rewards/rejected": -0.7767936587333679, "step": 10060 }, { - "epoch": 1.74, - "grad_norm": 28.019376221143933, - "learning_rate": 2.6265384627718046e-08, - "logits/chosen": -1.2700541019439697, - "logits/rejected": -1.215319037437439, - "logps/chosen": -226.7916717529297, - "logps/rejected": -368.22149658203125, - "loss": 0.3848, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.7437423467636108, - "rewards/margins": 1.4138736724853516, - "rewards/rejected": -3.157615900039673, + "epoch": 1.7350103376981392, + "grad_norm": 10.482232093811035, + "learning_rate": 5.253076925543609e-09, + "logits/chosen": -2.386352062225342, + "logits/rejected": -2.375824451446533, + "logps/chosen": -112.65206146240234, + "logps/rejected": -134.59097290039062, + "loss": 0.6205, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6019715070724487, + "rewards/margins": 0.2191394567489624, + "rewards/rejected": -0.8211109042167664, "step": 10070 }, { - "epoch": 1.74, - "grad_norm": 37.67411721104956, - "learning_rate": 2.593095454063615e-08, - "logits/chosen": -1.3826894760131836, - "logits/rejected": -1.3214690685272217, - "logps/chosen": -222.34335327148438, - "logps/rejected": -359.19488525390625, - "loss": 0.4212, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.6584384441375732, - "rewards/margins": 1.3830474615097046, - "rewards/rejected": -3.0414860248565674, + "epoch": 1.7367332873880081, + "grad_norm": 13.752614974975586, + "learning_rate": 5.18619090812723e-09, + "logits/chosen": -2.5031204223632812, + "logits/rejected": -2.4887642860412598, + "logps/chosen": -120.53104400634766, + "logps/rejected": -133.3077392578125, + "loss": 0.6471, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6399309039115906, + "rewards/margins": 0.14228789508342743, + "rewards/rejected": -0.7822188138961792, "step": 10080 }, { - "epoch": 1.74, - "grad_norm": 24.463559826579306, - "learning_rate": 2.5598550727185142e-08, - "logits/chosen": -1.3830634355545044, - "logits/rejected": -1.3127849102020264, - "logps/chosen": -223.80313110351562, - "logps/rejected": -375.8487548828125, - "loss": 0.39, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.6944698095321655, - "rewards/margins": 1.4869707822799683, - "rewards/rejected": -3.181440591812134, + "epoch": 1.7384562370778773, + "grad_norm": 9.386380195617676, + "learning_rate": 5.1197101454370285e-09, + "logits/chosen": -2.5299363136291504, + "logits/rejected": -2.5103542804718018, + "logps/chosen": -113.98204040527344, + "logps/rejected": -136.5449981689453, + "loss": 0.6228, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5960878729820251, + "rewards/margins": 0.19220301508903503, + "rewards/rejected": -0.7882908582687378, "step": 10090 }, { - "epoch": 1.74, - "grad_norm": 35.57829724815369, - "learning_rate": 2.5268176193318473e-08, - "logits/chosen": -1.349346399307251, - "logits/rejected": -1.2962137460708618, - "logps/chosen": -229.2544403076172, - "logps/rejected": -371.0001525878906, - "loss": 0.3867, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.742475152015686, - "rewards/margins": 1.4224704504013062, - "rewards/rejected": -3.164945602416992, + "epoch": 1.7401791867677465, + "grad_norm": 12.28760051727295, + "learning_rate": 5.0536352386636945e-09, + "logits/chosen": -2.4863905906677246, + "logits/rejected": -2.477647304534912, + "logps/chosen": -114.24980163574219, + "logps/rejected": -131.3005828857422, + "loss": 0.6365, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5926389098167419, + "rewards/margins": 0.1753116101026535, + "rewards/rejected": -0.7679504752159119, "step": 10100 }, { - "epoch": 1.74, - "eval_logits/chosen": -1.4281973838806152, - "eval_logits/rejected": -1.401401162147522, - "eval_logps/chosen": -242.6608123779297, - "eval_logps/rejected": -292.8902587890625, - "eval_loss": 0.6348100900650024, - "eval_rewards/accuracies": 0.6610130071640015, - "eval_rewards/chosen": -1.8395695686340332, - "eval_rewards/margins": 0.4577590227127075, - "eval_rewards/rejected": -2.297328472137451, - "eval_runtime": 357.2066, - "eval_samples_per_second": 12.049, - "eval_steps_per_second": 1.506, + "epoch": 1.7401791867677465, + "eval_logits/chosen": -2.563511848449707, + "eval_logits/rejected": -2.556731700897217, + "eval_logps/chosen": -107.8778305053711, + "eval_logps/rejected": -122.56346130371094, + "eval_loss": 0.655293345451355, + "eval_rewards/accuracies": 0.6201208233833313, + "eval_rewards/chosen": -0.49165940284729004, + "eval_rewards/margins": 0.10217391699552536, + "eval_rewards/rejected": -0.5938332676887512, + "eval_runtime": 359.4587, + "eval_samples_per_second": 11.974, + "eval_steps_per_second": 1.497, "step": 10100 }, { - "epoch": 1.74, - "grad_norm": 41.259295901796634, - "learning_rate": 2.4939833926638397e-08, - "logits/chosen": -1.379417896270752, - "logits/rejected": -1.333963394165039, - "logps/chosen": -243.1998748779297, - "logps/rejected": -391.07293701171875, - "loss": 0.4012, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.9264335632324219, - "rewards/margins": 1.4382061958312988, - "rewards/rejected": -3.3646399974823, + "epoch": 1.7419021364576155, + "grad_norm": 12.70230484008789, + "learning_rate": 4.9879667853276795e-09, + "logits/chosen": -2.529376268386841, + "logits/rejected": -2.533937454223633, + "logps/chosen": -113.8519287109375, + "logps/rejected": -137.6707000732422, + "loss": 0.6241, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.632787823677063, + "rewards/margins": 0.19746533036231995, + "rewards/rejected": -0.830253005027771, "step": 10110 }, { - "epoch": 1.74, - "grad_norm": 26.28585974209364, - "learning_rate": 2.4613526896369307e-08, - "logits/chosen": -1.3697757720947266, - "logits/rejected": -1.3050405979156494, - "logps/chosen": -231.44277954101562, - "logps/rejected": -369.87445068359375, - "loss": 0.3817, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.777197241783142, - "rewards/margins": 1.4158128499984741, - "rewards/rejected": -3.193009853363037, + "epoch": 1.7436250861474845, + "grad_norm": 12.27637767791748, + "learning_rate": 4.9227053792738615e-09, + "logits/chosen": -2.4879720211029053, + "logits/rejected": -2.4670259952545166, + "logps/chosen": -114.29881286621094, + "logps/rejected": -131.1550750732422, + "loss": 0.6197, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6057515740394592, + "rewards/margins": 0.19973024725914001, + "rewards/rejected": -0.8054817914962769, "step": 10120 }, { - "epoch": 1.75, - "grad_norm": 30.6614965261688, - "learning_rate": 2.428925805333082e-08, - "logits/chosen": -1.3869436979293823, - "logits/rejected": -1.3147681951522827, - "logps/chosen": -221.4283905029297, - "logps/rejected": -376.77764892578125, - "loss": 0.3521, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.6809848546981812, - "rewards/margins": 1.5506279468536377, - "rewards/rejected": -3.2316126823425293, + "epoch": 1.7453480358373534, + "grad_norm": 10.527959823608398, + "learning_rate": 4.857851610666164e-09, + "logits/chosen": -2.513655424118042, + "logits/rejected": -2.4933128356933594, + "logps/chosen": -109.53932189941406, + "logps/rejected": -130.92755126953125, + "loss": 0.6176, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5619297027587891, + "rewards/margins": 0.2108466923236847, + "rewards/rejected": -0.7727764844894409, "step": 10130 }, { - "epoch": 1.75, - "grad_norm": 40.486241150607576, - "learning_rate": 2.396703032991107e-08, - "logits/chosen": -1.3461048603057861, - "logits/rejected": -1.2657787799835205, - "logps/chosen": -243.0239715576172, - "logps/rejected": -370.98052978515625, - "loss": 0.4374, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.8258941173553467, - "rewards/margins": 1.3444820642471313, - "rewards/rejected": -3.1703763008117676, + "epoch": 1.7470709855272226, + "grad_norm": 11.256921768188477, + "learning_rate": 4.793406065982214e-09, + "logits/chosen": -2.4904751777648926, + "logits/rejected": -2.4507718086242676, + "logps/chosen": -120.92155456542969, + "logps/rejected": -133.06362915039062, + "loss": 0.6289, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6049574613571167, + "rewards/margins": 0.18600977957248688, + "rewards/rejected": -0.7909671664237976, "step": 10140 }, { - "epoch": 1.75, - "grad_norm": 31.540425471498317, - "learning_rate": 2.3646846640040158e-08, - "logits/chosen": -1.2644100189208984, - "logits/rejected": -1.1996811628341675, - "logps/chosen": -238.3133087158203, - "logps/rejected": -371.70947265625, - "loss": 0.4298, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.8294227123260498, - "rewards/margins": 1.333860158920288, - "rewards/rejected": -3.163282871246338, + "epoch": 1.7487939352170918, + "grad_norm": 10.61954116821289, + "learning_rate": 4.729369328008032e-09, + "logits/chosen": -2.437351703643799, + "logits/rejected": -2.4185705184936523, + "logps/chosen": -114.20558166503906, + "logps/rejected": -133.32203674316406, + "loss": 0.6249, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5883208513259888, + "rewards/margins": 0.19086460769176483, + "rewards/rejected": -0.77918541431427, "step": 10150 }, { - "epoch": 1.75, - "grad_norm": 29.9318411012178, - "learning_rate": 2.332870987916383e-08, - "logits/chosen": -1.3238608837127686, - "logits/rejected": -1.2526966333389282, - "logps/chosen": -222.04052734375, - "logps/rejected": -380.37548828125, - "loss": 0.3546, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.6999727487564087, - "rewards/margins": 1.5818251371383667, - "rewards/rejected": -3.2817981243133545, + "epoch": 1.7505168849069608, + "grad_norm": 9.216764450073242, + "learning_rate": 4.665741975832765e-09, + "logits/chosen": -2.462484836578369, + "logits/rejected": -2.446476697921753, + "logps/chosen": -111.08219146728516, + "logps/rejected": -131.91702270507812, + "loss": 0.615, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.590103268623352, + "rewards/margins": 0.20701715350151062, + "rewards/rejected": -0.7971204519271851, "step": 10160 }, { - "epoch": 1.75, - "grad_norm": 34.151684842456746, - "learning_rate": 2.3012622924217323e-08, - "logits/chosen": -1.3320282697677612, - "logits/rejected": -1.272220253944397, - "logps/chosen": -232.2387237548828, - "logps/rejected": -378.3623046875, - "loss": 0.4196, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.7736480236053467, - "rewards/margins": 1.466052770614624, - "rewards/rejected": -3.2397007942199707, + "epoch": 1.7522398345968297, + "grad_norm": 8.60057544708252, + "learning_rate": 4.602524584843464e-09, + "logits/chosen": -2.4618074893951416, + "logits/rejected": -2.4513816833496094, + "logps/chosen": -113.1440658569336, + "logps/rejected": -131.1116485595703, + "loss": 0.625, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5823419690132141, + "rewards/margins": 0.18472710251808167, + "rewards/rejected": -0.7670690417289734, "step": 10170 }, { - "epoch": 1.75, - "grad_norm": 21.211664057036547, - "learning_rate": 2.2698588633599357e-08, - "logits/chosen": -1.2500728368759155, - "logits/rejected": -1.1695213317871094, - "logps/chosen": -226.1921844482422, - "logps/rejected": -385.45855712890625, - "loss": 0.3777, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.737788438796997, - "rewards/margins": 1.583823800086975, - "rewards/rejected": -3.321612596511841, + "epoch": 1.7539627842866987, + "grad_norm": 10.92617130279541, + "learning_rate": 4.539717726719872e-09, + "logits/chosen": -2.402416944503784, + "logits/rejected": -2.375006914138794, + "logps/chosen": -112.00247955322266, + "logps/rejected": -135.15814208984375, + "loss": 0.6077, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5956050157546997, + "rewards/margins": 0.22287814319133759, + "rewards/rejected": -0.8184831738471985, "step": 10180 }, { - "epoch": 1.76, - "grad_norm": 56.5968336109649, - "learning_rate": 2.2386609847146077e-08, - "logits/chosen": -1.2704510688781738, - "logits/rejected": -1.2027629613876343, - "logps/chosen": -225.5328826904297, - "logps/rejected": -359.3175354003906, - "loss": 0.4261, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.7456719875335693, - "rewards/margins": 1.333605408668518, - "rewards/rejected": -3.079277515411377, + "epoch": 1.755685733976568, + "grad_norm": 13.182476997375488, + "learning_rate": 4.4773219694292155e-09, + "logits/chosen": -2.4350714683532715, + "logits/rejected": -2.4115071296691895, + "logps/chosen": -106.44425201416016, + "logps/rejected": -129.00108337402344, + "loss": 0.6103, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5548610091209412, + "rewards/margins": 0.22107870876789093, + "rewards/rejected": -0.7759397625923157, "step": 10190 }, { - "epoch": 1.76, - "grad_norm": 24.63754465967521, - "learning_rate": 2.2076689386105824e-08, - "logits/chosen": -1.3379052877426147, - "logits/rejected": -1.2737504243850708, - "logps/chosen": -230.8819580078125, - "logps/rejected": -371.2557067871094, - "loss": 0.3851, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.7848135232925415, - "rewards/margins": 1.4187109470367432, - "rewards/rejected": -3.203524351119995, + "epoch": 1.757408683666437, + "grad_norm": 10.107029914855957, + "learning_rate": 4.415337877221164e-09, + "logits/chosen": -2.5051677227020264, + "logits/rejected": -2.4862966537475586, + "logps/chosen": -112.60433197021484, + "logps/rejected": -129.8455047607422, + "loss": 0.6269, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.601836085319519, + "rewards/margins": 0.1873604953289032, + "rewards/rejected": -0.7891966104507446, "step": 10200 }, { - "epoch": 1.76, - "eval_logits/chosen": -1.3974082469940186, - "eval_logits/rejected": -1.369729995727539, - "eval_logps/chosen": -254.59268188476562, - "eval_logps/rejected": -307.6221618652344, - "eval_loss": 0.6357866525650024, - "eval_rewards/accuracies": 0.660780668258667, - "eval_rewards/chosen": -1.9588884115219116, - "eval_rewards/margins": 0.4857591688632965, - "eval_rewards/rejected": -2.444647789001465, - "eval_runtime": 356.793, - "eval_samples_per_second": 12.063, - "eval_steps_per_second": 1.508, + "epoch": 1.757408683666437, + "eval_logits/chosen": -2.5624215602874756, + "eval_logits/rejected": -2.555640935897827, + "eval_logps/chosen": -108.23209381103516, + "eval_logps/rejected": -122.9497299194336, + "eval_loss": 0.6552406549453735, + "eval_rewards/accuracies": 0.6208178400993347, + "eval_rewards/chosen": -0.4952020049095154, + "eval_rewards/margins": 0.10249407589435577, + "eval_rewards/rejected": -0.59769606590271, + "eval_runtime": 359.3223, + "eval_samples_per_second": 11.978, + "eval_steps_per_second": 1.497, "step": 10200 }, { - "epoch": 1.76, - "grad_norm": 54.095086047288085, - "learning_rate": 2.176883005311303e-08, - "logits/chosen": -1.361449122428894, - "logits/rejected": -1.3111120462417603, - "logps/chosen": -245.9110565185547, - "logps/rejected": -408.2243957519531, - "loss": 0.3661, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.9192225933074951, - "rewards/margins": 1.5922331809997559, - "rewards/rejected": -3.511455535888672, + "epoch": 1.759131633356306, + "grad_norm": 11.625716209411621, + "learning_rate": 4.353766010622606e-09, + "logits/chosen": -2.4938254356384277, + "logits/rejected": -2.489194631576538, + "logps/chosen": -119.16197204589844, + "logps/rejected": -144.89639282226562, + "loss": 0.6132, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6514904499053955, + "rewards/margins": 0.22621452808380127, + "rewards/rejected": -0.8777049779891968, "step": 10210 }, { - "epoch": 1.76, - "grad_norm": 47.443592896329925, - "learning_rate": 2.1463034632163535e-08, - "logits/chosen": -1.2670648097991943, - "logits/rejected": -1.2201905250549316, - "logps/chosen": -241.36117553710938, - "logps/rejected": -377.2726135253906, - "loss": 0.4243, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.9298385381698608, - "rewards/margins": 1.3317430019378662, - "rewards/rejected": -3.2615818977355957, + "epoch": 1.760854583046175, + "grad_norm": 11.112654685974121, + "learning_rate": 4.2926069264327066e-09, + "logits/chosen": -2.441725254058838, + "logits/rejected": -2.435220241546631, + "logps/chosen": -106.16336822509766, + "logps/rejected": -126.24835205078125, + "loss": 0.6301, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.577778697013855, + "rewards/margins": 0.17347396910190582, + "rewards/rejected": -0.7512526512145996, "step": 10220 }, { - "epoch": 1.76, - "grad_norm": 27.354616295829352, - "learning_rate": 2.1159305888588664e-08, - "logits/chosen": -1.2255061864852905, - "logits/rejected": -1.15509033203125, - "logps/chosen": -232.21240234375, - "logps/rejected": -373.4509582519531, - "loss": 0.446, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.7877719402313232, - "rewards/margins": 1.4283082485198975, - "rewards/rejected": -3.2160804271698, + "epoch": 1.762577532736044, + "grad_norm": 9.004151344299316, + "learning_rate": 4.231861177717733e-09, + "logits/chosen": -2.3886735439300537, + "logits/rejected": -2.3622939586639404, + "logps/chosen": -112.79914855957031, + "logps/rejected": -125.49736022949219, + "loss": 0.6467, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5935821533203125, + "rewards/margins": 0.14280185103416443, + "rewards/rejected": -0.7363839745521545, "step": 10230 }, { - "epoch": 1.76, - "grad_norm": 18.82682331923304, - "learning_rate": 2.085764656903105e-08, - "logits/chosen": -1.308882236480713, - "logits/rejected": -1.2245006561279297, - "logps/chosen": -224.0420684814453, - "logps/rejected": -410.17626953125, - "loss": 0.3153, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.7259820699691772, - "rewards/margins": 1.8521201610565186, - "rewards/rejected": -3.5781021118164062, + "epoch": 1.7643004824259132, + "grad_norm": 10.80211353302002, + "learning_rate": 4.17152931380621e-09, + "logits/chosen": -2.434027910232544, + "logits/rejected": -2.4118659496307373, + "logps/chosen": -112.94795989990234, + "logps/rejected": -137.31430053710938, + "loss": 0.6096, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6149092316627502, + "rewards/margins": 0.2345750778913498, + "rewards/rejected": -0.8494843244552612, "step": 10240 }, { - "epoch": 1.77, - "grad_norm": 47.39267774633412, - "learning_rate": 2.055805940141897e-08, - "logits/chosen": -1.2967278957366943, - "logits/rejected": -1.2207520008087158, - "logps/chosen": -248.3931427001953, - "logps/rejected": -379.16094970703125, - "loss": 0.3673, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.9161218404769897, - "rewards/margins": 1.378807783126831, - "rewards/rejected": -3.2949295043945312, + "epoch": 1.7660234321157822, + "grad_norm": 11.60608959197998, + "learning_rate": 4.111611880283794e-09, + "logits/chosen": -2.487051010131836, + "logits/rejected": -2.4494423866271973, + "logps/chosen": -118.3154067993164, + "logps/rejected": -131.3991241455078, + "loss": 0.6198, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.615398645401001, + "rewards/margins": 0.20179829001426697, + "rewards/rejected": -0.8171968460083008, "step": 10250 }, { - "epoch": 1.77, - "grad_norm": 29.996102972446575, - "learning_rate": 2.0260547094942348e-08, - "logits/chosen": -1.2790513038635254, - "logits/rejected": -1.2392971515655518, - "logps/chosen": -238.34115600585938, - "logps/rejected": -378.8802185058594, - "loss": 0.4199, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.8511323928833008, - "rewards/margins": 1.3689521551132202, - "rewards/rejected": -3.2200846672058105, + "epoch": 1.7677463818056514, + "grad_norm": 10.059064865112305, + "learning_rate": 4.05210941898847e-09, + "logits/chosen": -2.446291446685791, + "logits/rejected": -2.448866605758667, + "logps/chosen": -110.99190521240234, + "logps/rejected": -132.66427612304688, + "loss": 0.6233, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5772714614868164, + "rewards/margins": 0.18064741790294647, + "rewards/rejected": -0.7579189538955688, "step": 10260 }, { - "epoch": 1.77, - "grad_norm": 40.28205321720298, - "learning_rate": 1.9965112340027874e-08, - "logits/chosen": -1.300189733505249, - "logits/rejected": -1.2424399852752686, - "logps/chosen": -245.79641723632812, - "logps/rejected": -382.1834411621094, - "loss": 0.409, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.9040024280548096, - "rewards/margins": 1.361567497253418, - "rewards/rejected": -3.2655701637268066, + "epoch": 1.7694693314955203, + "grad_norm": 10.58187198638916, + "learning_rate": 3.993022468005575e-09, + "logits/chosen": -2.4398512840270996, + "logits/rejected": -2.4271864891052246, + "logps/chosen": -119.28215026855469, + "logps/rejected": -141.2032012939453, + "loss": 0.6172, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6385139226913452, + "rewards/margins": 0.21705254912376404, + "rewards/rejected": -0.8555665016174316, "step": 10270 }, { - "epoch": 1.77, - "grad_norm": 30.493851403701377, - "learning_rate": 1.9671757808314675e-08, - "logits/chosen": -1.2651712894439697, - "logits/rejected": -1.2117892503738403, - "logps/chosen": -252.64151000976562, - "logps/rejected": -374.02337646484375, - "loss": 0.4544, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.9832038879394531, - "rewards/margins": 1.2268593311309814, - "rewards/rejected": -3.2100627422332764, + "epoch": 1.7711922811853893, + "grad_norm": 12.797431945800781, + "learning_rate": 3.934351561662935e-09, + "logits/chosen": -2.4263205528259277, + "logits/rejected": -2.4106249809265137, + "logps/chosen": -120.3524169921875, + "logps/rejected": -128.5592803955078, + "loss": 0.6645, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.6602329015731812, + "rewards/margins": 0.09498941153287888, + "rewards/rejected": -0.7552222609519958, "step": 10280 }, { - "epoch": 1.77, - "grad_norm": 32.40691710505685, - "learning_rate": 1.9380486152630548e-08, - "logits/chosen": -1.2505433559417725, - "logits/rejected": -1.1942849159240723, - "logps/chosen": -229.8437042236328, - "logps/rejected": -385.4642639160156, - "loss": 0.4206, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.7842499017715454, - "rewards/margins": 1.5160019397735596, - "rewards/rejected": -3.3002521991729736, + "epoch": 1.7729152308752585, + "grad_norm": 9.863990783691406, + "learning_rate": 3.876097230526109e-09, + "logits/chosen": -2.398904800415039, + "logits/rejected": -2.3914239406585693, + "logps/chosen": -106.53401947021484, + "logps/rejected": -135.8355255126953, + "loss": 0.6003, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5509772896766663, + "rewards/margins": 0.2525397539138794, + "rewards/rejected": -0.8035169839859009, "step": 10290 }, { - "epoch": 1.77, - "grad_norm": 39.84324911925736, - "learning_rate": 1.909130000696732e-08, - "logits/chosen": -1.325438380241394, - "logits/rejected": -1.2682257890701294, - "logps/chosen": -235.05142211914062, - "logps/rejected": -361.3345031738281, - "loss": 0.4322, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.8042240142822266, - "rewards/margins": 1.2643927335739136, - "rewards/rejected": -3.068617105484009, + "epoch": 1.7746381805651275, + "grad_norm": 11.00568675994873, + "learning_rate": 3.818260001393464e-09, + "logits/chosen": -2.462790012359619, + "logits/rejected": -2.4483377933502197, + "logps/chosen": -118.48091125488281, + "logps/rejected": -130.38723754882812, + "loss": 0.6573, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6383308172225952, + "rewards/margins": 0.12067344039678574, + "rewards/rejected": -0.7590042948722839, "step": 10300 }, { - "epoch": 1.77, - "eval_logits/chosen": -1.4002227783203125, - "eval_logits/rejected": -1.3728564977645874, - "eval_logps/chosen": -252.0375518798828, - "eval_logps/rejected": -304.3727722167969, - "eval_loss": 0.6351932287216187, - "eval_rewards/accuracies": 0.6584572196006775, - "eval_rewards/chosen": -1.9333373308181763, - "eval_rewards/margins": 0.4788166582584381, - "eval_rewards/rejected": -2.412153959274292, - "eval_runtime": 356.5282, - "eval_samples_per_second": 12.072, - "eval_steps_per_second": 1.509, + "epoch": 1.7746381805651275, + "eval_logits/chosen": -2.5610098838806152, + "eval_logits/rejected": -2.5541930198669434, + "eval_logps/chosen": -108.33472442626953, + "eval_logps/rejected": -123.06450653076172, + "eval_loss": 0.6552524566650391, + "eval_rewards/accuracies": 0.6201208233833313, + "eval_rewards/chosen": -0.49622830748558044, + "eval_rewards/margins": 0.10261543095111847, + "eval_rewards/rejected": -0.5988436937332153, + "eval_runtime": 359.658, + "eval_samples_per_second": 11.967, + "eval_steps_per_second": 1.496, "step": 10300 }, { - "epoch": 1.78, - "grad_norm": 22.524340645441857, - "learning_rate": 1.8804201986457742e-08, - "logits/chosen": -1.261060357093811, - "logits/rejected": -1.1999661922454834, - "logps/chosen": -247.80233764648438, - "logps/rejected": -386.2306213378906, - "loss": 0.4087, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.9156711101531982, - "rewards/margins": 1.4324285984039307, - "rewards/rejected": -3.34809947013855, + "epoch": 1.7763611302549966, + "grad_norm": 10.546584129333496, + "learning_rate": 3.760840397291548e-09, + "logits/chosen": -2.39471697807312, + "logits/rejected": -2.3739752769470215, + "logps/chosen": -119.74263763427734, + "logps/rejected": -134.30592346191406, + "loss": 0.6227, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6347993016242981, + "rewards/margins": 0.19372475147247314, + "rewards/rejected": -0.8285239934921265, "step": 10310 }, { - "epoch": 1.78, - "grad_norm": 46.482526842999754, - "learning_rate": 1.851919468735119e-08, - "logits/chosen": -1.315914511680603, - "logits/rejected": -1.2488597631454468, - "logps/chosen": -235.482666015625, - "logps/rejected": -366.1687316894531, - "loss": 0.4226, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7874408960342407, - "rewards/margins": 1.3392503261566162, - "rewards/rejected": -3.1266913414001465, + "epoch": 1.7780840799448656, + "grad_norm": 10.581847190856934, + "learning_rate": 3.7038389374702382e-09, + "logits/chosen": -2.456726312637329, + "logits/rejected": -2.427189350128174, + "logps/chosen": -121.62674713134766, + "logps/rejected": -133.1729736328125, + "loss": 0.649, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6486510634422302, + "rewards/margins": 0.14786870777606964, + "rewards/rejected": -0.7965198755264282, "step": 10320 }, { - "epoch": 1.78, - "grad_norm": 55.06877120295062, - "learning_rate": 1.8236280686990653e-08, - "logits/chosen": -1.3398211002349854, - "logits/rejected": -1.2758935689926147, - "logps/chosen": -230.93673706054688, - "logps/rejected": -373.24432373046875, - "loss": 0.3857, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.7879480123519897, - "rewards/margins": 1.3995113372802734, - "rewards/rejected": -3.1874594688415527, + "epoch": 1.7798070296347346, + "grad_norm": 10.441367149353027, + "learning_rate": 3.6472561373981305e-09, + "logits/chosen": -2.4716782569885254, + "logits/rejected": -2.4550414085388184, + "logps/chosen": -115.976806640625, + "logps/rejected": -135.39369201660156, + "loss": 0.6275, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6383245587348938, + "rewards/margins": 0.17045047879219055, + "rewards/rejected": -0.8087749481201172, "step": 10330 }, { - "epoch": 1.78, - "grad_norm": 31.59403091184685, - "learning_rate": 1.795546254378927e-08, - "logits/chosen": -1.3418684005737305, - "logits/rejected": -1.2638609409332275, - "logps/chosen": -231.3434295654297, - "logps/rejected": -384.9586181640625, - "loss": 0.3837, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.7527868747711182, - "rewards/margins": 1.5512384176254272, - "rewards/rejected": -3.304025173187256, + "epoch": 1.7815299793246038, + "grad_norm": 10.83610725402832, + "learning_rate": 3.5910925087578535e-09, + "logits/chosen": -2.4928503036499023, + "logits/rejected": -2.47050142288208, + "logps/chosen": -115.1012191772461, + "logps/rejected": -135.30702209472656, + "loss": 0.6166, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5901371836662292, + "rewards/margins": 0.21725019812583923, + "rewards/rejected": -0.8073874711990356, "step": 10340 }, { - "epoch": 1.78, - "grad_norm": 44.48450974185094, - "learning_rate": 1.7676742797207045e-08, - "logits/chosen": -1.4229285717010498, - "logits/rejected": -1.3561543226242065, - "logps/chosen": -241.94143676757812, - "logps/rejected": -376.6054992675781, - "loss": 0.4181, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.878838300704956, - "rewards/margins": 1.376997470855713, - "rewards/rejected": -3.255835771560669, + "epoch": 1.7832529290144727, + "grad_norm": 10.322022438049316, + "learning_rate": 3.535348559441409e-09, + "logits/chosen": -2.569450855255127, + "logits/rejected": -2.545938730239868, + "logps/chosen": -112.86637878417969, + "logps/rejected": -129.04212951660156, + "loss": 0.6243, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5878443121910095, + "rewards/margins": 0.19202940165996552, + "rewards/rejected": -0.7798737287521362, "step": 10350 }, { - "epoch": 1.78, - "grad_norm": 40.416266152745024, - "learning_rate": 1.740012396772819e-08, - "logits/chosen": -1.2647850513458252, - "logits/rejected": -1.1986753940582275, - "logps/chosen": -245.0532684326172, - "logps/rejected": -359.286865234375, - "loss": 0.4955, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.891425371170044, - "rewards/margins": 1.173485279083252, - "rewards/rejected": -3.064910411834717, + "epoch": 1.784975878704342, + "grad_norm": 10.957462310791016, + "learning_rate": 3.4800247935456383e-09, + "logits/chosen": -2.4213013648986816, + "logits/rejected": -2.390179395675659, + "logps/chosen": -119.5207290649414, + "logps/rejected": -128.385009765625, + "loss": 0.6536, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6357388496398926, + "rewards/margins": 0.12005865573883057, + "rewards/rejected": -0.7557975053787231, "step": 10360 }, { - "epoch": 1.79, - "grad_norm": 29.777426788141348, - "learning_rate": 1.7125608556838035e-08, - "logits/chosen": -1.1616214513778687, - "logits/rejected": -1.0960752964019775, - "logps/chosen": -223.65115356445312, - "logps/rejected": -352.9761047363281, - "loss": 0.4088, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7168071269989014, - "rewards/margins": 1.2844127416610718, - "rewards/rejected": -3.001220226287842, + "epoch": 1.786698828394211, + "grad_norm": 10.56867790222168, + "learning_rate": 3.425121711367607e-09, + "logits/chosen": -2.305194139480591, + "logits/rejected": -2.276939630508423, + "logps/chosen": -111.28253173828125, + "logps/rejected": -131.2523956298828, + "loss": 0.6216, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5930512547492981, + "rewards/margins": 0.19067077338695526, + "rewards/rejected": -0.7837220430374146, "step": 10370 }, { - "epoch": 1.79, - "grad_norm": 51.70883721503306, - "learning_rate": 1.6853199047000584e-08, - "logits/chosen": -1.3133689165115356, - "logits/rejected": -1.2742842435836792, - "logps/chosen": -249.2805633544922, - "logps/rejected": -336.363037109375, - "loss": 0.5641, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.974237084388733, - "rewards/margins": 0.8665148019790649, - "rewards/rejected": -2.840752124786377, + "epoch": 1.7884217780840799, + "grad_norm": 11.401347160339355, + "learning_rate": 3.3706398094001167e-09, + "logits/chosen": -2.448519229888916, + "logits/rejected": -2.4372894763946533, + "logps/chosen": -118.3085708618164, + "logps/rejected": -128.80023193359375, + "loss": 0.6673, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6642967462539673, + "rewards/margins": 0.10065841674804688, + "rewards/rejected": -0.7649552226066589, "step": 10380 }, { - "epoch": 1.79, - "grad_norm": 21.280760614032122, - "learning_rate": 1.6582897901636027e-08, - "logits/chosen": -1.3474326133728027, - "logits/rejected": -1.2722728252410889, - "logps/chosen": -228.17819213867188, - "logps/rejected": -372.76544189453125, - "loss": 0.391, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.7238601446151733, - "rewards/margins": 1.463079571723938, - "rewards/rejected": -3.1869397163391113, + "epoch": 1.7901447277739488, + "grad_norm": 10.860513687133789, + "learning_rate": 3.3165795803272057e-09, + "logits/chosen": -2.489330768585205, + "logits/rejected": -2.463158130645752, + "logps/chosen": -115.2964096069336, + "logps/rejected": -130.6220703125, + "loss": 0.6361, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5948722958564758, + "rewards/margins": 0.17043809592723846, + "rewards/rejected": -0.7653104066848755, "step": 10390 }, { - "epoch": 1.79, - "grad_norm": 23.04577762515264, - "learning_rate": 1.6314707565098395e-08, - "logits/chosen": -1.2872307300567627, - "logits/rejected": -1.2240254878997803, - "logps/chosen": -246.9097900390625, - "logps/rejected": -409.812255859375, - "loss": 0.3405, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.9479862451553345, - "rewards/margins": 1.5971990823745728, - "rewards/rejected": -3.5451855659484863, + "epoch": 1.791867677463818, + "grad_norm": 12.60634994506836, + "learning_rate": 3.2629415130196793e-09, + "logits/chosen": -2.447645902633667, + "logits/rejected": -2.430975914001465, + "logps/chosen": -113.55403900146484, + "logps/rejected": -140.37448120117188, + "loss": 0.6036, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6141378283500671, + "rewards/margins": 0.2365102469921112, + "rewards/rejected": -0.850648045539856, "step": 10400 }, { - "epoch": 1.79, - "eval_logits/chosen": -1.411521315574646, - "eval_logits/rejected": -1.384405493736267, - "eval_logps/chosen": -247.26951599121094, - "eval_logps/rejected": -298.5337219238281, - "eval_loss": 0.6351563334465027, - "eval_rewards/accuracies": 0.660780668258667, - "eval_rewards/chosen": -1.885656714439392, - "eval_rewards/margins": 0.46810635924339294, - "eval_rewards/rejected": -2.3537631034851074, - "eval_runtime": 356.5783, - "eval_samples_per_second": 12.07, - "eval_steps_per_second": 1.509, + "epoch": 1.791867677463818, + "eval_logits/chosen": -2.560978412628174, + "eval_logits/rejected": -2.554239273071289, + "eval_logps/chosen": -108.23997497558594, + "eval_logps/rejected": -122.97843933105469, + "eval_loss": 0.6551907658576965, + "eval_rewards/accuracies": 0.6196561455726624, + "eval_rewards/chosen": -0.49528077244758606, + "eval_rewards/margins": 0.10270221531391144, + "eval_rewards/rejected": -0.5979831218719482, + "eval_runtime": 359.6358, + "eval_samples_per_second": 11.968, + "eval_steps_per_second": 1.496, "step": 10400 }, { - "epoch": 1.79, - "grad_norm": 29.30147034557462, - "learning_rate": 1.6048630462653616e-08, - "logits/chosen": -1.2767484188079834, - "logits/rejected": -1.2087651491165161, - "logps/chosen": -246.88467407226562, - "logps/rejected": -367.8938293457031, - "loss": 0.4398, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.9034740924835205, - "rewards/margins": 1.2447469234466553, - "rewards/rejected": -3.1482207775115967, + "epoch": 1.7935906271536872, + "grad_norm": 10.753764152526855, + "learning_rate": 3.2097260925307235e-09, + "logits/chosen": -2.4378273487091064, + "logits/rejected": -2.4052460193634033, + "logps/chosen": -120.6829605102539, + "logps/rejected": -132.27468872070312, + "loss": 0.6423, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6416543126106262, + "rewards/margins": 0.15033070743083954, + "rewards/rejected": -0.791985034942627, "step": 10410 }, { - "epoch": 1.8, - "grad_norm": 50.80138734136231, - "learning_rate": 1.578466900045733e-08, - "logits/chosen": -1.3154891729354858, - "logits/rejected": -1.2465871572494507, - "logps/chosen": -237.7150421142578, - "logps/rejected": -368.3978576660156, - "loss": 0.4055, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.8362977504730225, - "rewards/margins": 1.3066072463989258, - "rewards/rejected": -3.142904758453369, + "epoch": 1.7953135768435562, + "grad_norm": 10.544516563415527, + "learning_rate": 3.1569338000914656e-09, + "logits/chosen": -2.458979368209839, + "logits/rejected": -2.4329304695129395, + "logps/chosen": -115.31663513183594, + "logps/rejected": -133.2076416015625, + "loss": 0.6337, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.611960768699646, + "rewards/margins": 0.17886130511760712, + "rewards/rejected": -0.7908221483230591, "step": 10420 }, { - "epoch": 1.8, - "grad_norm": 28.18485959040931, - "learning_rate": 1.5522825565533442e-08, - "logits/chosen": -1.3946081399917603, - "logits/rejected": -1.3362939357757568, - "logps/chosen": -232.037109375, - "logps/rejected": -357.0254211425781, - "loss": 0.4289, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7716169357299805, - "rewards/margins": 1.2683614492416382, - "rewards/rejected": -3.039978504180908, + "epoch": 1.7970365265334252, + "grad_norm": 9.884859085083008, + "learning_rate": 3.1045651131066886e-09, + "logits/chosen": -2.5237693786621094, + "logits/rejected": -2.5072269439697266, + "logps/chosen": -113.82945251464844, + "logps/rejected": -126.14137268066406, + "loss": 0.6464, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.58958899974823, + "rewards/margins": 0.14127321541309357, + "rewards/rejected": -0.7308622598648071, "step": 10430 }, { - "epoch": 1.8, - "grad_norm": 27.574863749179777, - "learning_rate": 1.526310252575222e-08, - "logits/chosen": -1.4324071407318115, - "logits/rejected": -1.380183219909668, - "logps/chosen": -236.1354217529297, - "logps/rejected": -360.7831115722656, - "loss": 0.4444, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.7827268838882446, - "rewards/margins": 1.2522282600402832, - "rewards/rejected": -3.0349552631378174, + "epoch": 1.7987594762232941, + "grad_norm": 11.38304615020752, + "learning_rate": 3.0526205051504437e-09, + "logits/chosen": -2.54156494140625, + "logits/rejected": -2.532452344894409, + "logps/chosen": -121.62736511230469, + "logps/rejected": -132.5194549560547, + "loss": 0.6554, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6375734210014343, + "rewards/margins": 0.11469124257564545, + "rewards/rejected": -0.752264678478241, "step": 10440 }, { - "epoch": 1.8, - "grad_norm": 34.86665149520393, - "learning_rate": 1.500550222980923e-08, - "logits/chosen": -1.3539096117019653, - "logits/rejected": -1.3092883825302124, - "logps/chosen": -238.3639678955078, - "logps/rejected": -363.4995422363281, - "loss": 0.4144, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.8473567962646484, - "rewards/margins": 1.211411476135254, - "rewards/rejected": -3.0587682723999023, + "epoch": 1.8004824259131633, + "grad_norm": 9.76710033416748, + "learning_rate": 3.001100445961846e-09, + "logits/chosen": -2.485647439956665, + "logits/rejected": -2.4800970554351807, + "logps/chosen": -114.85188293457031, + "logps/rejected": -139.90597534179688, + "loss": 0.6126, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6122986078262329, + "rewards/margins": 0.2103799283504486, + "rewards/rejected": -0.8226785659790039, "step": 10450 }, { - "epoch": 1.8, - "grad_norm": 32.44019457079582, - "learning_rate": 1.4750027007203653e-08, - "logits/chosen": -1.3425335884094238, - "logits/rejected": -1.2749333381652832, - "logps/chosen": -229.01583862304688, - "logps/rejected": -354.47314453125, - "loss": 0.423, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7443774938583374, - "rewards/margins": 1.2702993154525757, - "rewards/rejected": -3.014676809310913, + "epoch": 1.8022053756030325, + "grad_norm": 10.32025146484375, + "learning_rate": 2.9500054014407307e-09, + "logits/chosen": -2.47493052482605, + "logits/rejected": -2.449911594390869, + "logps/chosen": -115.2611312866211, + "logps/rejected": -131.6233367919922, + "loss": 0.6262, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6068614721298218, + "rewards/margins": 0.17909392714500427, + "rewards/rejected": -0.7859554290771484, "step": 10460 }, { - "epoch": 1.8, - "grad_norm": 57.913238056684506, - "learning_rate": 1.4496679168217646e-08, - "logits/chosen": -1.1968591213226318, - "logits/rejected": -1.1376616954803467, - "logps/chosen": -241.31689453125, - "logps/rejected": -365.15399169921875, - "loss": 0.4734, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.8941717147827148, - "rewards/margins": 1.2496927976608276, - "rewards/rejected": -3.143864393234253, + "epoch": 1.8039283252929015, + "grad_norm": 13.728989601135254, + "learning_rate": 2.899335833643529e-09, + "logits/chosen": -2.3405070304870605, + "logits/rejected": -2.3186278343200684, + "logps/chosen": -110.76844787597656, + "logps/rejected": -126.5798568725586, + "loss": 0.636, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5886200666427612, + "rewards/margins": 0.16937750577926636, + "rewards/rejected": -0.7579976320266724, "step": 10470 }, { - "epoch": 1.81, - "grad_norm": 26.776034814613833, - "learning_rate": 1.4245461003895232e-08, - "logits/chosen": -1.3491319417953491, - "logits/rejected": -1.2737585306167603, - "logps/chosen": -229.96826171875, - "logps/rejected": -374.3705749511719, - "loss": 0.4425, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.740369439125061, - "rewards/margins": 1.4796626567840576, - "rewards/rejected": -3.220032215118408, + "epoch": 1.8056512749827704, + "grad_norm": 9.618743896484375, + "learning_rate": 2.849092200779046e-09, + "logits/chosen": -2.47365140914917, + "logits/rejected": -2.440391778945923, + "logps/chosen": -115.9509048461914, + "logps/rejected": -134.37042236328125, + "loss": 0.6136, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5998003482818604, + "rewards/margins": 0.21988093852996826, + "rewards/rejected": -0.8196811676025391, "step": 10480 }, { - "epoch": 1.81, - "grad_norm": 27.83311511088631, - "learning_rate": 1.3996374786021642e-08, - "logits/chosen": -1.3122376203536987, - "logits/rejected": -1.241257905960083, - "logps/chosen": -222.9138641357422, - "logps/rejected": -362.92169189453125, - "loss": 0.4151, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.691291093826294, - "rewards/margins": 1.4125699996948242, - "rewards/rejected": -3.1038613319396973, + "epoch": 1.8073742246726394, + "grad_norm": 9.683095932006836, + "learning_rate": 2.7992749572043282e-09, + "logits/chosen": -2.468331813812256, + "logits/rejected": -2.441288471221924, + "logps/chosen": -109.4289321899414, + "logps/rejected": -126.5555419921875, + "loss": 0.6245, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5561849474906921, + "rewards/margins": 0.18384456634521484, + "rewards/rejected": -0.7400294542312622, "step": 10490 }, { - "epoch": 1.81, - "grad_norm": 34.16101354006198, - "learning_rate": 1.3749422767102698e-08, - "logits/chosen": -1.3112982511520386, - "logits/rejected": -1.2444483041763306, - "logps/chosen": -234.7496337890625, - "logps/rejected": -384.12420654296875, - "loss": 0.424, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.820643424987793, - "rewards/margins": 1.4697020053863525, - "rewards/rejected": -3.2903454303741455, + "epoch": 1.8090971743625086, + "grad_norm": 11.747886657714844, + "learning_rate": 2.7498845534205393e-09, + "logits/chosen": -2.4546990394592285, + "logits/rejected": -2.4392428398132324, + "logps/chosen": -115.9336929321289, + "logps/rejected": -139.72801208496094, + "loss": 0.6178, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6323421597480774, + "rewards/margins": 0.2138226330280304, + "rewards/rejected": -0.8461647033691406, "step": 10500 }, { - "epoch": 1.81, - "eval_logits/chosen": -1.411332368850708, - "eval_logits/rejected": -1.3842883110046387, - "eval_logps/chosen": -246.45022583007812, - "eval_logps/rejected": -297.5495300292969, - "eval_loss": 0.6351029276847839, - "eval_rewards/accuracies": 0.6598513126373291, - "eval_rewards/chosen": -1.8774638175964355, - "eval_rewards/margins": 0.46645745635032654, - "eval_rewards/rejected": -2.343921184539795, - "eval_runtime": 356.9292, - "eval_samples_per_second": 12.058, - "eval_steps_per_second": 1.507, + "epoch": 1.8090971743625086, + "eval_logits/chosen": -2.5598411560058594, + "eval_logits/rejected": -2.553055763244629, + "eval_logps/chosen": -108.27570343017578, + "eval_logps/rejected": -123.08312225341797, + "eval_loss": 0.6549313068389893, + "eval_rewards/accuracies": 0.6212825179100037, + "eval_rewards/chosen": -0.49563807249069214, + "eval_rewards/margins": 0.10339171439409256, + "eval_rewards/rejected": -0.5990298986434937, + "eval_runtime": 360.0185, + "eval_samples_per_second": 11.955, + "eval_steps_per_second": 1.494, "step": 10500 }, { - "epoch": 1.81, - "grad_norm": 36.69075640202483, - "learning_rate": 1.3504607180344463e-08, - "logits/chosen": -1.3228697776794434, - "logits/rejected": -1.2543773651123047, - "logps/chosen": -228.5086669921875, - "logps/rejected": -364.3096923828125, - "loss": 0.415, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7290817499160767, - "rewards/margins": 1.3821680545806885, - "rewards/rejected": -3.1112494468688965, + "epoch": 1.8108201240523778, + "grad_norm": 10.245859146118164, + "learning_rate": 2.7009214360688924e-09, + "logits/chosen": -2.453220844268799, + "logits/rejected": -2.4300804138183594, + "logps/chosen": -115.05948638916016, + "logps/rejected": -132.7764434814453, + "loss": 0.6191, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5947138071060181, + "rewards/margins": 0.20130130648612976, + "rewards/rejected": -0.7960150241851807, "step": 10510 }, { - "epoch": 1.81, - "grad_norm": 46.69963744425329, - "learning_rate": 1.3261930239633261e-08, - "logits/chosen": -1.4053773880004883, - "logits/rejected": -1.3577836751937866, - "logps/chosen": -221.27139282226562, - "logps/rejected": -369.21697998046875, - "loss": 0.4035, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.7077957391738892, - "rewards/margins": 1.4389338493347168, - "rewards/rejected": -3.1467297077178955, + "epoch": 1.8125430737422468, + "grad_norm": 10.916816711425781, + "learning_rate": 2.6523860479266525e-09, + "logits/chosen": -2.513354778289795, + "logits/rejected": -2.513002872467041, + "logps/chosen": -110.2159423828125, + "logps/rejected": -138.3251190185547, + "loss": 0.6045, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5973041653633118, + "rewards/margins": 0.24054870009422302, + "rewards/rejected": -0.8378528356552124, "step": 10520 }, { - "epoch": 1.81, - "grad_norm": 35.672432590977046, - "learning_rate": 1.3021394139515197e-08, - "logits/chosen": -1.2971795797348022, - "logits/rejected": -1.2416961193084717, - "logps/chosen": -233.1261444091797, - "logps/rejected": -350.49969482421875, - "loss": 0.4311, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7978441715240479, - "rewards/margins": 1.185850977897644, - "rewards/rejected": -2.9836955070495605, + "epoch": 1.8142660234321157, + "grad_norm": 11.534632682800293, + "learning_rate": 2.6042788279030392e-09, + "logits/chosen": -2.423563003540039, + "logits/rejected": -2.4029486179351807, + "logps/chosen": -116.3944091796875, + "logps/rejected": -129.7278594970703, + "loss": 0.6443, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6305351853370667, + "rewards/margins": 0.14548929035663605, + "rewards/rejected": -0.7760244011878967, "step": 10530 }, { - "epoch": 1.82, - "grad_norm": 25.051590405247744, - "learning_rate": 1.2783001055176907e-08, - "logits/chosen": -1.2430702447891235, - "logits/rejected": -1.1768423318862915, - "logps/chosen": -227.86788940429688, - "logps/rejected": -368.8069152832031, - "loss": 0.3771, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.7555568218231201, - "rewards/margins": 1.4385837316513062, - "rewards/rejected": -3.194140672683716, + "epoch": 1.8159889731219847, + "grad_norm": 9.460017204284668, + "learning_rate": 2.556600211035381e-09, + "logits/chosen": -2.393198251724243, + "logits/rejected": -2.3695731163024902, + "logps/chosen": -113.17286682128906, + "logps/rejected": -128.18748474121094, + "loss": 0.6299, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6084216833114624, + "rewards/margins": 0.17950792610645294, + "rewards/rejected": -0.7879296541213989, "step": 10540 }, { - "epoch": 1.82, - "grad_norm": 48.523638164139314, - "learning_rate": 1.2546753142425315e-08, - "logits/chosen": -1.4035842418670654, - "logits/rejected": -1.3484973907470703, - "logps/chosen": -241.98580932617188, - "logps/rejected": -391.2903137207031, - "loss": 0.3919, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.8824710845947266, - "rewards/margins": 1.472904920578003, - "rewards/rejected": -3.3553764820098877, + "epoch": 1.817711922811854, + "grad_norm": 12.474471092224121, + "learning_rate": 2.509350628485063e-09, + "logits/chosen": -2.560568332672119, + "logits/rejected": -2.5556082725524902, + "logps/chosen": -116.18548583984375, + "logps/rejected": -137.69532775878906, + "loss": 0.6243, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.624345600605011, + "rewards/margins": 0.19479550421237946, + "rewards/rejected": -0.8191410899162292, "step": 10550 }, { - "epoch": 1.82, - "grad_norm": 31.996639068258908, - "learning_rate": 1.2312652537668499e-08, - "logits/chosen": -1.2917983531951904, - "logits/rejected": -1.2220714092254639, - "logps/chosen": -226.38491821289062, - "logps/rejected": -382.4876403808594, - "loss": 0.4064, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7078090906143188, - "rewards/margins": 1.5710179805755615, - "rewards/rejected": -3.278826951980591, + "epoch": 1.819434872501723, + "grad_norm": 14.021306037902832, + "learning_rate": 2.4625305075337e-09, + "logits/chosen": -2.423170328140259, + "logits/rejected": -2.4037978649139404, + "logps/chosen": -116.9132080078125, + "logps/rejected": -134.77378845214844, + "loss": 0.6298, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6128100156784058, + "rewards/margins": 0.18871495127677917, + "rewards/rejected": -0.8015249371528625, "step": 10560 }, { - "epoch": 1.82, - "grad_norm": 39.94758110559314, - "learning_rate": 1.2080701357896267e-08, - "logits/chosen": -1.361748456954956, - "logits/rejected": -1.3035575151443481, - "logps/chosen": -242.5690460205078, - "logps/rejected": -395.84844970703125, - "loss": 0.3614, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.866334319114685, - "rewards/margins": 1.5249965190887451, - "rewards/rejected": -3.3913307189941406, + "epoch": 1.821157822191592, + "grad_norm": 10.682099342346191, + "learning_rate": 2.4161402715792533e-09, + "logits/chosen": -2.498974561691284, + "logits/rejected": -2.482527732849121, + "logps/chosen": -117.31404876708984, + "logps/rejected": -138.8473358154297, + "loss": 0.6149, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6136723756790161, + "rewards/margins": 0.2071593701839447, + "rewards/rejected": -0.8208317756652832, "step": 10570 }, { - "epoch": 1.82, - "grad_norm": 34.466730835739696, - "learning_rate": 1.185090170066097e-08, - "logits/chosen": -1.3756376504898071, - "logits/rejected": -1.310418963432312, - "logps/chosen": -224.53564453125, - "logps/rejected": -363.5881652832031, - "loss": 0.4041, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.711679220199585, - "rewards/margins": 1.3960368633270264, - "rewards/rejected": -3.1077163219451904, + "epoch": 1.822880771881461, + "grad_norm": 9.404667854309082, + "learning_rate": 2.370180340132194e-09, + "logits/chosen": -2.5023789405822754, + "logits/rejected": -2.4861600399017334, + "logps/chosen": -112.813232421875, + "logps/rejected": -131.14877319335938, + "loss": 0.6206, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5945123434066772, + "rewards/margins": 0.18883763253688812, + "rewards/rejected": -0.7833499908447266, "step": 10580 }, { - "epoch": 1.82, - "grad_norm": 25.78656181126498, - "learning_rate": 1.1623255644058638e-08, - "logits/chosen": -1.3188053369522095, - "logits/rejected": -1.2425676584243774, - "logps/chosen": -226.51016235351562, - "logps/rejected": -357.7701721191406, - "loss": 0.4113, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7062441110610962, - "rewards/margins": 1.3722339868545532, - "rewards/rejected": -3.0784783363342285, + "epoch": 1.82460372157133, + "grad_norm": 8.862701416015625, + "learning_rate": 2.3246511288117274e-09, + "logits/chosen": -2.449619770050049, + "logits/rejected": -2.413877010345459, + "logps/chosen": -119.61332702636719, + "logps/rejected": -129.75070190429688, + "loss": 0.6371, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6370880603790283, + "rewards/margins": 0.16107021272182465, + "rewards/rejected": -0.7981582880020142, "step": 10590 }, { - "epoch": 1.83, - "grad_norm": 34.38272700142523, - "learning_rate": 1.1397765246710072e-08, - "logits/chosen": -1.3734514713287354, - "logits/rejected": -1.3211066722869873, - "logps/chosen": -227.74270629882812, - "logps/rejected": -366.80364990234375, - "loss": 0.4396, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.7487207651138306, - "rewards/margins": 1.372809648513794, - "rewards/rejected": -3.121530294418335, + "epoch": 1.8263266712611992, + "grad_norm": 10.304811477661133, + "learning_rate": 2.2795530493420144e-09, + "logits/chosen": -2.528219699859619, + "logits/rejected": -2.5179848670959473, + "logps/chosen": -112.78125, + "logps/rejected": -130.56149291992188, + "loss": 0.6403, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5989866852760315, + "rewards/margins": 0.15974877774715424, + "rewards/rejected": -0.7587353587150574, "step": 10600 }, { - "epoch": 1.83, - "eval_logits/chosen": -1.4146060943603516, - "eval_logits/rejected": -1.3876330852508545, - "eval_logps/chosen": -246.19650268554688, - "eval_logps/rejected": -297.2034912109375, - "eval_loss": 0.6350103616714478, - "eval_rewards/accuracies": 0.6568308472633362, - "eval_rewards/chosen": -1.8749263286590576, - "eval_rewards/margins": 0.4655349552631378, - "eval_rewards/rejected": -2.340461254119873, - "eval_runtime": 357.4305, - "eval_samples_per_second": 12.042, - "eval_steps_per_second": 1.505, + "epoch": 1.8263266712611992, + "eval_logits/chosen": -2.5594441890716553, + "eval_logits/rejected": -2.5526580810546875, + "eval_logps/chosen": -108.38090515136719, + "eval_logps/rejected": -123.14496612548828, + "eval_loss": 0.6551013588905334, + "eval_rewards/accuracies": 0.6203531622886658, + "eval_rewards/chosen": -0.49669015407562256, + "eval_rewards/margins": 0.10295825451612473, + "eval_rewards/rejected": -0.5996482968330383, + "eval_runtime": 359.9344, + "eval_samples_per_second": 11.958, + "eval_steps_per_second": 1.495, "step": 10600 }, { - "epoch": 1.83, - "grad_norm": 36.19363313018661, - "learning_rate": 1.1174432547742308e-08, - "logits/chosen": -1.3016915321350098, - "logits/rejected": -1.2503132820129395, - "logps/chosen": -239.76791381835938, - "logps/rejected": -367.1284484863281, - "loss": 0.4453, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.8869917392730713, - "rewards/margins": 1.2658240795135498, - "rewards/rejected": -3.152815580368042, + "epoch": 1.8280496209510684, + "grad_norm": 11.823186874389648, + "learning_rate": 2.2348865095484614e-09, + "logits/chosen": -2.435786485671997, + "logits/rejected": -2.4266715049743652, + "logps/chosen": -113.290771484375, + "logps/rejected": -131.25927734375, + "loss": 0.6309, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6222672462463379, + "rewards/margins": 0.17166352272033691, + "rewards/rejected": -0.7939307689666748, "step": 10610 }, { - "epoch": 1.83, - "grad_norm": 41.29882501819702, - "learning_rate": 1.095325956677015e-08, - "logits/chosen": -1.2536303997039795, - "logits/rejected": -1.1866223812103271, - "logps/chosen": -238.30874633789062, - "logps/rejected": -374.2566833496094, - "loss": 0.4032, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.8451499938964844, - "rewards/margins": 1.374023675918579, - "rewards/rejected": -3.2191734313964844, + "epoch": 1.8297725706409373, + "grad_norm": 14.5044527053833, + "learning_rate": 2.19065191335403e-09, + "logits/chosen": -2.3871350288391113, + "logits/rejected": -2.3654909133911133, + "logps/chosen": -117.86210632324219, + "logps/rejected": -133.91995239257812, + "loss": 0.6323, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6405826807022095, + "rewards/margins": 0.17532998323440552, + "rewards/rejected": -0.8159125447273254, "step": 10620 }, { - "epoch": 1.83, - "grad_norm": 22.72297730303355, - "learning_rate": 1.0734248303877813e-08, - "logits/chosen": -1.3555238246917725, - "logits/rejected": -1.2904984951019287, - "logps/chosen": -231.20010375976562, - "logps/rejected": -360.0879211425781, - "loss": 0.4694, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.7696985006332397, - "rewards/margins": 1.2896592617034912, - "rewards/rejected": -3.0593578815460205, + "epoch": 1.8314955203308063, + "grad_norm": 8.305620193481445, + "learning_rate": 2.1468496607755625e-09, + "logits/chosen": -2.469578981399536, + "logits/rejected": -2.450049877166748, + "logps/chosen": -114.07830810546875, + "logps/rejected": -135.2241973876953, + "loss": 0.6219, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5981978178024292, + "rewards/margins": 0.2123533934354782, + "rewards/rejected": -0.8105511665344238, "step": 10630 }, { - "epoch": 1.83, - "grad_norm": 34.92246125378767, - "learning_rate": 1.051740073960114e-08, - "logits/chosen": -1.340003252029419, - "logits/rejected": -1.2763203382492065, - "logps/chosen": -236.17941284179688, - "logps/rejected": -368.7391052246094, - "loss": 0.4742, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.8113651275634766, - "rewards/margins": 1.350304365158081, - "rewards/rejected": -3.1616694927215576, + "epoch": 1.8332184700206753, + "grad_norm": 11.741875648498535, + "learning_rate": 2.103480147920228e-09, + "logits/chosen": -2.4531044960021973, + "logits/rejected": -2.431525468826294, + "logps/chosen": -113.56483459472656, + "logps/rejected": -131.6667938232422, + "loss": 0.6196, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5849887728691101, + "rewards/margins": 0.2058919370174408, + "rewards/rejected": -0.7908806204795837, "step": 10640 }, { - "epoch": 1.83, - "grad_norm": 26.61094450928108, - "learning_rate": 1.0302718834909213e-08, - "logits/chosen": -1.3977024555206299, - "logits/rejected": -1.3331856727600098, - "logps/chosen": -238.84042358398438, - "logps/rejected": -391.08477783203125, - "loss": 0.4176, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.8521630764007568, - "rewards/margins": 1.520591139793396, - "rewards/rejected": -3.3727545738220215, + "epoch": 1.8349414197105445, + "grad_norm": 12.1553373336792, + "learning_rate": 2.0605437669818426e-09, + "logits/chosen": -2.5188522338867188, + "logits/rejected": -2.498983144760132, + "logps/chosen": -115.35491943359375, + "logps/rejected": -139.0282745361328, + "loss": 0.6116, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.617334246635437, + "rewards/margins": 0.23477241396903992, + "rewards/rejected": -0.8521067500114441, "step": 10650 }, { - "epoch": 1.84, - "grad_norm": 33.27724860143072, - "learning_rate": 1.0090204531187168e-08, - "logits/chosen": -1.2856873273849487, - "logits/rejected": -1.2244932651519775, - "logps/chosen": -238.58139038085938, - "logps/rejected": -374.8933410644531, - "loss": 0.4089, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.839249849319458, - "rewards/margins": 1.358032464981079, - "rewards/rejected": -3.197282314300537, + "epoch": 1.8366643694004137, + "grad_norm": 10.504712104797363, + "learning_rate": 2.0180409062374336e-09, + "logits/chosen": -2.4251739978790283, + "logits/rejected": -2.4132907390594482, + "logps/chosen": -120.6828384399414, + "logps/rejected": -132.45217895507812, + "loss": 0.6581, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.65984708070755, + "rewards/margins": 0.1128741055727005, + "rewards/rejected": -0.7727211713790894, "step": 10660 }, { - "epoch": 1.84, - "grad_norm": 44.68715039350637, - "learning_rate": 9.8798597502181e-09, - "logits/chosen": -1.3030592203140259, - "logits/rejected": -1.2441256046295166, - "logps/chosen": -246.3955841064453, - "logps/rejected": -373.30462646484375, - "loss": 0.4701, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.9426281452178955, - "rewards/margins": 1.2749965190887451, - "rewards/rejected": -3.2176246643066406, + "epoch": 1.8383873190902826, + "grad_norm": 9.784821510314941, + "learning_rate": 1.97597195004362e-09, + "logits/chosen": -2.4410600662231445, + "logits/rejected": -2.4224324226379395, + "logps/chosen": -117.0425033569336, + "logps/rejected": -135.34555053710938, + "loss": 0.6278, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6490435004234314, + "rewards/margins": 0.1887730062007904, + "rewards/rejected": -0.8378164172172546, "step": 10670 }, { - "epoch": 1.84, - "grad_norm": 26.493605739622634, - "learning_rate": 9.671686394166156e-09, - "logits/chosen": -1.3562123775482178, - "logits/rejected": -1.2756215333938599, - "logps/chosen": -221.86074829101562, - "logps/rejected": -360.75762939453125, - "loss": 0.3826, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.6560554504394531, - "rewards/margins": 1.4439318180084229, - "rewards/rejected": -3.099987506866455, + "epoch": 1.8401102687801516, + "grad_norm": 10.796539306640625, + "learning_rate": 1.934337278833231e-09, + "logits/chosen": -2.48850417137146, + "logits/rejected": -2.4493298530578613, + "logps/chosen": -116.40985107421875, + "logps/rejected": -126.93431091308594, + "loss": 0.6371, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6012075543403625, + "rewards/margins": 0.1603269726037979, + "rewards/rejected": -0.7615344524383545, "step": 10680 }, { - "epoch": 1.84, - "grad_norm": 30.331480121505034, - "learning_rate": 9.465686345558944e-09, - "logits/chosen": -1.3282508850097656, - "logits/rejected": -1.2702124118804932, - "logps/chosen": -226.7456817626953, - "logps/rejected": -377.0152282714844, - "loss": 0.4397, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.7300522327423096, - "rewards/margins": 1.4764258861541748, - "rewards/rejected": -3.2064781188964844, + "epoch": 1.8418332184700206, + "grad_norm": 10.557440757751465, + "learning_rate": 1.8931372691117887e-09, + "logits/chosen": -2.460437774658203, + "logits/rejected": -2.4529013633728027, + "logps/chosen": -114.23712158203125, + "logps/rejected": -138.14364624023438, + "loss": 0.6154, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6048712730407715, + "rewards/margins": 0.21279887855052948, + "rewards/rejected": -0.8176702260971069, "step": 10690 }, { - "epoch": 1.84, - "grad_norm": 37.77942293381782, - "learning_rate": 9.261861467270787e-09, - "logits/chosen": -1.3761959075927734, - "logits/rejected": -1.3014271259307861, - "logps/chosen": -222.33877563476562, - "logps/rejected": -349.0968322753906, - "loss": 0.3908, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.6618179082870483, - "rewards/margins": 1.3258897066116333, - "rewards/rejected": -2.9877076148986816, + "epoch": 1.8435561681598898, + "grad_norm": 12.950105667114258, + "learning_rate": 1.8523722934541575e-09, + "logits/chosen": -2.517003059387207, + "logits/rejected": -2.479825496673584, + "logps/chosen": -116.20123291015625, + "logps/rejected": -126.58357238769531, + "loss": 0.6341, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6003366708755493, + "rewards/margins": 0.1621071845293045, + "rewards/rejected": -0.7624439001083374, "step": 10700 }, { - "epoch": 1.84, - "eval_logits/chosen": -1.4212182760238647, - "eval_logits/rejected": -1.3943599462509155, - "eval_logps/chosen": -243.04237365722656, - "eval_logps/rejected": -293.6067810058594, - "eval_loss": 0.6334287524223328, - "eval_rewards/accuracies": 0.6563661694526672, - "eval_rewards/chosen": -1.8433852195739746, - "eval_rewards/margins": 0.46110865473747253, - "eval_rewards/rejected": -2.3044939041137695, - "eval_runtime": 356.8228, - "eval_samples_per_second": 12.062, - "eval_steps_per_second": 1.508, + "epoch": 1.8435561681598898, + "eval_logits/chosen": -2.5590357780456543, + "eval_logits/rejected": -2.5522735118865967, + "eval_logps/chosen": -108.35945892333984, + "eval_logps/rejected": -123.14959716796875, + "eval_loss": 0.6550170183181763, + "eval_rewards/accuracies": 0.6205855011940002, + "eval_rewards/chosen": -0.4964757263660431, + "eval_rewards/margins": 0.10321904718875885, + "eval_rewards/rejected": -0.5996947884559631, + "eval_runtime": 360.261, + "eval_samples_per_second": 11.947, + "eval_steps_per_second": 1.493, "step": 10700 }, { - "epoch": 1.85, - "grad_norm": 54.759408136897285, - "learning_rate": 9.060213602505778e-09, - "logits/chosen": -1.304149866104126, - "logits/rejected": -1.239341139793396, - "logps/chosen": -225.63449096679688, - "logps/rejected": -350.32049560546875, - "loss": 0.4472, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7219674587249756, - "rewards/margins": 1.2871848344802856, - "rewards/rejected": -3.0091521739959717, + "epoch": 1.8452791178497587, + "grad_norm": 10.944367408752441, + "learning_rate": 1.8120427205011556e-09, + "logits/chosen": -2.4294328689575195, + "logits/rejected": -2.4029085636138916, + "logps/chosen": -113.1238784790039, + "logps/rejected": -125.20323181152344, + "loss": 0.6357, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5967923402786255, + "rewards/margins": 0.16086360812187195, + "rewards/rejected": -0.757655918598175, "step": 10710 }, { - "epoch": 1.85, - "grad_norm": 57.3062622613245, - "learning_rate": 8.860744574781032e-09, - "logits/chosen": -1.338438630104065, - "logits/rejected": -1.269768476486206, - "logps/chosen": -236.2624053955078, - "logps/rejected": -356.3697509765625, - "loss": 0.4911, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.8369505405426025, - "rewards/margins": 1.204301118850708, - "rewards/rejected": -3.0412516593933105, + "epoch": 1.847002067539628, + "grad_norm": 14.83804988861084, + "learning_rate": 1.7721489149562063e-09, + "logits/chosen": -2.453725814819336, + "logits/rejected": -2.4256491661071777, + "logps/chosen": -117.40971374511719, + "logps/rejected": -132.19723510742188, + "loss": 0.6463, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6482272148132324, + "rewards/margins": 0.15093299746513367, + "rewards/rejected": -0.7991601228713989, "step": 10720 }, { - "epoch": 1.85, - "grad_norm": 28.412674272208022, - "learning_rate": 8.663456187910422e-09, - "logits/chosen": -1.4263569116592407, - "logits/rejected": -1.3510792255401611, - "logps/chosen": -229.66720581054688, - "logps/rejected": -364.2427673339844, - "loss": 0.3543, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.7573401927947998, - "rewards/margins": 1.3864922523498535, - "rewards/rejected": -3.1438326835632324, + "epoch": 1.848725017229497, + "grad_norm": 10.805709838867188, + "learning_rate": 1.7326912375820846e-09, + "logits/chosen": -2.5418336391448975, + "logits/rejected": -2.5095081329345703, + "logps/chosen": -113.35749816894531, + "logps/rejected": -127.86077880859375, + "loss": 0.6213, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5939838290214539, + "rewards/margins": 0.1858406960964203, + "rewards/rejected": -0.7798245549201965, "step": 10730 }, { - "epoch": 1.85, - "grad_norm": 41.706917137582344, - "learning_rate": 8.468350225987908e-09, - "logits/chosen": -1.2837555408477783, - "logits/rejected": -1.222891926765442, - "logps/chosen": -246.54638671875, - "logps/rejected": -371.4312744140625, - "loss": 0.4706, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.9186769723892212, - "rewards/margins": 1.2438628673553467, - "rewards/rejected": -3.1625399589538574, + "epoch": 1.8504479669193659, + "grad_norm": 13.219216346740723, + "learning_rate": 1.6936700451975817e-09, + "logits/chosen": -2.4094748497009277, + "logits/rejected": -2.397881031036377, + "logps/chosen": -118.57723236083984, + "logps/rejected": -133.8645782470703, + "loss": 0.6393, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6388689279556274, + "rewards/margins": 0.14827385544776917, + "rewards/rejected": -0.7871428728103638, "step": 10740 }, { - "epoch": 1.85, - "grad_norm": 31.262809555566417, - "learning_rate": 8.275428453371813e-09, - "logits/chosen": -1.248228907585144, - "logits/rejected": -1.1745529174804688, - "logps/chosen": -237.2993621826172, - "logps/rejected": -385.1238098144531, - "loss": 0.4173, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.8524090051651, - "rewards/margins": 1.4723732471466064, - "rewards/rejected": -3.324782609939575, + "epoch": 1.852170916609235, + "grad_norm": 9.872903823852539, + "learning_rate": 1.6550856906743627e-09, + "logits/chosen": -2.4142613410949707, + "logits/rejected": -2.386833667755127, + "logps/chosen": -111.84124755859375, + "logps/rejected": -135.484130859375, + "loss": 0.6086, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5978298187255859, + "rewards/margins": 0.23036417365074158, + "rewards/rejected": -0.8281939625740051, "step": 10750 }, { - "epoch": 1.85, - "grad_norm": 35.16663063428534, - "learning_rate": 8.084692614668542e-09, - "logits/chosen": -1.3016769886016846, - "logits/rejected": -1.248867392539978, - "logps/chosen": -224.76113891601562, - "logps/rejected": -346.5380859375, - "loss": 0.4184, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -1.7105858325958252, - "rewards/margins": 1.22708261013031, - "rewards/rejected": -2.937668561935425, + "epoch": 1.853893866299104, + "grad_norm": 11.279659271240234, + "learning_rate": 1.6169385229337086e-09, + "logits/chosen": -2.4235472679138184, + "logits/rejected": -2.4074952602386475, + "logps/chosen": -113.22697448730469, + "logps/rejected": -127.94795227050781, + "loss": 0.6393, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5954617261886597, + "rewards/margins": 0.15615740418434143, + "rewards/rejected": -0.7516191601753235, "step": 10760 }, { - "epoch": 1.86, - "grad_norm": 31.86195155798675, - "learning_rate": 7.89614443471695e-09, - "logits/chosen": -1.3076452016830444, - "logits/rejected": -1.2525701522827148, - "logps/chosen": -217.1156463623047, - "logps/rejected": -352.6279296875, - "loss": 0.3762, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.6004784107208252, - "rewards/margins": 1.3801997900009155, - "rewards/rejected": -2.9806783199310303, + "epoch": 1.8556168159889732, + "grad_norm": 10.54137134552002, + "learning_rate": 1.5792288869433902e-09, + "logits/chosen": -2.4071712493896484, + "logits/rejected": -2.3937089443206787, + "logps/chosen": -116.57283020019531, + "logps/rejected": -132.6007537841797, + "loss": 0.627, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5950114130973816, + "rewards/margins": 0.1851048767566681, + "rewards/rejected": -0.7801163196563721, "step": 10770 }, { - "epoch": 1.86, - "grad_norm": 25.654760643763883, - "learning_rate": 7.7097856185728e-09, - "logits/chosen": -1.418269395828247, - "logits/rejected": -1.3548392057418823, - "logps/chosen": -222.4542999267578, - "logps/rejected": -363.5215148925781, - "loss": 0.4058, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.6930938959121704, - "rewards/margins": 1.3915579319000244, - "rewards/rejected": -3.084651470184326, + "epoch": 1.8573397656788422, + "grad_norm": 9.987702369689941, + "learning_rate": 1.5419571237145601e-09, + "logits/chosen": -2.5344901084899902, + "logits/rejected": -2.5173232555389404, + "logps/chosen": -114.1861343383789, + "logps/rejected": -132.4864959716797, + "loss": 0.6373, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.610164999961853, + "rewards/margins": 0.16384710371494293, + "rewards/rejected": -0.7740120887756348, "step": 10780 }, { - "epoch": 1.86, - "grad_norm": 40.803942439334094, - "learning_rate": 7.525617851493166e-09, - "logits/chosen": -1.4488575458526611, - "logits/rejected": -1.3801032304763794, - "logps/chosen": -207.01974487304688, - "logps/rejected": -345.1376647949219, - "loss": 0.3895, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.5470707416534424, - "rewards/margins": 1.3702001571655273, - "rewards/rejected": -2.917271137237549, + "epoch": 1.8590627153687111, + "grad_norm": 11.684725761413574, + "learning_rate": 1.5051235702986331e-09, + "logits/chosen": -2.561619758605957, + "logits/rejected": -2.5328845977783203, + "logps/chosen": -107.4722671508789, + "logps/rejected": -129.6492462158203, + "loss": 0.6156, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5513117909431458, + "rewards/margins": 0.21112540364265442, + "rewards/rejected": -0.7624371647834778, "step": 10790 }, { - "epoch": 1.86, - "grad_norm": 22.89870220316856, - "learning_rate": 7.343642798921384e-09, - "logits/chosen": -1.4306409358978271, - "logits/rejected": -1.3741027116775513, - "logps/chosen": -220.6624755859375, - "logps/rejected": -359.7532043457031, - "loss": 0.4273, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.6872755289077759, - "rewards/margins": 1.371055006980896, - "rewards/rejected": -3.058330774307251, + "epoch": 1.8607856650585803, + "grad_norm": 10.52690315246582, + "learning_rate": 1.4687285597842768e-09, + "logits/chosen": -2.5493738651275635, + "logits/rejected": -2.5295369625091553, + "logps/chosen": -111.5053482055664, + "logps/rejected": -131.95693969726562, + "loss": 0.627, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5954598784446716, + "rewards/margins": 0.18490615487098694, + "rewards/rejected": -0.7803661227226257, "step": 10800 }, { - "epoch": 1.86, - "eval_logits/chosen": -1.419447422027588, - "eval_logits/rejected": -1.3925857543945312, - "eval_logps/chosen": -244.09783935546875, - "eval_logps/rejected": -294.6657409667969, - "eval_loss": 0.6341846585273743, - "eval_rewards/accuracies": 0.6624070405960083, - "eval_rewards/chosen": -1.8539396524429321, - "eval_rewards/margins": 0.4611437916755676, - "eval_rewards/rejected": -2.3150837421417236, - "eval_runtime": 357.4896, - "eval_samples_per_second": 12.04, - "eval_steps_per_second": 1.505, + "epoch": 1.8607856650585803, + "eval_logits/chosen": -2.558943510055542, + "eval_logits/rejected": -2.5521273612976074, + "eval_logps/chosen": -108.42156982421875, + "eval_logps/rejected": -123.24092102050781, + "eval_loss": 0.6549394726753235, + "eval_rewards/accuracies": 0.6210501790046692, + "eval_rewards/chosen": -0.4970967471599579, + "eval_rewards/margins": 0.10351123660802841, + "eval_rewards/rejected": -0.6006080508232117, + "eval_runtime": 360.1737, + "eval_samples_per_second": 11.95, + "eval_steps_per_second": 1.494, "step": 10800 }, { - "epoch": 1.86, - "grad_norm": 24.19588838169064, - "learning_rate": 7.1638621064718516e-09, - "logits/chosen": -1.373991847038269, - "logits/rejected": -1.2945966720581055, - "logps/chosen": -227.33596801757812, - "logps/rejected": -364.17059326171875, - "loss": 0.3831, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.699730634689331, - "rewards/margins": 1.4409074783325195, - "rewards/rejected": -3.140638589859009, + "epoch": 1.8625086147484493, + "grad_norm": 9.803145408630371, + "learning_rate": 1.4327724212943704e-09, + "logits/chosen": -2.4846789836883545, + "logits/rejected": -2.4454684257507324, + "logps/chosen": -119.3404769897461, + "logps/rejected": -129.8537139892578, + "loss": 0.63, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.619606614112854, + "rewards/margins": 0.1778271496295929, + "rewards/rejected": -0.7974337339401245, "step": 10810 }, { - "epoch": 1.86, - "grad_norm": 50.27316401128035, - "learning_rate": 6.986277399915197e-09, - "logits/chosen": -1.2879558801651, - "logits/rejected": -1.2268191576004028, - "logps/chosen": -209.937744140625, - "logps/rejected": -343.2181701660156, - "loss": 0.4086, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5688936710357666, - "rewards/margins": 1.349342703819275, - "rewards/rejected": -2.918236255645752, + "epoch": 1.8642315644383185, + "grad_norm": 9.98331356048584, + "learning_rate": 1.3972554799830394e-09, + "logits/chosen": -2.4303512573242188, + "logits/rejected": -2.408212184906006, + "logps/chosen": -108.08897399902344, + "logps/rejected": -125.14892578125, + "loss": 0.6255, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5499401092529297, + "rewards/margins": 0.18750813603401184, + "rewards/rejected": -0.7374482154846191, "step": 10820 }, { - "epoch": 1.87, - "grad_norm": 50.904093818767734, - "learning_rate": 6.8108902851636285e-09, - "logits/chosen": -1.3199676275253296, - "logits/rejected": -1.2508794069290161, - "logps/chosen": -232.073486328125, - "logps/rejected": -368.0849914550781, - "loss": 0.399, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.8025493621826172, - "rewards/margins": 1.3453489542007446, - "rewards/rejected": -3.147897958755493, + "epoch": 1.8659545141281875, + "grad_norm": 13.596193313598633, + "learning_rate": 1.3621780570327257e-09, + "logits/chosen": -2.451287031173706, + "logits/rejected": -2.426119327545166, + "logps/chosen": -112.04255676269531, + "logps/rejected": -130.3465576171875, + "loss": 0.6321, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6022640466690063, + "rewards/margins": 0.1681433916091919, + "rewards/rejected": -0.7704073190689087, "step": 10830 }, { - "epoch": 1.87, - "grad_norm": 27.336450230863502, - "learning_rate": 6.637702348256308e-09, - "logits/chosen": -1.3644187450408936, - "logits/rejected": -1.307117223739624, - "logps/chosen": -227.4879150390625, - "logps/rejected": -349.18182373046875, - "loss": 0.4494, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.74507737159729, - "rewards/margins": 1.237743854522705, - "rewards/rejected": -2.982821226119995, + "epoch": 1.8676774638180564, + "grad_norm": 10.210439682006836, + "learning_rate": 1.3275404696512615e-09, + "logits/chosen": -2.4836678504943848, + "logits/rejected": -2.4577431678771973, + "logps/chosen": -111.8627700805664, + "logps/rejected": -125.012451171875, + "loss": 0.6429, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5887194275856018, + "rewards/margins": 0.15233774483203888, + "rewards/rejected": -0.7410570979118347, "step": 10840 }, { - "epoch": 1.87, - "grad_norm": 33.93760320273209, - "learning_rate": 6.466715155345109e-09, - "logits/chosen": -1.2493406534194946, - "logits/rejected": -1.199512243270874, - "logps/chosen": -229.41708374023438, - "logps/rejected": -345.4873046875, - "loss": 0.4573, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.7922929525375366, - "rewards/margins": 1.118058443069458, - "rewards/rejected": -2.910351276397705, + "epoch": 1.8694004135079254, + "grad_norm": 10.683871269226074, + "learning_rate": 1.2933430310690218e-09, + "logits/chosen": -2.4083244800567627, + "logits/rejected": -2.3927559852600098, + "logps/chosen": -107.12113952636719, + "logps/rejected": -130.18138122558594, + "loss": 0.6245, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5696035623550415, + "rewards/margins": 0.18765828013420105, + "rewards/rejected": -0.7572618722915649, "step": 10850 }, { - "epoch": 1.87, - "grad_norm": 31.190541875354757, - "learning_rate": 6.2979302526803006e-09, - "logits/chosen": -1.4172183275222778, - "logits/rejected": -1.3366795778274536, - "logps/chosen": -223.34884643554688, - "logps/rejected": -358.8973693847656, - "loss": 0.4234, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.6810195446014404, - "rewards/margins": 1.3840689659118652, - "rewards/rejected": -3.0650882720947266, + "epoch": 1.8711233631977946, + "grad_norm": 12.726272583007812, + "learning_rate": 1.25958605053606e-09, + "logits/chosen": -2.5296437740325928, + "logits/rejected": -2.498725414276123, + "logps/chosen": -116.60414123535156, + "logps/rejected": -130.74502563476562, + "loss": 0.636, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6134532690048218, + "rewards/margins": 0.1698613315820694, + "rewards/rejected": -0.78331458568573, "step": 10860 }, { - "epoch": 1.87, - "grad_norm": 19.263570637063424, - "learning_rate": 6.131349166596883e-09, - "logits/chosen": -1.2681770324707031, - "logits/rejected": -1.21076500415802, - "logps/chosen": -206.9633026123047, - "logps/rejected": -360.4692077636719, - "loss": 0.4066, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.5650874376296997, - "rewards/margins": 1.4790990352630615, - "rewards/rejected": -3.0441863536834717, + "epoch": 1.8728463128876638, + "grad_norm": 8.664299011230469, + "learning_rate": 1.2262698333193766e-09, + "logits/chosen": -2.3817386627197266, + "logits/rejected": -2.3768210411071777, + "logps/chosen": -107.44478607177734, + "logps/rejected": -135.75704956054688, + "loss": 0.6121, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.56959068775177, + "rewards/margins": 0.22736403346061707, + "rewards/rejected": -0.7969546914100647, "step": 10870 }, { - "epoch": 1.87, - "grad_norm": 33.21398609679741, - "learning_rate": 5.966973403500303e-09, - "logits/chosen": -1.3271772861480713, - "logits/rejected": -1.2587413787841797, - "logps/chosen": -232.65988159179688, - "logps/rejected": -375.89111328125, - "loss": 0.3756, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.7844676971435547, - "rewards/margins": 1.4478579759597778, - "rewards/rejected": -3.232325315475464, + "epoch": 1.8745692625775328, + "grad_norm": 11.063324928283691, + "learning_rate": 1.1933946807000606e-09, + "logits/chosen": -2.458096981048584, + "logits/rejected": -2.439481735229492, + "logps/chosen": -112.50048828125, + "logps/rejected": -130.33523559570312, + "loss": 0.6182, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5829136967658997, + "rewards/margins": 0.19349297881126404, + "rewards/rejected": -0.7764066457748413, "step": 10880 }, { - "epoch": 1.88, - "grad_norm": 37.56955046527683, - "learning_rate": 5.804804449853401e-09, - "logits/chosen": -1.3854949474334717, - "logits/rejected": -1.3297450542449951, - "logps/chosen": -223.22830200195312, - "logps/rejected": -355.37030029296875, - "loss": 0.4186, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7010080814361572, - "rewards/margins": 1.30489182472229, - "rewards/rejected": -3.0058999061584473, + "epoch": 1.8762922122674017, + "grad_norm": 9.031937599182129, + "learning_rate": 1.1609608899706803e-09, + "logits/chosen": -2.5011849403381348, + "logits/rejected": -2.483492374420166, + "logps/chosen": -113.2860107421875, + "logps/rejected": -133.9983367919922, + "loss": 0.6233, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6012560129165649, + "rewards/margins": 0.1904800534248352, + "rewards/rejected": -0.7917360067367554, "step": 10890 }, { - "epoch": 1.88, - "grad_norm": 37.057241152716664, - "learning_rate": 5.644843772162372e-09, - "logits/chosen": -1.434251308441162, - "logits/rejected": -1.3531643152236938, - "logps/chosen": -212.3564453125, - "logps/rejected": -349.568603515625, - "loss": 0.3762, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.5736942291259766, - "rewards/margins": 1.4162404537200928, - "rewards/rejected": -2.9899344444274902, + "epoch": 1.8780151619572707, + "grad_norm": 10.209151268005371, + "learning_rate": 1.1289687544324745e-09, + "logits/chosen": -2.5554192066192627, + "logits/rejected": -2.5241541862487793, + "logps/chosen": -112.42668151855469, + "logps/rejected": -124.28495788574219, + "loss": 0.6335, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5743435621261597, + "rewards/margins": 0.16246747970581055, + "rewards/rejected": -0.7368109822273254, "step": 10900 }, { - "epoch": 1.88, - "eval_logits/chosen": -1.4172533750534058, - "eval_logits/rejected": -1.3904321193695068, - "eval_logps/chosen": -244.67039489746094, - "eval_logps/rejected": -295.28729248046875, - "eval_loss": 0.6345546245574951, - "eval_rewards/accuracies": 0.6565985083580017, - "eval_rewards/chosen": -1.8596652746200562, - "eval_rewards/margins": 0.4616338312625885, - "eval_rewards/rejected": -2.3212990760803223, - "eval_runtime": 357.2508, - "eval_samples_per_second": 12.048, - "eval_steps_per_second": 1.506, + "epoch": 1.8780151619572707, + "eval_logits/chosen": -2.559028148651123, + "eval_logits/rejected": -2.5522618293762207, + "eval_logps/chosen": -108.45635986328125, + "eval_logps/rejected": -123.27279663085938, + "eval_loss": 0.654963493347168, + "eval_rewards/accuracies": 0.6201208233833313, + "eval_rewards/chosen": -0.49744468927383423, + "eval_rewards/margins": 0.1034821942448616, + "eval_rewards/rejected": -0.6009268760681152, + "eval_runtime": 360.2674, + "eval_samples_per_second": 11.947, + "eval_steps_per_second": 1.493, "step": 10900 }, { - "epoch": 1.88, - "grad_norm": 37.413849937107756, - "learning_rate": 5.487092816963995e-09, - "logits/chosen": -1.3338580131530762, - "logits/rejected": -1.2663573026657104, - "logps/chosen": -217.49801635742188, - "logps/rejected": -344.0356750488281, - "loss": 0.4173, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.6183340549468994, - "rewards/margins": 1.2802609205245972, - "rewards/rejected": -2.898594856262207, + "epoch": 1.8797381116471399, + "grad_norm": 11.109238624572754, + "learning_rate": 1.097418563392799e-09, + "logits/chosen": -2.4565396308898926, + "logits/rejected": -2.427241086959839, + "logps/chosen": -112.29621887207031, + "logps/rejected": -128.05030822753906, + "loss": 0.63, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5662978887557983, + "rewards/margins": 0.1724037230014801, + "rewards/rejected": -0.7387016415596008, "step": 10910 }, { - "epoch": 1.88, - "grad_norm": 42.748889303543294, - "learning_rate": 5.331553010812312e-09, - "logits/chosen": -1.3081706762313843, - "logits/rejected": -1.2376407384872437, - "logps/chosen": -229.73422241210938, - "logps/rejected": -366.51495361328125, - "loss": 0.3763, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.7389205694198608, - "rewards/margins": 1.3969751596450806, - "rewards/rejected": -3.1358957290649414, + "epoch": 1.881461061337009, + "grad_norm": 12.130841255187988, + "learning_rate": 1.0663106021624623e-09, + "logits/chosen": -2.428583860397339, + "logits/rejected": -2.4066367149353027, + "logps/chosen": -117.6747817993164, + "logps/rejected": -132.8242950439453, + "loss": 0.6256, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6183713674545288, + "rewards/margins": 0.18066565692424774, + "rewards/rejected": -0.799036979675293, "step": 10920 }, { - "epoch": 1.88, - "grad_norm": 29.30790533708006, - "learning_rate": 5.1782257602657756e-09, - "logits/chosen": -1.257922887802124, - "logits/rejected": -1.1985923051834106, - "logps/chosen": -233.80294799804688, - "logps/rejected": -352.77984619140625, - "loss": 0.4355, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.795925498008728, - "rewards/margins": 1.217332363128662, - "rewards/rejected": -3.0132579803466797, + "epoch": 1.883184011026878, + "grad_norm": 12.480398178100586, + "learning_rate": 1.035645152053155e-09, + "logits/chosen": -2.393200635910034, + "logits/rejected": -2.3660690784454346, + "logps/chosen": -116.7625503540039, + "logps/rejected": -127.45570373535156, + "loss": 0.648, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6255054473876953, + "rewards/margins": 0.1341763436794281, + "rewards/rejected": -0.759681761264801, "step": 10930 }, { - "epoch": 1.88, - "grad_norm": 54.99655407488525, - "learning_rate": 5.027112451874483e-09, - "logits/chosen": -1.2420815229415894, - "logits/rejected": -1.1881957054138184, - "logps/chosen": -241.045166015625, - "logps/rejected": -369.63568115234375, - "loss": 0.4248, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.8460403680801392, - "rewards/margins": 1.2986847162246704, - "rewards/rejected": -3.1447253227233887, + "epoch": 1.884906960716747, + "grad_norm": 12.46471118927002, + "learning_rate": 1.0054224903748964e-09, + "logits/chosen": -2.362339735031128, + "logits/rejected": -2.3466224670410156, + "logps/chosen": -119.80097961425781, + "logps/rejected": -132.3865509033203, + "loss": 0.6437, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6334177851676941, + "rewards/margins": 0.13872984051704407, + "rewards/rejected": -0.772147536277771, "step": 10940 }, { - "epoch": 1.89, - "grad_norm": 45.54073464601158, - "learning_rate": 4.878214452167739e-09, - "logits/chosen": -1.3072357177734375, - "logits/rejected": -1.2408037185668945, - "logps/chosen": -238.0092315673828, - "logps/rejected": -381.5384826660156, - "loss": 0.3866, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.834251046180725, - "rewards/margins": 1.4345420598983765, - "rewards/rejected": -3.2687935829162598, + "epoch": 1.886629910406616, + "grad_norm": 14.05789852142334, + "learning_rate": 9.75642890433548e-10, + "logits/chosen": -2.4196391105651855, + "logits/rejected": -2.4031875133514404, + "logps/chosen": -118.99332427978516, + "logps/rejected": -133.8544464111328, + "loss": 0.6424, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6438688039779663, + "rewards/margins": 0.1477302461862564, + "rewards/rejected": -0.7915989756584167, "step": 10950 }, { - "epoch": 1.89, - "grad_norm": 30.095545129155003, - "learning_rate": 4.7315331076416275e-09, - "logits/chosen": -1.3551833629608154, - "logits/rejected": -1.293670654296875, - "logps/chosen": -236.3451385498047, - "logps/rejected": -368.11126708984375, - "loss": 0.4367, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.8255088329315186, - "rewards/margins": 1.3299205303192139, - "rewards/rejected": -3.1554293632507324, + "epoch": 1.8883528600964852, + "grad_norm": 13.794063568115234, + "learning_rate": 9.463066215283254e-10, + "logits/chosen": -2.477527618408203, + "logits/rejected": -2.456815719604492, + "logps/chosen": -116.08634948730469, + "logps/rejected": -136.8908233642578, + "loss": 0.6227, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6227591037750244, + "rewards/margins": 0.22041161358356476, + "rewards/rejected": -0.8431707620620728, "step": 10960 }, { - "epoch": 1.89, - "grad_norm": 45.41850107802761, - "learning_rate": 4.587069744746791e-09, - "logits/chosen": -1.3423527479171753, - "logits/rejected": -1.2766934633255005, - "logps/chosen": -233.58682250976562, - "logps/rejected": -358.17694091796875, - "loss": 0.4907, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.7673285007476807, - "rewards/margins": 1.2734668254852295, - "rewards/rejected": -3.04079532623291, + "epoch": 1.8900758097863544, + "grad_norm": 11.207356452941895, + "learning_rate": 9.174139489493582e-10, + "logits/chosen": -2.4887819290161133, + "logits/rejected": -2.4592578411102295, + "logps/chosen": -117.4792709350586, + "logps/rejected": -134.54624938964844, + "loss": 0.6263, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.606163501739502, + "rewards/margins": 0.19818837940692902, + "rewards/rejected": -0.8043519258499146, "step": 10970 }, { - "epoch": 1.89, - "grad_norm": 24.319102255891416, - "learning_rate": 4.44482566987664e-09, - "logits/chosen": -1.3505980968475342, - "logits/rejected": -1.2955100536346436, - "logps/chosen": -245.52920532226562, - "logps/rejected": -382.1690368652344, - "loss": 0.4328, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.9095827341079712, - "rewards/margins": 1.3507945537567139, - "rewards/rejected": -3.2603771686553955, + "epoch": 1.8917987594762233, + "grad_norm": 12.16650676727295, + "learning_rate": 8.889651339753279e-10, + "logits/chosen": -2.4826903343200684, + "logits/rejected": -2.46773099899292, + "logps/chosen": -119.41780853271484, + "logps/rejected": -141.7852325439453, + "loss": 0.6275, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6484467387199402, + "rewards/margins": 0.20761927962303162, + "rewards/rejected": -0.8560660481452942, "step": 10980 }, { - "epoch": 1.89, - "grad_norm": 28.168547715893087, - "learning_rate": 4.304802169355221e-09, - "logits/chosen": -1.2861645221710205, - "logits/rejected": -1.22446608543396, - "logps/chosen": -218.29025268554688, - "logps/rejected": -352.78509521484375, - "loss": 0.4101, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.6582218408584595, - "rewards/margins": 1.3449079990386963, - "rewards/rejected": -3.0031299591064453, + "epoch": 1.8935217091660923, + "grad_norm": 14.031679153442383, + "learning_rate": 8.609604338710441e-10, + "logits/chosen": -2.4016706943511963, + "logits/rejected": -2.389274835586548, + "logps/chosen": -112.11262512207031, + "logps/rejected": -130.65145874023438, + "loss": 0.6284, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5962507128715515, + "rewards/margins": 0.18530747294425964, + "rewards/rejected": -0.7815582752227783, "step": 10990 }, { - "epoch": 1.9, - "grad_norm": 39.270052414600016, - "learning_rate": 4.167000509425811e-09, - "logits/chosen": -1.4578297138214111, - "logits/rejected": -1.4120782613754272, - "logps/chosen": -241.8314208984375, - "logps/rejected": -367.2518005371094, - "loss": 0.4734, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.8894517421722412, - "rewards/margins": 1.2133595943450928, - "rewards/rejected": -3.102811336517334, + "epoch": 1.8952446588559613, + "grad_norm": 11.361660957336426, + "learning_rate": 8.334001018851622e-10, + "logits/chosen": -2.5866708755493164, + "logits/rejected": -2.5802624225616455, + "logps/chosen": -116.65419006347656, + "logps/rejected": -140.32632446289062, + "loss": 0.6262, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6376508474349976, + "rewards/margins": 0.1957683116197586, + "rewards/rejected": -0.833419144153595, "step": 11000 }, { - "epoch": 1.9, - "eval_logits/chosen": -1.4190027713775635, - "eval_logits/rejected": -1.392012357711792, - "eval_logps/chosen": -243.87950134277344, - "eval_logps/rejected": -294.5247802734375, - "eval_loss": 0.6338525414466858, - "eval_rewards/accuracies": 0.6628717184066772, - "eval_rewards/chosen": -1.8517564535140991, - "eval_rewards/margins": 0.4619174599647522, - "eval_rewards/rejected": -2.313674211502075, - "eval_runtime": 357.5201, - "eval_samples_per_second": 12.038, - "eval_steps_per_second": 1.505, + "epoch": 1.8952446588559613, + "eval_logits/chosen": -2.558842897415161, + "eval_logits/rejected": -2.5520331859588623, + "eval_logps/chosen": -108.41845703125, + "eval_logps/rejected": -123.21259307861328, + "eval_loss": 0.6550112366676331, + "eval_rewards/accuracies": 0.6201208233833313, + "eval_rewards/chosen": -0.49706554412841797, + "eval_rewards/margins": 0.10325910896062851, + "eval_rewards/rejected": -0.6003247499465942, + "eval_runtime": 359.7213, + "eval_samples_per_second": 11.965, + "eval_steps_per_second": 1.496, "step": 11000 }, { - "epoch": 1.9, - "grad_norm": 33.16784261064953, - "learning_rate": 4.03142193623951e-09, - "logits/chosen": -1.3858684301376343, - "logits/rejected": -1.305426001548767, - "logps/chosen": -235.13864135742188, - "logps/rejected": -388.290771484375, - "loss": 0.3688, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.8195550441741943, - "rewards/margins": 1.5488157272338867, - "rewards/rejected": -3.368370771408081, + "epoch": 1.8969676085458305, + "grad_norm": 9.476881980895996, + "learning_rate": 8.062843872479019e-10, + "logits/chosen": -2.513068675994873, + "logits/rejected": -2.4836723804473877, + "logps/chosen": -110.36959075927734, + "logps/rejected": -131.13397216796875, + "loss": 0.6105, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5716327428817749, + "rewards/margins": 0.2250044345855713, + "rewards/rejected": -0.7966371774673462, "step": 11010 }, { - "epoch": 1.9, - "grad_norm": 19.279870183301984, - "learning_rate": 3.898067675843747e-09, - "logits/chosen": -1.435046911239624, - "logits/rejected": -1.3683885335922241, - "logps/chosen": -224.517578125, - "logps/rejected": -371.97283935546875, - "loss": 0.3641, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.6953538656234741, - "rewards/margins": 1.4940444231033325, - "rewards/rejected": -3.1893982887268066, + "epoch": 1.8986905582356997, + "grad_norm": 11.814251899719238, + "learning_rate": 7.796135351687494e-10, + "logits/chosen": -2.524017572402954, + "logits/rejected": -2.506251573562622, + "logps/chosen": -115.6934814453125, + "logps/rejected": -130.05941772460938, + "loss": 0.6333, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6068946123123169, + "rewards/margins": 0.16329768300056458, + "rewards/rejected": -0.7701922655105591, "step": 11020 }, { - "epoch": 1.9, - "grad_norm": 26.617574306250333, - "learning_rate": 3.766938934171348e-09, - "logits/chosen": -1.3704140186309814, - "logits/rejected": -1.3155790567398071, - "logps/chosen": -236.8312225341797, - "logps/rejected": -383.66754150390625, - "loss": 0.4191, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.8204883337020874, - "rewards/margins": 1.4713939428329468, - "rewards/rejected": -3.2918827533721924, + "epoch": 1.9004135079255686, + "grad_norm": 11.158601760864258, + "learning_rate": 7.533877868342698e-10, + "logits/chosen": -2.496063709259033, + "logits/rejected": -2.4887490272521973, + "logps/chosen": -118.58612060546875, + "logps/rejected": -133.88473510742188, + "loss": 0.6387, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6379790306091309, + "rewards/margins": 0.1559731513261795, + "rewards/rejected": -0.7939521670341492, "step": 11030 }, { - "epoch": 1.9, - "grad_norm": 32.25461324792829, - "learning_rate": 3.6380368970296836e-09, - "logits/chosen": -1.4112730026245117, - "logits/rejected": -1.351285696029663, - "logps/chosen": -239.5612030029297, - "logps/rejected": -364.81146240234375, - "loss": 0.4416, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.831099271774292, - "rewards/margins": 1.2617604732513428, - "rewards/rejected": -3.0928597450256348, + "epoch": 1.9021364576154376, + "grad_norm": 8.867852210998535, + "learning_rate": 7.276073794059367e-10, + "logits/chosen": -2.5248653888702393, + "logits/rejected": -2.5092899799346924, + "logps/chosen": -120.84211730957031, + "logps/rejected": -136.56988525390625, + "loss": 0.6361, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6437533497810364, + "rewards/margins": 0.166532963514328, + "rewards/rejected": -0.8102862238883972, "step": 11040 }, { - "epoch": 1.9, - "grad_norm": 35.4050705396756, - "learning_rate": 3.5113627300897285e-09, - "logits/chosen": -1.310435175895691, - "logits/rejected": -1.2423975467681885, - "logps/chosen": -222.96923828125, - "logps/rejected": -379.7068786621094, - "loss": 0.3806, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.7100273370742798, - "rewards/margins": 1.534369707107544, - "rewards/rejected": -3.244396924972534, + "epoch": 1.9038594073053066, + "grad_norm": 10.516443252563477, + "learning_rate": 7.022725460179457e-10, + "logits/chosen": -2.4529166221618652, + "logits/rejected": -2.4327664375305176, + "logps/chosen": -107.2201919555664, + "logps/rejected": -132.77243041992188, + "loss": 0.6088, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5525605082511902, + "rewards/margins": 0.2225067913532257, + "rewards/rejected": -0.7750672101974487, "step": 11050 }, { - "epoch": 1.91, - "grad_norm": 29.633120494533653, - "learning_rate": 3.38691757887577e-09, - "logits/chosen": -1.3639460802078247, - "logits/rejected": -1.271759271621704, - "logps/chosen": -234.57968139648438, - "logps/rejected": -376.5106506347656, - "loss": 0.4119, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.8031505346298218, - "rewards/margins": 1.444676399230957, - "rewards/rejected": -3.2478268146514893, + "epoch": 1.9055823569951758, + "grad_norm": 12.075571060180664, + "learning_rate": 6.77383515775154e-10, + "logits/chosen": -2.488499879837036, + "logits/rejected": -2.447962999343872, + "logps/chosen": -115.3367919921875, + "logps/rejected": -134.97335815429688, + "loss": 0.6115, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6106191873550415, + "rewards/margins": 0.22174124419689178, + "rewards/rejected": -0.8323603868484497, "step": 11060 }, { - "epoch": 1.91, - "grad_norm": 27.809450621921286, - "learning_rate": 3.2647025687549122e-09, - "logits/chosen": -1.3753821849822998, - "logits/rejected": -1.2898151874542236, - "logps/chosen": -224.57437133789062, - "logps/rejected": -369.9324645996094, - "loss": 0.4233, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.714037537574768, - "rewards/margins": 1.4897834062576294, - "rewards/rejected": -3.2038207054138184, + "epoch": 1.907305306685045, + "grad_norm": 11.85994815826416, + "learning_rate": 6.529405137509824e-10, + "logits/chosen": -2.5081701278686523, + "logits/rejected": -2.4635863304138184, + "logps/chosen": -109.13908386230469, + "logps/rejected": -131.86538696289062, + "loss": 0.5922, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5596362352371216, + "rewards/margins": 0.26336470246315, + "rewards/rejected": -0.8230009078979492, "step": 11070 }, { - "epoch": 1.91, - "grad_norm": 25.21379163567548, - "learning_rate": 3.144718804926866e-09, - "logits/chosen": -1.3679758310317993, - "logits/rejected": -1.3044774532318115, - "logps/chosen": -238.9508819580078, - "logps/rejected": -373.56060791015625, - "loss": 0.417, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.8344926834106445, - "rewards/margins": 1.3762904405593872, - "rewards/rejected": -3.210782527923584, + "epoch": 1.909028256374914, + "grad_norm": 13.558547973632812, + "learning_rate": 6.289437609853731e-10, + "logits/chosen": -2.4891202449798584, + "logits/rejected": -2.4640064239501953, + "logps/chosen": -118.3816909790039, + "logps/rejected": -136.5568084716797, + "loss": 0.6141, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.628782331943512, + "rewards/margins": 0.21184420585632324, + "rewards/rejected": -0.84062659740448, "step": 11080 }, { - "epoch": 1.91, - "grad_norm": 38.50752589918087, - "learning_rate": 3.0269673724140356e-09, - "logits/chosen": -1.3562889099121094, - "logits/rejected": -1.297836184501648, - "logps/chosen": -233.75924682617188, - "logps/rejected": -358.478271484375, - "loss": 0.4123, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.7592127323150635, - "rewards/margins": 1.2987269163131714, - "rewards/rejected": -3.0579395294189453, + "epoch": 1.9107512060647829, + "grad_norm": 12.248273849487305, + "learning_rate": 6.053934744828071e-10, + "logits/chosen": -2.465141773223877, + "logits/rejected": -2.4455647468566895, + "logps/chosen": -122.22418212890625, + "logps/rejected": -132.9927215576172, + "loss": 0.6372, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6440066695213318, + "rewards/margins": 0.15865349769592285, + "rewards/rejected": -0.8026601672172546, "step": 11090 }, { - "epoch": 1.91, - "grad_norm": 34.59360161848991, - "learning_rate": 2.9114493360517245e-09, - "logits/chosen": -1.2469245195388794, - "logits/rejected": -1.1905173063278198, - "logps/chosen": -209.7041015625, - "logps/rejected": -341.9212646484375, - "loss": 0.4333, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.5883010625839233, - "rewards/margins": 1.2910573482513428, - "rewards/rejected": -2.8793585300445557, + "epoch": 1.9124741557546519, + "grad_norm": 10.838395118713379, + "learning_rate": 5.822898672103449e-10, + "logits/chosen": -2.3649260997772217, + "logits/rejected": -2.3505358695983887, + "logps/chosen": -107.90660095214844, + "logps/rejected": -127.75433349609375, + "loss": 0.6311, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5702854990959167, + "rewards/margins": 0.16702958941459656, + "rewards/rejected": -0.7373150587081909, "step": 11100 }, { - "epoch": 1.91, - "eval_logits/chosen": -1.4189980030059814, - "eval_logits/rejected": -1.3920680284500122, - "eval_logps/chosen": -244.1648712158203, - "eval_logps/rejected": -294.9982604980469, - "eval_loss": 0.6333078145980835, - "eval_rewards/accuracies": 0.6598513126373291, - "eval_rewards/chosen": -1.8546103239059448, - "eval_rewards/margins": 0.46379825472831726, - "eval_rewards/rejected": -2.318408489227295, - "eval_runtime": 357.2276, - "eval_samples_per_second": 12.048, - "eval_steps_per_second": 1.506, + "epoch": 1.9124741557546519, + "eval_logits/chosen": -2.558917760848999, + "eval_logits/rejected": -2.552147150039673, + "eval_logps/chosen": -108.42533874511719, + "eval_logps/rejected": -123.26876068115234, + "eval_loss": 0.6548120975494385, + "eval_rewards/accuracies": 0.6210501790046692, + "eval_rewards/chosen": -0.49713435769081116, + "eval_rewards/margins": 0.10375203937292099, + "eval_rewards/rejected": -0.6008864045143127, + "eval_runtime": 359.348, + "eval_samples_per_second": 11.977, + "eval_steps_per_second": 1.497, "step": 11100 }, { - "epoch": 1.91, - "grad_norm": 15.157481934801936, - "learning_rate": 2.79816574047842e-09, - "logits/chosen": -1.3375742435455322, - "logits/rejected": -1.2590216398239136, - "logps/chosen": -242.087890625, - "logps/rejected": -421.6622619628906, - "loss": 0.3844, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.8821719884872437, - "rewards/margins": 1.799425721168518, - "rewards/rejected": -3.681597948074341, + "epoch": 1.914197105444521, + "grad_norm": 11.10058879852295, + "learning_rate": 5.59633148095684e-10, + "logits/chosen": -2.4862568378448486, + "logits/rejected": -2.4628608226776123, + "logps/chosen": -119.6026382446289, + "logps/rejected": -141.5771942138672, + "loss": 0.6134, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6570100784301758, + "rewards/margins": 0.22355251014232635, + "rewards/rejected": -0.8805624842643738, "step": 11110 }, { - "epoch": 1.92, - "grad_norm": 41.14442211032838, - "learning_rate": 2.6871176101263825e-09, - "logits/chosen": -1.4798122644424438, - "logits/rejected": -1.4134316444396973, - "logps/chosen": -232.623046875, - "logps/rejected": -359.533935546875, - "loss": 0.4216, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.730859398841858, - "rewards/margins": 1.3277714252471924, - "rewards/rejected": -3.05863094329834, + "epoch": 1.9159200551343902, + "grad_norm": 10.916470527648926, + "learning_rate": 5.374235220252765e-10, + "logits/chosen": -2.577754497528076, + "logits/rejected": -2.5522544384002686, + "logps/chosen": -123.24705505371094, + "logps/rejected": -133.11061096191406, + "loss": 0.6389, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6373213529586792, + "rewards/margins": 0.15703292191028595, + "rewards/rejected": -0.7943544387817383, "step": 11120 }, { - "epoch": 1.92, - "grad_norm": 41.710468519729744, - "learning_rate": 2.578305949212434e-09, - "logits/chosen": -1.273736834526062, - "logits/rejected": -1.2053143978118896, - "logps/chosen": -243.1206817626953, - "logps/rejected": -371.94866943359375, - "loss": 0.4025, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.8726370334625244, - "rewards/margins": 1.3150079250335693, - "rewards/rejected": -3.1876449584960938, + "epoch": 1.9176430048242592, + "grad_norm": 11.398584365844727, + "learning_rate": 5.156611898424867e-10, + "logits/chosen": -2.4134035110473633, + "logits/rejected": -2.391727924346924, + "logps/chosen": -122.10980224609375, + "logps/rejected": -132.87570190429688, + "loss": 0.6458, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6623373031616211, + "rewards/margins": 0.13456371426582336, + "rewards/rejected": -0.7969009876251221, "step": 11130 }, { - "epoch": 1.92, - "grad_norm": 41.66910869223657, - "learning_rate": 2.4717317417287942e-09, - "logits/chosen": -1.2594302892684937, - "logits/rejected": -1.1979601383209229, - "logps/chosen": -219.0839080810547, - "logps/rejected": -353.35504150390625, - "loss": 0.3744, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.6528394222259521, - "rewards/margins": 1.3649063110351562, - "rewards/rejected": -3.0177457332611084, + "epoch": 1.9193659545141282, + "grad_norm": 11.0700044631958, + "learning_rate": 4.943463483457588e-10, + "logits/chosen": -2.3942623138427734, + "logits/rejected": -2.3760738372802734, + "logps/chosen": -112.96749114990234, + "logps/rejected": -128.13754272460938, + "loss": 0.6285, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5915412306785583, + "rewards/margins": 0.17406094074249268, + "rewards/rejected": -0.765602171421051, "step": 11140 }, { - "epoch": 1.92, - "grad_norm": 38.68187084741804, - "learning_rate": 2.3673959514342314e-09, - "logits/chosen": -1.3535066843032837, - "logits/rejected": -1.3013697862625122, - "logps/chosen": -242.1443328857422, - "logps/rejected": -369.8888244628906, - "loss": 0.43, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.8408015966415405, - "rewards/margins": 1.2930688858032227, - "rewards/rejected": -3.1338706016540527, + "epoch": 1.9210889042039971, + "grad_norm": 12.193937301635742, + "learning_rate": 4.734791902868462e-10, + "logits/chosen": -2.4600093364715576, + "logits/rejected": -2.4479706287384033, + "logps/chosen": -122.27796936035156, + "logps/rejected": -134.85580444335938, + "loss": 0.6445, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6418440937995911, + "rewards/margins": 0.14160865545272827, + "rewards/rejected": -0.7834526300430298, "step": 11150 }, { - "epoch": 1.92, - "grad_norm": 34.31089036359821, - "learning_rate": 2.2652995218452877e-09, - "logits/chosen": -1.4165607690811157, - "logits/rejected": -1.3578795194625854, - "logps/chosen": -216.8806610107422, - "logps/rejected": -338.1597900390625, - "loss": 0.4338, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.647953748703003, - "rewards/margins": 1.2238695621490479, - "rewards/rejected": -2.871823310852051, + "epoch": 1.9228118538938663, + "grad_norm": 10.338449478149414, + "learning_rate": 4.530599043690575e-10, + "logits/chosen": -2.5353057384490967, + "logits/rejected": -2.5182223320007324, + "logps/chosen": -111.48570251464844, + "logps/rejected": -126.96525573730469, + "loss": 0.6341, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5936704874038696, + "rewards/margins": 0.16582946479320526, + "rewards/rejected": -0.7594999074935913, "step": 11160 }, { - "epoch": 1.92, - "grad_norm": 32.24674833849174, - "learning_rate": 2.165443376227871e-09, - "logits/chosen": -1.2586653232574463, - "logits/rejected": -1.196004867553711, - "logps/chosen": -232.2088165283203, - "logps/rejected": -331.0061340332031, - "loss": 0.4821, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.750544786453247, - "rewards/margins": 1.0693135261535645, - "rewards/rejected": -2.8198580741882324, + "epoch": 1.9245348035837355, + "grad_norm": 9.943690299987793, + "learning_rate": 4.3308867524557425e-10, + "logits/chosen": -2.3883252143859863, + "logits/rejected": -2.3627078533172607, + "logps/chosen": -115.91829681396484, + "logps/rejected": -121.95379638671875, + "loss": 0.6436, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5875613689422607, + "rewards/margins": 0.14175362884998322, + "rewards/rejected": -0.7293149828910828, "step": 11170 }, { - "epoch": 1.93, - "grad_norm": 57.55683167173714, - "learning_rate": 2.0678284175887907e-09, - "logits/chosen": -1.4004487991333008, - "logits/rejected": -1.3373186588287354, - "logps/chosen": -235.8096466064453, - "logps/rejected": -374.24713134765625, - "loss": 0.3973, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.8187503814697266, - "rewards/margins": 1.4154729843139648, - "rewards/rejected": -3.2342236042022705, + "epoch": 1.9262577532736045, + "grad_norm": 9.826698303222656, + "learning_rate": 4.135656835177581e-10, + "logits/chosen": -2.5109176635742188, + "logits/rejected": -2.4892756938934326, + "logps/chosen": -118.7507095336914, + "logps/rejected": -133.6431121826172, + "loss": 0.6297, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6478487253189087, + "rewards/margins": 0.18034231662750244, + "rewards/rejected": -0.8281909227371216, "step": 11180 }, { - "epoch": 1.93, - "grad_norm": 25.781820157206223, - "learning_rate": 1.972455528667677e-09, - "logits/chosen": -1.3892544507980347, - "logits/rejected": -1.3081748485565186, - "logps/chosen": -220.63912963867188, - "logps/rejected": -372.5516052246094, - "loss": 0.3301, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -1.6577503681182861, - "rewards/margins": 1.551475167274475, - "rewards/rejected": -3.20922589302063, + "epoch": 1.9279807029634735, + "grad_norm": 11.667350769042969, + "learning_rate": 3.944911057335354e-10, + "logits/chosen": -2.505736827850342, + "logits/rejected": -2.477687120437622, + "logps/chosen": -114.22786712646484, + "logps/rejected": -133.70339965820312, + "loss": 0.6072, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5934600830078125, + "rewards/margins": 0.22718998789787292, + "rewards/rejected": -0.8206501007080078, "step": 11190 }, { - "epoch": 1.93, - "grad_norm": 33.77308243819322, - "learning_rate": 1.8793255719288246e-09, - "logits/chosen": -1.3923676013946533, - "logits/rejected": -1.3273457288742065, - "logps/chosen": -210.1555633544922, - "logps/rejected": -346.9501647949219, - "loss": 0.4305, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.5684833526611328, - "rewards/margins": 1.3637385368347168, - "rewards/rejected": -2.9322218894958496, + "epoch": 1.9297036526533424, + "grad_norm": 10.847187995910645, + "learning_rate": 3.7586511438576496e-10, + "logits/chosen": -2.5051229000091553, + "logits/rejected": -2.4858336448669434, + "logps/chosen": -112.1025161743164, + "logps/rejected": -130.5367431640625, + "loss": 0.6239, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5878943204879761, + "rewards/margins": 0.18000969290733337, + "rewards/rejected": -0.7679039835929871, "step": 11200 }, { - "epoch": 1.93, - "eval_logits/chosen": -1.42206609249115, - "eval_logits/rejected": -1.3953404426574707, - "eval_logps/chosen": -243.3866424560547, - "eval_logps/rejected": -293.89874267578125, - "eval_loss": 0.6334691643714905, - "eval_rewards/accuracies": 0.6563661694526672, - "eval_rewards/chosen": -1.8468278646469116, - "eval_rewards/margins": 0.46058568358421326, - "eval_rewards/rejected": -2.307413339614868, - "eval_runtime": 357.503, - "eval_samples_per_second": 12.039, - "eval_steps_per_second": 1.505, + "epoch": 1.9297036526533424, + "eval_logits/chosen": -2.5583388805389404, + "eval_logits/rejected": -2.5515522956848145, + "eval_logps/chosen": -108.4262924194336, + "eval_logps/rejected": -123.20613098144531, + "eval_loss": 0.6550823450088501, + "eval_rewards/accuracies": 0.6201208233833313, + "eval_rewards/chosen": -0.4971439838409424, + "eval_rewards/margins": 0.10311610996723175, + "eval_rewards/rejected": -0.6002600789070129, + "eval_runtime": 359.6901, + "eval_samples_per_second": 11.966, + "eval_steps_per_second": 1.496, "step": 11200 }, { - "epoch": 1.93, - "grad_norm": 46.567374288537444, - "learning_rate": 1.7884393895536697e-09, - "logits/chosen": -1.2399379014968872, - "logits/rejected": -1.1777968406677246, - "logps/chosen": -229.732666015625, - "logps/rejected": -374.9830017089844, - "loss": 0.4382, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.793482780456543, - "rewards/margins": 1.417189121246338, - "rewards/rejected": -3.2106716632843018, + "epoch": 1.9314266023432116, + "grad_norm": 9.750901222229004, + "learning_rate": 3.5768787791073394e-10, + "logits/chosen": -2.3721094131469727, + "logits/rejected": -2.3552279472351074, + "logps/chosen": -108.65545654296875, + "logps/rejected": -134.5958251953125, + "loss": 0.6131, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5824468731880188, + "rewards/margins": 0.2240564525127411, + "rewards/rejected": -0.806503176689148, "step": 11210 }, { - "epoch": 1.93, - "grad_norm": 42.21368814153827, - "learning_rate": 1.6997978034329342e-09, - "logits/chosen": -1.3409579992294312, - "logits/rejected": -1.2815383672714233, - "logps/chosen": -217.97921752929688, - "logps/rejected": -357.2254943847656, - "loss": 0.4403, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.6595194339752197, - "rewards/margins": 1.3560142517089844, - "rewards/rejected": -3.015533447265625, + "epoch": 1.9331495520330806, + "grad_norm": 11.33436107635498, + "learning_rate": 3.3995956068658683e-10, + "logits/chosen": -2.465871572494507, + "logits/rejected": -2.4521288871765137, + "logps/chosen": -111.4020004272461, + "logps/rejected": -133.30909729003906, + "loss": 0.6324, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5935233235359192, + "rewards/margins": 0.1827230155467987, + "rewards/rejected": -0.7762463688850403, "step": 11220 }, { - "epoch": 1.93, - "grad_norm": 39.048264387033385, - "learning_rate": 1.613401615159299e-09, - "logits/chosen": -1.3046488761901855, - "logits/rejected": -1.2500449419021606, - "logps/chosen": -243.40029907226562, - "logps/rejected": -376.0064697265625, - "loss": 0.4023, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.8764787912368774, - "rewards/margins": 1.3685901165008545, - "rewards/rejected": -3.2450687885284424, + "epoch": 1.9348725017229498, + "grad_norm": 10.559438705444336, + "learning_rate": 3.2268032303185977e-10, + "logits/chosen": -2.43168306350708, + "logits/rejected": -2.4194796085357666, + "logps/chosen": -124.14222717285156, + "logps/rejected": -130.67507934570312, + "loss": 0.6634, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6837325096130371, + "rewards/margins": 0.1077658161520958, + "rewards/rejected": -0.7914983630180359, "step": 11230 }, { - "epoch": 1.94, - "grad_norm": 23.205283737644205, - "learning_rate": 1.5292516060201599e-09, - "logits/chosen": -1.3030173778533936, - "logits/rejected": -1.2475535869598389, - "logps/chosen": -230.6564483642578, - "logps/rejected": -360.25225830078125, - "loss": 0.4305, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.7603286504745483, - "rewards/margins": 1.3079051971435547, - "rewards/rejected": -3.0682339668273926, + "epoch": 1.9365954514128187, + "grad_norm": 8.981484413146973, + "learning_rate": 3.0585032120403196e-10, + "logits/chosen": -2.4348931312561035, + "logits/rejected": -2.419318437576294, + "logps/chosen": -115.5736312866211, + "logps/rejected": -130.85311889648438, + "loss": 0.6338, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6092356443405151, + "rewards/margins": 0.16486208140850067, + "rewards/rejected": -0.774097740650177, "step": 11240 }, { - "epoch": 1.94, - "grad_norm": 27.435468950638114, - "learning_rate": 1.4473485369905224e-09, - "logits/chosen": -1.3240660429000854, - "logits/rejected": -1.261580228805542, - "logps/chosen": -228.3889923095703, - "logps/rejected": -360.775634765625, - "loss": 0.4024, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.7416374683380127, - "rewards/margins": 1.3298765420913696, - "rewards/rejected": -3.071514129638672, + "epoch": 1.9383184011026877, + "grad_norm": 11.513097763061523, + "learning_rate": 2.894697073981045e-10, + "logits/chosen": -2.4622929096221924, + "logits/rejected": -2.4382715225219727, + "logps/chosen": -112.7807846069336, + "logps/rejected": -133.18942260742188, + "loss": 0.6138, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5849259495735168, + "rewards/margins": 0.2107054740190506, + "rewards/rejected": -0.7956314086914062, "step": 11250 }, { - "epoch": 1.94, - "grad_norm": 19.467311006382324, - "learning_rate": 1.3676931487261456e-09, - "logits/chosen": -1.2779386043548584, - "logits/rejected": -1.209826111793518, - "logps/chosen": -221.6068115234375, - "logps/rejected": -340.8597717285156, - "loss": 0.4412, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.637843370437622, - "rewards/margins": 1.2554326057434082, - "rewards/rejected": -2.8932759761810303, + "epoch": 1.940041350792557, + "grad_norm": 10.974672317504883, + "learning_rate": 2.735386297452291e-10, + "logits/chosen": -2.4070940017700195, + "logits/rejected": -2.3761181831359863, + "logps/chosen": -117.33487701416016, + "logps/rejected": -126.53621673583984, + "loss": 0.6421, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.5950134992599487, + "rewards/margins": 0.1547488272190094, + "rewards/rejected": -0.7497623562812805, "step": 11260 }, { - "epoch": 1.94, - "grad_norm": 41.60730906780407, - "learning_rate": 1.2902861615568527e-09, - "logits/chosen": -1.3289337158203125, - "logits/rejected": -1.2553983926773071, - "logps/chosen": -224.650390625, - "logps/rejected": -358.2517395019531, - "loss": 0.391, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.6910244226455688, - "rewards/margins": 1.3813101053237915, - "rewards/rejected": -3.0723345279693604, + "epoch": 1.9417643004824259, + "grad_norm": 10.253097534179688, + "learning_rate": 2.5805723231137057e-10, + "logits/chosen": -2.4497439861297607, + "logits/rejected": -2.4220428466796875, + "logps/chosen": -114.47210693359375, + "logps/rejected": -130.89596557617188, + "loss": 0.6141, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5890557169914246, + "rewards/margins": 0.2095029354095459, + "rewards/rejected": -0.7985587120056152, "step": 11270 }, { - "epoch": 1.94, - "grad_norm": 32.1598075986716, - "learning_rate": 1.2151282754799542e-09, - "logits/chosen": -1.3617111444473267, - "logits/rejected": -1.293905258178711, - "logps/chosen": -228.1493377685547, - "logps/rejected": -348.77618408203125, - "loss": 0.4462, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.7183462381362915, - "rewards/margins": 1.2455885410308838, - "rewards/rejected": -2.9639346599578857, + "epoch": 1.943487250172295, + "grad_norm": 9.800058364868164, + "learning_rate": 2.430256550959908e-10, + "logits/chosen": -2.5011367797851562, + "logits/rejected": -2.473191738128662, + "logps/chosen": -116.41319274902344, + "logps/rejected": -127.36231994628906, + "loss": 0.6393, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6010106205940247, + "rewards/margins": 0.14850196242332458, + "rewards/rejected": -0.7495125532150269, "step": 11280 }, { - "epoch": 1.95, - "grad_norm": 36.803056421177146, - "learning_rate": 1.1422201701540567e-09, - "logits/chosen": -1.3835200071334839, - "logits/rejected": -1.3248523473739624, - "logps/chosen": -217.21005249023438, - "logps/rejected": -349.54583740234375, - "loss": 0.4033, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.5726218223571777, - "rewards/margins": 1.3615262508392334, - "rewards/rejected": -2.9341483116149902, + "epoch": 1.945210199862164, + "grad_norm": 10.676453590393066, + "learning_rate": 2.2844403403081137e-10, + "logits/chosen": -2.4754865169525146, + "logits/rejected": -2.4605138301849365, + "logps/chosen": -117.28421783447266, + "logps/rejected": -134.24063110351562, + "loss": 0.6134, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5732543468475342, + "rewards/margins": 0.20773577690124512, + "rewards/rejected": -0.7809900045394897, "step": 11290 }, { - "epoch": 1.95, - "grad_norm": 27.85129396845735, - "learning_rate": 1.0715625048927092e-09, - "logits/chosen": -1.309777021408081, - "logits/rejected": -1.2495863437652588, - "logps/chosen": -242.6230926513672, - "logps/rejected": -351.98077392578125, - "loss": 0.4817, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.8370163440704346, - "rewards/margins": 1.1458604335784912, - "rewards/rejected": -2.9828765392303467, + "epoch": 1.946933149552033, + "grad_norm": 10.65670108795166, + "learning_rate": 2.1431250097854182e-10, + "logits/chosen": -2.436732292175293, + "logits/rejected": -2.413374185562134, + "logps/chosen": -123.99664306640625, + "logps/rejected": -128.61732482910156, + "loss": 0.6629, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6507250070571899, + "rewards/margins": 0.09833811223506927, + "rewards/rejected": -0.7490631341934204, "step": 11300 }, { - "epoch": 1.95, - "eval_logits/chosen": -1.4203462600708008, - "eval_logits/rejected": -1.393379807472229, - "eval_logps/chosen": -244.32652282714844, - "eval_logps/rejected": -295.0477294921875, - "eval_loss": 0.6342768669128418, - "eval_rewards/accuracies": 0.6572955250740051, - "eval_rewards/chosen": -1.8562268018722534, - "eval_rewards/margins": 0.46267637610435486, - "eval_rewards/rejected": -2.3189032077789307, - "eval_runtime": 356.9343, - "eval_samples_per_second": 12.058, - "eval_steps_per_second": 1.507, + "epoch": 1.946933149552033, + "eval_logits/chosen": -2.558652400970459, + "eval_logits/rejected": -2.5518338680267334, + "eval_logps/chosen": -108.41070556640625, + "eval_logps/rejected": -123.20658111572266, + "eval_loss": 0.6549978256225586, + "eval_rewards/accuracies": 0.6205855011940002, + "eval_rewards/chosen": -0.49698811769485474, + "eval_rewards/margins": 0.10327637940645218, + "eval_rewards/rejected": -0.6002644896507263, + "eval_runtime": 359.9247, + "eval_samples_per_second": 11.958, + "eval_steps_per_second": 1.495, "step": 11300 }, { - "epoch": 1.95, - "grad_norm": 33.21894208609608, - "learning_rate": 1.0031559186586825e-09, - "logits/chosen": -1.4185220003128052, - "logits/rejected": -1.3642728328704834, - "logps/chosen": -219.1095428466797, - "logps/rejected": -368.1912841796875, - "loss": 0.3543, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.686640739440918, - "rewards/margins": 1.4802013635635376, - "rewards/rejected": -3.166841983795166, + "epoch": 1.948656099241902, + "grad_norm": 11.778402328491211, + "learning_rate": 2.0063118373173648e-10, + "logits/chosen": -2.5401158332824707, + "logits/rejected": -2.5334606170654297, + "logps/chosen": -109.58613586425781, + "logps/rejected": -133.18478393554688, + "loss": 0.6093, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5912298560142517, + "rewards/margins": 0.22539111971855164, + "rewards/rejected": -0.8166210055351257, "step": 11310 }, { - "epoch": 1.95, - "grad_norm": 24.242052165940123, - "learning_rate": 9.370010300579213e-10, - "logits/chosen": -1.350401520729065, - "logits/rejected": -1.2788327932357788, - "logps/chosen": -226.5559539794922, - "logps/rejected": -364.3349914550781, - "loss": 0.4343, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7163565158843994, - "rewards/margins": 1.3881375789642334, - "rewards/rejected": -3.104494094848633, + "epoch": 1.9503790489317712, + "grad_norm": 9.3758544921875, + "learning_rate": 1.8740020601158425e-10, + "logits/chosen": -2.471465587615967, + "logits/rejected": -2.4445977210998535, + "logps/chosen": -111.59794616699219, + "logps/rejected": -132.0358428955078, + "loss": 0.6153, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5667641162872314, + "rewards/margins": 0.2145020067691803, + "rewards/rejected": -0.7812660932540894, "step": 11320 }, { - "epoch": 1.95, - "grad_norm": 49.892604398278046, - "learning_rate": 8.730984373342409e-10, - "logits/chosen": -1.3533201217651367, - "logits/rejected": -1.2792202234268188, - "logps/chosen": -225.33120727539062, - "logps/rejected": -375.6051330566406, - "loss": 0.3547, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.7277292013168335, - "rewards/margins": 1.547004222869873, - "rewards/rejected": -3.274733304977417, + "epoch": 1.9521019986216404, + "grad_norm": 16.428855895996094, + "learning_rate": 1.746196874668482e-10, + "logits/chosen": -2.4795517921447754, + "logits/rejected": -2.4475464820861816, + "logps/chosen": -118.28900146484375, + "logps/rejected": -131.59017944335938, + "loss": 0.6344, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6573032736778259, + "rewards/margins": 0.17725330591201782, + "rewards/rejected": -0.8345565795898438, "step": 11330 }, { - "epoch": 1.95, - "grad_norm": 44.808939079357344, - "learning_rate": 8.114487183636942e-10, - "logits/chosen": -1.2439250946044922, - "logits/rejected": -1.1751753091812134, - "logps/chosen": -238.3853759765625, - "logps/rejected": -387.01287841796875, - "loss": 0.3996, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.8302602767944336, - "rewards/margins": 1.4923627376556396, - "rewards/rejected": -3.3226234912872314, + "epoch": 1.9538249483115093, + "grad_norm": 12.055771827697754, + "learning_rate": 1.6228974367273883e-10, + "logits/chosen": -2.3732800483703613, + "logits/rejected": -2.3519484996795654, + "logps/chosen": -118.50630187988281, + "logps/rejected": -138.6522979736328, + "loss": 0.6227, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6312685608863831, + "rewards/margins": 0.20766310393810272, + "rewards/rejected": -0.838931679725647, "step": 11340 }, { - "epoch": 1.96, - "grad_norm": 24.43792912942958, - "learning_rate": 7.520524306494358e-10, - "logits/chosen": -1.3848811388015747, - "logits/rejected": -1.3261343240737915, - "logps/chosen": -250.03085327148438, - "logps/rejected": -366.9549255371094, - "loss": 0.4724, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.9313633441925049, - "rewards/margins": 1.177704095840454, - "rewards/rejected": -3.109067440032959, + "epoch": 1.9555478980013783, + "grad_norm": 11.385031700134277, + "learning_rate": 1.5041048612988717e-10, + "logits/chosen": -2.5143306255340576, + "logits/rejected": -2.5021491050720215, + "logps/chosen": -121.97713470458984, + "logps/rejected": -136.8004608154297, + "loss": 0.6402, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6507047414779663, + "rewards/margins": 0.15681439638137817, + "rewards/rejected": -0.8075190782546997, "step": 11350 }, { - "epoch": 1.96, - "grad_norm": 31.83383405122785, - "learning_rate": 6.949101113166711e-10, - "logits/chosen": -1.330773949623108, - "logits/rejected": -1.2642290592193604, - "logps/chosen": -235.5715789794922, - "logps/rejected": -365.17340087890625, - "loss": 0.4185, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.8331444263458252, - "rewards/margins": 1.3331224918365479, - "rewards/rejected": -3.166267156600952, + "epoch": 1.9572708476912473, + "grad_norm": 9.432991981506348, + "learning_rate": 1.3898202226333423e-10, + "logits/chosen": -2.47906756401062, + "logits/rejected": -2.451880931854248, + "logps/chosen": -114.74503326416016, + "logps/rejected": -129.50633239746094, + "loss": 0.6267, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6249033808708191, + "rewards/margins": 0.18457625806331635, + "rewards/rejected": -0.8094797134399414, "step": 11360 }, { - "epoch": 1.96, - "grad_norm": 29.61007825180616, - "learning_rate": 6.40022277107799e-10, - "logits/chosen": -1.3195604085922241, - "logits/rejected": -1.2653484344482422, - "logps/chosen": -229.4175567626953, - "logps/rejected": -352.00555419921875, - "loss": 0.4398, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7311922311782837, - "rewards/margins": 1.2529064416885376, - "rewards/rejected": -2.9840986728668213, + "epoch": 1.9589937973811165, + "grad_norm": 10.39565658569336, + "learning_rate": 1.280044554215598e-10, + "logits/chosen": -2.442214012145996, + "logits/rejected": -2.4250168800354004, + "logps/chosen": -119.72346496582031, + "logps/rejected": -133.8603515625, + "loss": 0.6336, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6344811320304871, + "rewards/margins": 0.16784489154815674, + "rewards/rejected": -0.8023262023925781, "step": 11370 }, { - "epoch": 1.96, - "grad_norm": 44.883310537971305, - "learning_rate": 5.873894243776933e-10, - "logits/chosen": -1.2741248607635498, - "logits/rejected": -1.2058513164520264, - "logps/chosen": -221.5206298828125, - "logps/rejected": -360.199951171875, - "loss": 0.4065, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.6969740390777588, - "rewards/margins": 1.4046361446380615, - "rewards/rejected": -3.1016104221343994, + "epoch": 1.9607167470709856, + "grad_norm": 14.005722045898438, + "learning_rate": 1.1747788487553866e-10, + "logits/chosen": -2.415670871734619, + "logits/rejected": -2.394944667816162, + "logps/chosen": -114.1079330444336, + "logps/rejected": -130.89517211914062, + "loss": 0.6307, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6227467656135559, + "rewards/margins": 0.18552573025226593, + "rewards/rejected": -0.8082724809646606, "step": 11380 }, { - "epoch": 1.96, - "grad_norm": 36.423896869469424, - "learning_rate": 5.370120290893176e-10, - "logits/chosen": -1.4164044857025146, - "logits/rejected": -1.340454339981079, - "logps/chosen": -214.89956665039062, - "logps/rejected": -366.628662109375, - "loss": 0.4007, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.625978708267212, - "rewards/margins": 1.4872604608535767, - "rewards/rejected": -3.113239288330078, + "epoch": 1.9624396967608546, + "grad_norm": 10.821507453918457, + "learning_rate": 1.0740240581786353e-10, + "logits/chosen": -2.5239498615264893, + "logits/rejected": -2.502776622772217, + "logps/chosen": -108.56727600097656, + "logps/rejected": -135.5474395751953, + "loss": 0.6051, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5624256730079651, + "rewards/margins": 0.239888995885849, + "rewards/rejected": -0.8023146390914917, "step": 11390 }, { - "epoch": 1.96, - "grad_norm": 28.749205056855722, - "learning_rate": 4.888905468093673e-10, - "logits/chosen": -1.3569167852401733, - "logits/rejected": -1.291182041168213, - "logps/chosen": -212.83273315429688, - "logps/rejected": -341.9942626953125, - "loss": 0.4146, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5691052675247192, - "rewards/margins": 1.31778883934021, - "rewards/rejected": -2.8868937492370605, + "epoch": 1.9641626464507236, + "grad_norm": 11.778271675109863, + "learning_rate": 9.777810936187347e-11, + "logits/chosen": -2.4854180812835693, + "logits/rejected": -2.455705404281616, + "logps/chosen": -113.80528259277344, + "logps/rejected": -129.2537384033203, + "loss": 0.6308, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5789285898208618, + "rewards/margins": 0.1805000901222229, + "rewards/rejected": -0.7594286799430847, "step": 11400 }, { - "epoch": 1.96, - "eval_logits/chosen": -1.41786789894104, - "eval_logits/rejected": -1.3909337520599365, - "eval_logps/chosen": -244.43157958984375, - "eval_logps/rejected": -295.2254638671875, - "eval_loss": 0.63393235206604, - "eval_rewards/accuracies": 0.6559014916419983, - "eval_rewards/chosen": -1.857277512550354, - "eval_rewards/margins": 0.4634034037590027, - "eval_rewards/rejected": -2.320681095123291, - "eval_runtime": 357.3673, - "eval_samples_per_second": 12.044, - "eval_steps_per_second": 1.505, + "epoch": 1.9641626464507236, + "eval_logits/chosen": -2.5585732460021973, + "eval_logits/rejected": -2.5517868995666504, + "eval_logps/chosen": -108.43598175048828, + "eval_logps/rejected": -123.23049926757812, + "eval_loss": 0.655043363571167, + "eval_rewards/accuracies": 0.6196561455726624, + "eval_rewards/chosen": -0.4972408711910248, + "eval_rewards/margins": 0.10326271504163742, + "eval_rewards/rejected": -0.6005036234855652, + "eval_runtime": 361.0916, + "eval_samples_per_second": 11.919, + "eval_steps_per_second": 1.49, "step": 11400 }, { - "epoch": 1.97, - "grad_norm": 39.086102079615756, - "learning_rate": 4.430254127040789e-10, - "logits/chosen": -1.3270251750946045, - "logits/rejected": -1.2652333974838257, - "logps/chosen": -229.4177703857422, - "logps/rejected": -348.8304748535156, - "loss": 0.4342, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7375816106796265, - "rewards/margins": 1.199951171875, - "rewards/rejected": -2.937532901763916, + "epoch": 1.9658855961405926, + "grad_norm": 11.219842910766602, + "learning_rate": 8.860508254081577e-11, + "logits/chosen": -2.48612642288208, + "logits/rejected": -2.4600718021392822, + "logps/chosen": -112.62696838378906, + "logps/rejected": -127.6418228149414, + "loss": 0.6378, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.569412350654602, + "rewards/margins": 0.15631404519081116, + "rewards/rejected": -0.7257263660430908, "step": 11410 }, { - "epoch": 1.97, - "grad_norm": 30.366456037929765, - "learning_rate": 3.994170415353715e-10, - "logits/chosen": -1.3328666687011719, - "logits/rejected": -1.2708826065063477, - "logps/chosen": -233.0135498046875, - "logps/rejected": -347.16339111328125, - "loss": 0.4453, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.793405532836914, - "rewards/margins": 1.175714135169983, - "rewards/rejected": -2.9691193103790283, + "epoch": 1.9676085458304617, + "grad_norm": 11.249455451965332, + "learning_rate": 7.98834083070743e-11, + "logits/chosen": -2.4693493843078613, + "logits/rejected": -2.441267728805542, + "logps/chosen": -112.8091812133789, + "logps/rejected": -126.4327621459961, + "loss": 0.6304, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.591242253780365, + "rewards/margins": 0.17049895226955414, + "rewards/rejected": -0.7617412209510803, "step": 11420 }, { - "epoch": 1.97, - "grad_norm": 23.516266855640406, - "learning_rate": 3.5806582765715574e-10, - "logits/chosen": -1.2737079858779907, - "logits/rejected": -1.215456247329712, - "logps/chosen": -230.99423217773438, - "logps/rejected": -346.54241943359375, - "loss": 0.4635, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.7601213455200195, - "rewards/margins": 1.1789329051971436, - "rewards/rejected": -2.939054012298584, + "epoch": 1.969331495520331, + "grad_norm": 10.52036190032959, + "learning_rate": 7.161316553143115e-11, + "logits/chosen": -2.4036307334899902, + "logits/rejected": -2.3796024322509766, + "logps/chosen": -116.31755065917969, + "logps/rejected": -130.1694793701172, + "loss": 0.6355, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6134723424911499, + "rewards/margins": 0.1615884006023407, + "rewards/rejected": -0.775060772895813, "step": 11430 }, { - "epoch": 1.97, - "grad_norm": 32.39137150045276, - "learning_rate": 3.189721450116145e-10, - "logits/chosen": -1.340698003768921, - "logits/rejected": -1.2886393070220947, - "logps/chosen": -225.9718780517578, - "logps/rejected": -343.7590637207031, - "loss": 0.4333, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.7345173358917236, - "rewards/margins": 1.1770877838134766, - "rewards/rejected": -2.9116053581237793, + "epoch": 1.9710544452102, + "grad_norm": 10.630167961120605, + "learning_rate": 6.37944290023229e-11, + "logits/chosen": -2.472803831100464, + "logits/rejected": -2.461055278778076, + "logps/chosen": -116.23243713378906, + "logps/rejected": -127.49246978759766, + "loss": 0.6552, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6368228793144226, + "rewards/margins": 0.1119503378868103, + "rewards/rejected": -0.7487732172012329, "step": 11440 }, { - "epoch": 1.97, - "grad_norm": 47.231248686402424, - "learning_rate": 2.821363471259275e-10, - "logits/chosen": -1.2820355892181396, - "logits/rejected": -1.2136328220367432, - "logps/chosen": -230.76296997070312, - "logps/rejected": -371.7513122558594, - "loss": 0.4059, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.780504822731018, - "rewards/margins": 1.4201017618179321, - "rewards/rejected": -3.20060658454895, + "epoch": 1.9727773949000689, + "grad_norm": 11.976935386657715, + "learning_rate": 5.64272694251855e-11, + "logits/chosen": -2.4077811241149902, + "logits/rejected": -2.3849339485168457, + "logps/chosen": -113.08724212646484, + "logps/rejected": -134.1420135498047, + "loss": 0.6083, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.603790819644928, + "rewards/margins": 0.22062405943870544, + "rewards/rejected": -0.8244149088859558, "step": 11450 }, { - "epoch": 1.97, - "grad_norm": 28.594067560080052, - "learning_rate": 2.4755876710905176e-10, - "logits/chosen": -1.3040361404418945, - "logits/rejected": -1.2424486875534058, - "logps/chosen": -228.9510498046875, - "logps/rejected": -367.95770263671875, - "loss": 0.3752, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.7342042922973633, - "rewards/margins": 1.4023338556289673, - "rewards/rejected": -3.136538028717041, + "epoch": 1.9745003445899378, + "grad_norm": 10.31814956665039, + "learning_rate": 4.951175342181035e-11, + "logits/chosen": -2.4443137645721436, + "logits/rejected": -2.4257240295410156, + "logps/chosen": -112.2625732421875, + "logps/rejected": -131.751953125, + "loss": 0.617, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5672473907470703, + "rewards/margins": 0.20691117644309998, + "rewards/rejected": -0.7741585969924927, "step": 11460 }, { - "epoch": 1.98, - "grad_norm": 46.590396282617746, - "learning_rate": 2.1523971764869642e-10, - "logits/chosen": -1.3816394805908203, - "logits/rejected": -1.3001785278320312, - "logps/chosen": -231.14248657226562, - "logps/rejected": -376.31292724609375, - "loss": 0.3637, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.7429434061050415, - "rewards/margins": 1.5104016065597534, - "rewards/rejected": -3.253345012664795, + "epoch": 1.976223294279807, + "grad_norm": 10.4486722946167, + "learning_rate": 4.3047943529739283e-11, + "logits/chosen": -2.481876850128174, + "logits/rejected": -2.452486038208008, + "logps/chosen": -121.09492492675781, + "logps/rejected": -136.9915313720703, + "loss": 0.6171, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6425241231918335, + "rewards/margins": 0.21760614216327667, + "rewards/rejected": -0.8601303100585938, "step": 11470 }, { - "epoch": 1.98, - "grad_norm": 43.745231317437764, - "learning_rate": 1.8517949100854692e-10, - "logits/chosen": -1.3776280879974365, - "logits/rejected": -1.3017531633377075, - "logps/chosen": -220.01980590820312, - "logps/rejected": -345.5537414550781, - "loss": 0.4132, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.6392385959625244, - "rewards/margins": 1.3121652603149414, - "rewards/rejected": -2.9514036178588867, + "epoch": 1.9779462439696762, + "grad_norm": 10.591318130493164, + "learning_rate": 3.703589820170938e-11, + "logits/chosen": -2.5215744972229004, + "logits/rejected": -2.4884209632873535, + "logps/chosen": -112.20654296875, + "logps/rejected": -126.6222915649414, + "loss": 0.6253, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5611206293106079, + "rewards/margins": 0.20080527663230896, + "rewards/rejected": -0.7619259357452393, "step": 11480 }, { - "epoch": 1.98, - "grad_norm": 23.908719760101306, - "learning_rate": 1.5737835902551733e-10, - "logits/chosen": -1.3258212804794312, - "logits/rejected": -1.2626917362213135, - "logps/chosen": -230.5511474609375, - "logps/rejected": -344.8309631347656, - "loss": 0.4828, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.758782982826233, - "rewards/margins": 1.1783698797225952, - "rewards/rejected": -2.937152862548828, + "epoch": 1.9796691936595452, + "grad_norm": 11.51418685913086, + "learning_rate": 3.1475671805103465e-11, + "logits/chosen": -2.4421164989471436, + "logits/rejected": -2.4178805351257324, + "logps/chosen": -120.77961730957031, + "logps/rejected": -128.45925903320312, + "loss": 0.6591, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6608833074569702, + "rewards/margins": 0.11253005266189575, + "rewards/rejected": -0.773413360118866, "step": 11490 }, { - "epoch": 1.98, - "grad_norm": 31.08219846166836, - "learning_rate": 1.318365731074189e-10, - "logits/chosen": -1.3679813146591187, - "logits/rejected": -1.315306305885315, - "logps/chosen": -218.0261688232422, - "logps/rejected": -330.38092041015625, - "loss": 0.432, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.6775197982788086, - "rewards/margins": 1.118550419807434, - "rewards/rejected": -2.796069860458374, + "epoch": 1.9813921433494142, + "grad_norm": 10.343287467956543, + "learning_rate": 2.6367314621483783e-11, + "logits/chosen": -2.4889087677001953, + "logits/rejected": -2.4696598052978516, + "logps/chosen": -109.34969329833984, + "logps/rejected": -122.16093444824219, + "loss": 0.6532, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5906139612197876, + "rewards/margins": 0.12310566753149033, + "rewards/rejected": -0.7137196063995361, "step": 11500 }, { - "epoch": 1.98, - "eval_logits/chosen": -1.4183286428451538, - "eval_logits/rejected": -1.391395092010498, - "eval_logps/chosen": -244.1782684326172, - "eval_logps/rejected": -295.0009765625, - "eval_loss": 0.6337403059005737, - "eval_rewards/accuracies": 0.6535780429840088, - "eval_rewards/chosen": -1.8547443151474, - "eval_rewards/margins": 0.4636920690536499, - "eval_rewards/rejected": -2.31843638420105, - "eval_runtime": 364.0419, - "eval_samples_per_second": 11.823, - "eval_steps_per_second": 1.478, + "epoch": 1.9813921433494142, + "eval_logits/chosen": -2.5584716796875, + "eval_logits/rejected": -2.551682710647583, + "eval_logps/chosen": -108.43128967285156, + "eval_logps/rejected": -123.231689453125, + "eval_loss": 0.6550213694572449, + "eval_rewards/accuracies": 0.6196561455726624, + "eval_rewards/chosen": -0.49719396233558655, + "eval_rewards/margins": 0.10332164913415909, + "eval_rewards/rejected": -0.600515604019165, + "eval_runtime": 360.843, + "eval_samples_per_second": 11.928, + "eval_steps_per_second": 1.491, "step": 11500 }, { - "epoch": 1.98, - "grad_norm": 25.52111148377799, - "learning_rate": 1.0855436423054532e-10, - "logits/chosen": -1.2700302600860596, - "logits/rejected": -1.2187522649765015, - "logps/chosen": -233.6306610107422, - "logps/rejected": -368.69073486328125, - "loss": 0.4327, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.7816520929336548, - "rewards/margins": 1.3728293180465698, - "rewards/rejected": -3.1544814109802246, + "epoch": 1.9831150930392831, + "grad_norm": 13.34192180633545, + "learning_rate": 2.1710872846109062e-11, + "logits/chosen": -2.4032585620880127, + "logits/rejected": -2.3933894634246826, + "logps/chosen": -115.02870178222656, + "logps/rejected": -132.9548797607422, + "loss": 0.6298, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5954822301864624, + "rewards/margins": 0.20137843489646912, + "rewards/rejected": -0.7968606948852539, "step": 11510 }, { - "epoch": 1.98, - "grad_norm": 32.012807334453846, - "learning_rate": 8.753194293770194e-11, - "logits/chosen": -1.3286519050598145, - "logits/rejected": -1.2325983047485352, - "logps/chosen": -229.2296905517578, - "logps/rejected": -372.2969970703125, - "loss": 0.3641, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.694801688194275, - "rewards/margins": 1.5285245180130005, - "rewards/rejected": -3.2233262062072754, + "epoch": 1.9848380427291523, + "grad_norm": 10.762691497802734, + "learning_rate": 1.7506388587540387e-11, + "logits/chosen": -2.4918160438537598, + "logits/rejected": -2.4419684410095215, + "logps/chosen": -118.88272857666016, + "logps/rejected": -127.25289154052734, + "loss": 0.6276, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5908408761024475, + "rewards/margins": 0.1819257289171219, + "rewards/rejected": -0.7727667093276978, "step": 11520 }, { - "epoch": 1.99, - "grad_norm": 30.754768134489776, - "learning_rate": 6.87694993363186e-11, - "logits/chosen": -1.3455301523208618, - "logits/rejected": -1.2808736562728882, - "logps/chosen": -218.40139770507812, - "logps/rejected": -335.3419494628906, - "loss": 0.4166, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.6494470834732056, - "rewards/margins": 1.2140170335769653, - "rewards/rejected": -2.863463878631592, + "epoch": 1.9865609924190215, + "grad_norm": 8.538677215576172, + "learning_rate": 1.3753899867263718e-11, + "logits/chosen": -2.462484836578369, + "logits/rejected": -2.431194305419922, + "logps/chosen": -113.8895492553711, + "logps/rejected": -125.02347564697266, + "loss": 0.637, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6042777895927429, + "rewards/margins": 0.15585239231586456, + "rewards/rejected": -0.7601302862167358, "step": 11530 }, { - "epoch": 1.99, - "grad_norm": 52.90046362094887, - "learning_rate": 5.226720309656207e-11, - "logits/chosen": -1.3888723850250244, - "logits/rejected": -1.3319811820983887, - "logps/chosen": -217.9016876220703, - "logps/rejected": -374.586669921875, - "loss": 0.3692, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.6716915369033813, - "rewards/margins": 1.5221760272979736, - "rewards/rejected": -3.1938672065734863, + "epoch": 1.9882839421088905, + "grad_norm": 11.066951751708984, + "learning_rate": 1.0453440619312414e-11, + "logits/chosen": -2.500387668609619, + "logits/rejected": -2.489388942718506, + "logps/chosen": -109.51570892333984, + "logps/rejected": -137.24916076660156, + "loss": 0.6022, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.587682843208313, + "rewards/margins": 0.23254191875457764, + "rewards/rejected": -0.8202247619628906, "step": 11540 }, { - "epoch": 1.99, - "grad_norm": 32.18141803466796, - "learning_rate": 3.802520345000393e-11, - "logits/chosen": -1.3312755823135376, - "logits/rejected": -1.2780405282974243, - "logps/chosen": -224.06698608398438, - "logps/rejected": -344.835205078125, - "loss": 0.46, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.7274624109268188, - "rewards/margins": 1.1927831172943115, - "rewards/rejected": -2.92024564743042, + "epoch": 1.9900068917987594, + "grad_norm": 9.847150802612305, + "learning_rate": 7.605040690000786e-12, + "logits/chosen": -2.4396908283233643, + "logits/rejected": -2.4210143089294434, + "logps/chosen": -111.09547424316406, + "logps/rejected": -130.10366821289062, + "loss": 0.6317, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5976601839065552, + "rewards/margins": 0.17508384585380554, + "rewards/rejected": -0.7727439999580383, "step": 11550 }, { - "epoch": 1.99, - "grad_norm": 43.51210987286007, - "learning_rate": 2.604362918812164e-11, - "logits/chosen": -1.3421592712402344, - "logits/rejected": -1.2694909572601318, - "logps/chosen": -227.5470733642578, - "logps/rejected": -358.80706787109375, - "loss": 0.4156, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7258684635162354, - "rewards/margins": 1.3443853855133057, - "rewards/rejected": -3.070253849029541, + "epoch": 1.9917298414886284, + "grad_norm": 9.202311515808105, + "learning_rate": 5.208725837624328e-12, + "logits/chosen": -2.466907024383545, + "logits/rejected": -2.4393181800842285, + "logps/chosen": -118.4568862915039, + "logps/rejected": -131.09475708007812, + "loss": 0.6389, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.634739100933075, + "rewards/margins": 0.15840479731559753, + "rewards/rejected": -0.7931438684463501, "step": 11560 }, { - "epoch": 1.99, - "grad_norm": 38.872585179691995, - "learning_rate": 1.6322588661216163e-11, - "logits/chosen": -1.3375303745269775, - "logits/rejected": -1.271527647972107, - "logps/chosen": -235.52359008789062, - "logps/rejected": -379.27459716796875, - "loss": 0.3802, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.8141777515411377, - "rewards/margins": 1.421951413154602, - "rewards/rejected": -3.23612904548645, + "epoch": 1.9934527911784976, + "grad_norm": 10.943995475769043, + "learning_rate": 3.2645177322432327e-12, + "logits/chosen": -2.4700872898101807, + "logits/rejected": -2.4510324001312256, + "logps/chosen": -119.45172119140625, + "logps/rejected": -139.00967407226562, + "loss": 0.6356, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.653407096862793, + "rewards/margins": 0.17969268560409546, + "rewards/rejected": -0.8330997228622437, "step": 11570 }, { - "epoch": 2.0, - "grad_norm": 42.37337871627236, - "learning_rate": 8.862169777440475e-12, - "logits/chosen": -1.3970595598220825, - "logits/rejected": -1.3340885639190674, - "logps/chosen": -227.24227905273438, - "logps/rejected": -373.0314636230469, - "loss": 0.3938, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7100292444229126, - "rewards/margins": 1.4638845920562744, - "rewards/rejected": -3.1739137172698975, + "epoch": 1.9951757408683668, + "grad_norm": 11.605945587158203, + "learning_rate": 1.7724339554880952e-12, + "logits/chosen": -2.508540630340576, + "logits/rejected": -2.490703582763672, + "logps/chosen": -118.93894958496094, + "logps/rejected": -133.46774291992188, + "loss": 0.6431, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6268073320388794, + "rewards/margins": 0.15112532675266266, + "rewards/rejected": -0.7779326438903809, "step": 11580 }, { - "epoch": 2.0, - "grad_norm": 37.01271407525554, - "learning_rate": 3.6624400018836485e-12, - "logits/chosen": -1.2879482507705688, - "logits/rejected": -1.215693712234497, - "logps/chosen": -228.08352661132812, - "logps/rejected": -351.55242919921875, - "loss": 0.4093, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.6859127283096313, - "rewards/margins": 1.3195241689682007, - "rewards/rejected": -3.005436420440674, + "epoch": 1.9968986905582358, + "grad_norm": 13.840242385864258, + "learning_rate": 7.324880003767298e-13, + "logits/chosen": -2.4034335613250732, + "logits/rejected": -2.369175434112549, + "logps/chosen": -120.6702880859375, + "logps/rejected": -123.08309173583984, + "loss": 0.6578, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.611688494682312, + "rewards/margins": 0.10892312228679657, + "rewards/rejected": -0.7206116318702698, "step": 11590 }, { - "epoch": 2.0, - "grad_norm": 33.774484526273135, - "learning_rate": 7.234463561267557e-13, - "logits/chosen": -1.341506004333496, - "logits/rejected": -1.2840282917022705, - "logps/chosen": -213.89993286132812, - "logps/rejected": -343.8310852050781, - "loss": 0.429, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.634570837020874, - "rewards/margins": 1.2662864923477173, - "rewards/rejected": -2.900857448577881, + "epoch": 1.9986216402481047, + "grad_norm": 9.633621215820312, + "learning_rate": 1.4468927122535113e-13, + "logits/chosen": -2.453922748565674, + "logits/rejected": -2.4403982162475586, + "logps/chosen": -106.54515075683594, + "logps/rejected": -127.30931091308594, + "loss": 0.6257, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5609582662582397, + "rewards/margins": 0.17430973052978516, + "rewards/rejected": -0.7352679967880249, "step": 11600 }, { - "epoch": 2.0, - "eval_logits/chosen": -1.4189956188201904, - "eval_logits/rejected": -1.3920025825500488, - "eval_logps/chosen": -244.38746643066406, - "eval_logps/rejected": -295.19293212890625, - "eval_loss": 0.6342188119888306, - "eval_rewards/accuracies": 0.6579925417900085, - "eval_rewards/chosen": -1.8568360805511475, - "eval_rewards/margins": 0.4635196328163147, - "eval_rewards/rejected": -2.3203558921813965, - "eval_runtime": 356.6898, - "eval_samples_per_second": 12.067, - "eval_steps_per_second": 1.508, + "epoch": 1.9986216402481047, + "eval_logits/chosen": -2.5584018230438232, + "eval_logits/rejected": -2.5515997409820557, + "eval_logps/chosen": -108.46733856201172, + "eval_logps/rejected": -123.28102111816406, + "eval_loss": 0.6549462676048279, + "eval_rewards/accuracies": 0.6194238066673279, + "eval_rewards/chosen": -0.49755439162254333, + "eval_rewards/margins": 0.10345453023910522, + "eval_rewards/rejected": -0.6010088920593262, + "eval_runtime": 360.3867, + "eval_samples_per_second": 11.943, + "eval_steps_per_second": 1.493, "step": 11600 }, { "epoch": 2.0, "step": 11608, "total_flos": 0.0, - "train_loss": 0.5042109644922366, - "train_runtime": 89019.0317, - "train_samples_per_second": 2.086, - "train_steps_per_second": 0.13 + "train_loss": 0.6539983197297005, + "train_runtime": 91180.3592, + "train_samples_per_second": 2.037, + "train_steps_per_second": 0.127 } ], "logging_steps": 10, @@ -19294,6 +19294,18 @@ "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null,