{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 500, "global_step": 1910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005234231876472127, "grad_norm": 7271.4244777785725, "learning_rate": 2.617801047120419e-09, "logits/chosen": 5870.685546875, "logits/rejected": 4942.87255859375, "logps/chosen": -300.06866455078125, "logps/rejected": -172.3806915283203, "loss": 502.7921, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.005234231876472127, "grad_norm": 8304.038811792496, "learning_rate": 2.6178010471204188e-08, "logits/chosen": 4513.291015625, "logits/rejected": 4184.9970703125, "logps/chosen": -237.91387939453125, "logps/rejected": -218.99322509765625, "loss": 517.5651, "rewards/accuracies": 0.4305555522441864, "rewards/chosen": 0.00014268612721934915, "rewards/margins": 7.825787179172039e-06, "rewards/rejected": 0.00013486042735166848, "step": 10 }, { "epoch": 0.010468463752944255, "grad_norm": 7349.301686542035, "learning_rate": 5.2356020942408376e-08, "logits/chosen": 6489.048828125, "logits/rejected": 5857.85986328125, "logps/chosen": -313.1256408691406, "logps/rejected": -286.7991027832031, "loss": 576.4106, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.005085395649075508, "rewards/margins": 0.001057310844771564, "rewards/rejected": 0.004028084687888622, "step": 20 }, { "epoch": 0.015702695629416383, "grad_norm": 6070.326194091692, "learning_rate": 7.853403141361257e-08, "logits/chosen": 6126.0244140625, "logits/rejected": 4615.61572265625, "logps/chosen": -283.33941650390625, "logps/rejected": -226.2187957763672, "loss": 536.6951, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.03882580250501633, "rewards/margins": 0.0007929968414828181, "rewards/rejected": 0.038032807409763336, "step": 30 }, { "epoch": 0.02093692750588851, "grad_norm": 4027.252620020275, "learning_rate": 1.0471204188481675e-07, "logits/chosen": 6232.24609375, "logits/rejected": 5138.41943359375, "logps/chosen": -303.6441345214844, "logps/rejected": -273.7585144042969, "loss": 521.8768, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.10832098871469498, "rewards/margins": 0.001040009781718254, "rewards/rejected": 0.10728099197149277, "step": 40 }, { "epoch": 0.02617115938236064, "grad_norm": 3273.4166142677173, "learning_rate": 1.3089005235602092e-07, "logits/chosen": 5814.5888671875, "logits/rejected": 4966.0185546875, "logps/chosen": -264.1505126953125, "logps/rejected": -246.9972381591797, "loss": 504.7053, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.1410144865512848, "rewards/margins": 0.008737658150494099, "rewards/rejected": 0.13227683305740356, "step": 50 }, { "epoch": 0.031405391258832765, "grad_norm": 3405.5858728041726, "learning_rate": 1.5706806282722514e-07, "logits/chosen": 5904.3173828125, "logits/rejected": 4385.4423828125, "logps/chosen": -305.66455078125, "logps/rejected": -220.8424072265625, "loss": 504.0667, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.162828266620636, "rewards/margins": 0.004079371690750122, "rewards/rejected": 0.15874889492988586, "step": 60 }, { "epoch": 0.036639623135304895, "grad_norm": 3202.908216557707, "learning_rate": 1.8324607329842932e-07, "logits/chosen": 5761.95166015625, "logits/rejected": 5009.62744140625, "logps/chosen": -268.76776123046875, "logps/rejected": -242.82901000976562, "loss": 483.6057, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.1770472377538681, "rewards/margins": 0.009785796515643597, "rewards/rejected": 0.16726145148277283, "step": 70 }, { "epoch": 0.04187385501177702, "grad_norm": 3087.5358270471534, "learning_rate": 2.094240837696335e-07, "logits/chosen": 5641.09423828125, "logits/rejected": 4720.921875, "logps/chosen": -255.1355438232422, "logps/rejected": -223.3970947265625, "loss": 520.935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.20566542446613312, "rewards/margins": 0.015274032950401306, "rewards/rejected": 0.1903913915157318, "step": 80 }, { "epoch": 0.04710808688824915, "grad_norm": 3166.52121678684, "learning_rate": 2.356020942408377e-07, "logits/chosen": 5928.62060546875, "logits/rejected": 5229.50830078125, "logps/chosen": -273.92181396484375, "logps/rejected": -257.62664794921875, "loss": 504.3293, "rewards/accuracies": 0.5, "rewards/chosen": 0.2296813279390335, "rewards/margins": 0.005136436782777309, "rewards/rejected": 0.22454488277435303, "step": 90 }, { "epoch": 0.05234231876472128, "grad_norm": 2820.6105507964335, "learning_rate": 2.6178010471204185e-07, "logits/chosen": 5289.11279296875, "logits/rejected": 4693.4892578125, "logps/chosen": -223.42025756835938, "logps/rejected": -191.41561889648438, "loss": 462.7118, "rewards/accuracies": 0.625, "rewards/chosen": 0.2503862977027893, "rewards/margins": 0.040689971297979355, "rewards/rejected": 0.20969633758068085, "step": 100 }, { "epoch": 0.05757655064119341, "grad_norm": 2957.007752432132, "learning_rate": 2.879581151832461e-07, "logits/chosen": 4805.5546875, "logits/rejected": 3832.24560546875, "logps/chosen": -226.375732421875, "logps/rejected": -162.94354248046875, "loss": 468.0938, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.2314574271440506, "rewards/margins": 0.005531529895961285, "rewards/rejected": 0.22592587769031525, "step": 110 }, { "epoch": 0.06281078251766553, "grad_norm": 2665.4354032186566, "learning_rate": 3.1413612565445027e-07, "logits/chosen": 5966.80615234375, "logits/rejected": 5375.4677734375, "logps/chosen": -266.8603210449219, "logps/rejected": -251.39871215820312, "loss": 478.3712, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.27219146490097046, "rewards/margins": 0.014115167781710625, "rewards/rejected": 0.2580762803554535, "step": 120 }, { "epoch": 0.06804501439413765, "grad_norm": 2859.728706765586, "learning_rate": 3.4031413612565446e-07, "logits/chosen": 5990.4404296875, "logits/rejected": 4250.15087890625, "logps/chosen": -246.6409149169922, "logps/rejected": -190.3795928955078, "loss": 466.5569, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.3171687722206116, "rewards/margins": 0.03780997544527054, "rewards/rejected": 0.27935880422592163, "step": 130 }, { "epoch": 0.07327924627060979, "grad_norm": 2930.1339861107, "learning_rate": 3.6649214659685864e-07, "logits/chosen": 5721.59375, "logits/rejected": 5571.9296875, "logps/chosen": -255.01687622070312, "logps/rejected": -260.22662353515625, "loss": 485.1082, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.27502936124801636, "rewards/margins": -0.03979866951704025, "rewards/rejected": 0.3148280382156372, "step": 140 }, { "epoch": 0.07851347814708191, "grad_norm": 3059.9679230109905, "learning_rate": 3.926701570680628e-07, "logits/chosen": 5372.61669921875, "logits/rejected": 4874.53759765625, "logps/chosen": -255.142333984375, "logps/rejected": -235.846435546875, "loss": 507.7281, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.29862216114997864, "rewards/margins": 0.015394523739814758, "rewards/rejected": 0.2832276523113251, "step": 150 }, { "epoch": 0.08374771002355404, "grad_norm": 2926.3725797738743, "learning_rate": 4.18848167539267e-07, "logits/chosen": 5306.0595703125, "logits/rejected": 4798.4033203125, "logps/chosen": -218.11911010742188, "logps/rejected": -226.7350616455078, "loss": 462.329, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.31827059388160706, "rewards/margins": -0.01059560663998127, "rewards/rejected": 0.32886621356010437, "step": 160 }, { "epoch": 0.08898194190002617, "grad_norm": 2736.45715870512, "learning_rate": 4.450261780104712e-07, "logits/chosen": 6529.0009765625, "logits/rejected": 5290.5166015625, "logps/chosen": -264.6250915527344, "logps/rejected": -233.53598022460938, "loss": 479.4374, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.2924504578113556, "rewards/margins": -0.042323701083660126, "rewards/rejected": 0.3347741663455963, "step": 170 }, { "epoch": 0.0942161737764983, "grad_norm": 2984.3085207454747, "learning_rate": 4.712041884816754e-07, "logits/chosen": 5866.4013671875, "logits/rejected": 4263.2841796875, "logps/chosen": -253.30673217773438, "logps/rejected": -205.41256713867188, "loss": 500.1966, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.3619328439235687, "rewards/margins": 0.07482115924358368, "rewards/rejected": 0.28711163997650146, "step": 180 }, { "epoch": 0.09945040565297043, "grad_norm": 2413.8772161958623, "learning_rate": 4.973821989528796e-07, "logits/chosen": 5497.056640625, "logits/rejected": 5329.40380859375, "logps/chosen": -232.9961700439453, "logps/rejected": -253.7998809814453, "loss": 460.8556, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.3027707040309906, "rewards/margins": -0.005348903127014637, "rewards/rejected": 0.30811959505081177, "step": 190 }, { "epoch": 0.10468463752944256, "grad_norm": 3189.2111842360887, "learning_rate": 4.999661831436498e-07, "logits/chosen": 5501.1474609375, "logits/rejected": 5489.7998046875, "logps/chosen": -236.9458465576172, "logps/rejected": -251.3883819580078, "loss": 475.5444, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.30881690979003906, "rewards/margins": -0.020580019801855087, "rewards/rejected": 0.32939693331718445, "step": 200 }, { "epoch": 0.10991886940591468, "grad_norm": 2699.749049929128, "learning_rate": 4.998492971140339e-07, "logits/chosen": 5460.18603515625, "logits/rejected": 5445.2724609375, "logps/chosen": -231.32339477539062, "logps/rejected": -259.4305114746094, "loss": 472.6169, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.32528847455978394, "rewards/margins": -0.03840009123086929, "rewards/rejected": 0.363688588142395, "step": 210 }, { "epoch": 0.11515310128238682, "grad_norm": 2726.868782872802, "learning_rate": 4.996489634487865e-07, "logits/chosen": 5533.435546875, "logits/rejected": 4800.142578125, "logps/chosen": -251.1014404296875, "logps/rejected": -216.500244140625, "loss": 442.7733, "rewards/accuracies": 0.625, "rewards/chosen": 0.3984920084476471, "rewards/margins": 0.058864910155534744, "rewards/rejected": 0.33962708711624146, "step": 220 }, { "epoch": 0.12038733315885894, "grad_norm": 2717.1705033857043, "learning_rate": 4.993652490577246e-07, "logits/chosen": 5993.1884765625, "logits/rejected": 4839.08203125, "logps/chosen": -249.40988159179688, "logps/rejected": -219.46878051757812, "loss": 462.8714, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.32329297065734863, "rewards/margins": -0.012330831959843636, "rewards/rejected": 0.3356238007545471, "step": 230 }, { "epoch": 0.12562156503533106, "grad_norm": 2891.5856762486524, "learning_rate": 4.9899824869915e-07, "logits/chosen": 5396.11376953125, "logits/rejected": 4066.21875, "logps/chosen": -242.0372314453125, "logps/rejected": -174.00357055664062, "loss": 448.6322, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.3511057496070862, "rewards/margins": 0.024918105453252792, "rewards/rejected": 0.3261876404285431, "step": 240 }, { "epoch": 0.13085579691180318, "grad_norm": 2977.7608325122183, "learning_rate": 4.985480849482012e-07, "logits/chosen": 5365.56591796875, "logits/rejected": 5459.0966796875, "logps/chosen": -223.53158569335938, "logps/rejected": -243.6824951171875, "loss": 470.1929, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.3795216679573059, "rewards/margins": 0.03881485387682915, "rewards/rejected": 0.34070685505867004, "step": 250 }, { "epoch": 0.1360900287882753, "grad_norm": 2844.886121573562, "learning_rate": 4.980149081559142e-07, "logits/chosen": 5969.5849609375, "logits/rejected": 5703.658203125, "logps/chosen": -279.28240966796875, "logps/rejected": -260.6078186035156, "loss": 453.171, "rewards/accuracies": 0.4375, "rewards/chosen": 0.32865092158317566, "rewards/margins": -0.03916555643081665, "rewards/rejected": 0.3678165078163147, "step": 260 }, { "epoch": 0.14132426066474746, "grad_norm": 2829.945413913836, "learning_rate": 4.973988963990065e-07, "logits/chosen": 4869.52001953125, "logits/rejected": 4211.67724609375, "logps/chosen": -224.86849975585938, "logps/rejected": -217.9200439453125, "loss": 457.8975, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.32791829109191895, "rewards/margins": 0.021049553528428078, "rewards/rejected": 0.3068687319755554, "step": 270 }, { "epoch": 0.14655849254121958, "grad_norm": 2907.0916066466657, "learning_rate": 4.967002554204008e-07, "logits/chosen": 5243.541015625, "logits/rejected": 4437.04345703125, "logps/chosen": -247.62939453125, "logps/rejected": -211.838134765625, "loss": 450.3028, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.3431067168712616, "rewards/margins": 0.02685241959989071, "rewards/rejected": 0.31625431776046753, "step": 280 }, { "epoch": 0.1517927244176917, "grad_norm": 2882.190469859016, "learning_rate": 4.959192185605087e-07, "logits/chosen": 5457.43994140625, "logits/rejected": 4840.53564453125, "logps/chosen": -238.77737426757812, "logps/rejected": -243.97128295898438, "loss": 468.4004, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.36954444646835327, "rewards/margins": 0.02929796651005745, "rewards/rejected": 0.3402464985847473, "step": 290 }, { "epoch": 0.15702695629416383, "grad_norm": 3009.2833395331872, "learning_rate": 4.950560466792969e-07, "logits/chosen": 6089.1845703125, "logits/rejected": 4922.9873046875, "logps/chosen": -288.011962890625, "logps/rejected": -253.79428100585938, "loss": 496.6538, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.41366416215896606, "rewards/margins": 0.09003490954637527, "rewards/rejected": 0.323629230260849, "step": 300 }, { "epoch": 0.16226118817063595, "grad_norm": 2429.430639056721, "learning_rate": 4.941110280691619e-07, "logits/chosen": 5420.87890625, "logits/rejected": 4362.85302734375, "logps/chosen": -246.7043914794922, "logps/rejected": -184.77566528320312, "loss": 450.8828, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.3455493152141571, "rewards/margins": -0.0095530916005373, "rewards/rejected": 0.3551023602485657, "step": 310 }, { "epoch": 0.16749542004710807, "grad_norm": 2753.359736527446, "learning_rate": 4.930844783586424e-07, "logits/chosen": 4784.5322265625, "logits/rejected": 4547.6220703125, "logps/chosen": -184.01431274414062, "logps/rejected": -193.05662536621094, "loss": 415.4406, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.3396381139755249, "rewards/margins": -0.008304371498525143, "rewards/rejected": 0.3479425311088562, "step": 320 }, { "epoch": 0.17272965192358022, "grad_norm": 2852.3716536653224, "learning_rate": 4.919767404070033e-07, "logits/chosen": 5849.3818359375, "logits/rejected": 4828.42822265625, "logps/chosen": -246.0577850341797, "logps/rejected": -209.41921997070312, "loss": 449.1766, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.4010588228702545, "rewards/margins": 0.042839743196964264, "rewards/rejected": 0.35821908712387085, "step": 330 }, { "epoch": 0.17796388380005235, "grad_norm": 2456.2913708681403, "learning_rate": 4.907881841897216e-07, "logits/chosen": 5143.18994140625, "logits/rejected": 5371.2216796875, "logps/chosen": -232.52963256835938, "logps/rejected": -257.6844787597656, "loss": 454.2092, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": 0.35724347829818726, "rewards/margins": -0.11005325615406036, "rewards/rejected": 0.4672967493534088, "step": 340 }, { "epoch": 0.18319811567652447, "grad_norm": 2678.401269841126, "learning_rate": 4.895192066749189e-07, "logits/chosen": 5613.0537109375, "logits/rejected": 4274.7685546875, "logps/chosen": -235.586669921875, "logps/rejected": -212.4261474609375, "loss": 472.6165, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4322528839111328, "rewards/margins": 0.11002279818058014, "rewards/rejected": 0.32223010063171387, "step": 350 }, { "epoch": 0.1884323475529966, "grad_norm": 2532.990455889227, "learning_rate": 4.881702316907768e-07, "logits/chosen": 5681.9755859375, "logits/rejected": 4378.25390625, "logps/chosen": -240.4472198486328, "logps/rejected": -192.0093536376953, "loss": 458.877, "rewards/accuracies": 0.625, "rewards/chosen": 0.371535062789917, "rewards/margins": 0.03572763875126839, "rewards/rejected": 0.3358073830604553, "step": 360 }, { "epoch": 0.19366657942946872, "grad_norm": 2561.1977986473953, "learning_rate": 4.86741709783982e-07, "logits/chosen": 5142.1611328125, "logits/rejected": 4391.021484375, "logps/chosen": -221.49853515625, "logps/rejected": -208.3010711669922, "loss": 418.6793, "rewards/accuracies": 0.5625, "rewards/chosen": 0.36740997433662415, "rewards/margins": 0.024009237065911293, "rewards/rejected": 0.343400776386261, "step": 370 }, { "epoch": 0.19890081130594087, "grad_norm": 3249.3946848792525, "learning_rate": 4.85234118069247e-07, "logits/chosen": 5851.6572265625, "logits/rejected": 5273.86572265625, "logps/chosen": -258.82733154296875, "logps/rejected": -225.1135711669922, "loss": 476.0678, "rewards/accuracies": 0.5, "rewards/chosen": 0.3698350489139557, "rewards/margins": -0.05597928166389465, "rewards/rejected": 0.42581433057785034, "step": 380 }, { "epoch": 0.204135043182413, "grad_norm": 6104.987197861906, "learning_rate": 4.836479600699578e-07, "logits/chosen": 5440.23828125, "logits/rejected": 5176.56787109375, "logps/chosen": -227.13626098632812, "logps/rejected": -237.4613800048828, "loss": 457.8458, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.4158262312412262, "rewards/margins": -0.06199340894818306, "rewards/rejected": 0.47781962156295776, "step": 390 }, { "epoch": 0.2093692750588851, "grad_norm": 2913.855746159282, "learning_rate": 4.819837655500013e-07, "logits/chosen": 6080.09521484375, "logits/rejected": 6008.8671875, "logps/chosen": -255.96240234375, "logps/rejected": -260.66082763671875, "loss": 463.2553, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.4074326157569885, "rewards/margins": -0.010780897922813892, "rewards/rejected": 0.41821345686912537, "step": 400 }, { "epoch": 0.21460350693535724, "grad_norm": 2757.6002263120413, "learning_rate": 4.802420903368285e-07, "logits/chosen": 5520.74951171875, "logits/rejected": 4620.50830078125, "logps/chosen": -209.3979949951172, "logps/rejected": -202.67066955566406, "loss": 401.3251, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.3946303129196167, "rewards/margins": 0.021963870152831078, "rewards/rejected": 0.37266644835472107, "step": 410 }, { "epoch": 0.21983773881182936, "grad_norm": 2886.017303383734, "learning_rate": 4.784235161358123e-07, "logits/chosen": 6241.6337890625, "logits/rejected": 4859.7138671875, "logps/chosen": -258.38421630859375, "logps/rejected": -226.57174682617188, "loss": 474.339, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.3764682412147522, "rewards/margins": -0.0009525719797238708, "rewards/rejected": 0.3774208426475525, "step": 420 }, { "epoch": 0.22507197068830148, "grad_norm": 2694.1239844205243, "learning_rate": 4.7652865033596314e-07, "logits/chosen": 5915.04052734375, "logits/rejected": 4868.78125, "logps/chosen": -237.8837890625, "logps/rejected": -239.0323944091797, "loss": 479.7707, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.39550644159317017, "rewards/margins": 0.05114778131246567, "rewards/rejected": 0.3443586528301239, "step": 430 }, { "epoch": 0.23030620256477363, "grad_norm": 2638.7505881805687, "learning_rate": 4.7455812580706534e-07, "logits/chosen": 5430.63525390625, "logits/rejected": 4377.5166015625, "logps/chosen": -228.3394012451172, "logps/rejected": -218.4453125, "loss": 477.3207, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.4782446026802063, "rewards/margins": 0.08502718061208725, "rewards/rejected": 0.39321738481521606, "step": 440 }, { "epoch": 0.23554043444124576, "grad_norm": 2560.4922150003467, "learning_rate": 4.725126006883046e-07, "logits/chosen": 5066.28369140625, "logits/rejected": 4950.4150390625, "logps/chosen": -223.5402069091797, "logps/rejected": -229.3683624267578, "loss": 475.7734, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.37004831433296204, "rewards/margins": -0.06600765883922577, "rewards/rejected": 0.4360559582710266, "step": 450 }, { "epoch": 0.24077466631771788, "grad_norm": 2834.5464453746413, "learning_rate": 4.703927581684539e-07, "logits/chosen": 5400.9306640625, "logits/rejected": 5344.81640625, "logps/chosen": -234.24026489257812, "logps/rejected": -206.53564453125, "loss": 451.783, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.3872218132019043, "rewards/margins": -0.023321902379393578, "rewards/rejected": 0.4105437397956848, "step": 460 }, { "epoch": 0.24600889819419, "grad_norm": 2984.6307853878493, "learning_rate": 4.68199306257695e-07, "logits/chosen": 5166.0556640625, "logits/rejected": 4168.626953125, "logps/chosen": -228.40029907226562, "logps/rejected": -217.7026824951172, "loss": 436.8623, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.3921186327934265, "rewards/margins": 0.03906525298953056, "rewards/rejected": 0.35305342078208923, "step": 470 }, { "epoch": 0.2512431300706621, "grad_norm": 2709.863344047286, "learning_rate": 4.6593297755114776e-07, "logits/chosen": 5889.123046875, "logits/rejected": 5564.8466796875, "logps/chosen": -224.7196807861328, "logps/rejected": -254.2318572998047, "loss": 500.472, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.4287680983543396, "rewards/margins": 0.03177551180124283, "rewards/rejected": 0.39699262380599976, "step": 480 }, { "epoch": 0.2564773619471343, "grad_norm": 2812.365883993442, "learning_rate": 4.635945289841902e-07, "logits/chosen": 4608.798828125, "logits/rejected": 4657.5869140625, "logps/chosen": -183.73062133789062, "logps/rejected": -228.1973876953125, "loss": 436.1543, "rewards/accuracies": 0.5, "rewards/chosen": 0.3860538601875305, "rewards/margins": 0.007812491152435541, "rewards/rejected": 0.3782413601875305, "step": 490 }, { "epoch": 0.26171159382360637, "grad_norm": 2964.3249647637385, "learning_rate": 4.611847415796476e-07, "logits/chosen": 5814.58544921875, "logits/rejected": 4944.06103515625, "logps/chosen": -253.06246948242188, "logps/rejected": -216.42373657226562, "loss": 449.1805, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.37933146953582764, "rewards/margins": -0.004121728241443634, "rewards/rejected": 0.38345322012901306, "step": 500 }, { "epoch": 0.2669458257000785, "grad_norm": 2468.7689710136347, "learning_rate": 4.5870442018693773e-07, "logits/chosen": 5567.94140625, "logits/rejected": 5069.24072265625, "logps/chosen": -238.51309204101562, "logps/rejected": -237.9658203125, "loss": 468.6761, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.39677414298057556, "rewards/margins": -0.018477313220500946, "rewards/rejected": 0.4152514338493347, "step": 510 }, { "epoch": 0.2721800575765506, "grad_norm": 3135.9866296472474, "learning_rate": 4.5615439321325735e-07, "logits/chosen": 5763.81103515625, "logits/rejected": 4699.95361328125, "logps/chosen": -245.3778076171875, "logps/rejected": -236.6832733154297, "loss": 485.4475, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.3716925084590912, "rewards/margins": -0.019004570320248604, "rewards/rejected": 0.39069709181785583, "step": 520 }, { "epoch": 0.27741428945302277, "grad_norm": 2697.1963242488955, "learning_rate": 4.535355123469008e-07, "logits/chosen": 5287.62890625, "logits/rejected": 4875.68505859375, "logps/chosen": -229.9176788330078, "logps/rejected": -223.1514892578125, "loss": 472.4326, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3853301405906677, "rewards/margins": 0.00580341462045908, "rewards/rejected": 0.37952667474746704, "step": 530 }, { "epoch": 0.2826485213294949, "grad_norm": 3065.9865354495937, "learning_rate": 4.5084865227280366e-07, "logits/chosen": 5233.0517578125, "logits/rejected": 4762.06787109375, "logps/chosen": -248.7694549560547, "logps/rejected": -221.34976196289062, "loss": 494.6735, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.3382965922355652, "rewards/margins": -0.02283622696995735, "rewards/rejected": 0.36113283038139343, "step": 540 }, { "epoch": 0.287882753205967, "grad_norm": 2581.098101318781, "learning_rate": 4.4809471038040437e-07, "logits/chosen": 5172.52978515625, "logits/rejected": 4110.0751953125, "logps/chosen": -249.93325805664062, "logps/rejected": -195.62350463867188, "loss": 445.9224, "rewards/accuracies": 0.5, "rewards/chosen": 0.42613086104393005, "rewards/margins": 0.030070941895246506, "rewards/rejected": 0.39605993032455444, "step": 550 }, { "epoch": 0.29311698508243916, "grad_norm": 2935.758261662807, "learning_rate": 4.4527460646392386e-07, "logits/chosen": 5275.1181640625, "logits/rejected": 4908.9013671875, "logps/chosen": -204.8629150390625, "logps/rejected": -213.5486297607422, "loss": 464.7498, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.3916018009185791, "rewards/margins": 0.01974121853709221, "rewards/rejected": 0.3718605637550354, "step": 560 }, { "epoch": 0.29835121695891126, "grad_norm": 2317.2183869330697, "learning_rate": 4.4238928241516163e-07, "logits/chosen": 6298.4970703125, "logits/rejected": 4828.0947265625, "logps/chosen": -271.87530517578125, "logps/rejected": -210.5924835205078, "loss": 444.7044, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.4335819184780121, "rewards/margins": 0.005383139010518789, "rewards/rejected": 0.42819881439208984, "step": 570 }, { "epoch": 0.3035854488353834, "grad_norm": 2476.766944172286, "learning_rate": 4.394397019089116e-07, "logits/chosen": 5619.259765625, "logits/rejected": 4530.4365234375, "logps/chosen": -253.644287109375, "logps/rejected": -210.2486572265625, "loss": 460.2462, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3618123531341553, "rewards/margins": 0.0577847845852375, "rewards/rejected": 0.3040275275707245, "step": 580 }, { "epoch": 0.30881968071185556, "grad_norm": 2592.791022013939, "learning_rate": 4.3642685008110246e-07, "logits/chosen": 5317.41064453125, "logits/rejected": 4190.921875, "logps/chosen": -215.50991821289062, "logps/rejected": -187.17164611816406, "loss": 413.2921, "rewards/accuracies": 0.5625, "rewards/chosen": 0.38392287492752075, "rewards/margins": -0.009419135749340057, "rewards/rejected": 0.3933420181274414, "step": 590 }, { "epoch": 0.31405391258832765, "grad_norm": 2611.3201020461893, "learning_rate": 4.333517331997704e-07, "logits/chosen": 5873.51220703125, "logits/rejected": 5519.97119140625, "logps/chosen": -261.2032470703125, "logps/rejected": -246.95849609375, "loss": 476.4223, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.3629549741744995, "rewards/margins": -0.021394124254584312, "rewards/rejected": 0.3843490779399872, "step": 600 }, { "epoch": 0.3192881444647998, "grad_norm": 2757.807768530615, "learning_rate": 4.302153783289736e-07, "logits/chosen": 5596.8837890625, "logits/rejected": 4788.099609375, "logps/chosen": -225.3079071044922, "logps/rejected": -241.4241180419922, "loss": 469.7494, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.4231595993041992, "rewards/margins": -0.00441837077960372, "rewards/rejected": 0.42757803201675415, "step": 610 }, { "epoch": 0.3245223763412719, "grad_norm": 3105.8012818905454, "learning_rate": 4.2701883298576124e-07, "logits/chosen": 5359.2724609375, "logits/rejected": 5001.88427734375, "logps/chosen": -240.8025360107422, "logps/rejected": -207.4171142578125, "loss": 475.3348, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.40704458951950073, "rewards/margins": 0.016931820660829544, "rewards/rejected": 0.3901127278804779, "step": 620 }, { "epoch": 0.32975660821774405, "grad_norm": 2907.7644335294735, "learning_rate": 4.237631647903115e-07, "logits/chosen": 5390.2080078125, "logits/rejected": 4478.12353515625, "logps/chosen": -223.9457244873047, "logps/rejected": -204.24703979492188, "loss": 459.8224, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.5224254727363586, "rewards/margins": 0.09427186101675034, "rewards/rejected": 0.4281536042690277, "step": 630 }, { "epoch": 0.33499084009421615, "grad_norm": 2838.8774079471536, "learning_rate": 4.204494611093548e-07, "logits/chosen": 5652.3779296875, "logits/rejected": 4024.271484375, "logps/chosen": -262.69659423828125, "logps/rejected": -210.9382781982422, "loss": 457.168, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5036084651947021, "rewards/margins": 0.1345875859260559, "rewards/rejected": 0.36902087926864624, "step": 640 }, { "epoch": 0.3402250719706883, "grad_norm": 2841.283206358593, "learning_rate": 4.1707882869300235e-07, "logits/chosen": 5601.1298828125, "logits/rejected": 4618.71484375, "logps/chosen": -243.36392211914062, "logps/rejected": -188.28256225585938, "loss": 473.2414, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.37009182572364807, "rewards/margins": 0.020084170624613762, "rewards/rejected": 0.3500076234340668, "step": 650 }, { "epoch": 0.34545930384716045, "grad_norm": 2990.557256988464, "learning_rate": 4.136523933051005e-07, "logits/chosen": 5809.75, "logits/rejected": 5187.5576171875, "logps/chosen": -242.94424438476562, "logps/rejected": -218.1668701171875, "loss": 430.7427, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.3632182478904724, "rewards/margins": -0.019726304337382317, "rewards/rejected": 0.3829445242881775, "step": 660 }, { "epoch": 0.35069353572363254, "grad_norm": 3030.6632095062805, "learning_rate": 4.101712993472348e-07, "logits/chosen": 5965.10986328125, "logits/rejected": 5161.24755859375, "logps/chosen": -239.2626190185547, "logps/rejected": -198.73562622070312, "loss": 445.511, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.375115305185318, "rewards/margins": 0.006631316151469946, "rewards/rejected": 0.36848393082618713, "step": 670 }, { "epoch": 0.3559277676001047, "grad_norm": 2767.786914589451, "learning_rate": 4.066367094765091e-07, "logits/chosen": 5560.0693359375, "logits/rejected": 4629.181640625, "logps/chosen": -257.6699523925781, "logps/rejected": -211.1696319580078, "loss": 458.9728, "rewards/accuracies": 0.5625, "rewards/chosen": 0.35179638862609863, "rewards/margins": -0.009023250080645084, "rewards/rejected": 0.36081960797309875, "step": 680 }, { "epoch": 0.3611619994765768, "grad_norm": 3018.8316167165076, "learning_rate": 4.0304980421722766e-07, "logits/chosen": 5494.3037109375, "logits/rejected": 5067.5087890625, "logps/chosen": -251.9346923828125, "logps/rejected": -231.23477172851562, "loss": 483.0792, "rewards/accuracies": 0.5625, "rewards/chosen": 0.39491140842437744, "rewards/margins": -0.004837697837501764, "rewards/rejected": 0.39974913001060486, "step": 690 }, { "epoch": 0.36639623135304894, "grad_norm": 2804.3368058052815, "learning_rate": 3.994117815666095e-07, "logits/chosen": 5581.04638671875, "logits/rejected": 4282.5869140625, "logps/chosen": -290.048095703125, "logps/rejected": -219.49282836914062, "loss": 487.5479, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.4267527461051941, "rewards/margins": 0.01361342053860426, "rewards/rejected": 0.41313934326171875, "step": 700 }, { "epoch": 0.3716304632295211, "grad_norm": 2463.6680284285494, "learning_rate": 3.957238565946671e-07, "logits/chosen": 5351.44384765625, "logits/rejected": 4493.7109375, "logps/chosen": -217.0452880859375, "logps/rejected": -190.09336853027344, "loss": 434.9337, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.3848896622657776, "rewards/margins": -0.02217109315097332, "rewards/rejected": 0.4070607125759125, "step": 710 }, { "epoch": 0.3768646951059932, "grad_norm": 2789.400152363563, "learning_rate": 3.9198726103838306e-07, "logits/chosen": 5270.38037109375, "logits/rejected": 4837.1796875, "logps/chosen": -230.885986328125, "logps/rejected": -195.58602905273438, "loss": 426.7396, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.3861168324947357, "rewards/margins": -0.029095202684402466, "rewards/rejected": 0.41521206498146057, "step": 720 }, { "epoch": 0.38209892698246534, "grad_norm": 2579.476417492773, "learning_rate": 3.8820324289031946e-07, "logits/chosen": 5369.2978515625, "logits/rejected": 4761.36328125, "logps/chosen": -204.5257568359375, "logps/rejected": -204.974853515625, "loss": 453.2719, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.3813869059085846, "rewards/margins": 0.00393150607123971, "rewards/rejected": 0.37745538353919983, "step": 730 }, { "epoch": 0.38733315885893743, "grad_norm": 2589.569755576345, "learning_rate": 3.84373065981799e-07, "logits/chosen": 6128.9091796875, "logits/rejected": 4719.29931640625, "logps/chosen": -250.7108154296875, "logps/rejected": -220.9887237548828, "loss": 459.4358, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.42599329352378845, "rewards/margins": -0.052743494510650635, "rewards/rejected": 0.4787367880344391, "step": 740 }, { "epoch": 0.3925673907354096, "grad_norm": 2833.9492734543496, "learning_rate": 3.8049800956079545e-07, "logits/chosen": 5706.271484375, "logits/rejected": 5021.3134765625, "logps/chosen": -249.65283203125, "logps/rejected": -209.9662322998047, "loss": 474.0779, "rewards/accuracies": 0.5, "rewards/chosen": 0.4112626016139984, "rewards/margins": 0.01550484262406826, "rewards/rejected": 0.3957577347755432, "step": 750 }, { "epoch": 0.39780162261188173, "grad_norm": 2832.6107588966483, "learning_rate": 3.7657936786467525e-07, "logits/chosen": 4986.99462890625, "logits/rejected": 4256.84765625, "logps/chosen": -212.7075958251953, "logps/rejected": -192.9168701171875, "loss": 433.4753, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.39544767141342163, "rewards/margins": 0.020534038543701172, "rewards/rejected": 0.37491363286972046, "step": 760 }, { "epoch": 0.40303585448835383, "grad_norm": 2774.458368416743, "learning_rate": 3.7261844968793226e-07, "logits/chosen": 4237.71337890625, "logits/rejected": 4375.86767578125, "logps/chosen": -183.19639587402344, "logps/rejected": -205.65249633789062, "loss": 429.399, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.3700638711452484, "rewards/margins": -0.0648937076330185, "rewards/rejected": 0.4349575936794281, "step": 770 }, { "epoch": 0.408270086364826, "grad_norm": 2924.863786054313, "learning_rate": 3.6861657794506187e-07, "logits/chosen": 4800.78125, "logits/rejected": 4496.0048828125, "logps/chosen": -206.91018676757812, "logps/rejected": -204.9410858154297, "loss": 467.7143, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.3601909279823303, "rewards/margins": -0.02283461019396782, "rewards/rejected": 0.38302555680274963, "step": 780 }, { "epoch": 0.4135043182412981, "grad_norm": 2883.394905438846, "learning_rate": 3.6457508922871777e-07, "logits/chosen": 5870.728515625, "logits/rejected": 4462.73876953125, "logps/chosen": -227.4943084716797, "logps/rejected": -195.0250701904297, "loss": 461.7544, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.41137704253196716, "rewards/margins": 0.02530388906598091, "rewards/rejected": 0.38607317209243774, "step": 790 }, { "epoch": 0.4187385501177702, "grad_norm": 2905.6186537657873, "learning_rate": 3.6049533336330084e-07, "logits/chosen": 5861.5732421875, "logits/rejected": 4758.0703125, "logps/chosen": -244.23165893554688, "logps/rejected": -210.7204132080078, "loss": 469.5478, "rewards/accuracies": 0.5625, "rewards/chosen": 0.41407591104507446, "rewards/margins": 0.06709511578083038, "rewards/rejected": 0.3469807505607605, "step": 800 }, { "epoch": 0.4239727819942423, "grad_norm": 2670.309278191828, "learning_rate": 3.56378672954129e-07, "logits/chosen": 6055.84130859375, "logits/rejected": 4388.7734375, "logps/chosen": -263.1472473144531, "logps/rejected": -195.3523712158203, "loss": 441.372, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.44812217354774475, "rewards/margins": 0.04222990199923515, "rewards/rejected": 0.4058922231197357, "step": 810 }, { "epoch": 0.42920701387071447, "grad_norm": 3124.9754658062902, "learning_rate": 3.5222648293233803e-07, "logits/chosen": 6021.9658203125, "logits/rejected": 5607.9169921875, "logps/chosen": -244.96554565429688, "logps/rejected": -242.5836639404297, "loss": 472.9549, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.38347047567367554, "rewards/margins": -0.00437899911776185, "rewards/rejected": 0.38784947991371155, "step": 820 }, { "epoch": 0.4344412457471866, "grad_norm": 2576.472835509745, "learning_rate": 3.480401500956657e-07, "logits/chosen": 5213.6259765625, "logits/rejected": 4482.5400390625, "logps/chosen": -206.4659881591797, "logps/rejected": -217.31362915039062, "loss": 449.0575, "rewards/accuracies": 0.5625, "rewards/chosen": 0.38270142674446106, "rewards/margins": 0.010285294614732265, "rewards/rejected": 0.3724161386489868, "step": 830 }, { "epoch": 0.4396754776236587, "grad_norm": 3109.274717902587, "learning_rate": 3.438210726452724e-07, "logits/chosen": 5968.18017578125, "logits/rejected": 5356.08447265625, "logps/chosen": -269.45263671875, "logps/rejected": -230.8134307861328, "loss": 482.8063, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4160873293876648, "rewards/margins": -0.02526630461215973, "rewards/rejected": 0.44135361909866333, "step": 840 }, { "epoch": 0.44490970950013087, "grad_norm": 2983.3753201080026, "learning_rate": 3.395706597187538e-07, "logits/chosen": 4604.427734375, "logits/rejected": 4566.62109375, "logps/chosen": -193.61697387695312, "logps/rejected": -188.32859802246094, "loss": 448.0851, "rewards/accuracies": 0.5625, "rewards/chosen": 0.36395174264907837, "rewards/margins": -0.0260360948741436, "rewards/rejected": 0.3899878263473511, "step": 850 }, { "epoch": 0.45014394137660296, "grad_norm": 2768.9950966421284, "learning_rate": 3.3529033091949986e-07, "logits/chosen": 5541.9208984375, "logits/rejected": 5208.42041015625, "logps/chosen": -259.69183349609375, "logps/rejected": -261.2183837890625, "loss": 463.3188, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.4210972189903259, "rewards/margins": -0.004580638371407986, "rewards/rejected": 0.4256778359413147, "step": 860 }, { "epoch": 0.4553781732530751, "grad_norm": 2588.2368838304837, "learning_rate": 3.309815158425591e-07, "logits/chosen": 5310.1044921875, "logits/rejected": 5185.2841796875, "logps/chosen": -249.4232177734375, "logps/rejected": -236.345947265625, "loss": 492.8625, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.38777777552604675, "rewards/margins": -0.04184495285153389, "rewards/rejected": 0.42962273955345154, "step": 870 }, { "epoch": 0.46061240512954726, "grad_norm": 2541.1798920816664, "learning_rate": 3.2664565359716536e-07, "logits/chosen": 5426.6923828125, "logits/rejected": 4499.73095703125, "logps/chosen": -225.3328399658203, "logps/rejected": -189.65623474121094, "loss": 472.1843, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.391153484582901, "rewards/margins": -0.021805385127663612, "rewards/rejected": 0.4129588007926941, "step": 880 }, { "epoch": 0.46584663700601936, "grad_norm": 2708.826125946467, "learning_rate": 3.222841923260869e-07, "logits/chosen": 5195.87939453125, "logits/rejected": 4546.89111328125, "logps/chosen": -222.4955596923828, "logps/rejected": -202.45382690429688, "loss": 436.797, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4222163259983063, "rewards/margins": -0.005721543915569782, "rewards/rejected": 0.42793789505958557, "step": 890 }, { "epoch": 0.4710808688824915, "grad_norm": 3327.9010011428068, "learning_rate": 3.1789858872195887e-07, "logits/chosen": 6197.4189453125, "logits/rejected": 5115.60205078125, "logps/chosen": -256.226318359375, "logps/rejected": -237.42318725585938, "loss": 463.1399, "rewards/accuracies": 0.625, "rewards/chosen": 0.47410669922828674, "rewards/margins": 0.11127021163702011, "rewards/rejected": 0.36283645033836365, "step": 900 }, { "epoch": 0.4763151007589636, "grad_norm": 3030.8407607963145, "learning_rate": 3.1349030754075937e-07, "logits/chosen": 5142.9736328125, "logits/rejected": 4237.6552734375, "logps/chosen": -213.5383758544922, "logps/rejected": -190.01211547851562, "loss": 441.7674, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.39735934138298035, "rewards/margins": 0.007636462338268757, "rewards/rejected": 0.38972288370132446, "step": 910 }, { "epoch": 0.48154933263543576, "grad_norm": 2438.3420207071586, "learning_rate": 3.090608211125931e-07, "logits/chosen": 5203.4697265625, "logits/rejected": 4523.6591796875, "logps/chosen": -209.59799194335938, "logps/rejected": -197.68603515625, "loss": 421.52, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.4184856414794922, "rewards/margins": 0.05746692419052124, "rewards/rejected": 0.36101871728897095, "step": 920 }, { "epoch": 0.48678356451190785, "grad_norm": 2547.16792381473, "learning_rate": 3.0461160884994487e-07, "logits/chosen": 5516.6259765625, "logits/rejected": 4994.55078125, "logps/chosen": -227.75247192382812, "logps/rejected": -213.952392578125, "loss": 469.6991, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.4269241690635681, "rewards/margins": 0.04791535809636116, "rewards/rejected": 0.37900882959365845, "step": 930 }, { "epoch": 0.49201779638838, "grad_norm": 2854.5481374547926, "learning_rate": 3.001441567535681e-07, "logits/chosen": 6194.80322265625, "logits/rejected": 5217.76171875, "logps/chosen": -246.4164581298828, "logps/rejected": -225.7456817626953, "loss": 476.4677, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.45431065559387207, "rewards/margins": -0.05539635568857193, "rewards/rejected": 0.509706974029541, "step": 940 }, { "epoch": 0.49725202826485215, "grad_norm": 2818.5725741299366, "learning_rate": 2.956599569161724e-07, "logits/chosen": 5136.03759765625, "logits/rejected": 4069.05126953125, "logps/chosen": -197.53506469726562, "logps/rejected": -187.42420959472656, "loss": 445.4503, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.3796132206916809, "rewards/margins": -0.05161357671022415, "rewards/rejected": 0.43122678995132446, "step": 950 }, { "epoch": 0.5024862601413242, "grad_norm": 2889.166347908498, "learning_rate": 2.91160507024077e-07, "logits/chosen": 5450.57666015625, "logits/rejected": 4700.4873046875, "logps/chosen": -224.8113555908203, "logps/rejected": -205.53060913085938, "loss": 454.4613, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.3970748782157898, "rewards/margins": 0.05230867862701416, "rewards/rejected": 0.34476613998413086, "step": 960 }, { "epoch": 0.5077204920177963, "grad_norm": 3284.552726470652, "learning_rate": 2.866473098569953e-07, "logits/chosen": 5638.99609375, "logits/rejected": 4810.56494140625, "logps/chosen": -246.9467010498047, "logps/rejected": -218.79812622070312, "loss": 466.0353, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.40407705307006836, "rewards/margins": -0.02928379736840725, "rewards/rejected": 0.43336087465286255, "step": 970 }, { "epoch": 0.5129547238942685, "grad_norm": 2623.610981766376, "learning_rate": 2.8212187278611905e-07, "logits/chosen": 5360.29296875, "logits/rejected": 4762.080078125, "logps/chosen": -241.3561553955078, "logps/rejected": -224.0301055908203, "loss": 451.5908, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.4626719355583191, "rewards/margins": 0.05470385402441025, "rewards/rejected": 0.40796810388565063, "step": 980 }, { "epoch": 0.5181889557707406, "grad_norm": 2798.9691821826846, "learning_rate": 2.775857072706684e-07, "logits/chosen": 5845.1748046875, "logits/rejected": 4390.95556640625, "logps/chosen": -243.54818725585938, "logps/rejected": -186.8525390625, "loss": 451.5862, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.4639175534248352, "rewards/margins": 0.07632891833782196, "rewards/rejected": 0.38758859038352966, "step": 990 }, { "epoch": 0.5234231876472127, "grad_norm": 2635.202394275582, "learning_rate": 2.7304032835307667e-07, "logits/chosen": 6009.6083984375, "logits/rejected": 5336.86083984375, "logps/chosen": -233.775146484375, "logps/rejected": -256.08892822265625, "loss": 449.5508, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.45101770758628845, "rewards/margins": 0.05462346225976944, "rewards/rejected": 0.396394282579422, "step": 1000 }, { "epoch": 0.528657419523685, "grad_norm": 3046.3512976746365, "learning_rate": 2.6848725415297884e-07, "logits/chosen": 5911.275390625, "logits/rejected": 5214.2880859375, "logps/chosen": -261.0115661621094, "logps/rejected": -205.8301239013672, "loss": 464.0305, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4340108335018158, "rewards/margins": 0.017716465517878532, "rewards/rejected": 0.4162944257259369, "step": 1010 }, { "epoch": 0.533891651400157, "grad_norm": 2864.526004336298, "learning_rate": 2.6392800536017183e-07, "logits/chosen": 5202.76416015625, "logits/rejected": 4944.0068359375, "logps/chosen": -247.30435180664062, "logps/rejected": -236.41311645507812, "loss": 435.8117, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.40671029686927795, "rewards/margins": 0.028489042073488235, "rewards/rejected": 0.37822121381759644, "step": 1020 }, { "epoch": 0.5391258832766291, "grad_norm": 6187.448379451401, "learning_rate": 2.59364104726716e-07, "logits/chosen": 5676.6806640625, "logits/rejected": 5011.1064453125, "logps/chosen": -246.1743927001953, "logps/rejected": -241.49044799804688, "loss": 454.4778, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.4495580792427063, "rewards/margins": 0.018314603716135025, "rewards/rejected": 0.4312434792518616, "step": 1030 }, { "epoch": 0.5443601151531012, "grad_norm": 2691.628839857216, "learning_rate": 2.547970765583491e-07, "logits/chosen": 5299.375, "logits/rejected": 4855.00146484375, "logps/chosen": -215.9862060546875, "logps/rejected": -192.43870544433594, "loss": 461.7123, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.4477197229862213, "rewards/margins": -0.008086870424449444, "rewards/rejected": 0.45580655336380005, "step": 1040 }, { "epoch": 0.5495943470295734, "grad_norm": 2495.233536183816, "learning_rate": 2.502284462053799e-07, "logits/chosen": 5878.501953125, "logits/rejected": 5847.0, "logps/chosen": -239.09317016601562, "logps/rejected": -223.63198852539062, "loss": 434.0518, "rewards/accuracies": 0.5, "rewards/chosen": 0.4332825243473053, "rewards/margins": -0.021379027515649796, "rewards/rejected": 0.4546615183353424, "step": 1050 }, { "epoch": 0.5548285789060455, "grad_norm": 2637.28084366533, "learning_rate": 2.4565973955323374e-07, "logits/chosen": 5609.7353515625, "logits/rejected": 4916.1220703125, "logps/chosen": -250.21542358398438, "logps/rejected": -205.2766876220703, "loss": 434.1141, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.415800005197525, "rewards/margins": 0.021374240517616272, "rewards/rejected": 0.39442577958106995, "step": 1060 }, { "epoch": 0.5600628107825176, "grad_norm": 3027.24933722122, "learning_rate": 2.410924825128195e-07, "logits/chosen": 5264.02099609375, "logits/rejected": 5056.28564453125, "logps/chosen": -225.8623504638672, "logps/rejected": -233.9796905517578, "loss": 455.3113, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.424748033285141, "rewards/margins": -0.012741155922412872, "rewards/rejected": 0.43748918175697327, "step": 1070 }, { "epoch": 0.5652970426589898, "grad_norm": 3316.2490439100284, "learning_rate": 2.365282005108875e-07, "logits/chosen": 5667.7626953125, "logits/rejected": 4730.36328125, "logps/chosen": -207.9048614501953, "logps/rejected": -211.5628204345703, "loss": 447.5346, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.47996577620506287, "rewards/margins": 0.09519468992948532, "rewards/rejected": 0.38477107882499695, "step": 1080 }, { "epoch": 0.5705312745354619, "grad_norm": 2826.3995184578384, "learning_rate": 2.319684179805491e-07, "logits/chosen": 5481.96240234375, "logits/rejected": 4411.9677734375, "logps/chosen": -243.62890625, "logps/rejected": -192.74151611328125, "loss": 455.4142, "rewards/accuracies": 0.5625, "rewards/chosen": 0.46998849511146545, "rewards/margins": 0.031203698366880417, "rewards/rejected": 0.43878477811813354, "step": 1090 }, { "epoch": 0.575765506411934, "grad_norm": 2747.847648919441, "learning_rate": 2.2741465785212902e-07, "logits/chosen": 5132.85595703125, "logits/rejected": 4005.84423828125, "logps/chosen": -217.897216796875, "logps/rejected": -181.64852905273438, "loss": 430.0103, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.38213029503822327, "rewards/margins": -0.016666922718286514, "rewards/rejected": 0.39879724383354187, "step": 1100 }, { "epoch": 0.5809997382884062, "grad_norm": 2323.3465928198475, "learning_rate": 2.2286844104451843e-07, "logits/chosen": 5565.04443359375, "logits/rejected": 4898.17822265625, "logps/chosen": -242.07308959960938, "logps/rejected": -231.4122314453125, "loss": 449.538, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.4311472773551941, "rewards/margins": 0.0534161813557148, "rewards/rejected": 0.3777311444282532, "step": 1110 }, { "epoch": 0.5862339701648783, "grad_norm": 2731.5099834648036, "learning_rate": 2.183312859572008e-07, "logits/chosen": 6313.4072265625, "logits/rejected": 5429.6669921875, "logps/chosen": -251.57821655273438, "logps/rejected": -215.6925048828125, "loss": 439.9762, "rewards/accuracies": 0.5, "rewards/chosen": 0.39398592710494995, "rewards/margins": -0.047116655856370926, "rewards/rejected": 0.4411025941371918, "step": 1120 }, { "epoch": 0.5914682020413504, "grad_norm": 2679.5180675952847, "learning_rate": 2.138047079631184e-07, "logits/chosen": 5296.7177734375, "logits/rejected": 5422.4072265625, "logps/chosen": -213.47006225585938, "logps/rejected": -225.93197631835938, "loss": 469.3134, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.4071172773838043, "rewards/margins": -0.12094493210315704, "rewards/rejected": 0.5280622243881226, "step": 1130 }, { "epoch": 0.5967024339178225, "grad_norm": 2582.940559778803, "learning_rate": 2.0929021890255068e-07, "logits/chosen": 6125.99462890625, "logits/rejected": 5399.9873046875, "logps/chosen": -257.4599609375, "logps/rejected": -255.30001831054688, "loss": 448.1145, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4138859212398529, "rewards/margins": -0.1462526023387909, "rewards/rejected": 0.5601385831832886, "step": 1140 }, { "epoch": 0.6019366657942947, "grad_norm": 2839.4929770409203, "learning_rate": 2.0478932657817102e-07, "logits/chosen": 5059.7060546875, "logits/rejected": 4862.5732421875, "logps/chosen": -209.98721313476562, "logps/rejected": -213.7787628173828, "loss": 421.8651, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3850982189178467, "rewards/margins": 0.013092848472297192, "rewards/rejected": 0.3720053732395172, "step": 1150 }, { "epoch": 0.6071708976707668, "grad_norm": 3064.4069962248286, "learning_rate": 2.0030353425145374e-07, "logits/chosen": 7047.43115234375, "logits/rejected": 6413.43994140625, "logps/chosen": -304.1640319824219, "logps/rejected": -276.45574951171875, "loss": 514.2635, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.6004487872123718, "rewards/margins": -0.14154250919818878, "rewards/rejected": 0.7419912815093994, "step": 1160 }, { "epoch": 0.6124051295472389, "grad_norm": 2585.4669520145776, "learning_rate": 1.9583434014059635e-07, "logits/chosen": 5760.01904296875, "logits/rejected": 5066.72900390625, "logps/chosen": -234.2002410888672, "logps/rejected": -218.1713409423828, "loss": 446.8139, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3829045593738556, "rewards/margins": -0.20982496440410614, "rewards/rejected": 0.5927294492721558, "step": 1170 }, { "epoch": 0.6176393614237111, "grad_norm": 2895.216663375429, "learning_rate": 1.9138323692012733e-07, "logits/chosen": 5071.248046875, "logits/rejected": 5016.58203125, "logps/chosen": -232.4548797607422, "logps/rejected": -214.78939819335938, "loss": 427.5768, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4009971022605896, "rewards/margins": -0.017197366803884506, "rewards/rejected": 0.4181944727897644, "step": 1180 }, { "epoch": 0.6228735933001832, "grad_norm": 3172.1217220248027, "learning_rate": 1.8695171122236442e-07, "logits/chosen": 5186.33447265625, "logits/rejected": 5195.64892578125, "logps/chosen": -219.36703491210938, "logps/rejected": -242.1404571533203, "loss": 492.8513, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.3699229657649994, "rewards/margins": -0.017510617151856422, "rewards/rejected": 0.38743358850479126, "step": 1190 }, { "epoch": 0.6281078251766553, "grad_norm": 2914.8049221104293, "learning_rate": 1.8254124314089223e-07, "logits/chosen": 5557.251953125, "logits/rejected": 5019.79541015625, "logps/chosen": -235.5565185546875, "logps/rejected": -225.5511016845703, "loss": 466.1685, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.48438769578933716, "rewards/margins": 0.052512697875499725, "rewards/rejected": 0.4318750500679016, "step": 1200 }, { "epoch": 0.6333420570531274, "grad_norm": 2637.0032356490447, "learning_rate": 1.7815330573622205e-07, "logits/chosen": 5739.81884765625, "logits/rejected": 5681.2822265625, "logps/chosen": -233.9698028564453, "logps/rejected": -262.9213562011719, "loss": 428.7625, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.4719967842102051, "rewards/margins": 0.026335466653108597, "rewards/rejected": 0.44566136598587036, "step": 1210 }, { "epoch": 0.6385762889295996, "grad_norm": 3181.9042887438923, "learning_rate": 1.7378936454380274e-07, "logits/chosen": 5621.00830078125, "logits/rejected": 4796.08740234375, "logps/chosen": -220.7882080078125, "logps/rejected": -207.47921752929688, "loss": 446.4105, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.43960586190223694, "rewards/margins": 0.08442094922065735, "rewards/rejected": 0.3551848828792572, "step": 1220 }, { "epoch": 0.6438105208060717, "grad_norm": 2986.626560530847, "learning_rate": 1.694508770845427e-07, "logits/chosen": 6612.59521484375, "logits/rejected": 5661.8095703125, "logps/chosen": -281.0216064453125, "logps/rejected": -243.0041961669922, "loss": 471.7892, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.5507138967514038, "rewards/margins": 0.09217164665460587, "rewards/rejected": 0.4585422873497009, "step": 1230 }, { "epoch": 0.6490447526825438, "grad_norm": 2491.6715549344503, "learning_rate": 1.651392923780105e-07, "logits/chosen": 6138.0517578125, "logits/rejected": 4998.80126953125, "logps/chosen": -237.0027618408203, "logps/rejected": -196.52346801757812, "loss": 439.6602, "rewards/accuracies": 0.5625, "rewards/chosen": 0.44304361939430237, "rewards/margins": 0.019446546211838722, "rewards/rejected": 0.4235970377922058, "step": 1240 }, { "epoch": 0.654278984559016, "grad_norm": 2930.0912987575693, "learning_rate": 1.6085605045847367e-07, "logits/chosen": 5613.2529296875, "logits/rejected": 4578.0126953125, "logps/chosen": -231.54763793945312, "logps/rejected": -231.14364624023438, "loss": 444.372, "rewards/accuracies": 0.4375, "rewards/chosen": 0.41010889410972595, "rewards/margins": -0.021687136963009834, "rewards/rejected": 0.43179601430892944, "step": 1250 }, { "epoch": 0.6595132164354881, "grad_norm": 2685.020119553926, "learning_rate": 1.5660258189393944e-07, "logits/chosen": 5781.34375, "logits/rejected": 4609.04443359375, "logps/chosen": -247.92501831054688, "logps/rejected": -209.21481323242188, "loss": 470.8875, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.5003296732902527, "rewards/margins": -0.01339314877986908, "rewards/rejected": 0.5137227773666382, "step": 1260 }, { "epoch": 0.6647474483119602, "grad_norm": 3478.5401870274527, "learning_rate": 1.5238030730835577e-07, "logits/chosen": 5069.4560546875, "logits/rejected": 5332.6708984375, "logps/chosen": -198.8926544189453, "logps/rejected": -207.35888671875, "loss": 431.9828, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.39420709013938904, "rewards/margins": -0.018875379115343094, "rewards/rejected": 0.41308245062828064, "step": 1270 }, { "epoch": 0.6699816801884323, "grad_norm": 2495.084991283364, "learning_rate": 1.4819063690713564e-07, "logits/chosen": 5791.67138671875, "logits/rejected": 4803.8017578125, "logps/chosen": -239.6625213623047, "logps/rejected": -215.32882690429688, "loss": 463.378, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.38961488008499146, "rewards/margins": -0.013818919658660889, "rewards/rejected": 0.40343379974365234, "step": 1280 }, { "epoch": 0.6752159120649045, "grad_norm": 2929.168635003601, "learning_rate": 1.4403497000615883e-07, "logits/chosen": 5547.22119140625, "logits/rejected": 4953.2685546875, "logps/chosen": -271.31817626953125, "logps/rejected": -208.81826782226562, "loss": 444.6075, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.40208378434181213, "rewards/margins": -0.03740546852350235, "rewards/rejected": 0.4394892156124115, "step": 1290 }, { "epoch": 0.6804501439413766, "grad_norm": 2581.5019746906396, "learning_rate": 1.3991469456441272e-07, "logits/chosen": 5376.77197265625, "logits/rejected": 5233.3271484375, "logps/chosen": -235.42138671875, "logps/rejected": -236.4017333984375, "loss": 420.9892, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.4205804467201233, "rewards/margins": -0.030592668801546097, "rewards/rejected": 0.4511730670928955, "step": 1300 }, { "epoch": 0.6856843758178487, "grad_norm": 3160.8658484895473, "learning_rate": 1.358311867204244e-07, "logits/chosen": 4565.5341796875, "logits/rejected": 4611.97998046875, "logps/chosen": -186.4778289794922, "logps/rejected": -192.1836700439453, "loss": 444.8648, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.37066149711608887, "rewards/margins": -0.06482173502445221, "rewards/rejected": 0.43548327684402466, "step": 1310 }, { "epoch": 0.6909186076943209, "grad_norm": 2840.1793768299094, "learning_rate": 1.3178581033264216e-07, "logits/chosen": 6083.375, "logits/rejected": 5281.1484375, "logps/chosen": -248.0015106201172, "logps/rejected": -244.42868041992188, "loss": 492.9659, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4658554196357727, "rewards/margins": 0.03714742138981819, "rewards/rejected": 0.42870792746543884, "step": 1320 }, { "epoch": 0.696152839570793, "grad_norm": 2999.3305501535006, "learning_rate": 1.2777991652391757e-07, "logits/chosen": 5277.3232421875, "logits/rejected": 4048.818359375, "logps/chosen": -233.933349609375, "logps/rejected": -182.76797485351562, "loss": 437.7016, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.4525213837623596, "rewards/margins": 0.050732992589473724, "rewards/rejected": 0.4017884135246277, "step": 1330 }, { "epoch": 0.7013870714472651, "grad_norm": 2758.037544701704, "learning_rate": 1.2381484323024178e-07, "logits/chosen": 5822.4111328125, "logits/rejected": 5114.529296875, "logps/chosen": -239.6138916015625, "logps/rejected": -212.1165771484375, "loss": 445.3217, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.416238397359848, "rewards/margins": -0.008182978257536888, "rewards/rejected": 0.42442137002944946, "step": 1340 }, { "epoch": 0.7066213033237373, "grad_norm": 2322.8858217356333, "learning_rate": 1.1989191475388516e-07, "logits/chosen": 4871.5693359375, "logits/rejected": 4576.48779296875, "logps/chosen": -184.75506591796875, "logps/rejected": -198.5352020263672, "loss": 395.6979, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.42691653966903687, "rewards/margins": -0.034974582493305206, "rewards/rejected": 0.4618911147117615, "step": 1350 }, { "epoch": 0.7118555352002094, "grad_norm": 2504.7651931239393, "learning_rate": 1.1601244132109179e-07, "logits/chosen": 4925.0966796875, "logits/rejected": 4446.34521484375, "logps/chosen": -194.90817260742188, "logps/rejected": -204.82412719726562, "loss": 465.5208, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.4748953878879547, "rewards/margins": 0.097330242395401, "rewards/rejected": 0.3775652050971985, "step": 1360 }, { "epoch": 0.7170897670766815, "grad_norm": 3082.926945072268, "learning_rate": 1.1217771864447395e-07, "logits/chosen": 5598.09228515625, "logits/rejected": 4815.2041015625, "logps/chosen": -246.0078125, "logps/rejected": -249.9725341796875, "loss": 500.4969, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.39293307065963745, "rewards/margins": -0.0007484182715415955, "rewards/rejected": 0.39368146657943726, "step": 1370 }, { "epoch": 0.7223239989531536, "grad_norm": 2722.3074395253057, "learning_rate": 1.0838902749025499e-07, "logits/chosen": 6814.609375, "logits/rejected": 5488.0498046875, "logps/chosen": -268.84417724609375, "logps/rejected": -234.26416015625, "loss": 458.2768, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5655155777931213, "rewards/margins": 0.1583983451128006, "rewards/rejected": 0.4071172773838043, "step": 1380 }, { "epoch": 0.7275582308296258, "grad_norm": 3089.651688801927, "learning_rate": 1.0464763325050358e-07, "logits/chosen": 5166.2724609375, "logits/rejected": 4742.515625, "logps/chosen": -231.9933624267578, "logps/rejected": -210.8185577392578, "loss": 428.4296, "rewards/accuracies": 0.5, "rewards/chosen": 0.49672383069992065, "rewards/margins": 0.027382072061300278, "rewards/rejected": 0.4693417549133301, "step": 1390 }, { "epoch": 0.7327924627060979, "grad_norm": 3265.9261458601773, "learning_rate": 1.0095478552050346e-07, "logits/chosen": 6062.771484375, "logits/rejected": 4171.9345703125, "logps/chosen": -269.94989013671875, "logps/rejected": -207.66567993164062, "loss": 476.396, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.4134543538093567, "rewards/margins": -0.02273373305797577, "rewards/rejected": 0.43618807196617126, "step": 1400 }, { "epoch": 0.73802669458257, "grad_norm": 2494.1100717249315, "learning_rate": 9.731171768139806e-08, "logits/chosen": 5687.302734375, "logits/rejected": 4657.4443359375, "logps/chosen": -224.1031036376953, "logps/rejected": -198.58413696289062, "loss": 443.6654, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.4446600377559662, "rewards/margins": 0.015666846185922623, "rewards/rejected": 0.42899322509765625, "step": 1410 }, { "epoch": 0.7432609264590422, "grad_norm": 2932.950028085596, "learning_rate": 9.37196464882522e-08, "logits/chosen": 5420.83837890625, "logits/rejected": 4967.8408203125, "logps/chosen": -209.77053833007812, "logps/rejected": -207.4347686767578, "loss": 450.4227, "rewards/accuracies": 0.5, "rewards/chosen": 0.43635910749435425, "rewards/margins": 0.013584541156888008, "rewards/rejected": 0.42277461290359497, "step": 1420 }, { "epoch": 0.7484951583355143, "grad_norm": 2755.115765364893, "learning_rate": 9.017977166366444e-08, "logits/chosen": 5591.40771484375, "logits/rejected": 5046.1015625, "logps/chosen": -239.6355438232422, "logps/rejected": -232.3363800048828, "loss": 399.9244, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.44893360137939453, "rewards/margins": -0.023927757516503334, "rewards/rejected": 0.47286128997802734, "step": 1430 }, { "epoch": 0.7537293902119864, "grad_norm": 2758.9501579598164, "learning_rate": 8.669327549707095e-08, "logits/chosen": 5718.99560546875, "logits/rejected": 4905.77197265625, "logps/chosen": -254.92733764648438, "logps/rejected": -217.9339141845703, "loss": 459.4898, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.49462470412254333, "rewards/margins": 0.05205591768026352, "rewards/rejected": 0.442568838596344, "step": 1440 }, { "epoch": 0.7589636220884585, "grad_norm": 2916.421053999076, "learning_rate": 8.326132244986931e-08, "logits/chosen": 5092.185546875, "logits/rejected": 4410.7255859375, "logps/chosen": -227.4456329345703, "logps/rejected": -200.5583953857422, "loss": 443.7745, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.35959136486053467, "rewards/margins": 0.013029132969677448, "rewards/rejected": 0.3465622365474701, "step": 1450 }, { "epoch": 0.7641978539649307, "grad_norm": 2500.95110656443, "learning_rate": 7.988505876649862e-08, "logits/chosen": 5265.857421875, "logits/rejected": 4087.15283203125, "logps/chosen": -229.035888671875, "logps/rejected": -220.652099609375, "loss": 447.4743, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.39383992552757263, "rewards/margins": 0.018630001693964005, "rewards/rejected": 0.37520989775657654, "step": 1460 }, { "epoch": 0.7694320858414028, "grad_norm": 3061.8379378461154, "learning_rate": 7.656561209160248e-08, "logits/chosen": 5706.8154296875, "logits/rejected": 4945.07177734375, "logps/chosen": -261.44781494140625, "logps/rejected": -214.4395294189453, "loss": 466.3214, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.42837047576904297, "rewards/margins": 0.06265218555927277, "rewards/rejected": 0.3657183051109314, "step": 1470 }, { "epoch": 0.7746663177178749, "grad_norm": 2191.9761613126184, "learning_rate": 7.330409109340562e-08, "logits/chosen": 5812.962890625, "logits/rejected": 5259.6865234375, "logps/chosen": -275.3894958496094, "logps/rejected": -240.2283935546875, "loss": 436.1938, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.4367316663265228, "rewards/margins": 0.004428995307534933, "rewards/rejected": 0.43230265378952026, "step": 1480 }, { "epoch": 0.7799005495943471, "grad_norm": 3020.3364090207338, "learning_rate": 7.010158509342681e-08, "logits/chosen": 6394.3779296875, "logits/rejected": 4694.5419921875, "logps/chosen": -254.2501983642578, "logps/rejected": -196.20724487304688, "loss": 469.8941, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.4546736776828766, "rewards/margins": -0.030942197889089584, "rewards/rejected": 0.4856158196926117, "step": 1490 }, { "epoch": 0.7851347814708192, "grad_norm": 2862.609703782646, "learning_rate": 6.695916370265527e-08, "logits/chosen": 5147.36279296875, "logits/rejected": 4634.7470703125, "logps/chosen": -220.52487182617188, "logps/rejected": -173.90330505371094, "loss": 483.0644, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.3716416358947754, "rewards/margins": -0.025338003411889076, "rewards/rejected": 0.3969796299934387, "step": 1500 }, { "epoch": 0.7903690133472913, "grad_norm": 2682.017468500409, "learning_rate": 6.387787646430853e-08, "logits/chosen": 6349.74951171875, "logits/rejected": 5803.51025390625, "logps/chosen": -253.1613311767578, "logps/rejected": -243.31069946289062, "loss": 465.5553, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.4580962657928467, "rewards/margins": 0.023849302902817726, "rewards/rejected": 0.4342469573020935, "step": 1510 }, { "epoch": 0.7956032452237635, "grad_norm": 2929.4259148387664, "learning_rate": 6.0858752503294e-08, "logits/chosen": 4991.2568359375, "logits/rejected": 4856.7724609375, "logps/chosen": -243.1151885986328, "logps/rejected": -218.1384735107422, "loss": 448.5571, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.39313045144081116, "rewards/margins": -0.008735431358218193, "rewards/rejected": 0.4018658697605133, "step": 1520 }, { "epoch": 0.8008374771002356, "grad_norm": 2334.8289467051422, "learning_rate": 5.7902800182489385e-08, "logits/chosen": 5283.90966796875, "logits/rejected": 5065.044921875, "logps/chosen": -209.5236053466797, "logps/rejected": -185.2886505126953, "loss": 438.2237, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": 0.4488741457462311, "rewards/margins": -0.02802448347210884, "rewards/rejected": 0.47689858078956604, "step": 1530 }, { "epoch": 0.8060717089767077, "grad_norm": 2835.446662936965, "learning_rate": 5.5011006765957604e-08, "logits/chosen": 6475.173828125, "logits/rejected": 5851.21044921875, "logps/chosen": -254.42550659179688, "logps/rejected": -276.20184326171875, "loss": 459.7479, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.5142590999603271, "rewards/margins": 0.06982637941837311, "rewards/rejected": 0.44443267583847046, "step": 1540 }, { "epoch": 0.8113059408531798, "grad_norm": 2645.7937944070086, "learning_rate": 5.218433808920883e-08, "logits/chosen": 5612.806640625, "logits/rejected": 5204.6416015625, "logps/chosen": -241.086669921875, "logps/rejected": -235.9331817626953, "loss": 477.5177, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.46159011125564575, "rewards/margins": 0.03268017619848251, "rewards/rejected": 0.42890992760658264, "step": 1550 }, { "epoch": 0.816540172729652, "grad_norm": 2799.218073861503, "learning_rate": 4.942373823661927e-08, "logits/chosen": 6671.35693359375, "logits/rejected": 5100.93115234375, "logps/chosen": -269.9798583984375, "logps/rejected": -219.4925079345703, "loss": 440.4742, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.49509739875793457, "rewards/margins": 0.023434173315763474, "rewards/rejected": 0.471663236618042, "step": 1560 }, { "epoch": 0.821774404606124, "grad_norm": 2778.404170104335, "learning_rate": 4.6730129226114354e-08, "logits/chosen": 5087.5205078125, "logits/rejected": 4740.5205078125, "logps/chosen": -213.2566375732422, "logps/rejected": -172.8238525390625, "loss": 456.5575, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.4261597990989685, "rewards/margins": -0.00964472908526659, "rewards/rejected": 0.4358045160770416, "step": 1570 }, { "epoch": 0.8270086364825961, "grad_norm": 2634.177798429675, "learning_rate": 4.41044107012227e-08, "logits/chosen": 6359.1689453125, "logits/rejected": 5187.23828125, "logps/chosen": -286.50201416015625, "logps/rejected": -236.1680145263672, "loss": 464.8844, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.5070956945419312, "rewards/margins": 0.03383489325642586, "rewards/rejected": 0.4732607901096344, "step": 1580 }, { "epoch": 0.8322428683590684, "grad_norm": 2799.6223555020733, "learning_rate": 4.1547459630601966e-08, "logits/chosen": 5616.50146484375, "logits/rejected": 5148.76025390625, "logps/chosen": -245.4374542236328, "logps/rejected": -224.58297729492188, "loss": 468.5303, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.42994633316993713, "rewards/margins": -0.04849852994084358, "rewards/rejected": 0.47844481468200684, "step": 1590 }, { "epoch": 0.8374771002355405, "grad_norm": 2603.023786297724, "learning_rate": 3.9060130015138857e-08, "logits/chosen": 5224.2919921875, "logits/rejected": 4763.0263671875, "logps/chosen": -228.858642578125, "logps/rejected": -206.1272735595703, "loss": 441.8306, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.532415509223938, "rewards/margins": 0.082635298371315, "rewards/rejected": 0.4497801661491394, "step": 1600 }, { "epoch": 0.8427113321120125, "grad_norm": 2952.0546467904624, "learning_rate": 3.664325260271953e-08, "logits/chosen": 5981.43505859375, "logits/rejected": 5149.3701171875, "logps/chosen": -264.82513427734375, "logps/rejected": -231.50088500976562, "loss": 476.4973, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.5294232368469238, "rewards/margins": 0.05757498741149902, "rewards/rejected": 0.4718483090400696, "step": 1610 }, { "epoch": 0.8479455639884846, "grad_norm": 2685.7294831210556, "learning_rate": 3.429763461076676e-08, "logits/chosen": 5783.91162109375, "logits/rejected": 5101.3291015625, "logps/chosen": -229.64187622070312, "logps/rejected": -208.2964324951172, "loss": 452.1323, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.657304584980011, "rewards/margins": -0.1763857752084732, "rewards/rejected": 0.8336902856826782, "step": 1620 }, { "epoch": 0.8531797958649568, "grad_norm": 2730.708924158352, "learning_rate": 3.202405945663555e-08, "logits/chosen": 5812.5908203125, "logits/rejected": 4056.528076171875, "logps/chosen": -227.04598999023438, "logps/rejected": -165.66348266601562, "loss": 433.1286, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.44985729455947876, "rewards/margins": 0.0024114579427987337, "rewards/rejected": 0.4474458694458008, "step": 1630 }, { "epoch": 0.8584140277414289, "grad_norm": 2716.197853796788, "learning_rate": 2.9823286495958556e-08, "logits/chosen": 4791.669921875, "logits/rejected": 5499.1123046875, "logps/chosen": -198.6993865966797, "logps/rejected": -247.2404327392578, "loss": 452.7315, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.4359843134880066, "rewards/margins": -0.5284264087677002, "rewards/rejected": 0.9644107818603516, "step": 1640 }, { "epoch": 0.863648259617901, "grad_norm": 2791.281847581253, "learning_rate": 2.769605076902695e-08, "logits/chosen": 6074.97802734375, "logits/rejected": 5591.9599609375, "logps/chosen": -239.0797882080078, "logps/rejected": -250.9207763671875, "loss": 429.5277, "rewards/accuracies": 0.5, "rewards/chosen": 0.5212064981460571, "rewards/margins": 0.0025348193012177944, "rewards/rejected": 0.5186716318130493, "step": 1650 }, { "epoch": 0.8688824914943732, "grad_norm": 3093.882963200942, "learning_rate": 2.5643062755293403e-08, "logits/chosen": 5405.78759765625, "logits/rejected": 4668.48828125, "logps/chosen": -237.1522674560547, "logps/rejected": -197.37451171875, "loss": 469.6285, "rewards/accuracies": 0.5, "rewards/chosen": 0.42427119612693787, "rewards/margins": 0.02963954210281372, "rewards/rejected": 0.39463168382644653, "step": 1660 }, { "epoch": 0.8741167233708453, "grad_norm": 2911.6598438916217, "learning_rate": 2.366500813607733e-08, "logits/chosen": 5930.2880859375, "logits/rejected": 4750.8203125, "logps/chosen": -233.2056427001953, "logps/rejected": -216.0753173828125, "loss": 461.4281, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.453493595123291, "rewards/margins": 0.01780077815055847, "rewards/rejected": 0.43569284677505493, "step": 1670 }, { "epoch": 0.8793509552473174, "grad_norm": 2804.7017487072826, "learning_rate": 2.176254756555329e-08, "logits/chosen": 6307.5615234375, "logits/rejected": 5713.3017578125, "logps/chosen": -275.5252685546875, "logps/rejected": -247.7924346923828, "loss": 446.325, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.4305667281150818, "rewards/margins": 0.0016602992545813322, "rewards/rejected": 0.42890650033950806, "step": 1680 }, { "epoch": 0.8845851871237895, "grad_norm": 2756.2303664573706, "learning_rate": 1.9936316450097468e-08, "logits/chosen": 5031.6337890625, "logits/rejected": 4576.9560546875, "logps/chosen": -213.4817657470703, "logps/rejected": -188.27459716796875, "loss": 429.4412, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.42264145612716675, "rewards/margins": 0.02685725688934326, "rewards/rejected": 0.3957841992378235, "step": 1690 }, { "epoch": 0.8898194190002617, "grad_norm": 2590.5879445875785, "learning_rate": 1.8186924736067477e-08, "logits/chosen": 5710.92578125, "logits/rejected": 4436.71435546875, "logps/chosen": -245.94833374023438, "logps/rejected": -221.7904815673828, "loss": 449.8979, "rewards/accuracies": 0.625, "rewards/chosen": 0.45010289549827576, "rewards/margins": 0.06335899978876114, "rewards/rejected": 0.3867438733577728, "step": 1700 }, { "epoch": 0.8950536508767338, "grad_norm": 2757.0934444677064, "learning_rate": 1.651495670608488e-08, "logits/chosen": 6424.537109375, "logits/rejected": 5109.24169921875, "logps/chosen": -250.62875366210938, "logps/rejected": -216.7704620361328, "loss": 430.7702, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.5290960073471069, "rewards/margins": 0.08976821601390839, "rewards/rejected": 0.43932777643203735, "step": 1710 }, { "epoch": 0.9002878827532059, "grad_norm": 8507.84049039918, "learning_rate": 1.4920970783889737e-08, "logits/chosen": 6169.34375, "logits/rejected": 4740.4990234375, "logps/chosen": -254.0730438232422, "logps/rejected": -234.4978485107422, "loss": 463.4699, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.4609464704990387, "rewards/margins": -0.0022157118655741215, "rewards/rejected": 0.4631621837615967, "step": 1720 }, { "epoch": 0.9055221146296781, "grad_norm": 2899.8879312514678, "learning_rate": 1.340549934783164e-08, "logits/chosen": 5876.3447265625, "logits/rejected": 5633.6728515625, "logps/chosen": -254.2863311767578, "logps/rejected": -255.55599975585938, "loss": 446.1415, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.43738800287246704, "rewards/margins": 0.03623257577419281, "rewards/rejected": 0.40115541219711304, "step": 1730 }, { "epoch": 0.9107563465061502, "grad_norm": 2759.8073515628384, "learning_rate": 1.1969048553059608e-08, "logits/chosen": 5478.740234375, "logits/rejected": 4825.607421875, "logps/chosen": -207.3783416748047, "logps/rejected": -196.50003051757812, "loss": 417.9901, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.41129404306411743, "rewards/margins": -0.001239946810528636, "rewards/rejected": 0.4125339984893799, "step": 1740 }, { "epoch": 0.9159905783826223, "grad_norm": 2928.870279384255, "learning_rate": 1.06120981624703e-08, "logits/chosen": 5248.7646484375, "logits/rejected": 5659.0908203125, "logps/chosen": -234.52590942382812, "logps/rejected": -253.9241943359375, "loss": 473.0938, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.3923454284667969, "rewards/margins": -0.08389392495155334, "rewards/rejected": 0.4762393534183502, "step": 1750 }, { "epoch": 0.9212248102590945, "grad_norm": 2633.7390537249466, "learning_rate": 9.335101386471284e-09, "logits/chosen": 6008.14599609375, "logits/rejected": 5465.85791015625, "logps/chosen": -254.5990447998047, "logps/rejected": -225.7978973388672, "loss": 429.5775, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.3928176462650299, "rewards/margins": -0.041309647262096405, "rewards/rejected": 0.4341272711753845, "step": 1760 }, { "epoch": 0.9264590421355666, "grad_norm": 2751.6746625086507, "learning_rate": 8.138484731612273e-09, "logits/chosen": 5772.14208984375, "logits/rejected": 4916.0166015625, "logps/chosen": -237.14111328125, "logps/rejected": -235.15902709960938, "loss": 449.8257, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.4163140654563904, "rewards/margins": -0.003167784307152033, "rewards/rejected": 0.4194818437099457, "step": 1770 }, { "epoch": 0.9316932740120387, "grad_norm": 2986.1836483001102, "learning_rate": 7.0226478581355e-09, "logits/chosen": 5796.9658203125, "logits/rejected": 5225.16015625, "logps/chosen": -233.0400390625, "logps/rejected": -211.41305541992188, "loss": 442.5229, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.5166589021682739, "rewards/margins": -0.05622934550046921, "rewards/rejected": 0.5728882551193237, "step": 1780 }, { "epoch": 0.9369275058885108, "grad_norm": 2588.104995999543, "learning_rate": 5.987963446492383e-09, "logits/chosen": 5863.2470703125, "logits/rejected": 5275.2890625, "logps/chosen": -230.316162109375, "logps/rejected": -208.0933074951172, "loss": 434.7167, "rewards/accuracies": 0.5, "rewards/chosen": 0.4742770791053772, "rewards/margins": 0.013745969161391258, "rewards/rejected": 0.4605311453342438, "step": 1790 }, { "epoch": 0.942161737764983, "grad_norm": 2706.6843129659187, "learning_rate": 5.0347770728713935e-09, "logits/chosen": 5820.26416015625, "logits/rejected": 4646.1689453125, "logps/chosen": -280.79437255859375, "logps/rejected": -203.54367065429688, "loss": 451.1318, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.39324313402175903, "rewards/margins": -0.029782313853502274, "rewards/rejected": 0.4230254590511322, "step": 1800 }, { "epoch": 0.9473959696414551, "grad_norm": 2907.594423226152, "learning_rate": 4.1634070937782424e-09, "logits/chosen": 5797.3984375, "logits/rejected": 5349.8115234375, "logps/chosen": -250.1112518310547, "logps/rejected": -250.5971221923828, "loss": 469.334, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.465669721364975, "rewards/margins": -0.11242341995239258, "rewards/rejected": 0.5780931711196899, "step": 1810 }, { "epoch": 0.9526302015179272, "grad_norm": 2594.3361342889198, "learning_rate": 3.3741445397075797e-09, "logits/chosen": 6125.84912109375, "logits/rejected": 5266.125, "logps/chosen": -265.88262939453125, "logps/rejected": -253.229736328125, "loss": 498.9627, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.4644504487514496, "rewards/margins": 0.023336850106716156, "rewards/rejected": 0.44111356139183044, "step": 1820 }, { "epoch": 0.9578644333943994, "grad_norm": 2694.3106547281477, "learning_rate": 2.667253017941018e-09, "logits/chosen": 6051.47802734375, "logits/rejected": 4897.32177734375, "logps/chosen": -258.3802795410156, "logps/rejected": -225.14633178710938, "loss": 475.6425, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.4497244358062744, "rewards/margins": -0.013121997937560081, "rewards/rejected": 0.46284645795822144, "step": 1830 }, { "epoch": 0.9630986652708715, "grad_norm": 2494.09151231596, "learning_rate": 2.0429686245045097e-09, "logits/chosen": 6027.98046875, "logits/rejected": 4803.41650390625, "logps/chosen": -286.62261962890625, "logps/rejected": -219.09426879882812, "loss": 477.4155, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.5301491022109985, "rewards/margins": 0.06436850130558014, "rewards/rejected": 0.4657805562019348, "step": 1840 }, { "epoch": 0.9683328971473436, "grad_norm": 2754.330659901363, "learning_rate": 1.5014998653141708e-09, "logits/chosen": 5659.52490234375, "logits/rejected": 4881.109375, "logps/chosen": -252.7493896484375, "logps/rejected": -205.0902862548828, "loss": 449.6573, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.5084835290908813, "rewards/margins": 0.027627814561128616, "rewards/rejected": 0.4808557629585266, "step": 1850 }, { "epoch": 0.9735671290238157, "grad_norm": 2974.5475606563223, "learning_rate": 1.0430275865371263e-09, "logits/chosen": 5825.0693359375, "logits/rejected": 4934.61962890625, "logps/chosen": -208.9281768798828, "logps/rejected": -208.70657348632812, "loss": 450.0896, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.5317475199699402, "rewards/margins": 0.12720224261283875, "rewards/rejected": 0.40454530715942383, "step": 1860 }, { "epoch": 0.9788013609002879, "grad_norm": 2999.4619162797794, "learning_rate": 6.677049141901314e-10, "logits/chosen": 4800.06103515625, "logits/rejected": 4752.68701171875, "logps/chosen": -204.03689575195312, "logps/rejected": -210.6210174560547, "loss": 434.911, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.3801441788673401, "rewards/margins": -0.11329132318496704, "rewards/rejected": 0.49343547224998474, "step": 1870 }, { "epoch": 0.98403559277676, "grad_norm": 2752.7633972308495, "learning_rate": 3.7565720299687077e-10, "logits/chosen": 6091.1953125, "logits/rejected": 5330.6806640625, "logps/chosen": -270.46978759765625, "logps/rejected": -219.2836456298828, "loss": 473.733, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.4513503611087799, "rewards/margins": -0.0449112243950367, "rewards/rejected": 0.4962615966796875, "step": 1880 }, { "epoch": 0.9892698246532321, "grad_norm": 2528.626552808556, "learning_rate": 1.6698199452053197e-10, "logits/chosen": 4495.45458984375, "logits/rejected": 4576.0517578125, "logps/chosen": -214.46987915039062, "logps/rejected": -207.4097137451172, "loss": 442.3405, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.38978248834609985, "rewards/margins": -0.048971425741910934, "rewards/rejected": 0.4387539029121399, "step": 1890 }, { "epoch": 0.9945040565297043, "grad_norm": 2585.4214547887354, "learning_rate": 4.174898458556009e-11, "logits/chosen": 5962.24658203125, "logits/rejected": 4326.06396484375, "logps/chosen": -230.4096221923828, "logps/rejected": -197.4384765625, "loss": 445.7363, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.42028751969337463, "rewards/margins": 0.060069866478443146, "rewards/rejected": 0.3602176308631897, "step": 1900 }, { "epoch": 0.9997382884061764, "grad_norm": 2873.788980614392, "learning_rate": 0.0, "logits/chosen": 6063.86083984375, "logits/rejected": 5068.90869140625, "logps/chosen": -257.5758361816406, "logps/rejected": -246.18759155273438, "loss": 479.5838, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.4219503402709961, "rewards/margins": 0.048766739666461945, "rewards/rejected": 0.37318360805511475, "step": 1910 }, { "epoch": 0.9997382884061764, "step": 1910, "total_flos": 0.0, "train_loss": 459.2928345845008, "train_runtime": 17326.5441, "train_samples_per_second": 3.528, "train_steps_per_second": 0.11 } ], "logging_steps": 10, "max_steps": 1910, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }