{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 100, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.5625e-08, "logits/chosen": -2.6282742023468018, "logits/rejected": -2.6322691440582275, "logps/chosen": -400.73358154296875, "logps/rejected": -385.47198486328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -2.60478138923645, "logits/rejected": -2.5875940322875977, "logps/chosen": -383.8570556640625, "logps/rejected": -403.23974609375, "loss": 0.693, "rewards/accuracies": 0.4305555522441864, "rewards/chosen": 0.0001632043713470921, "rewards/margins": -6.321006367215887e-05, "rewards/rejected": 0.0002264144568471238, "step": 10 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.618523359298706, "logits/rejected": -2.6062684059143066, "logps/chosen": -400.2876892089844, "logps/rejected": -425.55078125, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.000641118735074997, "rewards/margins": -7.16630820534192e-05, "rewards/rejected": 0.0007127817953005433, "step": 20 }, { "epoch": 0.1, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -2.552297830581665, "logits/rejected": -2.536328077316284, "logps/chosen": -371.4658508300781, "logps/rejected": -386.770751953125, "loss": 0.6926, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.004528197459876537, "rewards/margins": 0.001459120074287057, "rewards/rejected": 0.0030690771527588367, "step": 30 }, { "epoch": 0.13, "learning_rate": 4.989935734988097e-07, "logits/chosen": -2.591301441192627, "logits/rejected": -2.6019530296325684, "logps/chosen": -371.7442321777344, "logps/rejected": -381.6260986328125, "loss": 0.6914, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0025075082667171955, "rewards/margins": 0.0020256205461919308, "rewards/rejected": 0.00048188763321377337, "step": 40 }, { "epoch": 0.16, "learning_rate": 4.949188496058089e-07, "logits/chosen": -2.6773521900177, "logits/rejected": -2.6721460819244385, "logps/chosen": -387.83477783203125, "logps/rejected": -402.6687927246094, "loss": 0.688, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.011017683893442154, "rewards/margins": 0.013887738808989525, "rewards/rejected": -0.002870055614039302, "step": 50 }, { "epoch": 0.19, "learning_rate": 4.877641290737883e-07, "logits/chosen": -2.7053120136260986, "logits/rejected": -2.7033491134643555, "logps/chosen": -391.2000427246094, "logps/rejected": -417.63751220703125, "loss": 0.6902, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0010398384183645248, "rewards/margins": 0.009658296592533588, "rewards/rejected": -0.008618457242846489, "step": 60 }, { "epoch": 0.22, "learning_rate": 4.776193866647039e-07, "logits/chosen": -2.7314000129699707, "logits/rejected": -2.742297649383545, "logps/chosen": -418.1868591308594, "logps/rejected": -433.50653076171875, "loss": 0.6843, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.015087150037288666, "rewards/margins": 0.02047603204846382, "rewards/rejected": -0.03556318208575249, "step": 70 }, { "epoch": 0.26, "learning_rate": 4.646121984004665e-07, "logits/chosen": -2.7759556770324707, "logits/rejected": -2.7733349800109863, "logps/chosen": -398.1673583984375, "logps/rejected": -409.0989685058594, "loss": 0.6819, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03768505901098251, "rewards/margins": 0.025433484464883804, "rewards/rejected": -0.06311853975057602, "step": 80 }, { "epoch": 0.29, "learning_rate": 4.489061372204452e-07, "logits/chosen": -2.828214645385742, "logits/rejected": -2.831789255142212, "logps/chosen": -423.423828125, "logps/rejected": -433.41729736328125, "loss": 0.6797, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.053007822483778, "rewards/margins": 0.03302832692861557, "rewards/rejected": -0.08603614568710327, "step": 90 }, { "epoch": 0.32, "learning_rate": 4.3069871595684787e-07, "logits/chosen": -2.896029233932495, "logits/rejected": -2.893444538116455, "logps/chosen": -397.0558166503906, "logps/rejected": -403.83636474609375, "loss": 0.6786, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04668121784925461, "rewards/margins": 0.03920525312423706, "rewards/rejected": -0.08588646352291107, "step": 100 }, { "epoch": 0.32, "eval_logits/chosen": -2.885509967803955, "eval_logits/rejected": -2.878624677658081, "eval_logps/chosen": -386.8027038574219, "eval_logps/rejected": -400.90277099609375, "eval_loss": 0.6868489980697632, "eval_rewards/accuracies": 0.5546875, "eval_rewards/chosen": -0.0868735983967781, "eval_rewards/margins": 0.014578516595065594, "eval_rewards/rejected": -0.10145211219787598, "eval_runtime": 137.4322, "eval_samples_per_second": 7.276, "eval_steps_per_second": 0.233, "step": 100 }, { "epoch": 0.35, "learning_rate": 4.10218903496256e-07, "logits/chosen": -2.863513469696045, "logits/rejected": -2.8547675609588623, "logps/chosen": -425.29986572265625, "logps/rejected": -426.5914611816406, "loss": 0.6811, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09437432140111923, "rewards/margins": 0.027984386309981346, "rewards/rejected": -0.12235872447490692, "step": 110 }, { "epoch": 0.38, "learning_rate": 3.877242453630256e-07, "logits/chosen": -2.824404001235962, "logits/rejected": -2.8097667694091797, "logps/chosen": -416.0079650878906, "logps/rejected": -433.6048889160156, "loss": 0.6758, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.09664727002382278, "rewards/margins": 0.040948256850242615, "rewards/rejected": -0.137595534324646, "step": 120 }, { "epoch": 0.42, "learning_rate": 3.634976249348867e-07, "logits/chosen": -2.8867764472961426, "logits/rejected": -2.891484498977661, "logps/chosen": -395.841552734375, "logps/rejected": -422.6394958496094, "loss": 0.6765, "rewards/accuracies": 0.59375, "rewards/chosen": -0.11936166137456894, "rewards/margins": 0.03277095407247543, "rewards/rejected": -0.15213260054588318, "step": 130 }, { "epoch": 0.45, "learning_rate": 3.378437060203357e-07, "logits/chosen": -2.8144431114196777, "logits/rejected": -2.7991340160369873, "logps/chosen": -436.4762268066406, "logps/rejected": -477.21624755859375, "loss": 0.6759, "rewards/accuracies": 0.5625, "rewards/chosen": -0.15145839750766754, "rewards/margins": 0.03745008260011673, "rewards/rejected": -0.18890848755836487, "step": 140 }, { "epoch": 0.48, "learning_rate": 3.110851015344735e-07, "logits/chosen": -2.962568759918213, "logits/rejected": -2.957019090652466, "logps/chosen": -405.751708984375, "logps/rejected": -443.404296875, "loss": 0.6712, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.12555965781211853, "rewards/margins": 0.05261915922164917, "rewards/rejected": -0.1781788021326065, "step": 150 }, { "epoch": 0.51, "learning_rate": 2.8355831645441387e-07, "logits/chosen": -2.9129347801208496, "logits/rejected": -2.9059901237487793, "logps/chosen": -415.2442321777344, "logps/rejected": -437.210693359375, "loss": 0.6699, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.17596140503883362, "rewards/margins": 0.05255354568362236, "rewards/rejected": -0.22851495444774628, "step": 160 }, { "epoch": 0.54, "learning_rate": 2.5560951607395126e-07, "logits/chosen": -2.997249126434326, "logits/rejected": -3.0022037029266357, "logps/chosen": -408.9958801269531, "logps/rejected": -433.72076416015625, "loss": 0.6712, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16367743909358978, "rewards/margins": 0.07268272340297699, "rewards/rejected": -0.23636016249656677, "step": 170 }, { "epoch": 0.58, "learning_rate": 2.2759017277414164e-07, "logits/chosen": -3.01900315284729, "logits/rejected": -3.011901378631592, "logps/chosen": -409.57940673828125, "logps/rejected": -429.5187072753906, "loss": 0.6709, "rewards/accuracies": 0.625, "rewards/chosen": -0.11230169236660004, "rewards/margins": 0.061531912535429, "rewards/rejected": -0.17383362352848053, "step": 180 }, { "epoch": 0.61, "learning_rate": 1.998526460541818e-07, "logits/chosen": -3.0185275077819824, "logits/rejected": -2.9947714805603027, "logps/chosen": -423.85589599609375, "logps/rejected": -451.7948303222656, "loss": 0.6638, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.16302022337913513, "rewards/margins": 0.05766066908836365, "rewards/rejected": -0.22068090736865997, "step": 190 }, { "epoch": 0.64, "learning_rate": 1.7274575140626315e-07, "logits/chosen": -3.038890838623047, "logits/rejected": -3.028480052947998, "logps/chosen": -393.0187072753906, "logps/rejected": -410.3246154785156, "loss": 0.6615, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.19790220260620117, "rewards/margins": 0.04545364901423454, "rewards/rejected": -0.24335582554340363, "step": 200 }, { "epoch": 0.64, "eval_logits/chosen": -3.067202568054199, "eval_logits/rejected": -3.0606539249420166, "eval_logps/chosen": -396.62066650390625, "eval_logps/rejected": -412.20208740234375, "eval_loss": 0.6828470826148987, "eval_rewards/accuracies": 0.59375, "eval_rewards/chosen": -0.1850530207157135, "eval_rewards/margins": 0.029392333701252937, "eval_rewards/rejected": -0.2144453525543213, "eval_runtime": 136.2085, "eval_samples_per_second": 7.342, "eval_steps_per_second": 0.235, "step": 200 }, { "epoch": 0.67, "learning_rate": 1.4661037375836987e-07, "logits/chosen": -3.0557608604431152, "logits/rejected": -3.044783353805542, "logps/chosen": -411.9776916503906, "logps/rejected": -446.8324279785156, "loss": 0.6619, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1823655068874359, "rewards/margins": 0.09753072261810303, "rewards/rejected": -0.27989625930786133, "step": 210 }, { "epoch": 0.7, "learning_rate": 1.2177518064852348e-07, "logits/chosen": -3.064290761947632, "logits/rejected": -3.0677173137664795, "logps/chosen": -410.56231689453125, "logps/rejected": -445.7730407714844, "loss": 0.6571, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.161437526345253, "rewards/margins": 0.10264303535223007, "rewards/rejected": -0.26408058404922485, "step": 220 }, { "epoch": 0.74, "learning_rate": 9.855248903979505e-08, "logits/chosen": -3.098358392715454, "logits/rejected": -3.089136838912964, "logps/chosen": -410.40423583984375, "logps/rejected": -428.3938903808594, "loss": 0.6646, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.13886065781116486, "rewards/margins": 0.08097346127033234, "rewards/rejected": -0.2198341190814972, "step": 230 }, { "epoch": 0.77, "learning_rate": 7.723433775328384e-08, "logits/chosen": -3.098928928375244, "logits/rejected": -3.0982117652893066, "logps/chosen": -399.87091064453125, "logps/rejected": -431.13653564453125, "loss": 0.6548, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.18533360958099365, "rewards/margins": 0.10331207513809204, "rewards/rejected": -0.2886456847190857, "step": 240 }, { "epoch": 0.8, "learning_rate": 5.808881491049722e-08, "logits/chosen": -3.0935721397399902, "logits/rejected": -3.0877461433410645, "logps/chosen": -394.9942321777344, "logps/rejected": -434.58526611328125, "loss": 0.6504, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.19627241790294647, "rewards/margins": 0.14824633300304413, "rewards/rejected": -0.3445187509059906, "step": 250 }, { "epoch": 0.83, "learning_rate": 4.1356686569674335e-08, "logits/chosen": -3.061063051223755, "logits/rejected": -3.0640881061553955, "logps/chosen": -450.4789123535156, "logps/rejected": -484.6214904785156, "loss": 0.6588, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2441820651292801, "rewards/margins": 0.12522391974925995, "rewards/rejected": -0.36940592527389526, "step": 260 }, { "epoch": 0.86, "learning_rate": 2.724836895290805e-08, "logits/chosen": -3.136080265045166, "logits/rejected": -3.135943651199341, "logps/chosen": -392.79010009765625, "logps/rejected": -411.1092224121094, "loss": 0.6535, "rewards/accuracies": 0.625, "rewards/chosen": -0.17221376299858093, "rewards/margins": 0.06714353710412979, "rewards/rejected": -0.23935727775096893, "step": 270 }, { "epoch": 0.9, "learning_rate": 1.5941282340065697e-08, "logits/chosen": -3.0997986793518066, "logits/rejected": -3.0961849689483643, "logps/chosen": -427.6429138183594, "logps/rejected": -467.4734802246094, "loss": 0.6493, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2281384915113449, "rewards/margins": 0.09002666175365448, "rewards/rejected": -0.318165123462677, "step": 280 }, { "epoch": 0.93, "learning_rate": 7.577619905828281e-09, "logits/chosen": -3.1325037479400635, "logits/rejected": -3.138279438018799, "logps/chosen": -394.94561767578125, "logps/rejected": -414.731201171875, "loss": 0.6651, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2411809265613556, "rewards/margins": 0.07669158279895782, "rewards/rejected": -0.31787246465682983, "step": 290 }, { "epoch": 0.96, "learning_rate": 2.2625595580163247e-09, "logits/chosen": -3.1289730072021484, "logits/rejected": -3.099738597869873, "logps/chosen": -440.1055603027344, "logps/rejected": -485.18463134765625, "loss": 0.6539, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1983877271413803, "rewards/margins": 0.13716521859169006, "rewards/rejected": -0.3355529308319092, "step": 300 }, { "epoch": 0.96, "eval_logits/chosen": -3.170891523361206, "eval_logits/rejected": -3.164473295211792, "eval_logps/chosen": -401.33953857421875, "eval_logps/rejected": -417.6891784667969, "eval_loss": 0.6821330785751343, "eval_rewards/accuracies": 0.60546875, "eval_rewards/chosen": -0.23224163055419922, "eval_rewards/margins": 0.03707445412874222, "eval_rewards/rejected": -0.2693161070346832, "eval_runtime": 136.3294, "eval_samples_per_second": 7.335, "eval_steps_per_second": 0.235, "step": 300 }, { "epoch": 0.99, "learning_rate": 6.294126437336733e-11, "logits/chosen": -3.1333985328674316, "logits/rejected": -3.1103413105010986, "logps/chosen": -421.52532958984375, "logps/rejected": -455.3125, "loss": 0.6618, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2057611495256424, "rewards/margins": 0.10267385095357895, "rewards/rejected": -0.30843502283096313, "step": 310 }, { "epoch": 1.0, "step": 312, "total_flos": 0.0, "train_loss": 0.6714850996549313, "train_runtime": 5456.9379, "train_samples_per_second": 3.664, "train_steps_per_second": 0.057 } ], "logging_steps": 10, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }