{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9969834087481146, "eval_steps": 100, "global_step": 662, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030165912518853697, "grad_norm": 237.0, "learning_rate": 7.462686567164179e-09, "logits/chosen": 373.7113952636719, "logits/rejected": 314.0401306152344, "logps/chosen": -3.7142391204833984, "logps/rejected": -3.6281590461730957, "loss": 100.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.030165912518853696, "grad_norm": 113.5, "learning_rate": 7.462686567164178e-08, "logits/chosen": 357.4179992675781, "logits/rejected": 339.513916015625, "logps/chosen": -3.1980221271514893, "logps/rejected": -3.2019872665405273, "loss": 99.9521, "rewards/accuracies": 0.5069444179534912, "rewards/chosen": 0.0002816318301483989, "rewards/margins": 0.000606835528742522, "rewards/rejected": -0.00032520375680178404, "step": 10 }, { "epoch": 0.06033182503770739, "grad_norm": 142.0, "learning_rate": 1.4925373134328355e-07, "logits/chosen": 375.2931823730469, "logits/rejected": 341.9668273925781, "logps/chosen": -3.4338252544403076, "logps/rejected": -3.202441453933716, "loss": 100.1094, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0006929364753887057, "rewards/margins": -0.0008935144869610667, "rewards/rejected": 0.00020057809888385236, "step": 20 }, { "epoch": 0.09049773755656108, "grad_norm": 105.5, "learning_rate": 2.2388059701492537e-07, "logits/chosen": 367.0047302246094, "logits/rejected": 345.61358642578125, "logps/chosen": -3.1884350776672363, "logps/rejected": -3.089536666870117, "loss": 99.8614, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0009225490503013134, "rewards/margins": 0.0005482577835209668, "rewards/rejected": 0.0003742911503650248, "step": 30 }, { "epoch": 0.12066365007541478, "grad_norm": 212.0, "learning_rate": 2.985074626865671e-07, "logits/chosen": 375.55523681640625, "logits/rejected": 343.5054931640625, "logps/chosen": -3.3171133995056152, "logps/rejected": -3.01216459274292, "loss": 99.8979, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0019296016544103622, "rewards/margins": 0.00038303257315419614, "rewards/rejected": 0.0015465689357370138, "step": 40 }, { "epoch": 0.15082956259426847, "grad_norm": 171.0, "learning_rate": 3.7313432835820895e-07, "logits/chosen": 362.37359619140625, "logits/rejected": 326.4159240722656, "logps/chosen": -3.03434681892395, "logps/rejected": -3.188572645187378, "loss": 99.8304, "rewards/accuracies": 0.5, "rewards/chosen": 0.004757150541990995, "rewards/margins": -0.00031133147422224283, "rewards/rejected": 0.005068481899797916, "step": 50 }, { "epoch": 0.18099547511312217, "grad_norm": 92.0, "learning_rate": 4.4776119402985074e-07, "logits/chosen": 352.8628845214844, "logits/rejected": 327.94586181640625, "logps/chosen": -2.9519481658935547, "logps/rejected": -3.0041961669921875, "loss": 99.6399, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.009530991315841675, "rewards/margins": 0.001031916355714202, "rewards/rejected": 0.008499075658619404, "step": 60 }, { "epoch": 0.21116138763197587, "grad_norm": 103.0, "learning_rate": 4.999686376024363e-07, "logits/chosen": 357.00048828125, "logits/rejected": 333.3026123046875, "logps/chosen": -2.858609676361084, "logps/rejected": -2.8480465412139893, "loss": 99.3579, "rewards/accuracies": 0.53125, "rewards/chosen": 0.01719777286052704, "rewards/margins": 0.0022748487535864115, "rewards/rejected": 0.01492292433977127, "step": 70 }, { "epoch": 0.24132730015082957, "grad_norm": 106.5, "learning_rate": 4.994113027425108e-07, "logits/chosen": 379.38946533203125, "logits/rejected": 339.73944091796875, "logps/chosen": -2.7288854122161865, "logps/rejected": -2.6808323860168457, "loss": 98.8914, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.025090601295232773, "rewards/margins": 0.005764222703874111, "rewards/rejected": 0.019326379522681236, "step": 80 }, { "epoch": 0.27149321266968324, "grad_norm": 112.0, "learning_rate": 4.981588138481958e-07, "logits/chosen": 376.42950439453125, "logits/rejected": 345.72589111328125, "logps/chosen": -2.639547824859619, "logps/rejected": -2.540241241455078, "loss": 98.8622, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.03617015480995178, "rewards/margins": 0.011906561441719532, "rewards/rejected": 0.024263592436909676, "step": 90 }, { "epoch": 0.30165912518853694, "grad_norm": 109.0, "learning_rate": 4.962146618306821e-07, "logits/chosen": 381.8599853515625, "logits/rejected": 347.7571716308594, "logps/chosen": -2.4746594429016113, "logps/rejected": -2.393070697784424, "loss": 99.2543, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.04532874748110771, "rewards/margins": 0.010249687358736992, "rewards/rejected": 0.03507906198501587, "step": 100 }, { "epoch": 0.30165912518853694, "eval_logits/chosen": 418.0560302734375, "eval_logits/rejected": 390.85260009765625, "eval_logps/chosen": -2.36246657371521, "eval_logps/rejected": -2.362391233444214, "eval_loss": 98.51091003417969, "eval_rewards/accuracies": 0.5821917653083801, "eval_rewards/chosen": 0.040737785398960114, "eval_rewards/margins": 0.005313507281243801, "eval_rewards/rejected": 0.03542427718639374, "eval_runtime": 99.1497, "eval_samples_per_second": 46.959, "eval_steps_per_second": 1.473, "step": 100 }, { "epoch": 0.33182503770739064, "grad_norm": 94.5, "learning_rate": 4.935842653903397e-07, "logits/chosen": 363.9966735839844, "logits/rejected": 347.78033447265625, "logps/chosen": -2.4134836196899414, "logps/rejected": -2.3732988834381104, "loss": 98.8696, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04237458482384682, "rewards/margins": 0.005867284722626209, "rewards/rejected": 0.03650730103254318, "step": 110 }, { "epoch": 0.36199095022624433, "grad_norm": 109.0, "learning_rate": 4.902749559138277e-07, "logits/chosen": 381.50604248046875, "logits/rejected": 344.4244384765625, "logps/chosen": -2.260192394256592, "logps/rejected": -2.3423144817352295, "loss": 99.2348, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.042268361896276474, "rewards/margins": 0.004213915206491947, "rewards/rejected": 0.0380544476211071, "step": 120 }, { "epoch": 0.39215686274509803, "grad_norm": 111.5, "learning_rate": 4.862959570402049e-07, "logits/chosen": 355.4747314453125, "logits/rejected": 347.9737854003906, "logps/chosen": -2.242130756378174, "logps/rejected": -2.236783981323242, "loss": 99.0442, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.03943895921111107, "rewards/margins": 0.00034976023016497493, "rewards/rejected": 0.039089202880859375, "step": 130 }, { "epoch": 0.42232277526395173, "grad_norm": 100.5, "learning_rate": 4.816583589529929e-07, "logits/chosen": 377.9671630859375, "logits/rejected": 330.95989990234375, "logps/chosen": -2.2830498218536377, "logps/rejected": -2.2647957801818848, "loss": 98.1403, "rewards/accuracies": 0.625, "rewards/chosen": 0.055894386023283005, "rewards/margins": 0.010847574099898338, "rewards/rejected": 0.045046813786029816, "step": 140 }, { "epoch": 0.45248868778280543, "grad_norm": 93.0, "learning_rate": 4.7637508746984616e-07, "logits/chosen": 348.13671875, "logits/rejected": 340.65460205078125, "logps/chosen": -2.119765281677246, "logps/rejected": -2.2865583896636963, "loss": 98.2056, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.05023769661784172, "rewards/margins": 0.0025197002105414867, "rewards/rejected": 0.04771799594163895, "step": 150 }, { "epoch": 0.48265460030165913, "grad_norm": 94.5, "learning_rate": 4.7046086801597966e-07, "logits/chosen": 354.6838684082031, "logits/rejected": 344.14971923828125, "logps/chosen": -2.1116466522216797, "logps/rejected": -2.1644375324249268, "loss": 98.8372, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.047597456723451614, "rewards/margins": -0.0014352608704939485, "rewards/rejected": 0.04903271794319153, "step": 160 }, { "epoch": 0.5128205128205128, "grad_norm": 84.0, "learning_rate": 4.639321845817675e-07, "logits/chosen": 367.41046142578125, "logits/rejected": 350.5602111816406, "logps/chosen": -2.1424355506896973, "logps/rejected": -2.1355576515197754, "loss": 99.3001, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.048554591834545135, "rewards/margins": 0.00010763984028017148, "rewards/rejected": 0.048446957021951675, "step": 170 }, { "epoch": 0.5429864253393665, "grad_norm": 103.5, "learning_rate": 4.568072337789056e-07, "logits/chosen": 355.62255859375, "logits/rejected": 353.52374267578125, "logps/chosen": -2.1198911666870117, "logps/rejected": -2.168549060821533, "loss": 99.3479, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.05148975923657417, "rewards/margins": 0.001858557341620326, "rewards/rejected": 0.04963120073080063, "step": 180 }, { "epoch": 0.5731523378582202, "grad_norm": 120.0, "learning_rate": 4.4910587412319077e-07, "logits/chosen": 360.10418701171875, "logits/rejected": 338.6171875, "logps/chosen": -2.1945066452026367, "logps/rejected": -2.278280735015869, "loss": 98.6843, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.050669699907302856, "rewards/margins": 0.004689137917011976, "rewards/rejected": 0.04598056524991989, "step": 190 }, { "epoch": 0.6033182503770739, "grad_norm": 150.0, "learning_rate": 4.408495706852757e-07, "logits/chosen": 367.1837158203125, "logits/rejected": 335.16845703125, "logps/chosen": -2.285163402557373, "logps/rejected": -2.4447309970855713, "loss": 98.8709, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.04992702230811119, "rewards/margins": 0.005152065306901932, "rewards/rejected": 0.04477496072649956, "step": 200 }, { "epoch": 0.6033182503770739, "eval_logits/chosen": 415.9554748535156, "eval_logits/rejected": 388.3780822753906, "eval_logps/chosen": -2.315298318862915, "eval_logps/rejected": -2.3358802795410156, "eval_loss": 98.0234603881836, "eval_rewards/accuracies": 0.5787671208381653, "eval_rewards/chosen": 0.04309620335698128, "eval_rewards/margins": 0.006346376612782478, "eval_rewards/rejected": 0.03674982488155365, "eval_runtime": 96.1345, "eval_samples_per_second": 48.432, "eval_steps_per_second": 1.519, "step": 200 }, { "epoch": 0.6334841628959276, "grad_norm": 162.0, "learning_rate": 4.3206133526366716e-07, "logits/chosen": 381.5917053222656, "logits/rejected": 350.63568115234375, "logps/chosen": -2.362278699874878, "logps/rejected": -2.354440927505493, "loss": 97.5361, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04696507379412651, "rewards/margins": 0.01031709648668766, "rewards/rejected": 0.0366479754447937, "step": 210 }, { "epoch": 0.6636500754147813, "grad_norm": 88.0, "learning_rate": 4.227656622467162e-07, "logits/chosen": 363.15777587890625, "logits/rejected": 331.46624755859375, "logps/chosen": -2.2479934692382812, "logps/rejected": -2.419908046722412, "loss": 98.4741, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.04176827520132065, "rewards/margins": 0.0058479527942836285, "rewards/rejected": 0.03592032194137573, "step": 220 }, { "epoch": 0.693815987933635, "grad_norm": 114.0, "learning_rate": 4.129884603423642e-07, "logits/chosen": 359.25927734375, "logits/rejected": 348.3480529785156, "logps/chosen": -2.296457529067993, "logps/rejected": -2.342721462249756, "loss": 98.0148, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.04244069382548332, "rewards/margins": 0.0055845663882792, "rewards/rejected": 0.03685613349080086, "step": 230 }, { "epoch": 0.7239819004524887, "grad_norm": 166.0, "learning_rate": 4.0275698036592786e-07, "logits/chosen": 349.7359619140625, "logits/rejected": 340.6862487792969, "logps/chosen": -2.2498726844787598, "logps/rejected": -2.308311939239502, "loss": 98.0968, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.0436752513051033, "rewards/margins": 0.0003684187831822783, "rewards/rejected": 0.04330682009458542, "step": 240 }, { "epoch": 0.7541478129713424, "grad_norm": 115.5, "learning_rate": 3.920997392871882e-07, "logits/chosen": 366.28228759765625, "logits/rejected": 329.5648193359375, "logps/chosen": -2.187605619430542, "logps/rejected": -2.240809679031372, "loss": 98.3289, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.05280689522624016, "rewards/margins": 0.007444704882800579, "rewards/rejected": 0.045362185686826706, "step": 250 }, { "epoch": 0.7843137254901961, "grad_norm": 161.0, "learning_rate": 3.8104644074848177e-07, "logits/chosen": 367.5448303222656, "logits/rejected": 359.0502014160156, "logps/chosen": -2.094212293624878, "logps/rejected": -2.2239794731140137, "loss": 97.8614, "rewards/accuracies": 0.53125, "rewards/chosen": 0.058206796646118164, "rewards/margins": 0.00784910749644041, "rewards/rejected": 0.05035768076777458, "step": 260 }, { "epoch": 0.8144796380090498, "grad_norm": 132.0, "learning_rate": 3.696278922753216e-07, "logits/chosen": 354.0914611816406, "logits/rejected": 350.6158447265625, "logps/chosen": -2.169219493865967, "logps/rejected": -2.2195897102355957, "loss": 98.4471, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.046092383563518524, "rewards/margins": -0.0004284932219889015, "rewards/rejected": 0.04652087762951851, "step": 270 }, { "epoch": 0.8446455505279035, "grad_norm": 113.5, "learning_rate": 3.5787591941029836e-07, "logits/chosen": 353.6844482421875, "logits/rejected": 332.7743835449219, "logps/chosen": -2.208927869796753, "logps/rejected": -2.241152286529541, "loss": 98.0145, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04912891611456871, "rewards/margins": 0.007012406829744577, "rewards/rejected": 0.04211651161313057, "step": 280 }, { "epoch": 0.8748114630467572, "grad_norm": 99.0, "learning_rate": 3.4582327700958453e-07, "logits/chosen": 358.00921630859375, "logits/rejected": 338.9405822753906, "logps/chosen": -2.2120420932769775, "logps/rejected": -2.275679111480713, "loss": 98.1666, "rewards/accuracies": 0.5625, "rewards/chosen": 0.048364873975515366, "rewards/margins": 0.007225348148494959, "rewards/rejected": 0.041139524430036545, "step": 290 }, { "epoch": 0.9049773755656109, "grad_norm": 177.0, "learning_rate": 3.3350355794927597e-07, "logits/chosen": 355.8531494140625, "logits/rejected": 348.1767578125, "logps/chosen": -2.248812198638916, "logps/rejected": -2.373812198638916, "loss": 97.9389, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04782190918922424, "rewards/margins": 0.0014587871264666319, "rewards/rejected": 0.04636312276124954, "step": 300 }, { "epoch": 0.9049773755656109, "eval_logits/chosen": 414.2632751464844, "eval_logits/rejected": 386.40850830078125, "eval_logps/chosen": -2.2581100463867188, "eval_logps/rejected": -2.3082242012023926, "eval_loss": 97.61585998535156, "eval_rewards/accuracies": 0.5958904027938843, "eval_rewards/chosen": 0.0459556020796299, "eval_rewards/margins": 0.007822984829545021, "eval_rewards/rejected": 0.038132619112730026, "eval_runtime": 96.7019, "eval_samples_per_second": 48.148, "eval_steps_per_second": 1.51, "step": 300 }, { "epoch": 0.9351432880844646, "grad_norm": 175.0, "learning_rate": 3.209510994960208e-07, "logits/chosen": 356.83721923828125, "logits/rejected": 340.94195556640625, "logps/chosen": -2.3349146842956543, "logps/rejected": -2.351058006286621, "loss": 97.2842, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.04642525687813759, "rewards/margins": 0.008910289034247398, "rewards/rejected": 0.03751496225595474, "step": 310 }, { "epoch": 0.9653092006033183, "grad_norm": 159.0, "learning_rate": 3.082008876028986e-07, "logits/chosen": 363.7893981933594, "logits/rejected": 325.7989501953125, "logps/chosen": -2.269832134246826, "logps/rejected": -2.3510982990264893, "loss": 97.3576, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.04809769243001938, "rewards/margins": 0.013297428376972675, "rewards/rejected": 0.03480026498436928, "step": 320 }, { "epoch": 0.995475113122172, "grad_norm": 191.0, "learning_rate": 2.952884593972944e-07, "logits/chosen": 365.5209045410156, "logits/rejected": 338.1622009277344, "logps/chosen": -2.251650094985962, "logps/rejected": -2.3790125846862793, "loss": 97.8277, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.053557444363832474, "rewards/margins": 0.00977501180022955, "rewards/rejected": 0.04378242418169975, "step": 330 }, { "epoch": 1.0256410256410255, "grad_norm": 228.0, "learning_rate": 2.822498041325508e-07, "logits/chosen": 361.9426574707031, "logits/rejected": 331.2440490722656, "logps/chosen": -2.251028537750244, "logps/rejected": -2.407641887664795, "loss": 97.2834, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.05633380264043808, "rewards/margins": 0.014148990623652935, "rewards/rejected": 0.04218481108546257, "step": 340 }, { "epoch": 1.0558069381598794, "grad_norm": 113.0, "learning_rate": 2.6912126287946387e-07, "logits/chosen": 360.66326904296875, "logits/rejected": 353.2472839355469, "logps/chosen": -2.278568744659424, "logps/rejected": -2.3496108055114746, "loss": 97.7777, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.04154600575566292, "rewards/margins": 0.00018900888971984386, "rewards/rejected": 0.04135699197649956, "step": 350 }, { "epoch": 1.085972850678733, "grad_norm": 87.0, "learning_rate": 2.5593942723720076e-07, "logits/chosen": 353.33563232421875, "logits/rejected": 335.6075134277344, "logps/chosen": -2.326427936553955, "logps/rejected": -2.2644736766815186, "loss": 98.5909, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.050872981548309326, "rewards/margins": 0.008244507014751434, "rewards/rejected": 0.04262847453355789, "step": 360 }, { "epoch": 1.1161387631975868, "grad_norm": 150.0, "learning_rate": 2.427410373459502e-07, "logits/chosen": 360.3174133300781, "logits/rejected": 337.7137451171875, "logps/chosen": -2.3095028400421143, "logps/rejected": -2.3610920906066895, "loss": 97.0909, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.045982178300619125, "rewards/margins": 0.01000190433114767, "rewards/rejected": 0.03598027303814888, "step": 370 }, { "epoch": 1.1463046757164403, "grad_norm": 174.0, "learning_rate": 2.2956287948556162e-07, "logits/chosen": 354.63238525390625, "logits/rejected": 343.02880859375, "logps/chosen": -2.2987804412841797, "logps/rejected": -2.3496735095977783, "loss": 97.7205, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.041914135217666626, "rewards/margins": 0.004762929864227772, "rewards/rejected": 0.03715119883418083, "step": 380 }, { "epoch": 1.1764705882352942, "grad_norm": 158.0, "learning_rate": 2.164416835455862e-07, "logits/chosen": 356.2020568847656, "logits/rejected": 331.1781921386719, "logps/chosen": -2.3598244190216064, "logps/rejected": -2.406748056411743, "loss": 97.8897, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.05264422297477722, "rewards/margins": 0.006141997873783112, "rewards/rejected": 0.04650222510099411, "step": 390 }, { "epoch": 1.2066365007541477, "grad_norm": 122.5, "learning_rate": 2.0341402065248575e-07, "logits/chosen": 365.3482360839844, "logits/rejected": 355.1469421386719, "logps/chosen": -2.242918014526367, "logps/rejected": -2.4804000854492188, "loss": 96.4776, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.04187587648630142, "rewards/margins": 0.00307876686565578, "rewards/rejected": 0.03879711031913757, "step": 400 }, { "epoch": 1.2066365007541477, "eval_logits/chosen": 413.0242004394531, "eval_logits/rejected": 385.0536804199219, "eval_logps/chosen": -2.3157808780670166, "eval_logps/rejected": -2.376335859298706, "eval_loss": 97.31377410888672, "eval_rewards/accuracies": 0.590753436088562, "eval_rewards/chosen": 0.04307207837700844, "eval_rewards/margins": 0.008345033042132854, "eval_rewards/rejected": 0.03472704440355301, "eval_runtime": 96.7791, "eval_samples_per_second": 48.11, "eval_steps_per_second": 1.509, "step": 400 }, { "epoch": 1.2368024132730016, "grad_norm": 121.0, "learning_rate": 1.9051620123934536e-07, "logits/chosen": 372.9312438964844, "logits/rejected": 332.6072692871094, "logps/chosen": -2.3628134727478027, "logps/rejected": -2.4130213260650635, "loss": 97.2639, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.04606416076421738, "rewards/margins": 0.011483157984912395, "rewards/rejected": 0.034581005573272705, "step": 410 }, { "epoch": 1.2669683257918551, "grad_norm": 141.0, "learning_rate": 1.7778417384218248e-07, "logits/chosen": 365.7309875488281, "logits/rejected": 330.0118408203125, "logps/chosen": -2.3192696571350098, "logps/rejected": -2.434166431427002, "loss": 97.2612, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.04589037224650383, "rewards/margins": 0.010156502947211266, "rewards/rejected": 0.03573386371135712, "step": 420 }, { "epoch": 1.297134238310709, "grad_norm": 175.0, "learning_rate": 1.652534249049305e-07, "logits/chosen": 364.436279296875, "logits/rejected": 337.96990966796875, "logps/chosen": -2.2914252281188965, "logps/rejected": -2.368446111679077, "loss": 96.7924, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.05218750238418579, "rewards/margins": 0.009751560166478157, "rewards/rejected": 0.042435940355062485, "step": 430 }, { "epoch": 1.3273001508295625, "grad_norm": 119.0, "learning_rate": 1.5295887987235433e-07, "logits/chosen": 360.49114990234375, "logits/rejected": 341.19036865234375, "logps/chosen": -2.2844796180725098, "logps/rejected": -2.5002331733703613, "loss": 97.8876, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04997866973280907, "rewards/margins": 0.00823974795639515, "rewards/rejected": 0.04173891991376877, "step": 440 }, { "epoch": 1.3574660633484164, "grad_norm": 169.0, "learning_rate": 1.4093480584657152e-07, "logits/chosen": 338.6693115234375, "logits/rejected": 338.9823303222656, "logps/chosen": -2.340331792831421, "logps/rejected": -2.3622262477874756, "loss": 97.1683, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.045739155262708664, "rewards/margins": 0.00735636567696929, "rewards/rejected": 0.038382794708013535, "step": 450 }, { "epoch": 1.38763197586727, "grad_norm": 149.0, "learning_rate": 1.292147160784931e-07, "logits/chosen": 356.4623107910156, "logits/rejected": 323.6207580566406, "logps/chosen": -2.2748360633850098, "logps/rejected": -2.365034580230713, "loss": 98.4686, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.04644192382693291, "rewards/margins": 0.010739983059465885, "rewards/rejected": 0.0357019416987896, "step": 460 }, { "epoch": 1.4177978883861238, "grad_norm": 125.5, "learning_rate": 1.1783127656038188e-07, "logits/chosen": 349.2054443359375, "logits/rejected": 347.08544921875, "logps/chosen": -2.277270793914795, "logps/rejected": -2.3475308418273926, "loss": 97.2242, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04547736421227455, "rewards/margins": 0.006285218056291342, "rewards/rejected": 0.03919214755296707, "step": 470 }, { "epoch": 1.4479638009049773, "grad_norm": 169.0, "learning_rate": 1.068162149798737e-07, "logits/chosen": 349.33013916015625, "logits/rejected": 320.25604248046875, "logps/chosen": -2.3431038856506348, "logps/rejected": -2.404095411300659, "loss": 97.0285, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.04836113005876541, "rewards/margins": 0.011659981682896614, "rewards/rejected": 0.036701153963804245, "step": 480 }, { "epoch": 1.4781297134238311, "grad_norm": 131.0, "learning_rate": 9.620023228922112e-08, "logits/chosen": 344.032958984375, "logits/rejected": 335.2795715332031, "logps/chosen": -2.3322200775146484, "logps/rejected": -2.4139535427093506, "loss": 97.409, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04861091822385788, "rewards/margins": 0.00725546944886446, "rewards/rejected": 0.041355449706315994, "step": 490 }, { "epoch": 1.5082956259426847, "grad_norm": 135.0, "learning_rate": 8.601291713623316e-08, "logits/chosen": 352.27392578125, "logits/rejected": 346.0445861816406, "logps/chosen": -2.2179293632507324, "logps/rejected": -2.3674466609954834, "loss": 97.3613, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.04390779882669449, "rewards/margins": 0.003659948008134961, "rewards/rejected": 0.0402478463947773, "step": 500 }, { "epoch": 1.5082956259426847, "eval_logits/chosen": 412.6116638183594, "eval_logits/rejected": 384.59588623046875, "eval_logps/chosen": -2.3179564476013184, "eval_logps/rejected": -2.378140687942505, "eval_loss": 97.25182342529297, "eval_rewards/accuracies": 0.590753436088562, "eval_rewards/chosen": 0.04296330735087395, "eval_rewards/margins": 0.008326511830091476, "eval_rewards/rejected": 0.03463679552078247, "eval_runtime": 96.7441, "eval_samples_per_second": 48.127, "eval_steps_per_second": 1.509, "step": 500 }, { "epoch": 1.5384615384615383, "grad_norm": 118.5, "learning_rate": 7.628266339540659e-08, "logits/chosen": 348.74603271484375, "logits/rejected": 347.08380126953125, "logps/chosen": -2.342663288116455, "logps/rejected": -2.448847532272339, "loss": 97.8001, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.04741663485765457, "rewards/margins": 0.0025564138777554035, "rewards/rejected": 0.04486021772027016, "step": 510 }, { "epoch": 1.5686274509803921, "grad_norm": 215.0, "learning_rate": 6.70365910291057e-08, "logits/chosen": 357.7579040527344, "logits/rejected": 342.58221435546875, "logps/chosen": -2.3019728660583496, "logps/rejected": -2.442549228668213, "loss": 96.9522, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04647397994995117, "rewards/margins": 0.005314859561622143, "rewards/rejected": 0.041159119457006454, "step": 520 }, { "epoch": 1.598793363499246, "grad_norm": 121.0, "learning_rate": 5.830047049936254e-08, "logits/chosen": 358.22900390625, "logits/rejected": 328.1488342285156, "logps/chosen": -2.2621231079101562, "logps/rejected": -2.453052282333374, "loss": 96.6704, "rewards/accuracies": 0.65625, "rewards/chosen": 0.045031510293483734, "rewards/margins": 0.008471885696053505, "rewards/rejected": 0.03655962646007538, "step": 530 }, { "epoch": 1.6289592760180995, "grad_norm": 168.0, "learning_rate": 5.009865094097732e-08, "logits/chosen": 343.6116638183594, "logits/rejected": 343.15179443359375, "logps/chosen": -2.414348602294922, "logps/rejected": -2.4327187538146973, "loss": 97.8265, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.044759172946214676, "rewards/margins": 0.0038251220248639584, "rewards/rejected": 0.04093404486775398, "step": 540 }, { "epoch": 1.6591251885369531, "grad_norm": 378.0, "learning_rate": 4.245399229611238e-08, "logits/chosen": 353.45245361328125, "logits/rejected": 337.67340087890625, "logps/chosen": -2.2361724376678467, "logps/rejected": -2.310511589050293, "loss": 97.9185, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04042404517531395, "rewards/margins": 0.003903269302099943, "rewards/rejected": 0.03652077168226242, "step": 550 }, { "epoch": 1.689291101055807, "grad_norm": 147.0, "learning_rate": 3.538780159953347e-08, "logits/chosen": 348.5847473144531, "logits/rejected": 323.38995361328125, "logps/chosen": -2.261584997177124, "logps/rejected": -2.404850482940674, "loss": 97.2381, "rewards/accuracies": 0.59375, "rewards/chosen": 0.04695659875869751, "rewards/margins": 0.00860162265598774, "rewards/rejected": 0.03835497424006462, "step": 560 }, { "epoch": 1.7194570135746607, "grad_norm": 213.0, "learning_rate": 2.8919773592082337e-08, "logits/chosen": 356.572021484375, "logits/rejected": 327.9063720703125, "logps/chosen": -2.3302974700927734, "logps/rejected": -2.465226173400879, "loss": 97.1971, "rewards/accuracies": 0.625, "rewards/chosen": 0.046825435012578964, "rewards/margins": 0.011095492169260979, "rewards/rejected": 0.035729944705963135, "step": 570 }, { "epoch": 1.7496229260935143, "grad_norm": 155.0, "learning_rate": 2.3067935827901343e-08, "logits/chosen": 363.5878601074219, "logits/rejected": 354.7928161621094, "logps/chosen": -2.347442150115967, "logps/rejected": -2.424699068069458, "loss": 97.9048, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.042837705463171005, "rewards/margins": 0.0025561857037246227, "rewards/rejected": 0.04028152674436569, "step": 580 }, { "epoch": 1.779788838612368, "grad_norm": 152.0, "learning_rate": 1.7848598428407024e-08, "logits/chosen": 358.1939697265625, "logits/rejected": 329.384765625, "logps/chosen": -2.302311897277832, "logps/rejected": -2.4380781650543213, "loss": 97.3207, "rewards/accuracies": 0.59375, "rewards/chosen": 0.04442282021045685, "rewards/margins": 0.011918460950255394, "rewards/rejected": 0.032504357397556305, "step": 590 }, { "epoch": 1.8099547511312217, "grad_norm": 215.0, "learning_rate": 1.3276308623055982e-08, "logits/chosen": 346.2601623535156, "logits/rejected": 337.3775329589844, "logps/chosen": -2.2827913761138916, "logps/rejected": -2.39813494682312, "loss": 97.5077, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.040391139686107635, "rewards/margins": 0.005558500997722149, "rewards/rejected": 0.03483264148235321, "step": 600 }, { "epoch": 1.8099547511312217, "eval_logits/chosen": 412.5386962890625, "eval_logits/rejected": 384.52740478515625, "eval_logps/chosen": -2.330043315887451, "eval_logps/rejected": -2.3888137340545654, "eval_loss": 97.25433349609375, "eval_rewards/accuracies": 0.5976027250289917, "eval_rewards/chosen": 0.042358946055173874, "eval_rewards/margins": 0.0082557899877429, "eval_rewards/rejected": 0.0341031588613987, "eval_runtime": 96.6091, "eval_samples_per_second": 48.194, "eval_steps_per_second": 1.511, "step": 600 }, { "epoch": 1.8401206636500755, "grad_norm": 199.0, "learning_rate": 9.363810203607686e-09, "logits/chosen": 346.93450927734375, "logits/rejected": 334.490966796875, "logps/chosen": -2.3141541481018066, "logps/rejected": -2.3323051929473877, "loss": 97.5854, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0445205494761467, "rewards/margins": 0.007130137179046869, "rewards/rejected": 0.03739041090011597, "step": 610 }, { "epoch": 1.8702865761689291, "grad_norm": 212.0, "learning_rate": 6.1220080048908495e-09, "logits/chosen": 351.2411193847656, "logits/rejected": 340.6080017089844, "logps/chosen": -2.309572696685791, "logps/rejected": -2.4471845626831055, "loss": 97.3323, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04287445545196533, "rewards/margins": 0.006148182787001133, "rewards/rejected": 0.036726273596286774, "step": 620 }, { "epoch": 1.9004524886877827, "grad_norm": 173.0, "learning_rate": 3.5599375110729747e-09, "logits/chosen": 373.3907165527344, "logits/rejected": 343.541259765625, "logps/chosen": -2.3878846168518066, "logps/rejected": -2.472883939743042, "loss": 97.3339, "rewards/accuracies": 0.59375, "rewards/chosen": 0.05199785903096199, "rewards/margins": 0.012289008125662804, "rewards/rejected": 0.039708852767944336, "step": 630 }, { "epoch": 1.9306184012066365, "grad_norm": 144.0, "learning_rate": 1.6847396721454688e-09, "logits/chosen": 359.8321228027344, "logits/rejected": 329.95562744140625, "logps/chosen": -2.36277437210083, "logps/rejected": -2.395962953567505, "loss": 96.6934, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.04414898157119751, "rewards/margins": 0.011453721672296524, "rewards/rejected": 0.03269525617361069, "step": 640 }, { "epoch": 1.9607843137254903, "grad_norm": 211.0, "learning_rate": 5.016410008151228e-10, "logits/chosen": 355.52593994140625, "logits/rejected": 320.21697998046875, "logps/chosen": -2.4310154914855957, "logps/rejected": -2.3878276348114014, "loss": 97.3498, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.043542712926864624, "rewards/margins": 0.00873391143977642, "rewards/rejected": 0.034808795899152756, "step": 650 }, { "epoch": 1.990950226244344, "grad_norm": 177.0, "learning_rate": 1.3939005275909189e-11, "logits/chosen": 360.26409912109375, "logits/rejected": 332.65447998046875, "logps/chosen": -2.311514377593994, "logps/rejected": -2.424405574798584, "loss": 97.3456, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.04267590120434761, "rewards/margins": 0.012712600640952587, "rewards/rejected": 0.02996329963207245, "step": 660 }, { "epoch": 1.9969834087481146, "step": 662, "total_flos": 0.0, "train_loss": 98.05829228519313, "train_runtime": 4353.733, "train_samples_per_second": 19.487, "train_steps_per_second": 0.152 } ], "logging_steps": 10, "max_steps": 662, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }