{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99562408835174, "eval_steps": 200, "global_step": 1797, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 26.02830696105957, "learning_rate": 9.999523086940423e-06, "logits/chosen": -1.1374095678329468, "logits/rejected": -1.1327173709869385, "logps/chosen": -142.34921264648438, "logps/rejected": -155.80406188964844, "loss": 1.5859, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 13.082440376281738, "rewards/margins": 0.9329651594161987, "rewards/rejected": 12.14947509765625, "step": 10 }, { "epoch": 0.03, "grad_norm": 21.4633846282959, "learning_rate": 9.997817603030276e-06, "logits/chosen": -1.1412668228149414, "logits/rejected": -1.1413795948028564, "logps/chosen": -150.35586547851562, "logps/rejected": -164.76641845703125, "loss": 1.786, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 13.009861946105957, "rewards/margins": 0.8635275959968567, "rewards/rejected": 12.146333694458008, "step": 20 }, { "epoch": 0.05, "grad_norm": 23.037744522094727, "learning_rate": 9.994471383754724e-06, "logits/chosen": -1.1316417455673218, "logits/rejected": -1.130651831626892, "logps/chosen": -151.8024444580078, "logps/rejected": -168.8718719482422, "loss": 1.4256, "rewards/accuracies": 0.75, "rewards/chosen": 13.176553726196289, "rewards/margins": 1.613856554031372, "rewards/rejected": 11.562695503234863, "step": 30 }, { "epoch": 0.07, "grad_norm": 14.40039348602295, "learning_rate": 9.990154489175436e-06, "logits/chosen": -1.1126848459243774, "logits/rejected": -1.1085669994354248, "logps/chosen": -149.564697265625, "logps/rejected": -167.00857543945312, "loss": 1.5826, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 12.614360809326172, "rewards/margins": 1.426288366317749, "rewards/rejected": 11.18807315826416, "step": 40 }, { "epoch": 0.08, "grad_norm": 30.87997817993164, "learning_rate": 9.983908955774398e-06, "logits/chosen": -1.121843934059143, "logits/rejected": -1.124288558959961, "logps/chosen": -148.73220825195312, "logps/rejected": -175.08212280273438, "loss": 1.4624, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 12.753301620483398, "rewards/margins": 1.8802299499511719, "rewards/rejected": 10.873071670532227, "step": 50 }, { "epoch": 0.1, "grad_norm": 23.71029281616211, "learning_rate": 9.976140032846158e-06, "logits/chosen": -1.154342532157898, "logits/rejected": -1.1551573276519775, "logps/chosen": -148.70213317871094, "logps/rejected": -169.8200225830078, "loss": 1.5662, "rewards/accuracies": 0.75, "rewards/chosen": 12.446992874145508, "rewards/margins": 1.6254791021347046, "rewards/rejected": 10.821515083312988, "step": 60 }, { "epoch": 0.12, "grad_norm": 27.385414123535156, "learning_rate": 9.966850095052043e-06, "logits/chosen": -1.1726210117340088, "logits/rejected": -1.1690549850463867, "logps/chosen": -150.37200927734375, "logps/rejected": -183.83358764648438, "loss": 1.3566, "rewards/accuracies": 0.8125, "rewards/chosen": 12.22251033782959, "rewards/margins": 2.0067410469055176, "rewards/rejected": 10.21576976776123, "step": 70 }, { "epoch": 0.13, "grad_norm": 23.488914489746094, "learning_rate": 9.956041981969192e-06, "logits/chosen": -1.2129347324371338, "logits/rejected": -1.2055721282958984, "logps/chosen": -139.1160125732422, "logps/rejected": -180.86448669433594, "loss": 1.3657, "rewards/accuracies": 0.8125, "rewards/chosen": 12.533506393432617, "rewards/margins": 2.3238863945007324, "rewards/rejected": 10.209619522094727, "step": 80 }, { "epoch": 0.15, "grad_norm": 27.516918182373047, "learning_rate": 9.943718997222616e-06, "logits/chosen": -1.2162883281707764, "logits/rejected": -1.2150709629058838, "logps/chosen": -153.258544921875, "logps/rejected": -169.685302734375, "loss": 1.3773, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 12.494550704956055, "rewards/margins": 2.459848403930664, "rewards/rejected": 10.034701347351074, "step": 90 }, { "epoch": 0.17, "grad_norm": 30.299776077270508, "learning_rate": 9.929884907475405e-06, "logits/chosen": -1.2348403930664062, "logits/rejected": -1.2337309122085571, "logps/chosen": -141.0696563720703, "logps/rejected": -172.27981567382812, "loss": 1.3082, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 12.868219375610352, "rewards/margins": 2.447009325027466, "rewards/rejected": 10.421208381652832, "step": 100 }, { "epoch": 0.18, "grad_norm": 10.773354530334473, "learning_rate": 9.914543941277401e-06, "logits/chosen": -1.2390023469924927, "logits/rejected": -1.2259633541107178, "logps/chosen": -151.1343536376953, "logps/rejected": -166.46087646484375, "loss": 1.2816, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 12.653604507446289, "rewards/margins": 2.4828662872314453, "rewards/rejected": 10.170738220214844, "step": 110 }, { "epoch": 0.2, "grad_norm": 18.021488189697266, "learning_rate": 9.897700787772703e-06, "logits/chosen": -1.2141873836517334, "logits/rejected": -1.212436556816101, "logps/chosen": -145.79354858398438, "logps/rejected": -174.75421142578125, "loss": 1.4305, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 12.540283203125, "rewards/margins": 2.2728052139282227, "rewards/rejected": 10.26747989654541, "step": 120 }, { "epoch": 0.22, "grad_norm": 14.722694396972656, "learning_rate": 9.879360595266359e-06, "logits/chosen": -1.2301462888717651, "logits/rejected": -1.2223972082138062, "logps/chosen": -141.97836303710938, "logps/rejected": -190.95040893554688, "loss": 1.1746, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 12.985305786132812, "rewards/margins": 3.339768886566162, "rewards/rejected": 9.645538330078125, "step": 130 }, { "epoch": 0.23, "grad_norm": 19.521358489990234, "learning_rate": 9.861579077506591e-06, "logits/chosen": -1.2298915386199951, "logits/rejected": -1.221884846687317, "logps/chosen": -148.5527801513672, "logps/rejected": -180.9561004638672, "loss": 1.3081, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 12.90058422088623, "rewards/margins": 2.527609348297119, "rewards/rejected": 10.372976303100586, "step": 140 }, { "epoch": 0.25, "grad_norm": 21.75788688659668, "learning_rate": 9.84041033194796e-06, "logits/chosen": -1.1978212594985962, "logits/rejected": -1.2021763324737549, "logps/chosen": -149.8433837890625, "logps/rejected": -183.22714233398438, "loss": 1.3711, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 12.765899658203125, "rewards/margins": 2.4878742694854736, "rewards/rejected": 10.27802562713623, "step": 150 }, { "epoch": 0.27, "grad_norm": 31.972171783447266, "learning_rate": 9.817762058879405e-06, "logits/chosen": -1.1968965530395508, "logits/rejected": -1.2041929960250854, "logps/chosen": -141.44479370117188, "logps/rejected": -172.67782592773438, "loss": 1.4901, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 12.295498847961426, "rewards/margins": 1.9750818014144897, "rewards/rejected": 10.320415496826172, "step": 160 }, { "epoch": 0.28, "grad_norm": 6.441612243652344, "learning_rate": 9.793641181008042e-06, "logits/chosen": -1.1921595335006714, "logits/rejected": -1.1772682666778564, "logps/chosen": -158.01004028320312, "logps/rejected": -174.64419555664062, "loss": 1.3182, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 11.994952201843262, "rewards/margins": 3.0716311931610107, "rewards/rejected": 8.923321723937988, "step": 170 }, { "epoch": 0.3, "grad_norm": 23.175006866455078, "learning_rate": 9.76805507115971e-06, "logits/chosen": -1.1862366199493408, "logits/rejected": -1.1827598810195923, "logps/chosen": -150.92054748535156, "logps/rejected": -191.04061889648438, "loss": 1.3317, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 12.52110767364502, "rewards/margins": 2.7398293018341064, "rewards/rejected": 9.781278610229492, "step": 180 }, { "epoch": 0.32, "grad_norm": 20.24986457824707, "learning_rate": 9.741011550025385e-06, "logits/chosen": -1.1844813823699951, "logits/rejected": -1.1813442707061768, "logps/chosen": -146.86256408691406, "logps/rejected": -182.008544921875, "loss": 1.3589, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 12.942377090454102, "rewards/margins": 2.665095329284668, "rewards/rejected": 10.27728271484375, "step": 190 }, { "epoch": 0.33, "grad_norm": 31.18330955505371, "learning_rate": 9.71251888377069e-06, "logits/chosen": -1.198872685432434, "logits/rejected": -1.195305347442627, "logps/chosen": -140.82937622070312, "logps/rejected": -173.56301879882812, "loss": 1.3287, "rewards/accuracies": 0.75, "rewards/chosen": 12.571944236755371, "rewards/margins": 2.418260335922241, "rewards/rejected": 10.153684616088867, "step": 200 }, { "epoch": 0.33, "eval_logits/chosen": -1.186182975769043, "eval_logits/rejected": -1.1862589120864868, "eval_logps/chosen": -142.31016540527344, "eval_logps/rejected": -167.46771240234375, "eval_loss": 1.8699389696121216, "eval_rewards/accuracies": 0.631205677986145, "eval_rewards/chosen": 9.93428897857666, "eval_rewards/margins": 0.8060780763626099, "eval_rewards/rejected": 9.12821102142334, "eval_runtime": 280.3161, "eval_samples_per_second": 2.515, "eval_steps_per_second": 2.515, "step": 200 }, { "epoch": 0.35, "grad_norm": 23.425613403320312, "learning_rate": 9.682585781509243e-06, "logits/chosen": -1.2091820240020752, "logits/rejected": -1.2033774852752686, "logps/chosen": -147.17410278320312, "logps/rejected": -185.01951599121094, "loss": 1.1424, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 13.107488632202148, "rewards/margins": 3.5036494731903076, "rewards/rejected": 9.603840827941895, "step": 210 }, { "epoch": 0.37, "grad_norm": 24.199848175048828, "learning_rate": 9.651221392640626e-06, "logits/chosen": -1.2453477382659912, "logits/rejected": -1.2410730123519897, "logps/chosen": -142.30599975585938, "logps/rejected": -187.09365844726562, "loss": 1.3393, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 12.409756660461426, "rewards/margins": 2.767655849456787, "rewards/rejected": 9.642101287841797, "step": 220 }, { "epoch": 0.38, "grad_norm": 5.732828140258789, "learning_rate": 9.618435304053756e-06, "logits/chosen": -1.2778840065002441, "logits/rejected": -1.2746034860610962, "logps/chosen": -135.27261352539062, "logps/rejected": -201.36862182617188, "loss": 1.0207, "rewards/accuracies": 0.9375, "rewards/chosen": 13.215780258178711, "rewards/margins": 4.094855308532715, "rewards/rejected": 9.120925903320312, "step": 230 }, { "epoch": 0.4, "grad_norm": 33.938697814941406, "learning_rate": 9.584237537196539e-06, "logits/chosen": -1.264520287513733, "logits/rejected": -1.2698160409927368, "logps/chosen": -139.2551727294922, "logps/rejected": -185.9132537841797, "loss": 1.3149, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 12.885518074035645, "rewards/margins": 3.4518959522247314, "rewards/rejected": 9.433621406555176, "step": 240 }, { "epoch": 0.42, "grad_norm": 19.174827575683594, "learning_rate": 9.548638545012714e-06, "logits/chosen": -1.2648355960845947, "logits/rejected": -1.2563896179199219, "logps/chosen": -143.2262725830078, "logps/rejected": -177.64321899414062, "loss": 1.2689, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 12.918134689331055, "rewards/margins": 3.194645881652832, "rewards/rejected": 9.723487854003906, "step": 250 }, { "epoch": 0.43, "grad_norm": 12.325043678283691, "learning_rate": 9.511649208746768e-06, "logits/chosen": -1.2492735385894775, "logits/rejected": -1.2509915828704834, "logps/chosen": -140.66622924804688, "logps/rejected": -186.30052185058594, "loss": 1.2776, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 13.131103515625, "rewards/margins": 2.9642741680145264, "rewards/rejected": 10.166828155517578, "step": 260 }, { "epoch": 0.45, "grad_norm": 29.295560836791992, "learning_rate": 9.473280834617975e-06, "logits/chosen": -1.2667304277420044, "logits/rejected": -1.2650549411773682, "logps/chosen": -143.45948791503906, "logps/rejected": -180.34130859375, "loss": 1.3095, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 12.880398750305176, "rewards/margins": 2.869231939315796, "rewards/rejected": 10.011167526245117, "step": 270 }, { "epoch": 0.47, "grad_norm": 24.3317928314209, "learning_rate": 9.43354515036451e-06, "logits/chosen": -1.2491796016693115, "logits/rejected": -1.2445125579833984, "logps/chosen": -140.8796844482422, "logps/rejected": -178.88314819335938, "loss": 1.3453, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 12.978363037109375, "rewards/margins": 3.1743595600128174, "rewards/rejected": 9.80400276184082, "step": 280 }, { "epoch": 0.48, "grad_norm": 31.15982437133789, "learning_rate": 9.392454301658734e-06, "logits/chosen": -1.2521940469741821, "logits/rejected": -1.2475926876068115, "logps/chosen": -150.509765625, "logps/rejected": -201.6109161376953, "loss": 1.3024, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 13.243515014648438, "rewards/margins": 3.826166868209839, "rewards/rejected": 9.417348861694336, "step": 290 }, { "epoch": 0.5, "grad_norm": 21.953458786010742, "learning_rate": 9.350020848394722e-06, "logits/chosen": -1.233689308166504, "logits/rejected": -1.2306249141693115, "logps/chosen": -142.88519287109375, "logps/rejected": -176.23739624023438, "loss": 1.3095, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 12.999292373657227, "rewards/margins": 3.4122116565704346, "rewards/rejected": 9.587080955505371, "step": 300 }, { "epoch": 0.52, "grad_norm": 5.4461750984191895, "learning_rate": 9.306257760849198e-06, "logits/chosen": -1.243290662765503, "logits/rejected": -1.238360047340393, "logps/chosen": -135.43121337890625, "logps/rejected": -184.3162078857422, "loss": 1.1376, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 13.632291793823242, "rewards/margins": 3.4248099327087402, "rewards/rejected": 10.207484245300293, "step": 310 }, { "epoch": 0.53, "grad_norm": 5.681567668914795, "learning_rate": 9.261178415717006e-06, "logits/chosen": -1.2789077758789062, "logits/rejected": -1.2725191116333008, "logps/chosen": -145.27374267578125, "logps/rejected": -171.4091339111328, "loss": 1.1728, "rewards/accuracies": 0.875, "rewards/chosen": 13.61553955078125, "rewards/margins": 3.5283050537109375, "rewards/rejected": 10.087234497070312, "step": 320 }, { "epoch": 0.55, "grad_norm": 10.140364646911621, "learning_rate": 9.214796592022378e-06, "logits/chosen": -1.322906732559204, "logits/rejected": -1.308996558189392, "logps/chosen": -141.2882537841797, "logps/rejected": -186.32119750976562, "loss": 1.1413, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 13.753789901733398, "rewards/margins": 3.8648266792297363, "rewards/rejected": 9.88896369934082, "step": 330 }, { "epoch": 0.57, "grad_norm": 29.466781616210938, "learning_rate": 9.167126466907215e-06, "logits/chosen": -1.333762288093567, "logits/rejected": -1.33591628074646, "logps/chosen": -135.62503051757812, "logps/rejected": -185.78758239746094, "loss": 1.2402, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 13.778650283813477, "rewards/margins": 3.8923568725585938, "rewards/rejected": 9.886293411254883, "step": 340 }, { "epoch": 0.58, "grad_norm": 24.269454956054688, "learning_rate": 9.118182611297665e-06, "logits/chosen": -1.309342622756958, "logits/rejected": -1.3070811033248901, "logps/chosen": -136.98138427734375, "logps/rejected": -191.55300903320312, "loss": 1.327, "rewards/accuracies": 0.8125, "rewards/chosen": 13.252286911010742, "rewards/margins": 3.4166321754455566, "rewards/rejected": 9.835655212402344, "step": 350 }, { "epoch": 0.6, "grad_norm": 45.82386779785156, "learning_rate": 9.067979985450377e-06, "logits/chosen": -1.3228009939193726, "logits/rejected": -1.3171498775482178, "logps/chosen": -146.18157958984375, "logps/rejected": -188.90127563476562, "loss": 1.1041, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 13.495672225952148, "rewards/margins": 3.9597671031951904, "rewards/rejected": 9.535904884338379, "step": 360 }, { "epoch": 0.62, "grad_norm": 30.16645622253418, "learning_rate": 9.016533934379697e-06, "logits/chosen": -1.33194100856781, "logits/rejected": -1.3219845294952393, "logps/chosen": -138.00164794921875, "logps/rejected": -186.89862060546875, "loss": 1.0528, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 13.405718803405762, "rewards/margins": 4.230135440826416, "rewards/rejected": 9.17558479309082, "step": 370 }, { "epoch": 0.63, "grad_norm": 23.7780704498291, "learning_rate": 8.96386018316731e-06, "logits/chosen": -1.332335114479065, "logits/rejected": -1.3253917694091797, "logps/chosen": -135.97877502441406, "logps/rejected": -190.83480834960938, "loss": 1.0856, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 14.198583602905273, "rewards/margins": 4.640327453613281, "rewards/rejected": 9.558258056640625, "step": 380 }, { "epoch": 0.65, "grad_norm": 37.86054229736328, "learning_rate": 8.909974832155667e-06, "logits/chosen": -1.3102099895477295, "logits/rejected": -1.3026459217071533, "logps/chosen": -145.63626098632812, "logps/rejected": -176.83547973632812, "loss": 1.2607, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 13.206341743469238, "rewards/margins": 3.159071683883667, "rewards/rejected": 10.047269821166992, "step": 390 }, { "epoch": 0.67, "grad_norm": 21.949071884155273, "learning_rate": 8.854894352026746e-06, "logits/chosen": -1.2982990741729736, "logits/rejected": -1.294737696647644, "logps/chosen": -135.0550537109375, "logps/rejected": -179.46371459960938, "loss": 1.1821, "rewards/accuracies": 0.875, "rewards/chosen": 13.508148193359375, "rewards/margins": 3.7253730297088623, "rewards/rejected": 9.78277587890625, "step": 400 }, { "epoch": 0.67, "eval_logits/chosen": -1.2718416452407837, "eval_logits/rejected": -1.2731894254684448, "eval_logps/chosen": -142.27450561523438, "eval_logps/rejected": -166.72564697265625, "eval_loss": 1.9728976488113403, "eval_rewards/accuracies": 0.611347496509552, "eval_rewards/chosen": 9.937856674194336, "eval_rewards/margins": 0.7354397177696228, "eval_rewards/rejected": 9.20241641998291, "eval_runtime": 280.6001, "eval_samples_per_second": 2.512, "eval_steps_per_second": 2.512, "step": 400 }, { "epoch": 0.68, "grad_norm": 16.105052947998047, "learning_rate": 8.798635578767584e-06, "logits/chosen": -1.2835286855697632, "logits/rejected": -1.2852187156677246, "logps/chosen": -128.66891479492188, "logps/rejected": -180.98855590820312, "loss": 1.2384, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 13.414710998535156, "rewards/margins": 3.5980567932128906, "rewards/rejected": 9.81665325164795, "step": 410 }, { "epoch": 0.7, "grad_norm": 18.655576705932617, "learning_rate": 8.74121570852417e-06, "logits/chosen": -1.304517388343811, "logits/rejected": -1.295888900756836, "logps/chosen": -129.7939910888672, "logps/rejected": -182.28158569335938, "loss": 1.1389, "rewards/accuracies": 0.875, "rewards/chosen": 13.784006118774414, "rewards/margins": 4.219157695770264, "rewards/rejected": 9.564847946166992, "step": 420 }, { "epoch": 0.72, "grad_norm": 21.71282386779785, "learning_rate": 8.682652292345239e-06, "logits/chosen": -1.2803471088409424, "logits/rejected": -1.281240701675415, "logps/chosen": -126.82594299316406, "logps/rejected": -187.58602905273438, "loss": 1.0862, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 13.763422966003418, "rewards/margins": 4.294766426086426, "rewards/rejected": 9.468656539916992, "step": 430 }, { "epoch": 0.73, "grad_norm": 23.261680603027344, "learning_rate": 8.622963230817599e-06, "logits/chosen": -1.3054393529891968, "logits/rejected": -1.294926404953003, "logps/chosen": -137.8651580810547, "logps/rejected": -187.91754150390625, "loss": 1.0189, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 13.93773078918457, "rewards/margins": 4.54481840133667, "rewards/rejected": 9.392911911010742, "step": 440 }, { "epoch": 0.75, "grad_norm": 22.61610221862793, "learning_rate": 8.562166768594592e-06, "logits/chosen": -1.3260384798049927, "logits/rejected": -1.313039779663086, "logps/chosen": -136.37155151367188, "logps/rejected": -189.9808807373047, "loss": 1.0986, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 14.760931015014648, "rewards/margins": 5.300175666809082, "rewards/rejected": 9.460756301879883, "step": 450 }, { "epoch": 0.77, "grad_norm": 44.37038803100586, "learning_rate": 8.500281488819426e-06, "logits/chosen": -1.335376501083374, "logits/rejected": -1.32558012008667, "logps/chosen": -137.15939331054688, "logps/rejected": -178.85699462890625, "loss": 1.25, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 14.302162170410156, "rewards/margins": 4.735361099243164, "rewards/rejected": 9.566801071166992, "step": 460 }, { "epoch": 0.78, "grad_norm": 15.156902313232422, "learning_rate": 8.43732630744501e-06, "logits/chosen": -1.3467152118682861, "logits/rejected": -1.3411352634429932, "logps/chosen": -131.86680603027344, "logps/rejected": -184.54978942871094, "loss": 1.067, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 14.075284004211426, "rewards/margins": 4.149069309234619, "rewards/rejected": 9.926214218139648, "step": 470 }, { "epoch": 0.8, "grad_norm": 8.137179374694824, "learning_rate": 8.373320467452069e-06, "logits/chosen": -1.3710681200027466, "logits/rejected": -1.360769271850586, "logps/chosen": -129.4867706298828, "logps/rejected": -187.3372802734375, "loss": 1.0214, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 14.554537773132324, "rewards/margins": 5.148342132568359, "rewards/rejected": 9.406194686889648, "step": 480 }, { "epoch": 0.82, "grad_norm": 5.059852600097656, "learning_rate": 8.308283532967311e-06, "logits/chosen": -1.3810697793960571, "logits/rejected": -1.372183918952942, "logps/chosen": -128.07522583007812, "logps/rejected": -187.84347534179688, "loss": 1.0341, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 14.386204719543457, "rewards/margins": 4.947279930114746, "rewards/rejected": 9.438923835754395, "step": 490 }, { "epoch": 0.83, "grad_norm": 15.475478172302246, "learning_rate": 8.242235383283433e-06, "logits/chosen": -1.3918366432189941, "logits/rejected": -1.379677176475525, "logps/chosen": -133.6950225830078, "logps/rejected": -194.3822479248047, "loss": 0.9832, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 15.278470039367676, "rewards/margins": 5.975428104400635, "rewards/rejected": 9.303044319152832, "step": 500 }, { "epoch": 0.85, "grad_norm": 17.410987854003906, "learning_rate": 8.175196206782765e-06, "logits/chosen": -1.4215304851531982, "logits/rejected": -1.4245882034301758, "logps/chosen": -135.43157958984375, "logps/rejected": -195.17837524414062, "loss": 1.1563, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 13.64989948272705, "rewards/margins": 4.778893947601318, "rewards/rejected": 8.87100601196289, "step": 510 }, { "epoch": 0.87, "grad_norm": 10.288193702697754, "learning_rate": 8.107186494766475e-06, "logits/chosen": -1.3871448040008545, "logits/rejected": -1.3849140405654907, "logps/chosen": -138.03524780273438, "logps/rejected": -192.06105041503906, "loss": 1.0753, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 14.112817764282227, "rewards/margins": 5.05327844619751, "rewards/rejected": 9.059538841247559, "step": 520 }, { "epoch": 0.88, "grad_norm": 9.09673023223877, "learning_rate": 8.038227035191152e-06, "logits/chosen": -1.369144320487976, "logits/rejected": -1.3611090183258057, "logps/chosen": -129.99301147460938, "logps/rejected": -184.04013061523438, "loss": 0.9557, "rewards/accuracies": 0.9375, "rewards/chosen": 14.619100570678711, "rewards/margins": 5.349046230316162, "rewards/rejected": 9.270054817199707, "step": 530 }, { "epoch": 0.9, "grad_norm": 10.032170295715332, "learning_rate": 7.968338906314739e-06, "logits/chosen": -1.384235143661499, "logits/rejected": -1.3696062564849854, "logps/chosen": -135.14321899414062, "logps/rejected": -188.68896484375, "loss": 0.9715, "rewards/accuracies": 0.9375, "rewards/chosen": 15.478785514831543, "rewards/margins": 6.076346397399902, "rewards/rejected": 9.40243911743164, "step": 540 }, { "epoch": 0.92, "grad_norm": 19.764158248901367, "learning_rate": 7.897543470253708e-06, "logits/chosen": -1.4087002277374268, "logits/rejected": -1.4047653675079346, "logps/chosen": -123.32807922363281, "logps/rejected": -196.0301971435547, "loss": 0.987, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 14.471944808959961, "rewards/margins": 5.295290470123291, "rewards/rejected": 9.176654815673828, "step": 550 }, { "epoch": 0.93, "grad_norm": 40.5106315612793, "learning_rate": 7.825862366453487e-06, "logits/chosen": -1.4345886707305908, "logits/rejected": -1.4347158670425415, "logps/chosen": -123.09619140625, "logps/rejected": -191.9667205810547, "loss": 1.0165, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 14.416943550109863, "rewards/margins": 5.778631210327148, "rewards/rejected": 8.638311386108398, "step": 560 }, { "epoch": 0.95, "grad_norm": 11.272564888000488, "learning_rate": 7.753317505074114e-06, "logits/chosen": -1.3969998359680176, "logits/rejected": -1.3935317993164062, "logps/chosen": -136.36032104492188, "logps/rejected": -193.97418212890625, "loss": 1.088, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 14.054471969604492, "rewards/margins": 5.512935161590576, "rewards/rejected": 8.541539192199707, "step": 570 }, { "epoch": 0.97, "grad_norm": 32.77339172363281, "learning_rate": 7.679931060293137e-06, "logits/chosen": -1.3968368768692017, "logits/rejected": -1.3951635360717773, "logps/chosen": -127.2842025756836, "logps/rejected": -192.80587768554688, "loss": 1.0683, "rewards/accuracies": 0.875, "rewards/chosen": 14.630516052246094, "rewards/margins": 5.419614315032959, "rewards/rejected": 9.210902214050293, "step": 580 }, { "epoch": 0.98, "grad_norm": 9.357986450195312, "learning_rate": 7.605725463527825e-06, "logits/chosen": -1.3885825872421265, "logits/rejected": -1.3792840242385864, "logps/chosen": -129.8568115234375, "logps/rejected": -175.72964477539062, "loss": 1.0435, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 14.813947677612305, "rewards/margins": 5.173638343811035, "rewards/rejected": 9.640308380126953, "step": 590 }, { "epoch": 1.0, "grad_norm": 14.639739036560059, "learning_rate": 7.530723396578745e-06, "logits/chosen": -1.3973591327667236, "logits/rejected": -1.3902390003204346, "logps/chosen": -123.96485900878906, "logps/rejected": -186.21829223632812, "loss": 0.9116, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 15.26634693145752, "rewards/margins": 6.136044025421143, "rewards/rejected": 9.130304336547852, "step": 600 }, { "epoch": 1.0, "eval_logits/chosen": -1.351010799407959, "eval_logits/rejected": -1.3527151346206665, "eval_logps/chosen": -143.65618896484375, "eval_logps/rejected": -169.28347778320312, "eval_loss": 1.9454625844955444, "eval_rewards/accuracies": 0.6482269763946533, "eval_rewards/chosen": 9.799687385559082, "eval_rewards/margins": 0.8530532121658325, "eval_rewards/rejected": 8.946634292602539, "eval_runtime": 280.7129, "eval_samples_per_second": 2.511, "eval_steps_per_second": 2.511, "step": 600 }, { "epoch": 1.02, "grad_norm": 4.632735729217529, "learning_rate": 7.454947784696804e-06, "logits/chosen": -1.4173977375030518, "logits/rejected": -1.4061188697814941, "logps/chosen": -127.29417419433594, "logps/rejected": -196.90383911132812, "loss": 0.8744, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 14.873614311218262, "rewards/margins": 6.611442565917969, "rewards/rejected": 8.262171745300293, "step": 610 }, { "epoch": 1.03, "grad_norm": 9.438977241516113, "learning_rate": 7.3784217895758804e-06, "logits/chosen": -1.4528166055679321, "logits/rejected": -1.4417811632156372, "logps/chosen": -134.85516357421875, "logps/rejected": -205.706298828125, "loss": 0.8906, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 15.327418327331543, "rewards/margins": 6.7848310470581055, "rewards/rejected": 8.542585372924805, "step": 620 }, { "epoch": 1.05, "grad_norm": 8.289262771606445, "learning_rate": 7.3011688022731865e-06, "logits/chosen": -1.4324336051940918, "logits/rejected": -1.420256495475769, "logps/chosen": -129.654052734375, "logps/rejected": -184.8654327392578, "loss": 0.8893, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 14.612588882446289, "rewards/margins": 5.854708671569824, "rewards/rejected": 8.757880210876465, "step": 630 }, { "epoch": 1.07, "grad_norm": 8.663153648376465, "learning_rate": 7.2232124360595205e-06, "logits/chosen": -1.455397605895996, "logits/rejected": -1.4463832378387451, "logps/chosen": -127.48155212402344, "logps/rejected": -205.6735382080078, "loss": 0.7873, "rewards/accuracies": 1.0, "rewards/chosen": 15.274075508117676, "rewards/margins": 6.838622093200684, "rewards/rejected": 8.435453414916992, "step": 640 }, { "epoch": 1.08, "grad_norm": 6.434426307678223, "learning_rate": 7.144576519201595e-06, "logits/chosen": -1.4524507522583008, "logits/rejected": -1.437585711479187, "logps/chosen": -122.5464859008789, "logps/rejected": -193.6874542236328, "loss": 0.8148, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 15.566122055053711, "rewards/margins": 7.372010231018066, "rewards/rejected": 8.194112777709961, "step": 650 }, { "epoch": 1.1, "grad_norm": 4.037100791931152, "learning_rate": 7.0652850876786485e-06, "logits/chosen": -1.4568301439285278, "logits/rejected": -1.4512048959732056, "logps/chosen": -109.03157043457031, "logps/rejected": -203.418212890625, "loss": 0.7832, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 15.715448379516602, "rewards/margins": 7.003268241882324, "rewards/rejected": 8.712181091308594, "step": 660 }, { "epoch": 1.12, "grad_norm": 19.28900909423828, "learning_rate": 6.9853623778355805e-06, "logits/chosen": -1.4528229236602783, "logits/rejected": -1.441767692565918, "logps/chosen": -124.41324615478516, "logps/rejected": -183.64224243164062, "loss": 0.8512, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 15.778039932250977, "rewards/margins": 6.944033622741699, "rewards/rejected": 8.834006309509277, "step": 670 }, { "epoch": 1.13, "grad_norm": 5.148800373077393, "learning_rate": 6.904832818974818e-06, "logits/chosen": -1.461948037147522, "logits/rejected": -1.4578710794448853, "logps/chosen": -112.7221450805664, "logps/rejected": -195.2073211669922, "loss": 0.8077, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 15.100164413452148, "rewards/margins": 6.574339866638184, "rewards/rejected": 8.525824546813965, "step": 680 }, { "epoch": 1.15, "grad_norm": 8.881467819213867, "learning_rate": 6.823721025889227e-06, "logits/chosen": -1.4419883489608765, "logits/rejected": -1.4434632062911987, "logps/chosen": -120.42142486572266, "logps/rejected": -187.1098175048828, "loss": 0.8814, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 15.091572761535645, "rewards/margins": 6.394684791564941, "rewards/rejected": 8.696887016296387, "step": 690 }, { "epoch": 1.17, "grad_norm": 4.702162265777588, "learning_rate": 6.742051791338305e-06, "logits/chosen": -1.4430066347122192, "logits/rejected": -1.4336270093917847, "logps/chosen": -124.91837310791016, "logps/rejected": -190.82229614257812, "loss": 0.8541, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 16.073802947998047, "rewards/margins": 7.100152015686035, "rewards/rejected": 8.973649978637695, "step": 700 }, { "epoch": 1.18, "grad_norm": 16.544490814208984, "learning_rate": 6.6598500784700016e-06, "logits/chosen": -1.437310814857483, "logits/rejected": -1.439613938331604, "logps/chosen": -116.21219635009766, "logps/rejected": -190.1880340576172, "loss": 0.8385, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 14.853405952453613, "rewards/margins": 6.005455493927002, "rewards/rejected": 8.84795093536377, "step": 710 }, { "epoch": 1.2, "grad_norm": 4.642127513885498, "learning_rate": 6.577141013190428e-06, "logits/chosen": -1.474867582321167, "logits/rejected": -1.4651817083358765, "logps/chosen": -114.55826568603516, "logps/rejected": -191.57504272460938, "loss": 0.7852, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 15.868762016296387, "rewards/margins": 7.344477653503418, "rewards/rejected": 8.524284362792969, "step": 720 }, { "epoch": 1.22, "grad_norm": 5.04547119140625, "learning_rate": 6.493949876483841e-06, "logits/chosen": -1.4716556072235107, "logits/rejected": -1.4789526462554932, "logps/chosen": -114.3637466430664, "logps/rejected": -195.8820037841797, "loss": 0.8683, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 15.210566520690918, "rewards/margins": 7.015016078948975, "rewards/rejected": 8.195551872253418, "step": 730 }, { "epoch": 1.23, "grad_norm": 8.906220436096191, "learning_rate": 6.410302096685219e-06, "logits/chosen": -1.467878818511963, "logits/rejected": -1.4584242105484009, "logps/chosen": -113.1087875366211, "logps/rejected": -196.9852752685547, "loss": 0.7898, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 15.73388671875, "rewards/margins": 7.293121337890625, "rewards/rejected": 8.440766334533691, "step": 740 }, { "epoch": 1.25, "grad_norm": 4.600486755371094, "learning_rate": 6.326223241707787e-06, "logits/chosen": -1.4887049198150635, "logits/rejected": -1.4763177633285522, "logps/chosen": -114.50920104980469, "logps/rejected": -191.11123657226562, "loss": 0.7898, "rewards/accuracies": 1.0, "rewards/chosen": 16.316104888916016, "rewards/margins": 7.5335493087768555, "rewards/rejected": 8.782556533813477, "step": 750 }, { "epoch": 1.27, "grad_norm": 28.16501235961914, "learning_rate": 6.241739011227899e-06, "logits/chosen": -1.4948675632476807, "logits/rejected": -1.4884848594665527, "logps/chosen": -110.34342956542969, "logps/rejected": -183.22744750976562, "loss": 0.8029, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 15.722465515136719, "rewards/margins": 7.071497917175293, "rewards/rejected": 8.650967597961426, "step": 760 }, { "epoch": 1.28, "grad_norm": 8.12157917022705, "learning_rate": 6.156875228829627e-06, "logits/chosen": -1.4990284442901611, "logits/rejected": -1.502752661705017, "logps/chosen": -124.48286437988281, "logps/rejected": -205.2565155029297, "loss": 0.9354, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 15.559919357299805, "rewards/margins": 7.290531158447266, "rewards/rejected": 8.269388198852539, "step": 770 }, { "epoch": 1.3, "grad_norm": 5.052426338195801, "learning_rate": 6.071657834111483e-06, "logits/chosen": -1.4951364994049072, "logits/rejected": -1.4810194969177246, "logps/chosen": -115.41800689697266, "logps/rejected": -189.0443878173828, "loss": 0.8551, "rewards/accuracies": 0.9375, "rewards/chosen": 16.50579833984375, "rewards/margins": 7.6818413734436035, "rewards/rejected": 8.823959350585938, "step": 780 }, { "epoch": 1.32, "grad_norm": 20.577198028564453, "learning_rate": 5.986112874757688e-06, "logits/chosen": -1.4854376316070557, "logits/rejected": -1.4871946573257446, "logps/chosen": -112.67398834228516, "logps/rejected": -196.98049926757812, "loss": 0.8269, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 16.0164852142334, "rewards/margins": 7.493149757385254, "rewards/rejected": 8.523335456848145, "step": 790 }, { "epoch": 1.33, "grad_norm": 14.56539249420166, "learning_rate": 5.900266498576383e-06, "logits/chosen": -1.5002418756484985, "logits/rejected": -1.4961646795272827, "logps/chosen": -119.12144470214844, "logps/rejected": -195.09054565429688, "loss": 0.8412, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 15.817484855651855, "rewards/margins": 7.278079986572266, "rewards/rejected": 8.539405822753906, "step": 800 }, { "epoch": 1.33, "eval_logits/chosen": -1.417891502380371, "eval_logits/rejected": -1.4206050634384155, "eval_logps/chosen": -146.20431518554688, "eval_logps/rejected": -173.58306884765625, "eval_loss": 2.0040743350982666, "eval_rewards/accuracies": 0.6397163271903992, "eval_rewards/chosen": 9.544875144958496, "eval_rewards/margins": 1.0281997919082642, "eval_rewards/rejected": 8.516674995422363, "eval_runtime": 280.445, "eval_samples_per_second": 2.514, "eval_steps_per_second": 2.514, "step": 800 }, { "epoch": 1.35, "grad_norm": 8.800375938415527, "learning_rate": 5.81414494550726e-06, "logits/chosen": -1.4927313327789307, "logits/rejected": -1.486893653869629, "logps/chosen": -108.54096984863281, "logps/rejected": -194.3487548828125, "loss": 0.7899, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 16.199222564697266, "rewards/margins": 7.905210018157959, "rewards/rejected": 8.294013977050781, "step": 810 }, { "epoch": 1.37, "grad_norm": 12.154976844787598, "learning_rate": 5.727774539601015e-06, "logits/chosen": -1.4953235387802124, "logits/rejected": -1.49058198928833, "logps/chosen": -105.4122543334961, "logps/rejected": -205.03518676757812, "loss": 0.7704, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 15.790075302124023, "rewards/margins": 7.537489414215088, "rewards/rejected": 8.252584457397461, "step": 820 }, { "epoch": 1.38, "grad_norm": 18.195478439331055, "learning_rate": 5.641181680973094e-06, "logits/chosen": -1.5069096088409424, "logits/rejected": -1.5045548677444458, "logps/chosen": -107.76765441894531, "logps/rejected": -185.20993041992188, "loss": 0.7649, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 15.898587226867676, "rewards/margins": 7.165400505065918, "rewards/rejected": 8.733189582824707, "step": 830 }, { "epoch": 1.4, "grad_norm": 6.522106647491455, "learning_rate": 5.554392837734201e-06, "logits/chosen": -1.4592971801757812, "logits/rejected": -1.4653236865997314, "logps/chosen": -126.52349853515625, "logps/rejected": -200.0361328125, "loss": 0.9745, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 14.701069831848145, "rewards/margins": 6.266867160797119, "rewards/rejected": 8.434202194213867, "step": 840 }, { "epoch": 1.42, "grad_norm": 4.361470699310303, "learning_rate": 5.467434537900002e-06, "logits/chosen": -1.4889204502105713, "logits/rejected": -1.4831851720809937, "logps/chosen": -117.97469329833984, "logps/rejected": -199.29603576660156, "loss": 0.8372, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 15.988435745239258, "rewards/margins": 7.939582824707031, "rewards/rejected": 8.048852920532227, "step": 850 }, { "epoch": 1.43, "grad_norm": 30.419536590576172, "learning_rate": 5.380333361282537e-06, "logits/chosen": -1.4820563793182373, "logits/rejected": -1.4779971837997437, "logps/chosen": -115.8757553100586, "logps/rejected": -182.51400756835938, "loss": 0.9258, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 15.708460807800293, "rewards/margins": 6.669085502624512, "rewards/rejected": 9.039376258850098, "step": 860 }, { "epoch": 1.45, "grad_norm": 9.19847583770752, "learning_rate": 5.293115931365793e-06, "logits/chosen": -1.5117051601409912, "logits/rejected": -1.5152397155761719, "logps/chosen": -111.7296371459961, "logps/rejected": -208.90414428710938, "loss": 0.8764, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 16.2409725189209, "rewards/margins": 8.053875923156738, "rewards/rejected": 8.187097549438477, "step": 870 }, { "epoch": 1.47, "grad_norm": 5.0174384117126465, "learning_rate": 5.20580890716792e-06, "logits/chosen": -1.5034881830215454, "logits/rejected": -1.500460147857666, "logps/chosen": -108.12443542480469, "logps/rejected": -194.9343719482422, "loss": 0.7908, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 16.14035415649414, "rewards/margins": 7.5548906326293945, "rewards/rejected": 8.585463523864746, "step": 880 }, { "epoch": 1.48, "grad_norm": 51.05670928955078, "learning_rate": 5.118438975092605e-06, "logits/chosen": -1.503549575805664, "logits/rejected": -1.5023475885391235, "logps/chosen": -113.84260559082031, "logps/rejected": -193.15135192871094, "loss": 0.8541, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 15.795028686523438, "rewards/margins": 7.298943519592285, "rewards/rejected": 8.496085166931152, "step": 890 }, { "epoch": 1.5, "grad_norm": 10.384552955627441, "learning_rate": 5.031032840772048e-06, "logits/chosen": -1.49759840965271, "logits/rejected": -1.491120457649231, "logps/chosen": -110.45916748046875, "logps/rejected": -189.4463653564453, "loss": 0.8419, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 16.17422866821289, "rewards/margins": 7.4651031494140625, "rewards/rejected": 8.709126472473145, "step": 900 }, { "epoch": 1.52, "grad_norm": 21.21319007873535, "learning_rate": 4.943617220904091e-06, "logits/chosen": -1.5416353940963745, "logits/rejected": -1.5274879932403564, "logps/chosen": -113.6543197631836, "logps/rejected": -192.6654815673828, "loss": 0.8271, "rewards/accuracies": 0.9375, "rewards/chosen": 17.103158950805664, "rewards/margins": 9.091134071350098, "rewards/rejected": 8.01202392578125, "step": 910 }, { "epoch": 1.53, "grad_norm": 4.474107265472412, "learning_rate": 4.856218835085946e-06, "logits/chosen": -1.5196397304534912, "logits/rejected": -1.5235029458999634, "logps/chosen": -102.72175598144531, "logps/rejected": -212.8190155029297, "loss": 0.7193, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 16.488685607910156, "rewards/margins": 8.548524856567383, "rewards/rejected": 7.940161228179932, "step": 920 }, { "epoch": 1.55, "grad_norm": 4.788906097412109, "learning_rate": 4.768864397647031e-06, "logits/chosen": -1.5152068138122559, "logits/rejected": -1.5065648555755615, "logps/chosen": -101.84674072265625, "logps/rejected": -219.8702392578125, "loss": 0.6819, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 17.328767776489258, "rewards/margins": 10.332796096801758, "rewards/rejected": 6.995970726013184, "step": 930 }, { "epoch": 1.57, "grad_norm": 5.230409622192383, "learning_rate": 4.681580609483436e-06, "logits/chosen": -1.5237815380096436, "logits/rejected": -1.5128507614135742, "logps/chosen": -107.20805358886719, "logps/rejected": -202.7063751220703, "loss": 0.7475, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 17.05699920654297, "rewards/margins": 9.087102890014648, "rewards/rejected": 7.969895362854004, "step": 940 }, { "epoch": 1.58, "grad_norm": 4.067636013031006, "learning_rate": 4.594394149896481e-06, "logits/chosen": -1.5245317220687866, "logits/rejected": -1.5309604406356812, "logps/chosen": -104.8866958618164, "logps/rejected": -214.6997528076172, "loss": 0.7442, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 17.146936416625977, "rewards/margins": 9.150843620300293, "rewards/rejected": 7.996092319488525, "step": 950 }, { "epoch": 1.6, "grad_norm": 5.677704811096191, "learning_rate": 4.507331668437878e-06, "logits/chosen": -1.524597406387329, "logits/rejected": -1.5158151388168335, "logps/chosen": -99.23400115966797, "logps/rejected": -204.5542755126953, "loss": 0.7509, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 17.156509399414062, "rewards/margins": 9.190254211425781, "rewards/rejected": 7.966255187988281, "step": 960 }, { "epoch": 1.62, "grad_norm": 18.698612213134766, "learning_rate": 4.42041977676399e-06, "logits/chosen": -1.5082147121429443, "logits/rejected": -1.51360285282135, "logps/chosen": -101.81925964355469, "logps/rejected": -208.90847778320312, "loss": 0.7403, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 17.100299835205078, "rewards/margins": 9.655840873718262, "rewards/rejected": 7.444457054138184, "step": 970 }, { "epoch": 1.63, "grad_norm": 8.17164421081543, "learning_rate": 4.333685040501664e-06, "logits/chosen": -1.5298702716827393, "logits/rejected": -1.5242555141448975, "logps/chosen": -103.73741149902344, "logps/rejected": -200.9259796142578, "loss": 0.7967, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 16.915372848510742, "rewards/margins": 8.870743751525879, "rewards/rejected": 8.04463005065918, "step": 980 }, { "epoch": 1.65, "grad_norm": 11.650801658630371, "learning_rate": 4.247153971128145e-06, "logits/chosen": -1.518059492111206, "logits/rejected": -1.512229323387146, "logps/chosen": -100.54881286621094, "logps/rejected": -197.3603515625, "loss": 0.7513, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 17.221500396728516, "rewards/margins": 9.602907180786133, "rewards/rejected": 7.618594169616699, "step": 990 }, { "epoch": 1.67, "grad_norm": 43.56822967529297, "learning_rate": 4.160853017867531e-06, "logits/chosen": -1.5179004669189453, "logits/rejected": -1.5160772800445557, "logps/chosen": -103.8607177734375, "logps/rejected": -198.41555786132812, "loss": 0.7345, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 16.935503005981445, "rewards/margins": 8.681713104248047, "rewards/rejected": 8.253790855407715, "step": 1000 }, { "epoch": 1.67, "eval_logits/chosen": -1.4289880990982056, "eval_logits/rejected": -1.4324686527252197, "eval_logps/chosen": -150.1593017578125, "eval_logps/rejected": -177.23570251464844, "eval_loss": 2.0659372806549072, "eval_rewards/accuracies": 0.6425532102584839, "eval_rewards/chosen": 9.149377822875977, "eval_rewards/margins": 0.9979680180549622, "eval_rewards/rejected": 8.151410102844238, "eval_runtime": 280.4165, "eval_samples_per_second": 2.514, "eval_steps_per_second": 2.514, "step": 1000 }, { "epoch": 1.68, "grad_norm": 18.621665954589844, "learning_rate": 4.074808559606264e-06, "logits/chosen": -1.5129244327545166, "logits/rejected": -1.5049433708190918, "logps/chosen": -100.34537506103516, "logps/rejected": -216.16506958007812, "loss": 0.6972, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 17.9000301361084, "rewards/margins": 10.329971313476562, "rewards/rejected": 7.570061683654785, "step": 1010 }, { "epoch": 1.7, "grad_norm": 38.73393630981445, "learning_rate": 3.989046896830119e-06, "logits/chosen": -1.5301315784454346, "logits/rejected": -1.5370006561279297, "logps/chosen": -108.65140533447266, "logps/rejected": -213.85879516601562, "loss": 0.8484, "rewards/accuracies": 0.9375, "rewards/chosen": 16.508888244628906, "rewards/margins": 9.253557205200195, "rewards/rejected": 7.255330562591553, "step": 1020 }, { "epoch": 1.72, "grad_norm": 5.319835662841797, "learning_rate": 3.9035942435851504e-06, "logits/chosen": -1.5252224206924438, "logits/rejected": -1.5232148170471191, "logps/chosen": -102.89615631103516, "logps/rejected": -206.59750366210938, "loss": 0.7447, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 17.126842498779297, "rewards/margins": 9.563276290893555, "rewards/rejected": 7.563568115234375, "step": 1030 }, { "epoch": 1.73, "grad_norm": 19.18516731262207, "learning_rate": 3.818476719465073e-06, "logits/chosen": -1.5133918523788452, "logits/rejected": -1.503049612045288, "logps/chosen": -111.7210693359375, "logps/rejected": -180.65403747558594, "loss": 0.9205, "rewards/accuracies": 0.9375, "rewards/chosen": 15.973672866821289, "rewards/margins": 7.1840667724609375, "rewards/rejected": 8.789606094360352, "step": 1040 }, { "epoch": 1.75, "grad_norm": 20.20930290222168, "learning_rate": 3.7337203416274993e-06, "logits/chosen": -1.5373504161834717, "logits/rejected": -1.5284373760223389, "logps/chosen": -98.73445129394531, "logps/rejected": -208.073486328125, "loss": 0.7114, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 17.71356773376465, "rewards/margins": 10.003168106079102, "rewards/rejected": 7.710400581359863, "step": 1050 }, { "epoch": 1.77, "grad_norm": 26.072620391845703, "learning_rate": 3.6493510168414924e-06, "logits/chosen": -1.5504237413406372, "logits/rejected": -1.5454633235931396, "logps/chosen": -104.91600036621094, "logps/rejected": -212.7382354736328, "loss": 0.8006, "rewards/accuracies": 0.9375, "rewards/chosen": 16.929195404052734, "rewards/margins": 10.16772174835205, "rewards/rejected": 6.761473178863525, "step": 1060 }, { "epoch": 1.78, "grad_norm": 4.6664628982543945, "learning_rate": 3.5653945335688688e-06, "logits/chosen": -1.520021677017212, "logits/rejected": -1.5187304019927979, "logps/chosen": -99.18524932861328, "logps/rejected": -208.15414428710938, "loss": 0.7484, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 17.673595428466797, "rewards/margins": 9.892778396606445, "rewards/rejected": 7.780816555023193, "step": 1070 }, { "epoch": 1.8, "grad_norm": 17.390003204345703, "learning_rate": 3.4818765540816505e-06, "logits/chosen": -1.5037453174591064, "logits/rejected": -1.4973242282867432, "logps/chosen": -120.0849609375, "logps/rejected": -196.7596893310547, "loss": 0.8644, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 15.961163520812988, "rewards/margins": 8.226602554321289, "rewards/rejected": 7.734560489654541, "step": 1080 }, { "epoch": 1.82, "grad_norm": 29.988948822021484, "learning_rate": 3.398822606618095e-06, "logits/chosen": -1.507930040359497, "logits/rejected": -1.5118629932403564, "logps/chosen": -95.96809387207031, "logps/rejected": -216.37130737304688, "loss": 0.6905, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 17.203866958618164, "rewards/margins": 9.483266830444336, "rewards/rejected": 7.7205986976623535, "step": 1090 }, { "epoch": 1.83, "grad_norm": 23.51713752746582, "learning_rate": 3.3162580775796994e-06, "logits/chosen": -1.4951313734054565, "logits/rejected": -1.48732590675354, "logps/chosen": -102.76708984375, "logps/rejected": -198.36593627929688, "loss": 0.8403, "rewards/accuracies": 0.9375, "rewards/chosen": 15.965538024902344, "rewards/margins": 7.880660057067871, "rewards/rejected": 8.084877967834473, "step": 1100 }, { "epoch": 1.85, "grad_norm": 5.080748558044434, "learning_rate": 3.2342082037715404e-06, "logits/chosen": -1.4765777587890625, "logits/rejected": -1.4732264280319214, "logps/chosen": -101.1002426147461, "logps/rejected": -198.5289306640625, "loss": 0.7783, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 16.523101806640625, "rewards/margins": 8.514945983886719, "rewards/rejected": 8.008153915405273, "step": 1110 }, { "epoch": 1.87, "grad_norm": 15.195828437805176, "learning_rate": 3.1526980646883664e-06, "logits/chosen": -1.49948251247406, "logits/rejected": -1.49336838722229, "logps/chosen": -113.28621673583984, "logps/rejected": -198.71463012695312, "loss": 0.8398, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 16.502878189086914, "rewards/margins": 8.838693618774414, "rewards/rejected": 7.664183139801025, "step": 1120 }, { "epoch": 1.88, "grad_norm": 8.84524154663086, "learning_rate": 3.071752574848747e-06, "logits/chosen": -1.5089815855026245, "logits/rejected": -1.5023810863494873, "logps/chosen": -109.5863265991211, "logps/rejected": -200.3076934814453, "loss": 0.7723, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 16.556867599487305, "rewards/margins": 8.802556037902832, "rewards/rejected": 7.754312992095947, "step": 1130 }, { "epoch": 1.9, "grad_norm": 5.309729099273682, "learning_rate": 2.991396476179671e-06, "logits/chosen": -1.5207172632217407, "logits/rejected": -1.5075544118881226, "logps/chosen": -104.1460189819336, "logps/rejected": -200.79122924804688, "loss": 0.7185, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 18.383495330810547, "rewards/margins": 9.844137191772461, "rewards/rejected": 8.539360046386719, "step": 1140 }, { "epoch": 1.92, "grad_norm": 17.16521644592285, "learning_rate": 2.911654330453882e-06, "logits/chosen": -1.5120269060134888, "logits/rejected": -1.5061429738998413, "logps/chosen": -104.67869567871094, "logps/rejected": -189.43106079101562, "loss": 0.8524, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 17.262327194213867, "rewards/margins": 8.857732772827148, "rewards/rejected": 8.404593467712402, "step": 1150 }, { "epoch": 1.93, "grad_norm": 6.592312812805176, "learning_rate": 2.8325505117822984e-06, "logits/chosen": -1.51890230178833, "logits/rejected": -1.5155048370361328, "logps/chosen": -101.92094421386719, "logps/rejected": -207.8151092529297, "loss": 0.6951, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 16.95406723022461, "rewards/margins": 9.166045188903809, "rewards/rejected": 7.788022518157959, "step": 1160 }, { "epoch": 1.95, "grad_norm": 14.916762351989746, "learning_rate": 2.754109199163771e-06, "logits/chosen": -1.5025979280471802, "logits/rejected": -1.4987188577651978, "logps/chosen": -101.9366226196289, "logps/rejected": -201.5316162109375, "loss": 0.7587, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 17.186330795288086, "rewards/margins": 8.913457870483398, "rewards/rejected": 8.272873878479004, "step": 1170 }, { "epoch": 1.97, "grad_norm": 39.37679672241211, "learning_rate": 2.6763543690945004e-06, "logits/chosen": -1.5020328760147095, "logits/rejected": -1.4887430667877197, "logps/chosen": -109.6100845336914, "logps/rejected": -188.98495483398438, "loss": 0.913, "rewards/accuracies": 0.9375, "rewards/chosen": 17.317249298095703, "rewards/margins": 8.822264671325684, "rewards/rejected": 8.49498462677002, "step": 1180 }, { "epoch": 1.98, "grad_norm": 3.733832597732544, "learning_rate": 2.599309788239339e-06, "logits/chosen": -1.5142863988876343, "logits/rejected": -1.5111477375030518, "logps/chosen": -106.55147552490234, "logps/rejected": -215.3072967529297, "loss": 0.7224, "rewards/accuracies": 1.0, "rewards/chosen": 17.323034286499023, "rewards/margins": 9.6803560256958, "rewards/rejected": 7.642678260803223, "step": 1190 }, { "epoch": 2.0, "grad_norm": 15.505064010620117, "learning_rate": 2.5229990061672414e-06, "logits/chosen": -1.5279539823532104, "logits/rejected": -1.5259180068969727, "logps/chosen": -97.4522476196289, "logps/rejected": -206.6697235107422, "loss": 0.6609, "rewards/accuracies": 1.0, "rewards/chosen": 18.010801315307617, "rewards/margins": 10.6015043258667, "rewards/rejected": 7.409295558929443, "step": 1200 }, { "epoch": 2.0, "eval_logits/chosen": -1.4321023225784302, "eval_logits/rejected": -1.4358972311019897, "eval_logps/chosen": -151.3264923095703, "eval_logps/rejected": -180.62367248535156, "eval_loss": 2.032116174697876, "eval_rewards/accuracies": 0.6680850982666016, "eval_rewards/chosen": 9.032657623291016, "eval_rewards/margins": 1.22004234790802, "eval_rewards/rejected": 7.812614917755127, "eval_runtime": 280.3663, "eval_samples_per_second": 2.515, "eval_steps_per_second": 2.515, "step": 1200 }, { "epoch": 2.02, "grad_norm": 7.859879493713379, "learning_rate": 2.4474453481530587e-06, "logits/chosen": -1.5223504304885864, "logits/rejected": -1.5135400295257568, "logps/chosen": -93.24683380126953, "logps/rejected": -219.54849243164062, "loss": 0.6373, "rewards/accuracies": 1.0, "rewards/chosen": 18.513376235961914, "rewards/margins": 11.950658798217773, "rewards/rejected": 6.562716484069824, "step": 1210 }, { "epoch": 2.03, "grad_norm": 27.701181411743164, "learning_rate": 2.3726719080478962e-06, "logits/chosen": -1.509887933731079, "logits/rejected": -1.5058742761611938, "logps/chosen": -103.5932388305664, "logps/rejected": -203.78501892089844, "loss": 0.7128, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 17.814315795898438, "rewards/margins": 9.831649780273438, "rewards/rejected": 7.982666969299316, "step": 1220 }, { "epoch": 2.05, "grad_norm": 8.064666748046875, "learning_rate": 2.298701541220218e-06, "logits/chosen": -1.5236984491348267, "logits/rejected": -1.5181363821029663, "logps/chosen": -100.51972961425781, "logps/rejected": -202.49575805664062, "loss": 0.6432, "rewards/accuracies": 1.0, "rewards/chosen": 18.57242202758789, "rewards/margins": 11.497468948364258, "rewards/rejected": 7.074953556060791, "step": 1230 }, { "epoch": 2.07, "grad_norm": 5.775174140930176, "learning_rate": 2.22555685756983e-06, "logits/chosen": -1.5132160186767578, "logits/rejected": -1.503983974456787, "logps/chosen": -96.10552978515625, "logps/rejected": -201.612060546875, "loss": 0.6538, "rewards/accuracies": 1.0, "rewards/chosen": 18.1705322265625, "rewards/margins": 10.621967315673828, "rewards/rejected": 7.548564910888672, "step": 1240 }, { "epoch": 2.08, "grad_norm": 7.816349983215332, "learning_rate": 2.153260214616915e-06, "logits/chosen": -1.5382534265518188, "logits/rejected": -1.5415149927139282, "logps/chosen": -95.2313461303711, "logps/rejected": -223.0685577392578, "loss": 0.5986, "rewards/accuracies": 1.0, "rewards/chosen": 18.566387176513672, "rewards/margins": 11.582368850708008, "rewards/rejected": 6.9840192794799805, "step": 1250 }, { "epoch": 2.1, "grad_norm": 5.086969375610352, "learning_rate": 2.081833710668181e-06, "logits/chosen": -1.494619607925415, "logits/rejected": -1.4864647388458252, "logps/chosen": -93.64717864990234, "logps/rejected": -195.05477905273438, "loss": 0.6944, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 17.304344177246094, "rewards/margins": 10.235904693603516, "rewards/rejected": 7.068438529968262, "step": 1260 }, { "epoch": 2.12, "grad_norm": 3.984151840209961, "learning_rate": 2.0112991780622725e-06, "logits/chosen": -1.4988569021224976, "logits/rejected": -1.4943821430206299, "logps/chosen": -102.37433624267578, "logps/rejected": -204.2311553955078, "loss": 0.6708, "rewards/accuracies": 1.0, "rewards/chosen": 18.241689682006836, "rewards/margins": 10.953558921813965, "rewards/rejected": 7.288130283355713, "step": 1270 }, { "epoch": 2.13, "grad_norm": 39.707828521728516, "learning_rate": 1.9416781764964486e-06, "logits/chosen": -1.5009758472442627, "logits/rejected": -1.5007375478744507, "logps/chosen": -95.72146606445312, "logps/rejected": -212.9442901611328, "loss": 0.654, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 17.809816360473633, "rewards/margins": 10.415254592895508, "rewards/rejected": 7.394561767578125, "step": 1280 }, { "epoch": 2.15, "grad_norm": 6.5094475746154785, "learning_rate": 1.8729919864366292e-06, "logits/chosen": -1.5389680862426758, "logits/rejected": -1.5245224237442017, "logps/chosen": -93.90787506103516, "logps/rejected": -200.7926788330078, "loss": 0.6446, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 18.495624542236328, "rewards/margins": 10.695076942443848, "rewards/rejected": 7.800548553466797, "step": 1290 }, { "epoch": 2.17, "grad_norm": 11.99113655090332, "learning_rate": 1.8052616026127563e-06, "logits/chosen": -1.5282859802246094, "logits/rejected": -1.5253300666809082, "logps/chosen": -98.3044204711914, "logps/rejected": -214.7481689453125, "loss": 0.6627, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 17.510263442993164, "rewards/margins": 10.662416458129883, "rewards/rejected": 6.847846984863281, "step": 1300 }, { "epoch": 2.18, "grad_norm": 14.8443021774292, "learning_rate": 1.7385077276015267e-06, "logits/chosen": -1.5107916593551636, "logits/rejected": -1.513163685798645, "logps/chosen": -92.03627014160156, "logps/rejected": -213.4009552001953, "loss": 0.6345, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 18.635957717895508, "rewards/margins": 11.570890426635742, "rewards/rejected": 7.065066337585449, "step": 1310 }, { "epoch": 2.2, "grad_norm": 12.072715759277344, "learning_rate": 1.6727507654983977e-06, "logits/chosen": -1.5300309658050537, "logits/rejected": -1.523341417312622, "logps/chosen": -107.01918029785156, "logps/rejected": -197.11643981933594, "loss": 0.7373, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 17.78557014465332, "rewards/margins": 10.176875114440918, "rewards/rejected": 7.608694553375244, "step": 1320 }, { "epoch": 2.22, "grad_norm": 8.86849594116211, "learning_rate": 1.6080108156808439e-06, "logits/chosen": -1.5328854322433472, "logits/rejected": -1.5194957256317139, "logps/chosen": -100.20330810546875, "logps/rejected": -195.58236694335938, "loss": 0.6678, "rewards/accuracies": 1.0, "rewards/chosen": 18.137895584106445, "rewards/margins": 10.396581649780273, "rewards/rejected": 7.7413129806518555, "step": 1330 }, { "epoch": 2.23, "grad_norm": 4.92030143737793, "learning_rate": 1.5443076666647545e-06, "logits/chosen": -1.5401328802108765, "logits/rejected": -1.5302283763885498, "logps/chosen": -87.4715805053711, "logps/rejected": -214.174560546875, "loss": 0.5832, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 19.330814361572266, "rewards/margins": 11.920788764953613, "rewards/rejected": 7.410025596618652, "step": 1340 }, { "epoch": 2.25, "grad_norm": 5.7385783195495605, "learning_rate": 1.4816607900558311e-06, "logits/chosen": -1.5227100849151611, "logits/rejected": -1.5200421810150146, "logps/chosen": -87.50262451171875, "logps/rejected": -214.90176391601562, "loss": 0.6009, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 18.722919464111328, "rewards/margins": 11.610313415527344, "rewards/rejected": 7.112607002258301, "step": 1350 }, { "epoch": 2.27, "grad_norm": 4.156861305236816, "learning_rate": 1.4200893345978816e-06, "logits/chosen": -1.5246403217315674, "logits/rejected": -1.5162007808685303, "logps/chosen": -95.39898681640625, "logps/rejected": -208.87509155273438, "loss": 0.6347, "rewards/accuracies": 1.0, "rewards/chosen": 18.26802635192871, "rewards/margins": 11.145674705505371, "rewards/rejected": 7.122353553771973, "step": 1360 }, { "epoch": 2.28, "grad_norm": 3.7285964488983154, "learning_rate": 1.3596121203197715e-06, "logits/chosen": -1.5043359994888306, "logits/rejected": -1.5017638206481934, "logps/chosen": -101.5743637084961, "logps/rejected": -205.45144653320312, "loss": 0.7345, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 17.542848587036133, "rewards/margins": 10.09937858581543, "rewards/rejected": 7.443469047546387, "step": 1370 }, { "epoch": 2.3, "grad_norm": 5.035600662231445, "learning_rate": 1.3002476327828717e-06, "logits/chosen": -1.5371487140655518, "logits/rejected": -1.5391342639923096, "logps/chosen": -102.25202941894531, "logps/rejected": -217.117919921875, "loss": 0.6705, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 18.24382972717285, "rewards/margins": 10.82688045501709, "rewards/rejected": 7.416950225830078, "step": 1380 }, { "epoch": 2.32, "grad_norm": 10.119084358215332, "learning_rate": 1.2420140174307267e-06, "logits/chosen": -1.5054762363433838, "logits/rejected": -1.5047041177749634, "logps/chosen": -90.96085357666016, "logps/rejected": -203.27731323242188, "loss": 0.6701, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 16.905033111572266, "rewards/margins": 9.766073226928711, "rewards/rejected": 7.1389594078063965, "step": 1390 }, { "epoch": 2.33, "grad_norm": 4.378362655639648, "learning_rate": 1.1849290740426994e-06, "logits/chosen": -1.52398681640625, "logits/rejected": -1.5253530740737915, "logps/chosen": -100.94436645507812, "logps/rejected": -208.01913452148438, "loss": 0.6768, "rewards/accuracies": 1.0, "rewards/chosen": 17.257160186767578, "rewards/margins": 10.379103660583496, "rewards/rejected": 6.878057956695557, "step": 1400 }, { "epoch": 2.33, "eval_logits/chosen": -1.4431970119476318, "eval_logits/rejected": -1.447227120399475, "eval_logps/chosen": -150.6456756591797, "eval_logps/rejected": -179.82106018066406, "eval_loss": 2.0312814712524414, "eval_rewards/accuracies": 0.6709219813346863, "eval_rewards/chosen": 9.100737571716309, "eval_rewards/margins": 1.207862377166748, "eval_rewards/rejected": 7.892876148223877, "eval_runtime": 280.087, "eval_samples_per_second": 2.517, "eval_steps_per_second": 2.517, "step": 1400 }, { "epoch": 2.35, "grad_norm": 4.4983320236206055, "learning_rate": 1.1290102512932482e-06, "logits/chosen": -1.5456353425979614, "logits/rejected": -1.5437848567962646, "logps/chosen": -90.5999984741211, "logps/rejected": -215.61239624023438, "loss": 0.6084, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 18.774730682373047, "rewards/margins": 11.802639961242676, "rewards/rejected": 6.972087860107422, "step": 1410 }, { "epoch": 2.37, "grad_norm": 8.901042938232422, "learning_rate": 1.074274641418554e-06, "logits/chosen": -1.5062071084976196, "logits/rejected": -1.502605676651001, "logps/chosen": -93.99595642089844, "logps/rejected": -201.00819396972656, "loss": 0.6596, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 17.505390167236328, "rewards/margins": 10.57873249053955, "rewards/rejected": 6.926657199859619, "step": 1420 }, { "epoch": 2.38, "grad_norm": 5.139017581939697, "learning_rate": 1.0207389749920593e-06, "logits/chosen": -1.509242057800293, "logits/rejected": -1.511796474456787, "logps/chosen": -96.8970718383789, "logps/rejected": -215.5059051513672, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 17.298181533813477, "rewards/margins": 10.531153678894043, "rewards/rejected": 6.767026424407959, "step": 1430 }, { "epoch": 2.4, "grad_norm": 4.371284484863281, "learning_rate": 9.68419615810598e-07, "logits/chosen": -1.5192222595214844, "logits/rejected": -1.5184710025787354, "logps/chosen": -92.4468002319336, "logps/rejected": -208.1323699951172, "loss": 0.6454, "rewards/accuracies": 1.0, "rewards/chosen": 17.749366760253906, "rewards/margins": 10.604511260986328, "rewards/rejected": 7.144855499267578, "step": 1440 }, { "epoch": 2.42, "grad_norm": 3.970381259918213, "learning_rate": 9.173325558925905e-07, "logits/chosen": -1.5145829916000366, "logits/rejected": -1.51982843875885, "logps/chosen": -90.59550476074219, "logps/rejected": -224.2189483642578, "loss": 0.6051, "rewards/accuracies": 1.0, "rewards/chosen": 17.675228118896484, "rewards/margins": 11.378229141235352, "rewards/rejected": 6.296999931335449, "step": 1450 }, { "epoch": 2.43, "grad_norm": 4.847020626068115, "learning_rate": 8.674934105899152e-07, "logits/chosen": -1.4975147247314453, "logits/rejected": -1.4984054565429688, "logps/chosen": -95.9659423828125, "logps/rejected": -205.8800048828125, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 17.054203033447266, "rewards/margins": 9.49101448059082, "rewards/rejected": 7.563189506530762, "step": 1460 }, { "epoch": 2.45, "grad_norm": 4.591710090637207, "learning_rate": 8.189174138148814e-07, "logits/chosen": -1.5397193431854248, "logits/rejected": -1.5424444675445557, "logps/chosen": -83.99203491210938, "logps/rejected": -215.84378051757812, "loss": 0.5823, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 18.956871032714844, "rewards/margins": 11.750204086303711, "rewards/rejected": 7.206667900085449, "step": 1470 }, { "epoch": 2.47, "grad_norm": 5.379372596740723, "learning_rate": 7.716194133838135e-07, "logits/chosen": -1.5043582916259766, "logits/rejected": -1.507912278175354, "logps/chosen": -94.89381408691406, "logps/rejected": -209.4828338623047, "loss": 0.654, "rewards/accuracies": 1.0, "rewards/chosen": 17.687734603881836, "rewards/margins": 10.847681045532227, "rewards/rejected": 6.840054988861084, "step": 1480 }, { "epoch": 2.48, "grad_norm": 7.855030536651611, "learning_rate": 7.256138664786477e-07, "logits/chosen": -1.5224730968475342, "logits/rejected": -1.5275871753692627, "logps/chosen": -83.72583770751953, "logps/rejected": -211.76937866210938, "loss": 0.5732, "rewards/accuracies": 1.0, "rewards/chosen": 19.119752883911133, "rewards/margins": 12.638054847717285, "rewards/rejected": 6.481698513031006, "step": 1490 }, { "epoch": 2.5, "grad_norm": 3.8725578784942627, "learning_rate": 6.809148352279182e-07, "logits/chosen": -1.5395710468292236, "logits/rejected": -1.5471025705337524, "logps/chosen": -90.97738647460938, "logps/rejected": -220.6120147705078, "loss": 0.6148, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 18.08513641357422, "rewards/margins": 11.457734107971191, "rewards/rejected": 6.627403259277344, "step": 1500 }, { "epoch": 2.52, "grad_norm": 8.233463287353516, "learning_rate": 6.375359824085126e-07, "logits/chosen": -1.5111545324325562, "logits/rejected": -1.5080124139785767, "logps/chosen": -100.68155670166016, "logps/rejected": -211.43820190429688, "loss": 0.7931, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 17.69210433959961, "rewards/margins": 10.682944297790527, "rewards/rejected": 7.009159088134766, "step": 1510 }, { "epoch": 2.53, "grad_norm": 5.737654209136963, "learning_rate": 5.954905672694805e-07, "logits/chosen": -1.5383660793304443, "logits/rejected": -1.5310518741607666, "logps/chosen": -91.32364654541016, "logps/rejected": -210.65103149414062, "loss": 0.5817, "rewards/accuracies": 1.0, "rewards/chosen": 19.20272445678711, "rewards/margins": 11.968230247497559, "rewards/rejected": 7.23449182510376, "step": 1520 }, { "epoch": 2.55, "grad_norm": 5.602839469909668, "learning_rate": 5.547914414791922e-07, "logits/chosen": -1.523057222366333, "logits/rejected": -1.526564121246338, "logps/chosen": -92.25067901611328, "logps/rejected": -210.3281707763672, "loss": 0.6644, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 17.999813079833984, "rewards/margins": 10.866498947143555, "rewards/rejected": 7.133312225341797, "step": 1530 }, { "epoch": 2.57, "grad_norm": 80.35784912109375, "learning_rate": 5.154510451970762e-07, "logits/chosen": -1.5100964307785034, "logits/rejected": -1.5057765245437622, "logps/chosen": -91.3646469116211, "logps/rejected": -208.1036834716797, "loss": 0.698, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 17.777923583984375, "rewards/margins": 10.551263809204102, "rewards/rejected": 7.22666072845459, "step": 1540 }, { "epoch": 2.58, "grad_norm": 11.723702430725098, "learning_rate": 4.774814032711461e-07, "logits/chosen": -1.5164598226547241, "logits/rejected": -1.5183497667312622, "logps/chosen": -99.03367614746094, "logps/rejected": -213.4344024658203, "loss": 0.6773, "rewards/accuracies": 1.0, "rewards/chosen": 17.00816535949707, "rewards/margins": 10.20258903503418, "rewards/rejected": 6.805574893951416, "step": 1550 }, { "epoch": 2.6, "grad_norm": 10.103087425231934, "learning_rate": 4.4089412156245793e-07, "logits/chosen": -1.5356425046920776, "logits/rejected": -1.5423994064331055, "logps/chosen": -83.90791320800781, "logps/rejected": -210.4249267578125, "loss": 0.5745, "rewards/accuracies": 1.0, "rewards/chosen": 18.552326202392578, "rewards/margins": 11.089494705200195, "rewards/rejected": 7.462831020355225, "step": 1560 }, { "epoch": 2.62, "grad_norm": 8.58123779296875, "learning_rate": 4.0570038339764803e-07, "logits/chosen": -1.5257699489593506, "logits/rejected": -1.521463394165039, "logps/chosen": -99.25550842285156, "logps/rejected": -212.4494171142578, "loss": 0.6679, "rewards/accuracies": 1.0, "rewards/chosen": 18.084367752075195, "rewards/margins": 11.122998237609863, "rewards/rejected": 6.961369514465332, "step": 1570 }, { "epoch": 2.63, "grad_norm": 5.0036845207214355, "learning_rate": 3.719109461506215e-07, "logits/chosen": -1.4980933666229248, "logits/rejected": -1.5047228336334229, "logps/chosen": -89.56025695800781, "logps/rejected": -215.61279296875, "loss": 0.6389, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 17.403789520263672, "rewards/margins": 10.387609481811523, "rewards/rejected": 7.016180515289307, "step": 1580 }, { "epoch": 2.65, "grad_norm": 5.2077412605285645, "learning_rate": 3.3953613795443376e-07, "logits/chosen": -1.5230729579925537, "logits/rejected": -1.5227991342544556, "logps/chosen": -90.34446716308594, "logps/rejected": -208.3804931640625, "loss": 0.7039, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 18.70303726196289, "rewards/margins": 11.128946304321289, "rewards/rejected": 7.574089050292969, "step": 1590 }, { "epoch": 2.67, "grad_norm": 5.617701530456543, "learning_rate": 3.0858585454437927e-07, "logits/chosen": -1.508111596107483, "logits/rejected": -1.5131912231445312, "logps/chosen": -89.39806365966797, "logps/rejected": -204.76675415039062, "loss": 0.615, "rewards/accuracies": 1.0, "rewards/chosen": 17.604631423950195, "rewards/margins": 10.715816497802734, "rewards/rejected": 6.8888139724731445, "step": 1600 }, { "epoch": 2.67, "eval_logits/chosen": -1.4370256662368774, "eval_logits/rejected": -1.4413024187088013, "eval_logps/chosen": -150.68116760253906, "eval_logps/rejected": -179.16799926757812, "eval_loss": 2.051481008529663, "eval_rewards/accuracies": 0.6624113321304321, "eval_rewards/chosen": 9.097188949584961, "eval_rewards/margins": 1.1390060186386108, "eval_rewards/rejected": 7.958182334899902, "eval_runtime": 280.5593, "eval_samples_per_second": 2.513, "eval_steps_per_second": 2.513, "step": 1600 }, { "epoch": 2.68, "grad_norm": 18.668004989624023, "learning_rate": 2.7906955623324074e-07, "logits/chosen": -1.519481897354126, "logits/rejected": -1.5189173221588135, "logps/chosen": -93.53435516357422, "logps/rejected": -198.51348876953125, "loss": 0.6712, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 17.499284744262695, "rewards/margins": 10.426948547363281, "rewards/rejected": 7.072335720062256, "step": 1610 }, { "epoch": 2.7, "grad_norm": 4.54674768447876, "learning_rate": 2.509962650196407e-07, "logits/chosen": -1.5232311487197876, "logits/rejected": -1.5235909223556519, "logps/chosen": -89.19126892089844, "logps/rejected": -211.9619140625, "loss": 0.6203, "rewards/accuracies": 1.0, "rewards/chosen": 18.46454620361328, "rewards/margins": 11.837278366088867, "rewards/rejected": 6.627265930175781, "step": 1620 }, { "epoch": 2.72, "grad_norm": 13.877330780029297, "learning_rate": 2.2437456183035833e-07, "logits/chosen": -1.5302555561065674, "logits/rejected": -1.5268919467926025, "logps/chosen": -90.18998718261719, "logps/rejected": -205.0181121826172, "loss": 0.6213, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 18.42779541015625, "rewards/margins": 11.21821403503418, "rewards/rejected": 7.209580898284912, "step": 1630 }, { "epoch": 2.73, "grad_norm": 7.050842761993408, "learning_rate": 1.99212583897474e-07, "logits/chosen": -1.5237982273101807, "logits/rejected": -1.5170161724090576, "logps/chosen": -96.49137115478516, "logps/rejected": -208.6808319091797, "loss": 0.6311, "rewards/accuracies": 1.0, "rewards/chosen": 17.960681915283203, "rewards/margins": 11.038247108459473, "rewards/rejected": 6.9224348068237305, "step": 1640 }, { "epoch": 2.75, "grad_norm": 9.275837898254395, "learning_rate": 1.7551802227112558e-07, "logits/chosen": -1.5040963888168335, "logits/rejected": -1.5078271627426147, "logps/chosen": -91.1369857788086, "logps/rejected": -209.1532745361328, "loss": 0.622, "rewards/accuracies": 1.0, "rewards/chosen": 17.7973690032959, "rewards/margins": 10.730772972106934, "rewards/rejected": 7.066598415374756, "step": 1650 }, { "epoch": 2.77, "grad_norm": 5.387999534606934, "learning_rate": 1.5329811946865392e-07, "logits/chosen": -1.5108160972595215, "logits/rejected": -1.5107080936431885, "logps/chosen": -98.21044921875, "logps/rejected": -212.44580078125, "loss": 0.6516, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 17.72305679321289, "rewards/margins": 10.394246101379395, "rewards/rejected": 7.328810214996338, "step": 1660 }, { "epoch": 2.78, "grad_norm": 5.4950714111328125, "learning_rate": 1.3255966726084036e-07, "logits/chosen": -1.504311442375183, "logits/rejected": -1.5167784690856934, "logps/chosen": -87.20539093017578, "logps/rejected": -200.42111206054688, "loss": 0.6145, "rewards/accuracies": 1.0, "rewards/chosen": 17.13315200805664, "rewards/margins": 10.226241111755371, "rewards/rejected": 6.9069085121154785, "step": 1670 }, { "epoch": 2.8, "grad_norm": 6.581058979034424, "learning_rate": 1.1330900459592564e-07, "logits/chosen": -1.5215296745300293, "logits/rejected": -1.5147764682769775, "logps/chosen": -92.18501281738281, "logps/rejected": -193.42245483398438, "loss": 0.6203, "rewards/accuracies": 1.0, "rewards/chosen": 17.840869903564453, "rewards/margins": 10.311830520629883, "rewards/rejected": 7.529041290283203, "step": 1680 }, { "epoch": 2.82, "grad_norm": 29.634891510009766, "learning_rate": 9.55520156620332e-08, "logits/chosen": -1.5149075984954834, "logits/rejected": -1.5113269090652466, "logps/chosen": -92.65069580078125, "logps/rejected": -197.09310913085938, "loss": 0.6481, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 18.486942291259766, "rewards/margins": 11.153399467468262, "rewards/rejected": 7.333543300628662, "step": 1690 }, { "epoch": 2.83, "grad_norm": 7.716902256011963, "learning_rate": 7.92941280886056e-08, "logits/chosen": -1.5236696004867554, "logits/rejected": -1.5350602865219116, "logps/chosen": -83.11317443847656, "logps/rejected": -225.55429077148438, "loss": 0.619, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 18.241207122802734, "rewards/margins": 11.599997520446777, "rewards/rejected": 6.641209602355957, "step": 1700 }, { "epoch": 2.85, "grad_norm": 9.752326011657715, "learning_rate": 6.454031128737881e-08, "logits/chosen": -1.5312079191207886, "logits/rejected": -1.5227811336517334, "logps/chosen": -93.35438537597656, "logps/rejected": -215.9623260498047, "loss": 0.6241, "rewards/accuracies": 1.0, "rewards/chosen": 19.262916564941406, "rewards/margins": 12.349664688110352, "rewards/rejected": 6.9132513999938965, "step": 1710 }, { "epoch": 2.87, "grad_norm": 9.797955513000488, "learning_rate": 5.129507493343011e-08, "logits/chosen": -1.511307716369629, "logits/rejected": -1.5151628255844116, "logps/chosen": -94.07261657714844, "logps/rejected": -221.2146453857422, "loss": 0.6836, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 18.428720474243164, "rewards/margins": 11.63404369354248, "rewards/rejected": 6.794674873352051, "step": 1720 }, { "epoch": 2.88, "grad_norm": 4.7570719718933105, "learning_rate": 3.956246758674065e-08, "logits/chosen": -1.5266796350479126, "logits/rejected": -1.5276660919189453, "logps/chosen": -96.55259704589844, "logps/rejected": -218.0394744873047, "loss": 0.6813, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 18.319250106811523, "rewards/margins": 11.030166625976562, "rewards/rejected": 7.289083003997803, "step": 1730 }, { "epoch": 2.9, "grad_norm": 5.806319713592529, "learning_rate": 2.934607545470536e-08, "logits/chosen": -1.5195215940475464, "logits/rejected": -1.5297635793685913, "logps/chosen": -87.46583557128906, "logps/rejected": -229.15591430664062, "loss": 0.6653, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 18.350963592529297, "rewards/margins": 11.915371894836426, "rewards/rejected": 6.4355902671813965, "step": 1740 }, { "epoch": 2.92, "grad_norm": 4.364658832550049, "learning_rate": 2.0649021295970906e-08, "logits/chosen": -1.5325102806091309, "logits/rejected": -1.5392825603485107, "logps/chosen": -89.66578674316406, "logps/rejected": -212.051513671875, "loss": 0.6212, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 18.12158966064453, "rewards/margins": 11.411016464233398, "rewards/rejected": 6.710572719573975, "step": 1750 }, { "epoch": 2.93, "grad_norm": 7.178764343261719, "learning_rate": 1.3473963465924222e-08, "logits/chosen": -1.5103847980499268, "logits/rejected": -1.510608434677124, "logps/chosen": -90.99683380126953, "logps/rejected": -202.84072875976562, "loss": 0.6385, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 17.887941360473633, "rewards/margins": 11.204258918762207, "rewards/rejected": 6.683682441711426, "step": 1760 }, { "epoch": 2.95, "grad_norm": 9.613419532775879, "learning_rate": 7.823095104137479e-09, "logits/chosen": -1.5148423910140991, "logits/rejected": -1.5170824527740479, "logps/chosen": -101.44172668457031, "logps/rejected": -203.09060668945312, "loss": 0.7437, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 17.126873016357422, "rewards/margins": 9.506840705871582, "rewards/rejected": 7.620034694671631, "step": 1770 }, { "epoch": 2.97, "grad_norm": 3.819849729537964, "learning_rate": 3.6981434640093183e-09, "logits/chosen": -1.5141820907592773, "logits/rejected": -1.5163359642028809, "logps/chosen": -91.76555633544922, "logps/rejected": -208.36703491210938, "loss": 0.6192, "rewards/accuracies": 1.0, "rewards/chosen": 18.568470001220703, "rewards/margins": 11.578906059265137, "rewards/rejected": 6.989563941955566, "step": 1780 }, { "epoch": 2.98, "grad_norm": 25.042579650878906, "learning_rate": 1.1003693848093965e-09, "logits/chosen": -1.5274379253387451, "logits/rejected": -1.5291308164596558, "logps/chosen": -102.40494537353516, "logps/rejected": -220.06063842773438, "loss": 0.7154, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 17.37801742553711, "rewards/margins": 10.208986282348633, "rewards/rejected": 7.169030666351318, "step": 1790 }, { "epoch": 3.0, "step": 1797, "total_flos": 8.333096122255933e+17, "train_loss": 0.8950637202828078, "train_runtime": 14615.0375, "train_samples_per_second": 0.985, "train_steps_per_second": 0.123 } ], "logging_steps": 10, "max_steps": 1797, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "total_flos": 8.333096122255933e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }