Jiazheng Li
init push
a57f764
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.99562408835174,
"eval_steps": 200,
"global_step": 1797,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 26.02830696105957,
"learning_rate": 9.999523086940423e-06,
"logits/chosen": -1.1374095678329468,
"logits/rejected": -1.1327173709869385,
"logps/chosen": -142.34921264648438,
"logps/rejected": -155.80406188964844,
"loss": 1.5859,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 13.082440376281738,
"rewards/margins": 0.9329651594161987,
"rewards/rejected": 12.14947509765625,
"step": 10
},
{
"epoch": 0.03,
"grad_norm": 21.4633846282959,
"learning_rate": 9.997817603030276e-06,
"logits/chosen": -1.1412668228149414,
"logits/rejected": -1.1413795948028564,
"logps/chosen": -150.35586547851562,
"logps/rejected": -164.76641845703125,
"loss": 1.786,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 13.009861946105957,
"rewards/margins": 0.8635275959968567,
"rewards/rejected": 12.146333694458008,
"step": 20
},
{
"epoch": 0.05,
"grad_norm": 23.037744522094727,
"learning_rate": 9.994471383754724e-06,
"logits/chosen": -1.1316417455673218,
"logits/rejected": -1.130651831626892,
"logps/chosen": -151.8024444580078,
"logps/rejected": -168.8718719482422,
"loss": 1.4256,
"rewards/accuracies": 0.75,
"rewards/chosen": 13.176553726196289,
"rewards/margins": 1.613856554031372,
"rewards/rejected": 11.562695503234863,
"step": 30
},
{
"epoch": 0.07,
"grad_norm": 14.40039348602295,
"learning_rate": 9.990154489175436e-06,
"logits/chosen": -1.1126848459243774,
"logits/rejected": -1.1085669994354248,
"logps/chosen": -149.564697265625,
"logps/rejected": -167.00857543945312,
"loss": 1.5826,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 12.614360809326172,
"rewards/margins": 1.426288366317749,
"rewards/rejected": 11.18807315826416,
"step": 40
},
{
"epoch": 0.08,
"grad_norm": 30.87997817993164,
"learning_rate": 9.983908955774398e-06,
"logits/chosen": -1.121843934059143,
"logits/rejected": -1.124288558959961,
"logps/chosen": -148.73220825195312,
"logps/rejected": -175.08212280273438,
"loss": 1.4624,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 12.753301620483398,
"rewards/margins": 1.8802299499511719,
"rewards/rejected": 10.873071670532227,
"step": 50
},
{
"epoch": 0.1,
"grad_norm": 23.71029281616211,
"learning_rate": 9.976140032846158e-06,
"logits/chosen": -1.154342532157898,
"logits/rejected": -1.1551573276519775,
"logps/chosen": -148.70213317871094,
"logps/rejected": -169.8200225830078,
"loss": 1.5662,
"rewards/accuracies": 0.75,
"rewards/chosen": 12.446992874145508,
"rewards/margins": 1.6254791021347046,
"rewards/rejected": 10.821515083312988,
"step": 60
},
{
"epoch": 0.12,
"grad_norm": 27.385414123535156,
"learning_rate": 9.966850095052043e-06,
"logits/chosen": -1.1726210117340088,
"logits/rejected": -1.1690549850463867,
"logps/chosen": -150.37200927734375,
"logps/rejected": -183.83358764648438,
"loss": 1.3566,
"rewards/accuracies": 0.8125,
"rewards/chosen": 12.22251033782959,
"rewards/margins": 2.0067410469055176,
"rewards/rejected": 10.21576976776123,
"step": 70
},
{
"epoch": 0.13,
"grad_norm": 23.488914489746094,
"learning_rate": 9.956041981969192e-06,
"logits/chosen": -1.2129347324371338,
"logits/rejected": -1.2055721282958984,
"logps/chosen": -139.1160125732422,
"logps/rejected": -180.86448669433594,
"loss": 1.3657,
"rewards/accuracies": 0.8125,
"rewards/chosen": 12.533506393432617,
"rewards/margins": 2.3238863945007324,
"rewards/rejected": 10.209619522094727,
"step": 80
},
{
"epoch": 0.15,
"grad_norm": 27.516918182373047,
"learning_rate": 9.943718997222616e-06,
"logits/chosen": -1.2162883281707764,
"logits/rejected": -1.2150709629058838,
"logps/chosen": -153.258544921875,
"logps/rejected": -169.685302734375,
"loss": 1.3773,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 12.494550704956055,
"rewards/margins": 2.459848403930664,
"rewards/rejected": 10.034701347351074,
"step": 90
},
{
"epoch": 0.17,
"grad_norm": 30.299776077270508,
"learning_rate": 9.929884907475405e-06,
"logits/chosen": -1.2348403930664062,
"logits/rejected": -1.2337309122085571,
"logps/chosen": -141.0696563720703,
"logps/rejected": -172.27981567382812,
"loss": 1.3082,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 12.868219375610352,
"rewards/margins": 2.447009325027466,
"rewards/rejected": 10.421208381652832,
"step": 100
},
{
"epoch": 0.18,
"grad_norm": 10.773354530334473,
"learning_rate": 9.914543941277401e-06,
"logits/chosen": -1.2390023469924927,
"logits/rejected": -1.2259633541107178,
"logps/chosen": -151.1343536376953,
"logps/rejected": -166.46087646484375,
"loss": 1.2816,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 12.653604507446289,
"rewards/margins": 2.4828662872314453,
"rewards/rejected": 10.170738220214844,
"step": 110
},
{
"epoch": 0.2,
"grad_norm": 18.021488189697266,
"learning_rate": 9.897700787772703e-06,
"logits/chosen": -1.2141873836517334,
"logits/rejected": -1.212436556816101,
"logps/chosen": -145.79354858398438,
"logps/rejected": -174.75421142578125,
"loss": 1.4305,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 12.540283203125,
"rewards/margins": 2.2728052139282227,
"rewards/rejected": 10.26747989654541,
"step": 120
},
{
"epoch": 0.22,
"grad_norm": 14.722694396972656,
"learning_rate": 9.879360595266359e-06,
"logits/chosen": -1.2301462888717651,
"logits/rejected": -1.2223972082138062,
"logps/chosen": -141.97836303710938,
"logps/rejected": -190.95040893554688,
"loss": 1.1746,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 12.985305786132812,
"rewards/margins": 3.339768886566162,
"rewards/rejected": 9.645538330078125,
"step": 130
},
{
"epoch": 0.23,
"grad_norm": 19.521358489990234,
"learning_rate": 9.861579077506591e-06,
"logits/chosen": -1.2298915386199951,
"logits/rejected": -1.221884846687317,
"logps/chosen": -148.5527801513672,
"logps/rejected": -180.9561004638672,
"loss": 1.3081,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 12.90058422088623,
"rewards/margins": 2.527609348297119,
"rewards/rejected": 10.372976303100586,
"step": 140
},
{
"epoch": 0.25,
"grad_norm": 21.75788688659668,
"learning_rate": 9.84041033194796e-06,
"logits/chosen": -1.1978212594985962,
"logits/rejected": -1.2021763324737549,
"logps/chosen": -149.8433837890625,
"logps/rejected": -183.22714233398438,
"loss": 1.3711,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 12.765899658203125,
"rewards/margins": 2.4878742694854736,
"rewards/rejected": 10.27802562713623,
"step": 150
},
{
"epoch": 0.27,
"grad_norm": 31.972171783447266,
"learning_rate": 9.817762058879405e-06,
"logits/chosen": -1.1968965530395508,
"logits/rejected": -1.2041929960250854,
"logps/chosen": -141.44479370117188,
"logps/rejected": -172.67782592773438,
"loss": 1.4901,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 12.295498847961426,
"rewards/margins": 1.9750818014144897,
"rewards/rejected": 10.320415496826172,
"step": 160
},
{
"epoch": 0.28,
"grad_norm": 6.441612243652344,
"learning_rate": 9.793641181008042e-06,
"logits/chosen": -1.1921595335006714,
"logits/rejected": -1.1772682666778564,
"logps/chosen": -158.01004028320312,
"logps/rejected": -174.64419555664062,
"loss": 1.3182,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 11.994952201843262,
"rewards/margins": 3.0716311931610107,
"rewards/rejected": 8.923321723937988,
"step": 170
},
{
"epoch": 0.3,
"grad_norm": 23.175006866455078,
"learning_rate": 9.76805507115971e-06,
"logits/chosen": -1.1862366199493408,
"logits/rejected": -1.1827598810195923,
"logps/chosen": -150.92054748535156,
"logps/rejected": -191.04061889648438,
"loss": 1.3317,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 12.52110767364502,
"rewards/margins": 2.7398293018341064,
"rewards/rejected": 9.781278610229492,
"step": 180
},
{
"epoch": 0.32,
"grad_norm": 20.24986457824707,
"learning_rate": 9.741011550025385e-06,
"logits/chosen": -1.1844813823699951,
"logits/rejected": -1.1813442707061768,
"logps/chosen": -146.86256408691406,
"logps/rejected": -182.008544921875,
"loss": 1.3589,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 12.942377090454102,
"rewards/margins": 2.665095329284668,
"rewards/rejected": 10.27728271484375,
"step": 190
},
{
"epoch": 0.33,
"grad_norm": 31.18330955505371,
"learning_rate": 9.71251888377069e-06,
"logits/chosen": -1.198872685432434,
"logits/rejected": -1.195305347442627,
"logps/chosen": -140.82937622070312,
"logps/rejected": -173.56301879882812,
"loss": 1.3287,
"rewards/accuracies": 0.75,
"rewards/chosen": 12.571944236755371,
"rewards/margins": 2.418260335922241,
"rewards/rejected": 10.153684616088867,
"step": 200
},
{
"epoch": 0.33,
"eval_logits/chosen": -1.186182975769043,
"eval_logits/rejected": -1.1862589120864868,
"eval_logps/chosen": -142.31016540527344,
"eval_logps/rejected": -167.46771240234375,
"eval_loss": 1.8699389696121216,
"eval_rewards/accuracies": 0.631205677986145,
"eval_rewards/chosen": 9.93428897857666,
"eval_rewards/margins": 0.8060780763626099,
"eval_rewards/rejected": 9.12821102142334,
"eval_runtime": 280.3161,
"eval_samples_per_second": 2.515,
"eval_steps_per_second": 2.515,
"step": 200
},
{
"epoch": 0.35,
"grad_norm": 23.425613403320312,
"learning_rate": 9.682585781509243e-06,
"logits/chosen": -1.2091820240020752,
"logits/rejected": -1.2033774852752686,
"logps/chosen": -147.17410278320312,
"logps/rejected": -185.01951599121094,
"loss": 1.1424,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 13.107488632202148,
"rewards/margins": 3.5036494731903076,
"rewards/rejected": 9.603840827941895,
"step": 210
},
{
"epoch": 0.37,
"grad_norm": 24.199848175048828,
"learning_rate": 9.651221392640626e-06,
"logits/chosen": -1.2453477382659912,
"logits/rejected": -1.2410730123519897,
"logps/chosen": -142.30599975585938,
"logps/rejected": -187.09365844726562,
"loss": 1.3393,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 12.409756660461426,
"rewards/margins": 2.767655849456787,
"rewards/rejected": 9.642101287841797,
"step": 220
},
{
"epoch": 0.38,
"grad_norm": 5.732828140258789,
"learning_rate": 9.618435304053756e-06,
"logits/chosen": -1.2778840065002441,
"logits/rejected": -1.2746034860610962,
"logps/chosen": -135.27261352539062,
"logps/rejected": -201.36862182617188,
"loss": 1.0207,
"rewards/accuracies": 0.9375,
"rewards/chosen": 13.215780258178711,
"rewards/margins": 4.094855308532715,
"rewards/rejected": 9.120925903320312,
"step": 230
},
{
"epoch": 0.4,
"grad_norm": 33.938697814941406,
"learning_rate": 9.584237537196539e-06,
"logits/chosen": -1.264520287513733,
"logits/rejected": -1.2698160409927368,
"logps/chosen": -139.2551727294922,
"logps/rejected": -185.9132537841797,
"loss": 1.3149,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 12.885518074035645,
"rewards/margins": 3.4518959522247314,
"rewards/rejected": 9.433621406555176,
"step": 240
},
{
"epoch": 0.42,
"grad_norm": 19.174827575683594,
"learning_rate": 9.548638545012714e-06,
"logits/chosen": -1.2648355960845947,
"logits/rejected": -1.2563896179199219,
"logps/chosen": -143.2262725830078,
"logps/rejected": -177.64321899414062,
"loss": 1.2689,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 12.918134689331055,
"rewards/margins": 3.194645881652832,
"rewards/rejected": 9.723487854003906,
"step": 250
},
{
"epoch": 0.43,
"grad_norm": 12.325043678283691,
"learning_rate": 9.511649208746768e-06,
"logits/chosen": -1.2492735385894775,
"logits/rejected": -1.2509915828704834,
"logps/chosen": -140.66622924804688,
"logps/rejected": -186.30052185058594,
"loss": 1.2776,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 13.131103515625,
"rewards/margins": 2.9642741680145264,
"rewards/rejected": 10.166828155517578,
"step": 260
},
{
"epoch": 0.45,
"grad_norm": 29.295560836791992,
"learning_rate": 9.473280834617975e-06,
"logits/chosen": -1.2667304277420044,
"logits/rejected": -1.2650549411773682,
"logps/chosen": -143.45948791503906,
"logps/rejected": -180.34130859375,
"loss": 1.3095,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 12.880398750305176,
"rewards/margins": 2.869231939315796,
"rewards/rejected": 10.011167526245117,
"step": 270
},
{
"epoch": 0.47,
"grad_norm": 24.3317928314209,
"learning_rate": 9.43354515036451e-06,
"logits/chosen": -1.2491796016693115,
"logits/rejected": -1.2445125579833984,
"logps/chosen": -140.8796844482422,
"logps/rejected": -178.88314819335938,
"loss": 1.3453,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 12.978363037109375,
"rewards/margins": 3.1743595600128174,
"rewards/rejected": 9.80400276184082,
"step": 280
},
{
"epoch": 0.48,
"grad_norm": 31.15982437133789,
"learning_rate": 9.392454301658734e-06,
"logits/chosen": -1.2521940469741821,
"logits/rejected": -1.2475926876068115,
"logps/chosen": -150.509765625,
"logps/rejected": -201.6109161376953,
"loss": 1.3024,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 13.243515014648438,
"rewards/margins": 3.826166868209839,
"rewards/rejected": 9.417348861694336,
"step": 290
},
{
"epoch": 0.5,
"grad_norm": 21.953458786010742,
"learning_rate": 9.350020848394722e-06,
"logits/chosen": -1.233689308166504,
"logits/rejected": -1.2306249141693115,
"logps/chosen": -142.88519287109375,
"logps/rejected": -176.23739624023438,
"loss": 1.3095,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 12.999292373657227,
"rewards/margins": 3.4122116565704346,
"rewards/rejected": 9.587080955505371,
"step": 300
},
{
"epoch": 0.52,
"grad_norm": 5.4461750984191895,
"learning_rate": 9.306257760849198e-06,
"logits/chosen": -1.243290662765503,
"logits/rejected": -1.238360047340393,
"logps/chosen": -135.43121337890625,
"logps/rejected": -184.3162078857422,
"loss": 1.1376,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 13.632291793823242,
"rewards/margins": 3.4248099327087402,
"rewards/rejected": 10.207484245300293,
"step": 310
},
{
"epoch": 0.53,
"grad_norm": 5.681567668914795,
"learning_rate": 9.261178415717006e-06,
"logits/chosen": -1.2789077758789062,
"logits/rejected": -1.2725191116333008,
"logps/chosen": -145.27374267578125,
"logps/rejected": -171.4091339111328,
"loss": 1.1728,
"rewards/accuracies": 0.875,
"rewards/chosen": 13.61553955078125,
"rewards/margins": 3.5283050537109375,
"rewards/rejected": 10.087234497070312,
"step": 320
},
{
"epoch": 0.55,
"grad_norm": 10.140364646911621,
"learning_rate": 9.214796592022378e-06,
"logits/chosen": -1.322906732559204,
"logits/rejected": -1.308996558189392,
"logps/chosen": -141.2882537841797,
"logps/rejected": -186.32119750976562,
"loss": 1.1413,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 13.753789901733398,
"rewards/margins": 3.8648266792297363,
"rewards/rejected": 9.88896369934082,
"step": 330
},
{
"epoch": 0.57,
"grad_norm": 29.466781616210938,
"learning_rate": 9.167126466907215e-06,
"logits/chosen": -1.333762288093567,
"logits/rejected": -1.33591628074646,
"logps/chosen": -135.62503051757812,
"logps/rejected": -185.78758239746094,
"loss": 1.2402,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 13.778650283813477,
"rewards/margins": 3.8923568725585938,
"rewards/rejected": 9.886293411254883,
"step": 340
},
{
"epoch": 0.58,
"grad_norm": 24.269454956054688,
"learning_rate": 9.118182611297665e-06,
"logits/chosen": -1.309342622756958,
"logits/rejected": -1.3070811033248901,
"logps/chosen": -136.98138427734375,
"logps/rejected": -191.55300903320312,
"loss": 1.327,
"rewards/accuracies": 0.8125,
"rewards/chosen": 13.252286911010742,
"rewards/margins": 3.4166321754455566,
"rewards/rejected": 9.835655212402344,
"step": 350
},
{
"epoch": 0.6,
"grad_norm": 45.82386779785156,
"learning_rate": 9.067979985450377e-06,
"logits/chosen": -1.3228009939193726,
"logits/rejected": -1.3171498775482178,
"logps/chosen": -146.18157958984375,
"logps/rejected": -188.90127563476562,
"loss": 1.1041,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 13.495672225952148,
"rewards/margins": 3.9597671031951904,
"rewards/rejected": 9.535904884338379,
"step": 360
},
{
"epoch": 0.62,
"grad_norm": 30.16645622253418,
"learning_rate": 9.016533934379697e-06,
"logits/chosen": -1.33194100856781,
"logits/rejected": -1.3219845294952393,
"logps/chosen": -138.00164794921875,
"logps/rejected": -186.89862060546875,
"loss": 1.0528,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 13.405718803405762,
"rewards/margins": 4.230135440826416,
"rewards/rejected": 9.17558479309082,
"step": 370
},
{
"epoch": 0.63,
"grad_norm": 23.7780704498291,
"learning_rate": 8.96386018316731e-06,
"logits/chosen": -1.332335114479065,
"logits/rejected": -1.3253917694091797,
"logps/chosen": -135.97877502441406,
"logps/rejected": -190.83480834960938,
"loss": 1.0856,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 14.198583602905273,
"rewards/margins": 4.640327453613281,
"rewards/rejected": 9.558258056640625,
"step": 380
},
{
"epoch": 0.65,
"grad_norm": 37.86054229736328,
"learning_rate": 8.909974832155667e-06,
"logits/chosen": -1.3102099895477295,
"logits/rejected": -1.3026459217071533,
"logps/chosen": -145.63626098632812,
"logps/rejected": -176.83547973632812,
"loss": 1.2607,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 13.206341743469238,
"rewards/margins": 3.159071683883667,
"rewards/rejected": 10.047269821166992,
"step": 390
},
{
"epoch": 0.67,
"grad_norm": 21.949071884155273,
"learning_rate": 8.854894352026746e-06,
"logits/chosen": -1.2982990741729736,
"logits/rejected": -1.294737696647644,
"logps/chosen": -135.0550537109375,
"logps/rejected": -179.46371459960938,
"loss": 1.1821,
"rewards/accuracies": 0.875,
"rewards/chosen": 13.508148193359375,
"rewards/margins": 3.7253730297088623,
"rewards/rejected": 9.78277587890625,
"step": 400
},
{
"epoch": 0.67,
"eval_logits/chosen": -1.2718416452407837,
"eval_logits/rejected": -1.2731894254684448,
"eval_logps/chosen": -142.27450561523438,
"eval_logps/rejected": -166.72564697265625,
"eval_loss": 1.9728976488113403,
"eval_rewards/accuracies": 0.611347496509552,
"eval_rewards/chosen": 9.937856674194336,
"eval_rewards/margins": 0.7354397177696228,
"eval_rewards/rejected": 9.20241641998291,
"eval_runtime": 280.6001,
"eval_samples_per_second": 2.512,
"eval_steps_per_second": 2.512,
"step": 400
},
{
"epoch": 0.68,
"grad_norm": 16.105052947998047,
"learning_rate": 8.798635578767584e-06,
"logits/chosen": -1.2835286855697632,
"logits/rejected": -1.2852187156677246,
"logps/chosen": -128.66891479492188,
"logps/rejected": -180.98855590820312,
"loss": 1.2384,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 13.414710998535156,
"rewards/margins": 3.5980567932128906,
"rewards/rejected": 9.81665325164795,
"step": 410
},
{
"epoch": 0.7,
"grad_norm": 18.655576705932617,
"learning_rate": 8.74121570852417e-06,
"logits/chosen": -1.304517388343811,
"logits/rejected": -1.295888900756836,
"logps/chosen": -129.7939910888672,
"logps/rejected": -182.28158569335938,
"loss": 1.1389,
"rewards/accuracies": 0.875,
"rewards/chosen": 13.784006118774414,
"rewards/margins": 4.219157695770264,
"rewards/rejected": 9.564847946166992,
"step": 420
},
{
"epoch": 0.72,
"grad_norm": 21.71282386779785,
"learning_rate": 8.682652292345239e-06,
"logits/chosen": -1.2803471088409424,
"logits/rejected": -1.281240701675415,
"logps/chosen": -126.82594299316406,
"logps/rejected": -187.58602905273438,
"loss": 1.0862,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 13.763422966003418,
"rewards/margins": 4.294766426086426,
"rewards/rejected": 9.468656539916992,
"step": 430
},
{
"epoch": 0.73,
"grad_norm": 23.261680603027344,
"learning_rate": 8.622963230817599e-06,
"logits/chosen": -1.3054393529891968,
"logits/rejected": -1.294926404953003,
"logps/chosen": -137.8651580810547,
"logps/rejected": -187.91754150390625,
"loss": 1.0189,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 13.93773078918457,
"rewards/margins": 4.54481840133667,
"rewards/rejected": 9.392911911010742,
"step": 440
},
{
"epoch": 0.75,
"grad_norm": 22.61610221862793,
"learning_rate": 8.562166768594592e-06,
"logits/chosen": -1.3260384798049927,
"logits/rejected": -1.313039779663086,
"logps/chosen": -136.37155151367188,
"logps/rejected": -189.9808807373047,
"loss": 1.0986,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 14.760931015014648,
"rewards/margins": 5.300175666809082,
"rewards/rejected": 9.460756301879883,
"step": 450
},
{
"epoch": 0.77,
"grad_norm": 44.37038803100586,
"learning_rate": 8.500281488819426e-06,
"logits/chosen": -1.335376501083374,
"logits/rejected": -1.32558012008667,
"logps/chosen": -137.15939331054688,
"logps/rejected": -178.85699462890625,
"loss": 1.25,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 14.302162170410156,
"rewards/margins": 4.735361099243164,
"rewards/rejected": 9.566801071166992,
"step": 460
},
{
"epoch": 0.78,
"grad_norm": 15.156902313232422,
"learning_rate": 8.43732630744501e-06,
"logits/chosen": -1.3467152118682861,
"logits/rejected": -1.3411352634429932,
"logps/chosen": -131.86680603027344,
"logps/rejected": -184.54978942871094,
"loss": 1.067,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 14.075284004211426,
"rewards/margins": 4.149069309234619,
"rewards/rejected": 9.926214218139648,
"step": 470
},
{
"epoch": 0.8,
"grad_norm": 8.137179374694824,
"learning_rate": 8.373320467452069e-06,
"logits/chosen": -1.3710681200027466,
"logits/rejected": -1.360769271850586,
"logps/chosen": -129.4867706298828,
"logps/rejected": -187.3372802734375,
"loss": 1.0214,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 14.554537773132324,
"rewards/margins": 5.148342132568359,
"rewards/rejected": 9.406194686889648,
"step": 480
},
{
"epoch": 0.82,
"grad_norm": 5.059852600097656,
"learning_rate": 8.308283532967311e-06,
"logits/chosen": -1.3810697793960571,
"logits/rejected": -1.372183918952942,
"logps/chosen": -128.07522583007812,
"logps/rejected": -187.84347534179688,
"loss": 1.0341,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 14.386204719543457,
"rewards/margins": 4.947279930114746,
"rewards/rejected": 9.438923835754395,
"step": 490
},
{
"epoch": 0.83,
"grad_norm": 15.475478172302246,
"learning_rate": 8.242235383283433e-06,
"logits/chosen": -1.3918366432189941,
"logits/rejected": -1.379677176475525,
"logps/chosen": -133.6950225830078,
"logps/rejected": -194.3822479248047,
"loss": 0.9832,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 15.278470039367676,
"rewards/margins": 5.975428104400635,
"rewards/rejected": 9.303044319152832,
"step": 500
},
{
"epoch": 0.85,
"grad_norm": 17.410987854003906,
"learning_rate": 8.175196206782765e-06,
"logits/chosen": -1.4215304851531982,
"logits/rejected": -1.4245882034301758,
"logps/chosen": -135.43157958984375,
"logps/rejected": -195.17837524414062,
"loss": 1.1563,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 13.64989948272705,
"rewards/margins": 4.778893947601318,
"rewards/rejected": 8.87100601196289,
"step": 510
},
{
"epoch": 0.87,
"grad_norm": 10.288193702697754,
"learning_rate": 8.107186494766475e-06,
"logits/chosen": -1.3871448040008545,
"logits/rejected": -1.3849140405654907,
"logps/chosen": -138.03524780273438,
"logps/rejected": -192.06105041503906,
"loss": 1.0753,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 14.112817764282227,
"rewards/margins": 5.05327844619751,
"rewards/rejected": 9.059538841247559,
"step": 520
},
{
"epoch": 0.88,
"grad_norm": 9.09673023223877,
"learning_rate": 8.038227035191152e-06,
"logits/chosen": -1.369144320487976,
"logits/rejected": -1.3611090183258057,
"logps/chosen": -129.99301147460938,
"logps/rejected": -184.04013061523438,
"loss": 0.9557,
"rewards/accuracies": 0.9375,
"rewards/chosen": 14.619100570678711,
"rewards/margins": 5.349046230316162,
"rewards/rejected": 9.270054817199707,
"step": 530
},
{
"epoch": 0.9,
"grad_norm": 10.032170295715332,
"learning_rate": 7.968338906314739e-06,
"logits/chosen": -1.384235143661499,
"logits/rejected": -1.3696062564849854,
"logps/chosen": -135.14321899414062,
"logps/rejected": -188.68896484375,
"loss": 0.9715,
"rewards/accuracies": 0.9375,
"rewards/chosen": 15.478785514831543,
"rewards/margins": 6.076346397399902,
"rewards/rejected": 9.40243911743164,
"step": 540
},
{
"epoch": 0.92,
"grad_norm": 19.764158248901367,
"learning_rate": 7.897543470253708e-06,
"logits/chosen": -1.4087002277374268,
"logits/rejected": -1.4047653675079346,
"logps/chosen": -123.32807922363281,
"logps/rejected": -196.0301971435547,
"loss": 0.987,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 14.471944808959961,
"rewards/margins": 5.295290470123291,
"rewards/rejected": 9.176654815673828,
"step": 550
},
{
"epoch": 0.93,
"grad_norm": 40.5106315612793,
"learning_rate": 7.825862366453487e-06,
"logits/chosen": -1.4345886707305908,
"logits/rejected": -1.4347158670425415,
"logps/chosen": -123.09619140625,
"logps/rejected": -191.9667205810547,
"loss": 1.0165,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 14.416943550109863,
"rewards/margins": 5.778631210327148,
"rewards/rejected": 8.638311386108398,
"step": 560
},
{
"epoch": 0.95,
"grad_norm": 11.272564888000488,
"learning_rate": 7.753317505074114e-06,
"logits/chosen": -1.3969998359680176,
"logits/rejected": -1.3935317993164062,
"logps/chosen": -136.36032104492188,
"logps/rejected": -193.97418212890625,
"loss": 1.088,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 14.054471969604492,
"rewards/margins": 5.512935161590576,
"rewards/rejected": 8.541539192199707,
"step": 570
},
{
"epoch": 0.97,
"grad_norm": 32.77339172363281,
"learning_rate": 7.679931060293137e-06,
"logits/chosen": -1.3968368768692017,
"logits/rejected": -1.3951635360717773,
"logps/chosen": -127.2842025756836,
"logps/rejected": -192.80587768554688,
"loss": 1.0683,
"rewards/accuracies": 0.875,
"rewards/chosen": 14.630516052246094,
"rewards/margins": 5.419614315032959,
"rewards/rejected": 9.210902214050293,
"step": 580
},
{
"epoch": 0.98,
"grad_norm": 9.357986450195312,
"learning_rate": 7.605725463527825e-06,
"logits/chosen": -1.3885825872421265,
"logits/rejected": -1.3792840242385864,
"logps/chosen": -129.8568115234375,
"logps/rejected": -175.72964477539062,
"loss": 1.0435,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 14.813947677612305,
"rewards/margins": 5.173638343811035,
"rewards/rejected": 9.640308380126953,
"step": 590
},
{
"epoch": 1.0,
"grad_norm": 14.639739036560059,
"learning_rate": 7.530723396578745e-06,
"logits/chosen": -1.3973591327667236,
"logits/rejected": -1.3902390003204346,
"logps/chosen": -123.96485900878906,
"logps/rejected": -186.21829223632812,
"loss": 0.9116,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 15.26634693145752,
"rewards/margins": 6.136044025421143,
"rewards/rejected": 9.130304336547852,
"step": 600
},
{
"epoch": 1.0,
"eval_logits/chosen": -1.351010799407959,
"eval_logits/rejected": -1.3527151346206665,
"eval_logps/chosen": -143.65618896484375,
"eval_logps/rejected": -169.28347778320312,
"eval_loss": 1.9454625844955444,
"eval_rewards/accuracies": 0.6482269763946533,
"eval_rewards/chosen": 9.799687385559082,
"eval_rewards/margins": 0.8530532121658325,
"eval_rewards/rejected": 8.946634292602539,
"eval_runtime": 280.7129,
"eval_samples_per_second": 2.511,
"eval_steps_per_second": 2.511,
"step": 600
},
{
"epoch": 1.02,
"grad_norm": 4.632735729217529,
"learning_rate": 7.454947784696804e-06,
"logits/chosen": -1.4173977375030518,
"logits/rejected": -1.4061188697814941,
"logps/chosen": -127.29417419433594,
"logps/rejected": -196.90383911132812,
"loss": 0.8744,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 14.873614311218262,
"rewards/margins": 6.611442565917969,
"rewards/rejected": 8.262171745300293,
"step": 610
},
{
"epoch": 1.03,
"grad_norm": 9.438977241516113,
"learning_rate": 7.3784217895758804e-06,
"logits/chosen": -1.4528166055679321,
"logits/rejected": -1.4417811632156372,
"logps/chosen": -134.85516357421875,
"logps/rejected": -205.706298828125,
"loss": 0.8906,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 15.327418327331543,
"rewards/margins": 6.7848310470581055,
"rewards/rejected": 8.542585372924805,
"step": 620
},
{
"epoch": 1.05,
"grad_norm": 8.289262771606445,
"learning_rate": 7.3011688022731865e-06,
"logits/chosen": -1.4324336051940918,
"logits/rejected": -1.420256495475769,
"logps/chosen": -129.654052734375,
"logps/rejected": -184.8654327392578,
"loss": 0.8893,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 14.612588882446289,
"rewards/margins": 5.854708671569824,
"rewards/rejected": 8.757880210876465,
"step": 630
},
{
"epoch": 1.07,
"grad_norm": 8.663153648376465,
"learning_rate": 7.2232124360595205e-06,
"logits/chosen": -1.455397605895996,
"logits/rejected": -1.4463832378387451,
"logps/chosen": -127.48155212402344,
"logps/rejected": -205.6735382080078,
"loss": 0.7873,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.274075508117676,
"rewards/margins": 6.838622093200684,
"rewards/rejected": 8.435453414916992,
"step": 640
},
{
"epoch": 1.08,
"grad_norm": 6.434426307678223,
"learning_rate": 7.144576519201595e-06,
"logits/chosen": -1.4524507522583008,
"logits/rejected": -1.437585711479187,
"logps/chosen": -122.5464859008789,
"logps/rejected": -193.6874542236328,
"loss": 0.8148,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 15.566122055053711,
"rewards/margins": 7.372010231018066,
"rewards/rejected": 8.194112777709961,
"step": 650
},
{
"epoch": 1.1,
"grad_norm": 4.037100791931152,
"learning_rate": 7.0652850876786485e-06,
"logits/chosen": -1.4568301439285278,
"logits/rejected": -1.4512048959732056,
"logps/chosen": -109.03157043457031,
"logps/rejected": -203.418212890625,
"loss": 0.7832,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 15.715448379516602,
"rewards/margins": 7.003268241882324,
"rewards/rejected": 8.712181091308594,
"step": 660
},
{
"epoch": 1.12,
"grad_norm": 19.28900909423828,
"learning_rate": 6.9853623778355805e-06,
"logits/chosen": -1.4528229236602783,
"logits/rejected": -1.441767692565918,
"logps/chosen": -124.41324615478516,
"logps/rejected": -183.64224243164062,
"loss": 0.8512,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 15.778039932250977,
"rewards/margins": 6.944033622741699,
"rewards/rejected": 8.834006309509277,
"step": 670
},
{
"epoch": 1.13,
"grad_norm": 5.148800373077393,
"learning_rate": 6.904832818974818e-06,
"logits/chosen": -1.461948037147522,
"logits/rejected": -1.4578710794448853,
"logps/chosen": -112.7221450805664,
"logps/rejected": -195.2073211669922,
"loss": 0.8077,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 15.100164413452148,
"rewards/margins": 6.574339866638184,
"rewards/rejected": 8.525824546813965,
"step": 680
},
{
"epoch": 1.15,
"grad_norm": 8.881467819213867,
"learning_rate": 6.823721025889227e-06,
"logits/chosen": -1.4419883489608765,
"logits/rejected": -1.4434632062911987,
"logps/chosen": -120.42142486572266,
"logps/rejected": -187.1098175048828,
"loss": 0.8814,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 15.091572761535645,
"rewards/margins": 6.394684791564941,
"rewards/rejected": 8.696887016296387,
"step": 690
},
{
"epoch": 1.17,
"grad_norm": 4.702162265777588,
"learning_rate": 6.742051791338305e-06,
"logits/chosen": -1.4430066347122192,
"logits/rejected": -1.4336270093917847,
"logps/chosen": -124.91837310791016,
"logps/rejected": -190.82229614257812,
"loss": 0.8541,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 16.073802947998047,
"rewards/margins": 7.100152015686035,
"rewards/rejected": 8.973649978637695,
"step": 700
},
{
"epoch": 1.18,
"grad_norm": 16.544490814208984,
"learning_rate": 6.6598500784700016e-06,
"logits/chosen": -1.437310814857483,
"logits/rejected": -1.439613938331604,
"logps/chosen": -116.21219635009766,
"logps/rejected": -190.1880340576172,
"loss": 0.8385,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 14.853405952453613,
"rewards/margins": 6.005455493927002,
"rewards/rejected": 8.84795093536377,
"step": 710
},
{
"epoch": 1.2,
"grad_norm": 4.642127513885498,
"learning_rate": 6.577141013190428e-06,
"logits/chosen": -1.474867582321167,
"logits/rejected": -1.4651817083358765,
"logps/chosen": -114.55826568603516,
"logps/rejected": -191.57504272460938,
"loss": 0.7852,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 15.868762016296387,
"rewards/margins": 7.344477653503418,
"rewards/rejected": 8.524284362792969,
"step": 720
},
{
"epoch": 1.22,
"grad_norm": 5.04547119140625,
"learning_rate": 6.493949876483841e-06,
"logits/chosen": -1.4716556072235107,
"logits/rejected": -1.4789526462554932,
"logps/chosen": -114.3637466430664,
"logps/rejected": -195.8820037841797,
"loss": 0.8683,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 15.210566520690918,
"rewards/margins": 7.015016078948975,
"rewards/rejected": 8.195551872253418,
"step": 730
},
{
"epoch": 1.23,
"grad_norm": 8.906220436096191,
"learning_rate": 6.410302096685219e-06,
"logits/chosen": -1.467878818511963,
"logits/rejected": -1.4584242105484009,
"logps/chosen": -113.1087875366211,
"logps/rejected": -196.9852752685547,
"loss": 0.7898,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 15.73388671875,
"rewards/margins": 7.293121337890625,
"rewards/rejected": 8.440766334533691,
"step": 740
},
{
"epoch": 1.25,
"grad_norm": 4.600486755371094,
"learning_rate": 6.326223241707787e-06,
"logits/chosen": -1.4887049198150635,
"logits/rejected": -1.4763177633285522,
"logps/chosen": -114.50920104980469,
"logps/rejected": -191.11123657226562,
"loss": 0.7898,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.316104888916016,
"rewards/margins": 7.5335493087768555,
"rewards/rejected": 8.782556533813477,
"step": 750
},
{
"epoch": 1.27,
"grad_norm": 28.16501235961914,
"learning_rate": 6.241739011227899e-06,
"logits/chosen": -1.4948675632476807,
"logits/rejected": -1.4884848594665527,
"logps/chosen": -110.34342956542969,
"logps/rejected": -183.22744750976562,
"loss": 0.8029,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 15.722465515136719,
"rewards/margins": 7.071497917175293,
"rewards/rejected": 8.650967597961426,
"step": 760
},
{
"epoch": 1.28,
"grad_norm": 8.12157917022705,
"learning_rate": 6.156875228829627e-06,
"logits/chosen": -1.4990284442901611,
"logits/rejected": -1.502752661705017,
"logps/chosen": -124.48286437988281,
"logps/rejected": -205.2565155029297,
"loss": 0.9354,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 15.559919357299805,
"rewards/margins": 7.290531158447266,
"rewards/rejected": 8.269388198852539,
"step": 770
},
{
"epoch": 1.3,
"grad_norm": 5.052426338195801,
"learning_rate": 6.071657834111483e-06,
"logits/chosen": -1.4951364994049072,
"logits/rejected": -1.4810194969177246,
"logps/chosen": -115.41800689697266,
"logps/rejected": -189.0443878173828,
"loss": 0.8551,
"rewards/accuracies": 0.9375,
"rewards/chosen": 16.50579833984375,
"rewards/margins": 7.6818413734436035,
"rewards/rejected": 8.823959350585938,
"step": 780
},
{
"epoch": 1.32,
"grad_norm": 20.577198028564453,
"learning_rate": 5.986112874757688e-06,
"logits/chosen": -1.4854376316070557,
"logits/rejected": -1.4871946573257446,
"logps/chosen": -112.67398834228516,
"logps/rejected": -196.98049926757812,
"loss": 0.8269,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 16.0164852142334,
"rewards/margins": 7.493149757385254,
"rewards/rejected": 8.523335456848145,
"step": 790
},
{
"epoch": 1.33,
"grad_norm": 14.56539249420166,
"learning_rate": 5.900266498576383e-06,
"logits/chosen": -1.5002418756484985,
"logits/rejected": -1.4961646795272827,
"logps/chosen": -119.12144470214844,
"logps/rejected": -195.09054565429688,
"loss": 0.8412,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 15.817484855651855,
"rewards/margins": 7.278079986572266,
"rewards/rejected": 8.539405822753906,
"step": 800
},
{
"epoch": 1.33,
"eval_logits/chosen": -1.417891502380371,
"eval_logits/rejected": -1.4206050634384155,
"eval_logps/chosen": -146.20431518554688,
"eval_logps/rejected": -173.58306884765625,
"eval_loss": 2.0040743350982666,
"eval_rewards/accuracies": 0.6397163271903992,
"eval_rewards/chosen": 9.544875144958496,
"eval_rewards/margins": 1.0281997919082642,
"eval_rewards/rejected": 8.516674995422363,
"eval_runtime": 280.445,
"eval_samples_per_second": 2.514,
"eval_steps_per_second": 2.514,
"step": 800
},
{
"epoch": 1.35,
"grad_norm": 8.800375938415527,
"learning_rate": 5.81414494550726e-06,
"logits/chosen": -1.4927313327789307,
"logits/rejected": -1.486893653869629,
"logps/chosen": -108.54096984863281,
"logps/rejected": -194.3487548828125,
"loss": 0.7899,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 16.199222564697266,
"rewards/margins": 7.905210018157959,
"rewards/rejected": 8.294013977050781,
"step": 810
},
{
"epoch": 1.37,
"grad_norm": 12.154976844787598,
"learning_rate": 5.727774539601015e-06,
"logits/chosen": -1.4953235387802124,
"logits/rejected": -1.49058198928833,
"logps/chosen": -105.4122543334961,
"logps/rejected": -205.03518676757812,
"loss": 0.7704,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 15.790075302124023,
"rewards/margins": 7.537489414215088,
"rewards/rejected": 8.252584457397461,
"step": 820
},
{
"epoch": 1.38,
"grad_norm": 18.195478439331055,
"learning_rate": 5.641181680973094e-06,
"logits/chosen": -1.5069096088409424,
"logits/rejected": -1.5045548677444458,
"logps/chosen": -107.76765441894531,
"logps/rejected": -185.20993041992188,
"loss": 0.7649,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 15.898587226867676,
"rewards/margins": 7.165400505065918,
"rewards/rejected": 8.733189582824707,
"step": 830
},
{
"epoch": 1.4,
"grad_norm": 6.522106647491455,
"learning_rate": 5.554392837734201e-06,
"logits/chosen": -1.4592971801757812,
"logits/rejected": -1.4653236865997314,
"logps/chosen": -126.52349853515625,
"logps/rejected": -200.0361328125,
"loss": 0.9745,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 14.701069831848145,
"rewards/margins": 6.266867160797119,
"rewards/rejected": 8.434202194213867,
"step": 840
},
{
"epoch": 1.42,
"grad_norm": 4.361470699310303,
"learning_rate": 5.467434537900002e-06,
"logits/chosen": -1.4889204502105713,
"logits/rejected": -1.4831851720809937,
"logps/chosen": -117.97469329833984,
"logps/rejected": -199.29603576660156,
"loss": 0.8372,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 15.988435745239258,
"rewards/margins": 7.939582824707031,
"rewards/rejected": 8.048852920532227,
"step": 850
},
{
"epoch": 1.43,
"grad_norm": 30.419536590576172,
"learning_rate": 5.380333361282537e-06,
"logits/chosen": -1.4820563793182373,
"logits/rejected": -1.4779971837997437,
"logps/chosen": -115.8757553100586,
"logps/rejected": -182.51400756835938,
"loss": 0.9258,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 15.708460807800293,
"rewards/margins": 6.669085502624512,
"rewards/rejected": 9.039376258850098,
"step": 860
},
{
"epoch": 1.45,
"grad_norm": 9.19847583770752,
"learning_rate": 5.293115931365793e-06,
"logits/chosen": -1.5117051601409912,
"logits/rejected": -1.5152397155761719,
"logps/chosen": -111.7296371459961,
"logps/rejected": -208.90414428710938,
"loss": 0.8764,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 16.2409725189209,
"rewards/margins": 8.053875923156738,
"rewards/rejected": 8.187097549438477,
"step": 870
},
{
"epoch": 1.47,
"grad_norm": 5.0174384117126465,
"learning_rate": 5.20580890716792e-06,
"logits/chosen": -1.5034881830215454,
"logits/rejected": -1.500460147857666,
"logps/chosen": -108.12443542480469,
"logps/rejected": -194.9343719482422,
"loss": 0.7908,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 16.14035415649414,
"rewards/margins": 7.5548906326293945,
"rewards/rejected": 8.585463523864746,
"step": 880
},
{
"epoch": 1.48,
"grad_norm": 51.05670928955078,
"learning_rate": 5.118438975092605e-06,
"logits/chosen": -1.503549575805664,
"logits/rejected": -1.5023475885391235,
"logps/chosen": -113.84260559082031,
"logps/rejected": -193.15135192871094,
"loss": 0.8541,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 15.795028686523438,
"rewards/margins": 7.298943519592285,
"rewards/rejected": 8.496085166931152,
"step": 890
},
{
"epoch": 1.5,
"grad_norm": 10.384552955627441,
"learning_rate": 5.031032840772048e-06,
"logits/chosen": -1.49759840965271,
"logits/rejected": -1.491120457649231,
"logps/chosen": -110.45916748046875,
"logps/rejected": -189.4463653564453,
"loss": 0.8419,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 16.17422866821289,
"rewards/margins": 7.4651031494140625,
"rewards/rejected": 8.709126472473145,
"step": 900
},
{
"epoch": 1.52,
"grad_norm": 21.21319007873535,
"learning_rate": 4.943617220904091e-06,
"logits/chosen": -1.5416353940963745,
"logits/rejected": -1.5274879932403564,
"logps/chosen": -113.6543197631836,
"logps/rejected": -192.6654815673828,
"loss": 0.8271,
"rewards/accuracies": 0.9375,
"rewards/chosen": 17.103158950805664,
"rewards/margins": 9.091134071350098,
"rewards/rejected": 8.01202392578125,
"step": 910
},
{
"epoch": 1.53,
"grad_norm": 4.474107265472412,
"learning_rate": 4.856218835085946e-06,
"logits/chosen": -1.5196397304534912,
"logits/rejected": -1.5235029458999634,
"logps/chosen": -102.72175598144531,
"logps/rejected": -212.8190155029297,
"loss": 0.7193,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 16.488685607910156,
"rewards/margins": 8.548524856567383,
"rewards/rejected": 7.940161228179932,
"step": 920
},
{
"epoch": 1.55,
"grad_norm": 4.788906097412109,
"learning_rate": 4.768864397647031e-06,
"logits/chosen": -1.5152068138122559,
"logits/rejected": -1.5065648555755615,
"logps/chosen": -101.84674072265625,
"logps/rejected": -219.8702392578125,
"loss": 0.6819,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 17.328767776489258,
"rewards/margins": 10.332796096801758,
"rewards/rejected": 6.995970726013184,
"step": 930
},
{
"epoch": 1.57,
"grad_norm": 5.230409622192383,
"learning_rate": 4.681580609483436e-06,
"logits/chosen": -1.5237815380096436,
"logits/rejected": -1.5128507614135742,
"logps/chosen": -107.20805358886719,
"logps/rejected": -202.7063751220703,
"loss": 0.7475,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 17.05699920654297,
"rewards/margins": 9.087102890014648,
"rewards/rejected": 7.969895362854004,
"step": 940
},
{
"epoch": 1.58,
"grad_norm": 4.067636013031006,
"learning_rate": 4.594394149896481e-06,
"logits/chosen": -1.5245317220687866,
"logits/rejected": -1.5309604406356812,
"logps/chosen": -104.8866958618164,
"logps/rejected": -214.6997528076172,
"loss": 0.7442,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 17.146936416625977,
"rewards/margins": 9.150843620300293,
"rewards/rejected": 7.996092319488525,
"step": 950
},
{
"epoch": 1.6,
"grad_norm": 5.677704811096191,
"learning_rate": 4.507331668437878e-06,
"logits/chosen": -1.524597406387329,
"logits/rejected": -1.5158151388168335,
"logps/chosen": -99.23400115966797,
"logps/rejected": -204.5542755126953,
"loss": 0.7509,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 17.156509399414062,
"rewards/margins": 9.190254211425781,
"rewards/rejected": 7.966255187988281,
"step": 960
},
{
"epoch": 1.62,
"grad_norm": 18.698612213134766,
"learning_rate": 4.42041977676399e-06,
"logits/chosen": -1.5082147121429443,
"logits/rejected": -1.51360285282135,
"logps/chosen": -101.81925964355469,
"logps/rejected": -208.90847778320312,
"loss": 0.7403,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 17.100299835205078,
"rewards/margins": 9.655840873718262,
"rewards/rejected": 7.444457054138184,
"step": 970
},
{
"epoch": 1.63,
"grad_norm": 8.17164421081543,
"learning_rate": 4.333685040501664e-06,
"logits/chosen": -1.5298702716827393,
"logits/rejected": -1.5242555141448975,
"logps/chosen": -103.73741149902344,
"logps/rejected": -200.9259796142578,
"loss": 0.7967,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 16.915372848510742,
"rewards/margins": 8.870743751525879,
"rewards/rejected": 8.04463005065918,
"step": 980
},
{
"epoch": 1.65,
"grad_norm": 11.650801658630371,
"learning_rate": 4.247153971128145e-06,
"logits/chosen": -1.518059492111206,
"logits/rejected": -1.512229323387146,
"logps/chosen": -100.54881286621094,
"logps/rejected": -197.3603515625,
"loss": 0.7513,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 17.221500396728516,
"rewards/margins": 9.602907180786133,
"rewards/rejected": 7.618594169616699,
"step": 990
},
{
"epoch": 1.67,
"grad_norm": 43.56822967529297,
"learning_rate": 4.160853017867531e-06,
"logits/chosen": -1.5179004669189453,
"logits/rejected": -1.5160772800445557,
"logps/chosen": -103.8607177734375,
"logps/rejected": -198.41555786132812,
"loss": 0.7345,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 16.935503005981445,
"rewards/margins": 8.681713104248047,
"rewards/rejected": 8.253790855407715,
"step": 1000
},
{
"epoch": 1.67,
"eval_logits/chosen": -1.4289880990982056,
"eval_logits/rejected": -1.4324686527252197,
"eval_logps/chosen": -150.1593017578125,
"eval_logps/rejected": -177.23570251464844,
"eval_loss": 2.0659372806549072,
"eval_rewards/accuracies": 0.6425532102584839,
"eval_rewards/chosen": 9.149377822875977,
"eval_rewards/margins": 0.9979680180549622,
"eval_rewards/rejected": 8.151410102844238,
"eval_runtime": 280.4165,
"eval_samples_per_second": 2.514,
"eval_steps_per_second": 2.514,
"step": 1000
},
{
"epoch": 1.68,
"grad_norm": 18.621665954589844,
"learning_rate": 4.074808559606264e-06,
"logits/chosen": -1.5129244327545166,
"logits/rejected": -1.5049433708190918,
"logps/chosen": -100.34537506103516,
"logps/rejected": -216.16506958007812,
"loss": 0.6972,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 17.9000301361084,
"rewards/margins": 10.329971313476562,
"rewards/rejected": 7.570061683654785,
"step": 1010
},
{
"epoch": 1.7,
"grad_norm": 38.73393630981445,
"learning_rate": 3.989046896830119e-06,
"logits/chosen": -1.5301315784454346,
"logits/rejected": -1.5370006561279297,
"logps/chosen": -108.65140533447266,
"logps/rejected": -213.85879516601562,
"loss": 0.8484,
"rewards/accuracies": 0.9375,
"rewards/chosen": 16.508888244628906,
"rewards/margins": 9.253557205200195,
"rewards/rejected": 7.255330562591553,
"step": 1020
},
{
"epoch": 1.72,
"grad_norm": 5.319835662841797,
"learning_rate": 3.9035942435851504e-06,
"logits/chosen": -1.5252224206924438,
"logits/rejected": -1.5232148170471191,
"logps/chosen": -102.89615631103516,
"logps/rejected": -206.59750366210938,
"loss": 0.7447,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 17.126842498779297,
"rewards/margins": 9.563276290893555,
"rewards/rejected": 7.563568115234375,
"step": 1030
},
{
"epoch": 1.73,
"grad_norm": 19.18516731262207,
"learning_rate": 3.818476719465073e-06,
"logits/chosen": -1.5133918523788452,
"logits/rejected": -1.503049612045288,
"logps/chosen": -111.7210693359375,
"logps/rejected": -180.65403747558594,
"loss": 0.9205,
"rewards/accuracies": 0.9375,
"rewards/chosen": 15.973672866821289,
"rewards/margins": 7.1840667724609375,
"rewards/rejected": 8.789606094360352,
"step": 1040
},
{
"epoch": 1.75,
"grad_norm": 20.20930290222168,
"learning_rate": 3.7337203416274993e-06,
"logits/chosen": -1.5373504161834717,
"logits/rejected": -1.5284373760223389,
"logps/chosen": -98.73445129394531,
"logps/rejected": -208.073486328125,
"loss": 0.7114,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 17.71356773376465,
"rewards/margins": 10.003168106079102,
"rewards/rejected": 7.710400581359863,
"step": 1050
},
{
"epoch": 1.77,
"grad_norm": 26.072620391845703,
"learning_rate": 3.6493510168414924e-06,
"logits/chosen": -1.5504237413406372,
"logits/rejected": -1.5454633235931396,
"logps/chosen": -104.91600036621094,
"logps/rejected": -212.7382354736328,
"loss": 0.8006,
"rewards/accuracies": 0.9375,
"rewards/chosen": 16.929195404052734,
"rewards/margins": 10.16772174835205,
"rewards/rejected": 6.761473178863525,
"step": 1060
},
{
"epoch": 1.78,
"grad_norm": 4.6664628982543945,
"learning_rate": 3.5653945335688688e-06,
"logits/chosen": -1.520021677017212,
"logits/rejected": -1.5187304019927979,
"logps/chosen": -99.18524932861328,
"logps/rejected": -208.15414428710938,
"loss": 0.7484,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 17.673595428466797,
"rewards/margins": 9.892778396606445,
"rewards/rejected": 7.780816555023193,
"step": 1070
},
{
"epoch": 1.8,
"grad_norm": 17.390003204345703,
"learning_rate": 3.4818765540816505e-06,
"logits/chosen": -1.5037453174591064,
"logits/rejected": -1.4973242282867432,
"logps/chosen": -120.0849609375,
"logps/rejected": -196.7596893310547,
"loss": 0.8644,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 15.961163520812988,
"rewards/margins": 8.226602554321289,
"rewards/rejected": 7.734560489654541,
"step": 1080
},
{
"epoch": 1.82,
"grad_norm": 29.988948822021484,
"learning_rate": 3.398822606618095e-06,
"logits/chosen": -1.507930040359497,
"logits/rejected": -1.5118629932403564,
"logps/chosen": -95.96809387207031,
"logps/rejected": -216.37130737304688,
"loss": 0.6905,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 17.203866958618164,
"rewards/margins": 9.483266830444336,
"rewards/rejected": 7.7205986976623535,
"step": 1090
},
{
"epoch": 1.83,
"grad_norm": 23.51713752746582,
"learning_rate": 3.3162580775796994e-06,
"logits/chosen": -1.4951313734054565,
"logits/rejected": -1.48732590675354,
"logps/chosen": -102.76708984375,
"logps/rejected": -198.36593627929688,
"loss": 0.8403,
"rewards/accuracies": 0.9375,
"rewards/chosen": 15.965538024902344,
"rewards/margins": 7.880660057067871,
"rewards/rejected": 8.084877967834473,
"step": 1100
},
{
"epoch": 1.85,
"grad_norm": 5.080748558044434,
"learning_rate": 3.2342082037715404e-06,
"logits/chosen": -1.4765777587890625,
"logits/rejected": -1.4732264280319214,
"logps/chosen": -101.1002426147461,
"logps/rejected": -198.5289306640625,
"loss": 0.7783,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 16.523101806640625,
"rewards/margins": 8.514945983886719,
"rewards/rejected": 8.008153915405273,
"step": 1110
},
{
"epoch": 1.87,
"grad_norm": 15.195828437805176,
"learning_rate": 3.1526980646883664e-06,
"logits/chosen": -1.49948251247406,
"logits/rejected": -1.49336838722229,
"logps/chosen": -113.28621673583984,
"logps/rejected": -198.71463012695312,
"loss": 0.8398,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 16.502878189086914,
"rewards/margins": 8.838693618774414,
"rewards/rejected": 7.664183139801025,
"step": 1120
},
{
"epoch": 1.88,
"grad_norm": 8.84524154663086,
"learning_rate": 3.071752574848747e-06,
"logits/chosen": -1.5089815855026245,
"logits/rejected": -1.5023810863494873,
"logps/chosen": -109.5863265991211,
"logps/rejected": -200.3076934814453,
"loss": 0.7723,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 16.556867599487305,
"rewards/margins": 8.802556037902832,
"rewards/rejected": 7.754312992095947,
"step": 1130
},
{
"epoch": 1.9,
"grad_norm": 5.309729099273682,
"learning_rate": 2.991396476179671e-06,
"logits/chosen": -1.5207172632217407,
"logits/rejected": -1.5075544118881226,
"logps/chosen": -104.1460189819336,
"logps/rejected": -200.79122924804688,
"loss": 0.7185,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 18.383495330810547,
"rewards/margins": 9.844137191772461,
"rewards/rejected": 8.539360046386719,
"step": 1140
},
{
"epoch": 1.92,
"grad_norm": 17.16521644592285,
"learning_rate": 2.911654330453882e-06,
"logits/chosen": -1.5120269060134888,
"logits/rejected": -1.5061429738998413,
"logps/chosen": -104.67869567871094,
"logps/rejected": -189.43106079101562,
"loss": 0.8524,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 17.262327194213867,
"rewards/margins": 8.857732772827148,
"rewards/rejected": 8.404593467712402,
"step": 1150
},
{
"epoch": 1.93,
"grad_norm": 6.592312812805176,
"learning_rate": 2.8325505117822984e-06,
"logits/chosen": -1.51890230178833,
"logits/rejected": -1.5155048370361328,
"logps/chosen": -101.92094421386719,
"logps/rejected": -207.8151092529297,
"loss": 0.6951,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 16.95406723022461,
"rewards/margins": 9.166045188903809,
"rewards/rejected": 7.788022518157959,
"step": 1160
},
{
"epoch": 1.95,
"grad_norm": 14.916762351989746,
"learning_rate": 2.754109199163771e-06,
"logits/chosen": -1.5025979280471802,
"logits/rejected": -1.4987188577651978,
"logps/chosen": -101.9366226196289,
"logps/rejected": -201.5316162109375,
"loss": 0.7587,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 17.186330795288086,
"rewards/margins": 8.913457870483398,
"rewards/rejected": 8.272873878479004,
"step": 1170
},
{
"epoch": 1.97,
"grad_norm": 39.37679672241211,
"learning_rate": 2.6763543690945004e-06,
"logits/chosen": -1.5020328760147095,
"logits/rejected": -1.4887430667877197,
"logps/chosen": -109.6100845336914,
"logps/rejected": -188.98495483398438,
"loss": 0.913,
"rewards/accuracies": 0.9375,
"rewards/chosen": 17.317249298095703,
"rewards/margins": 8.822264671325684,
"rewards/rejected": 8.49498462677002,
"step": 1180
},
{
"epoch": 1.98,
"grad_norm": 3.733832597732544,
"learning_rate": 2.599309788239339e-06,
"logits/chosen": -1.5142863988876343,
"logits/rejected": -1.5111477375030518,
"logps/chosen": -106.55147552490234,
"logps/rejected": -215.3072967529297,
"loss": 0.7224,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.323034286499023,
"rewards/margins": 9.6803560256958,
"rewards/rejected": 7.642678260803223,
"step": 1190
},
{
"epoch": 2.0,
"grad_norm": 15.505064010620117,
"learning_rate": 2.5229990061672414e-06,
"logits/chosen": -1.5279539823532104,
"logits/rejected": -1.5259180068969727,
"logps/chosen": -97.4522476196289,
"logps/rejected": -206.6697235107422,
"loss": 0.6609,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.010801315307617,
"rewards/margins": 10.6015043258667,
"rewards/rejected": 7.409295558929443,
"step": 1200
},
{
"epoch": 2.0,
"eval_logits/chosen": -1.4321023225784302,
"eval_logits/rejected": -1.4358972311019897,
"eval_logps/chosen": -151.3264923095703,
"eval_logps/rejected": -180.62367248535156,
"eval_loss": 2.032116174697876,
"eval_rewards/accuracies": 0.6680850982666016,
"eval_rewards/chosen": 9.032657623291016,
"eval_rewards/margins": 1.22004234790802,
"eval_rewards/rejected": 7.812614917755127,
"eval_runtime": 280.3663,
"eval_samples_per_second": 2.515,
"eval_steps_per_second": 2.515,
"step": 1200
},
{
"epoch": 2.02,
"grad_norm": 7.859879493713379,
"learning_rate": 2.4474453481530587e-06,
"logits/chosen": -1.5223504304885864,
"logits/rejected": -1.5135400295257568,
"logps/chosen": -93.24683380126953,
"logps/rejected": -219.54849243164062,
"loss": 0.6373,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.513376235961914,
"rewards/margins": 11.950658798217773,
"rewards/rejected": 6.562716484069824,
"step": 1210
},
{
"epoch": 2.03,
"grad_norm": 27.701181411743164,
"learning_rate": 2.3726719080478962e-06,
"logits/chosen": -1.509887933731079,
"logits/rejected": -1.5058742761611938,
"logps/chosen": -103.5932388305664,
"logps/rejected": -203.78501892089844,
"loss": 0.7128,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 17.814315795898438,
"rewards/margins": 9.831649780273438,
"rewards/rejected": 7.982666969299316,
"step": 1220
},
{
"epoch": 2.05,
"grad_norm": 8.064666748046875,
"learning_rate": 2.298701541220218e-06,
"logits/chosen": -1.5236984491348267,
"logits/rejected": -1.5181363821029663,
"logps/chosen": -100.51972961425781,
"logps/rejected": -202.49575805664062,
"loss": 0.6432,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.57242202758789,
"rewards/margins": 11.497468948364258,
"rewards/rejected": 7.074953556060791,
"step": 1230
},
{
"epoch": 2.07,
"grad_norm": 5.775174140930176,
"learning_rate": 2.22555685756983e-06,
"logits/chosen": -1.5132160186767578,
"logits/rejected": -1.503983974456787,
"logps/chosen": -96.10552978515625,
"logps/rejected": -201.612060546875,
"loss": 0.6538,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.1705322265625,
"rewards/margins": 10.621967315673828,
"rewards/rejected": 7.548564910888672,
"step": 1240
},
{
"epoch": 2.08,
"grad_norm": 7.816349983215332,
"learning_rate": 2.153260214616915e-06,
"logits/chosen": -1.5382534265518188,
"logits/rejected": -1.5415149927139282,
"logps/chosen": -95.2313461303711,
"logps/rejected": -223.0685577392578,
"loss": 0.5986,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.566387176513672,
"rewards/margins": 11.582368850708008,
"rewards/rejected": 6.9840192794799805,
"step": 1250
},
{
"epoch": 2.1,
"grad_norm": 5.086969375610352,
"learning_rate": 2.081833710668181e-06,
"logits/chosen": -1.494619607925415,
"logits/rejected": -1.4864647388458252,
"logps/chosen": -93.64717864990234,
"logps/rejected": -195.05477905273438,
"loss": 0.6944,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 17.304344177246094,
"rewards/margins": 10.235904693603516,
"rewards/rejected": 7.068438529968262,
"step": 1260
},
{
"epoch": 2.12,
"grad_norm": 3.984151840209961,
"learning_rate": 2.0112991780622725e-06,
"logits/chosen": -1.4988569021224976,
"logits/rejected": -1.4943821430206299,
"logps/chosen": -102.37433624267578,
"logps/rejected": -204.2311553955078,
"loss": 0.6708,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.241689682006836,
"rewards/margins": 10.953558921813965,
"rewards/rejected": 7.288130283355713,
"step": 1270
},
{
"epoch": 2.13,
"grad_norm": 39.707828521728516,
"learning_rate": 1.9416781764964486e-06,
"logits/chosen": -1.5009758472442627,
"logits/rejected": -1.5007375478744507,
"logps/chosen": -95.72146606445312,
"logps/rejected": -212.9442901611328,
"loss": 0.654,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 17.809816360473633,
"rewards/margins": 10.415254592895508,
"rewards/rejected": 7.394561767578125,
"step": 1280
},
{
"epoch": 2.15,
"grad_norm": 6.5094475746154785,
"learning_rate": 1.8729919864366292e-06,
"logits/chosen": -1.5389680862426758,
"logits/rejected": -1.5245224237442017,
"logps/chosen": -93.90787506103516,
"logps/rejected": -200.7926788330078,
"loss": 0.6446,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 18.495624542236328,
"rewards/margins": 10.695076942443848,
"rewards/rejected": 7.800548553466797,
"step": 1290
},
{
"epoch": 2.17,
"grad_norm": 11.99113655090332,
"learning_rate": 1.8052616026127563e-06,
"logits/chosen": -1.5282859802246094,
"logits/rejected": -1.5253300666809082,
"logps/chosen": -98.3044204711914,
"logps/rejected": -214.7481689453125,
"loss": 0.6627,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 17.510263442993164,
"rewards/margins": 10.662416458129883,
"rewards/rejected": 6.847846984863281,
"step": 1300
},
{
"epoch": 2.18,
"grad_norm": 14.8443021774292,
"learning_rate": 1.7385077276015267e-06,
"logits/chosen": -1.5107916593551636,
"logits/rejected": -1.513163685798645,
"logps/chosen": -92.03627014160156,
"logps/rejected": -213.4009552001953,
"loss": 0.6345,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 18.635957717895508,
"rewards/margins": 11.570890426635742,
"rewards/rejected": 7.065066337585449,
"step": 1310
},
{
"epoch": 2.2,
"grad_norm": 12.072715759277344,
"learning_rate": 1.6727507654983977e-06,
"logits/chosen": -1.5300309658050537,
"logits/rejected": -1.523341417312622,
"logps/chosen": -107.01918029785156,
"logps/rejected": -197.11643981933594,
"loss": 0.7373,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 17.78557014465332,
"rewards/margins": 10.176875114440918,
"rewards/rejected": 7.608694553375244,
"step": 1320
},
{
"epoch": 2.22,
"grad_norm": 8.86849594116211,
"learning_rate": 1.6080108156808439e-06,
"logits/chosen": -1.5328854322433472,
"logits/rejected": -1.5194957256317139,
"logps/chosen": -100.20330810546875,
"logps/rejected": -195.58236694335938,
"loss": 0.6678,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.137895584106445,
"rewards/margins": 10.396581649780273,
"rewards/rejected": 7.7413129806518555,
"step": 1330
},
{
"epoch": 2.23,
"grad_norm": 4.92030143737793,
"learning_rate": 1.5443076666647545e-06,
"logits/chosen": -1.5401328802108765,
"logits/rejected": -1.5302283763885498,
"logps/chosen": -87.4715805053711,
"logps/rejected": -214.174560546875,
"loss": 0.5832,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 19.330814361572266,
"rewards/margins": 11.920788764953613,
"rewards/rejected": 7.410025596618652,
"step": 1340
},
{
"epoch": 2.25,
"grad_norm": 5.7385783195495605,
"learning_rate": 1.4816607900558311e-06,
"logits/chosen": -1.5227100849151611,
"logits/rejected": -1.5200421810150146,
"logps/chosen": -87.50262451171875,
"logps/rejected": -214.90176391601562,
"loss": 0.6009,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 18.722919464111328,
"rewards/margins": 11.610313415527344,
"rewards/rejected": 7.112607002258301,
"step": 1350
},
{
"epoch": 2.27,
"grad_norm": 4.156861305236816,
"learning_rate": 1.4200893345978816e-06,
"logits/chosen": -1.5246403217315674,
"logits/rejected": -1.5162007808685303,
"logps/chosen": -95.39898681640625,
"logps/rejected": -208.87509155273438,
"loss": 0.6347,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.26802635192871,
"rewards/margins": 11.145674705505371,
"rewards/rejected": 7.122353553771973,
"step": 1360
},
{
"epoch": 2.28,
"grad_norm": 3.7285964488983154,
"learning_rate": 1.3596121203197715e-06,
"logits/chosen": -1.5043359994888306,
"logits/rejected": -1.5017638206481934,
"logps/chosen": -101.5743637084961,
"logps/rejected": -205.45144653320312,
"loss": 0.7345,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 17.542848587036133,
"rewards/margins": 10.09937858581543,
"rewards/rejected": 7.443469047546387,
"step": 1370
},
{
"epoch": 2.3,
"grad_norm": 5.035600662231445,
"learning_rate": 1.3002476327828717e-06,
"logits/chosen": -1.5371487140655518,
"logits/rejected": -1.5391342639923096,
"logps/chosen": -102.25202941894531,
"logps/rejected": -217.117919921875,
"loss": 0.6705,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 18.24382972717285,
"rewards/margins": 10.82688045501709,
"rewards/rejected": 7.416950225830078,
"step": 1380
},
{
"epoch": 2.32,
"grad_norm": 10.119084358215332,
"learning_rate": 1.2420140174307267e-06,
"logits/chosen": -1.5054762363433838,
"logits/rejected": -1.5047041177749634,
"logps/chosen": -90.96085357666016,
"logps/rejected": -203.27731323242188,
"loss": 0.6701,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 16.905033111572266,
"rewards/margins": 9.766073226928711,
"rewards/rejected": 7.1389594078063965,
"step": 1390
},
{
"epoch": 2.33,
"grad_norm": 4.378362655639648,
"learning_rate": 1.1849290740426994e-06,
"logits/chosen": -1.52398681640625,
"logits/rejected": -1.5253530740737915,
"logps/chosen": -100.94436645507812,
"logps/rejected": -208.01913452148438,
"loss": 0.6768,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.257160186767578,
"rewards/margins": 10.379103660583496,
"rewards/rejected": 6.878057956695557,
"step": 1400
},
{
"epoch": 2.33,
"eval_logits/chosen": -1.4431970119476318,
"eval_logits/rejected": -1.447227120399475,
"eval_logps/chosen": -150.6456756591797,
"eval_logps/rejected": -179.82106018066406,
"eval_loss": 2.0312814712524414,
"eval_rewards/accuracies": 0.6709219813346863,
"eval_rewards/chosen": 9.100737571716309,
"eval_rewards/margins": 1.207862377166748,
"eval_rewards/rejected": 7.892876148223877,
"eval_runtime": 280.087,
"eval_samples_per_second": 2.517,
"eval_steps_per_second": 2.517,
"step": 1400
},
{
"epoch": 2.35,
"grad_norm": 4.4983320236206055,
"learning_rate": 1.1290102512932482e-06,
"logits/chosen": -1.5456353425979614,
"logits/rejected": -1.5437848567962646,
"logps/chosen": -90.5999984741211,
"logps/rejected": -215.61239624023438,
"loss": 0.6084,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 18.774730682373047,
"rewards/margins": 11.802639961242676,
"rewards/rejected": 6.972087860107422,
"step": 1410
},
{
"epoch": 2.37,
"grad_norm": 8.901042938232422,
"learning_rate": 1.074274641418554e-06,
"logits/chosen": -1.5062071084976196,
"logits/rejected": -1.502605676651001,
"logps/chosen": -93.99595642089844,
"logps/rejected": -201.00819396972656,
"loss": 0.6596,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 17.505390167236328,
"rewards/margins": 10.57873249053955,
"rewards/rejected": 6.926657199859619,
"step": 1420
},
{
"epoch": 2.38,
"grad_norm": 5.139017581939697,
"learning_rate": 1.0207389749920593e-06,
"logits/chosen": -1.509242057800293,
"logits/rejected": -1.511796474456787,
"logps/chosen": -96.8970718383789,
"logps/rejected": -215.5059051513672,
"loss": 0.6782,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.298181533813477,
"rewards/margins": 10.531153678894043,
"rewards/rejected": 6.767026424407959,
"step": 1430
},
{
"epoch": 2.4,
"grad_norm": 4.371284484863281,
"learning_rate": 9.68419615810598e-07,
"logits/chosen": -1.5192222595214844,
"logits/rejected": -1.5184710025787354,
"logps/chosen": -92.4468002319336,
"logps/rejected": -208.1323699951172,
"loss": 0.6454,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.749366760253906,
"rewards/margins": 10.604511260986328,
"rewards/rejected": 7.144855499267578,
"step": 1440
},
{
"epoch": 2.42,
"grad_norm": 3.970381259918213,
"learning_rate": 9.173325558925905e-07,
"logits/chosen": -1.5145829916000366,
"logits/rejected": -1.51982843875885,
"logps/chosen": -90.59550476074219,
"logps/rejected": -224.2189483642578,
"loss": 0.6051,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.675228118896484,
"rewards/margins": 11.378229141235352,
"rewards/rejected": 6.296999931335449,
"step": 1450
},
{
"epoch": 2.43,
"grad_norm": 4.847020626068115,
"learning_rate": 8.674934105899152e-07,
"logits/chosen": -1.4975147247314453,
"logits/rejected": -1.4984054565429688,
"logps/chosen": -95.9659423828125,
"logps/rejected": -205.8800048828125,
"loss": 0.6823,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.054203033447266,
"rewards/margins": 9.49101448059082,
"rewards/rejected": 7.563189506530762,
"step": 1460
},
{
"epoch": 2.45,
"grad_norm": 4.591710090637207,
"learning_rate": 8.189174138148814e-07,
"logits/chosen": -1.5397193431854248,
"logits/rejected": -1.5424444675445557,
"logps/chosen": -83.99203491210938,
"logps/rejected": -215.84378051757812,
"loss": 0.5823,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 18.956871032714844,
"rewards/margins": 11.750204086303711,
"rewards/rejected": 7.206667900085449,
"step": 1470
},
{
"epoch": 2.47,
"grad_norm": 5.379372596740723,
"learning_rate": 7.716194133838135e-07,
"logits/chosen": -1.5043582916259766,
"logits/rejected": -1.507912278175354,
"logps/chosen": -94.89381408691406,
"logps/rejected": -209.4828338623047,
"loss": 0.654,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.687734603881836,
"rewards/margins": 10.847681045532227,
"rewards/rejected": 6.840054988861084,
"step": 1480
},
{
"epoch": 2.48,
"grad_norm": 7.855030536651611,
"learning_rate": 7.256138664786477e-07,
"logits/chosen": -1.5224730968475342,
"logits/rejected": -1.5275871753692627,
"logps/chosen": -83.72583770751953,
"logps/rejected": -211.76937866210938,
"loss": 0.5732,
"rewards/accuracies": 1.0,
"rewards/chosen": 19.119752883911133,
"rewards/margins": 12.638054847717285,
"rewards/rejected": 6.481698513031006,
"step": 1490
},
{
"epoch": 2.5,
"grad_norm": 3.8725578784942627,
"learning_rate": 6.809148352279182e-07,
"logits/chosen": -1.5395710468292236,
"logits/rejected": -1.5471025705337524,
"logps/chosen": -90.97738647460938,
"logps/rejected": -220.6120147705078,
"loss": 0.6148,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 18.08513641357422,
"rewards/margins": 11.457734107971191,
"rewards/rejected": 6.627403259277344,
"step": 1500
},
{
"epoch": 2.52,
"grad_norm": 8.233463287353516,
"learning_rate": 6.375359824085126e-07,
"logits/chosen": -1.5111545324325562,
"logits/rejected": -1.5080124139785767,
"logps/chosen": -100.68155670166016,
"logps/rejected": -211.43820190429688,
"loss": 0.7931,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 17.69210433959961,
"rewards/margins": 10.682944297790527,
"rewards/rejected": 7.009159088134766,
"step": 1510
},
{
"epoch": 2.53,
"grad_norm": 5.737654209136963,
"learning_rate": 5.954905672694805e-07,
"logits/chosen": -1.5383660793304443,
"logits/rejected": -1.5310518741607666,
"logps/chosen": -91.32364654541016,
"logps/rejected": -210.65103149414062,
"loss": 0.5817,
"rewards/accuracies": 1.0,
"rewards/chosen": 19.20272445678711,
"rewards/margins": 11.968230247497559,
"rewards/rejected": 7.23449182510376,
"step": 1520
},
{
"epoch": 2.55,
"grad_norm": 5.602839469909668,
"learning_rate": 5.547914414791922e-07,
"logits/chosen": -1.523057222366333,
"logits/rejected": -1.526564121246338,
"logps/chosen": -92.25067901611328,
"logps/rejected": -210.3281707763672,
"loss": 0.6644,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 17.999813079833984,
"rewards/margins": 10.866498947143555,
"rewards/rejected": 7.133312225341797,
"step": 1530
},
{
"epoch": 2.57,
"grad_norm": 80.35784912109375,
"learning_rate": 5.154510451970762e-07,
"logits/chosen": -1.5100964307785034,
"logits/rejected": -1.5057765245437622,
"logps/chosen": -91.3646469116211,
"logps/rejected": -208.1036834716797,
"loss": 0.698,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 17.777923583984375,
"rewards/margins": 10.551263809204102,
"rewards/rejected": 7.22666072845459,
"step": 1540
},
{
"epoch": 2.58,
"grad_norm": 11.723702430725098,
"learning_rate": 4.774814032711461e-07,
"logits/chosen": -1.5164598226547241,
"logits/rejected": -1.5183497667312622,
"logps/chosen": -99.03367614746094,
"logps/rejected": -213.4344024658203,
"loss": 0.6773,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.00816535949707,
"rewards/margins": 10.20258903503418,
"rewards/rejected": 6.805574893951416,
"step": 1550
},
{
"epoch": 2.6,
"grad_norm": 10.103087425231934,
"learning_rate": 4.4089412156245793e-07,
"logits/chosen": -1.5356425046920776,
"logits/rejected": -1.5423994064331055,
"logps/chosen": -83.90791320800781,
"logps/rejected": -210.4249267578125,
"loss": 0.5745,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.552326202392578,
"rewards/margins": 11.089494705200195,
"rewards/rejected": 7.462831020355225,
"step": 1560
},
{
"epoch": 2.62,
"grad_norm": 8.58123779296875,
"learning_rate": 4.0570038339764803e-07,
"logits/chosen": -1.5257699489593506,
"logits/rejected": -1.521463394165039,
"logps/chosen": -99.25550842285156,
"logps/rejected": -212.4494171142578,
"loss": 0.6679,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.084367752075195,
"rewards/margins": 11.122998237609863,
"rewards/rejected": 6.961369514465332,
"step": 1570
},
{
"epoch": 2.63,
"grad_norm": 5.0036845207214355,
"learning_rate": 3.719109461506215e-07,
"logits/chosen": -1.4980933666229248,
"logits/rejected": -1.5047228336334229,
"logps/chosen": -89.56025695800781,
"logps/rejected": -215.61279296875,
"loss": 0.6389,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 17.403789520263672,
"rewards/margins": 10.387609481811523,
"rewards/rejected": 7.016180515289307,
"step": 1580
},
{
"epoch": 2.65,
"grad_norm": 5.2077412605285645,
"learning_rate": 3.3953613795443376e-07,
"logits/chosen": -1.5230729579925537,
"logits/rejected": -1.5227991342544556,
"logps/chosen": -90.34446716308594,
"logps/rejected": -208.3804931640625,
"loss": 0.7039,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 18.70303726196289,
"rewards/margins": 11.128946304321289,
"rewards/rejected": 7.574089050292969,
"step": 1590
},
{
"epoch": 2.67,
"grad_norm": 5.617701530456543,
"learning_rate": 3.0858585454437927e-07,
"logits/chosen": -1.508111596107483,
"logits/rejected": -1.5131912231445312,
"logps/chosen": -89.39806365966797,
"logps/rejected": -204.76675415039062,
"loss": 0.615,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.604631423950195,
"rewards/margins": 10.715816497802734,
"rewards/rejected": 6.8888139724731445,
"step": 1600
},
{
"epoch": 2.67,
"eval_logits/chosen": -1.4370256662368774,
"eval_logits/rejected": -1.4413024187088013,
"eval_logps/chosen": -150.68116760253906,
"eval_logps/rejected": -179.16799926757812,
"eval_loss": 2.051481008529663,
"eval_rewards/accuracies": 0.6624113321304321,
"eval_rewards/chosen": 9.097188949584961,
"eval_rewards/margins": 1.1390060186386108,
"eval_rewards/rejected": 7.958182334899902,
"eval_runtime": 280.5593,
"eval_samples_per_second": 2.513,
"eval_steps_per_second": 2.513,
"step": 1600
},
{
"epoch": 2.68,
"grad_norm": 18.668004989624023,
"learning_rate": 2.7906955623324074e-07,
"logits/chosen": -1.519481897354126,
"logits/rejected": -1.5189173221588135,
"logps/chosen": -93.53435516357422,
"logps/rejected": -198.51348876953125,
"loss": 0.6712,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 17.499284744262695,
"rewards/margins": 10.426948547363281,
"rewards/rejected": 7.072335720062256,
"step": 1610
},
{
"epoch": 2.7,
"grad_norm": 4.54674768447876,
"learning_rate": 2.509962650196407e-07,
"logits/chosen": -1.5232311487197876,
"logits/rejected": -1.5235909223556519,
"logps/chosen": -89.19126892089844,
"logps/rejected": -211.9619140625,
"loss": 0.6203,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.46454620361328,
"rewards/margins": 11.837278366088867,
"rewards/rejected": 6.627265930175781,
"step": 1620
},
{
"epoch": 2.72,
"grad_norm": 13.877330780029297,
"learning_rate": 2.2437456183035833e-07,
"logits/chosen": -1.5302555561065674,
"logits/rejected": -1.5268919467926025,
"logps/chosen": -90.18998718261719,
"logps/rejected": -205.0181121826172,
"loss": 0.6213,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 18.42779541015625,
"rewards/margins": 11.21821403503418,
"rewards/rejected": 7.209580898284912,
"step": 1630
},
{
"epoch": 2.73,
"grad_norm": 7.050842761993408,
"learning_rate": 1.99212583897474e-07,
"logits/chosen": -1.5237982273101807,
"logits/rejected": -1.5170161724090576,
"logps/chosen": -96.49137115478516,
"logps/rejected": -208.6808319091797,
"loss": 0.6311,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.960681915283203,
"rewards/margins": 11.038247108459473,
"rewards/rejected": 6.9224348068237305,
"step": 1640
},
{
"epoch": 2.75,
"grad_norm": 9.275837898254395,
"learning_rate": 1.7551802227112558e-07,
"logits/chosen": -1.5040963888168335,
"logits/rejected": -1.5078271627426147,
"logps/chosen": -91.1369857788086,
"logps/rejected": -209.1532745361328,
"loss": 0.622,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.7973690032959,
"rewards/margins": 10.730772972106934,
"rewards/rejected": 7.066598415374756,
"step": 1650
},
{
"epoch": 2.77,
"grad_norm": 5.387999534606934,
"learning_rate": 1.5329811946865392e-07,
"logits/chosen": -1.5108160972595215,
"logits/rejected": -1.5107080936431885,
"logps/chosen": -98.21044921875,
"logps/rejected": -212.44580078125,
"loss": 0.6516,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 17.72305679321289,
"rewards/margins": 10.394246101379395,
"rewards/rejected": 7.328810214996338,
"step": 1660
},
{
"epoch": 2.78,
"grad_norm": 5.4950714111328125,
"learning_rate": 1.3255966726084036e-07,
"logits/chosen": -1.504311442375183,
"logits/rejected": -1.5167784690856934,
"logps/chosen": -87.20539093017578,
"logps/rejected": -200.42111206054688,
"loss": 0.6145,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.13315200805664,
"rewards/margins": 10.226241111755371,
"rewards/rejected": 6.9069085121154785,
"step": 1670
},
{
"epoch": 2.8,
"grad_norm": 6.581058979034424,
"learning_rate": 1.1330900459592564e-07,
"logits/chosen": -1.5215296745300293,
"logits/rejected": -1.5147764682769775,
"logps/chosen": -92.18501281738281,
"logps/rejected": -193.42245483398438,
"loss": 0.6203,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.840869903564453,
"rewards/margins": 10.311830520629883,
"rewards/rejected": 7.529041290283203,
"step": 1680
},
{
"epoch": 2.82,
"grad_norm": 29.634891510009766,
"learning_rate": 9.55520156620332e-08,
"logits/chosen": -1.5149075984954834,
"logits/rejected": -1.5113269090652466,
"logps/chosen": -92.65069580078125,
"logps/rejected": -197.09310913085938,
"loss": 0.6481,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 18.486942291259766,
"rewards/margins": 11.153399467468262,
"rewards/rejected": 7.333543300628662,
"step": 1690
},
{
"epoch": 2.83,
"grad_norm": 7.716902256011963,
"learning_rate": 7.92941280886056e-08,
"logits/chosen": -1.5236696004867554,
"logits/rejected": -1.5350602865219116,
"logps/chosen": -83.11317443847656,
"logps/rejected": -225.55429077148438,
"loss": 0.619,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 18.241207122802734,
"rewards/margins": 11.599997520446777,
"rewards/rejected": 6.641209602355957,
"step": 1700
},
{
"epoch": 2.85,
"grad_norm": 9.752326011657715,
"learning_rate": 6.454031128737881e-08,
"logits/chosen": -1.5312079191207886,
"logits/rejected": -1.5227811336517334,
"logps/chosen": -93.35438537597656,
"logps/rejected": -215.9623260498047,
"loss": 0.6241,
"rewards/accuracies": 1.0,
"rewards/chosen": 19.262916564941406,
"rewards/margins": 12.349664688110352,
"rewards/rejected": 6.9132513999938965,
"step": 1710
},
{
"epoch": 2.87,
"grad_norm": 9.797955513000488,
"learning_rate": 5.129507493343011e-08,
"logits/chosen": -1.511307716369629,
"logits/rejected": -1.5151628255844116,
"logps/chosen": -94.07261657714844,
"logps/rejected": -221.2146453857422,
"loss": 0.6836,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 18.428720474243164,
"rewards/margins": 11.63404369354248,
"rewards/rejected": 6.794674873352051,
"step": 1720
},
{
"epoch": 2.88,
"grad_norm": 4.7570719718933105,
"learning_rate": 3.956246758674065e-08,
"logits/chosen": -1.5266796350479126,
"logits/rejected": -1.5276660919189453,
"logps/chosen": -96.55259704589844,
"logps/rejected": -218.0394744873047,
"loss": 0.6813,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 18.319250106811523,
"rewards/margins": 11.030166625976562,
"rewards/rejected": 7.289083003997803,
"step": 1730
},
{
"epoch": 2.9,
"grad_norm": 5.806319713592529,
"learning_rate": 2.934607545470536e-08,
"logits/chosen": -1.5195215940475464,
"logits/rejected": -1.5297635793685913,
"logps/chosen": -87.46583557128906,
"logps/rejected": -229.15591430664062,
"loss": 0.6653,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 18.350963592529297,
"rewards/margins": 11.915371894836426,
"rewards/rejected": 6.4355902671813965,
"step": 1740
},
{
"epoch": 2.92,
"grad_norm": 4.364658832550049,
"learning_rate": 2.0649021295970906e-08,
"logits/chosen": -1.5325102806091309,
"logits/rejected": -1.5392825603485107,
"logps/chosen": -89.66578674316406,
"logps/rejected": -212.051513671875,
"loss": 0.6212,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 18.12158966064453,
"rewards/margins": 11.411016464233398,
"rewards/rejected": 6.710572719573975,
"step": 1750
},
{
"epoch": 2.93,
"grad_norm": 7.178764343261719,
"learning_rate": 1.3473963465924222e-08,
"logits/chosen": -1.5103847980499268,
"logits/rejected": -1.510608434677124,
"logps/chosen": -90.99683380126953,
"logps/rejected": -202.84072875976562,
"loss": 0.6385,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 17.887941360473633,
"rewards/margins": 11.204258918762207,
"rewards/rejected": 6.683682441711426,
"step": 1760
},
{
"epoch": 2.95,
"grad_norm": 9.613419532775879,
"learning_rate": 7.823095104137479e-09,
"logits/chosen": -1.5148423910140991,
"logits/rejected": -1.5170824527740479,
"logps/chosen": -101.44172668457031,
"logps/rejected": -203.09060668945312,
"loss": 0.7437,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 17.126873016357422,
"rewards/margins": 9.506840705871582,
"rewards/rejected": 7.620034694671631,
"step": 1770
},
{
"epoch": 2.97,
"grad_norm": 3.819849729537964,
"learning_rate": 3.6981434640093183e-09,
"logits/chosen": -1.5141820907592773,
"logits/rejected": -1.5163359642028809,
"logps/chosen": -91.76555633544922,
"logps/rejected": -208.36703491210938,
"loss": 0.6192,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.568470001220703,
"rewards/margins": 11.578906059265137,
"rewards/rejected": 6.989563941955566,
"step": 1780
},
{
"epoch": 2.98,
"grad_norm": 25.042579650878906,
"learning_rate": 1.1003693848093965e-09,
"logits/chosen": -1.5274379253387451,
"logits/rejected": -1.5291308164596558,
"logps/chosen": -102.40494537353516,
"logps/rejected": -220.06063842773438,
"loss": 0.7154,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 17.37801742553711,
"rewards/margins": 10.208986282348633,
"rewards/rejected": 7.169030666351318,
"step": 1790
},
{
"epoch": 3.0,
"step": 1797,
"total_flos": 8.333096122255933e+17,
"train_loss": 0.8950637202828078,
"train_runtime": 14615.0375,
"train_samples_per_second": 0.985,
"train_steps_per_second": 0.123
}
],
"logging_steps": 10,
"max_steps": 1797,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"total_flos": 8.333096122255933e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}