TTTXXX01's picture
Model save
3606762 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997056226081837,
"eval_steps": 500,
"global_step": 1698,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005887547836326171,
"grad_norm": 5408.07341365347,
"learning_rate": 2.941176470588235e-09,
"logits/chosen": 6646.15966796875,
"logits/rejected": 3119.63818359375,
"logps/chosen": -368.2507019042969,
"logps/rejected": -168.0050048828125,
"loss": 515.6205,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0058875478363261706,
"grad_norm": 4902.986980984481,
"learning_rate": 2.941176470588235e-08,
"logits/chosen": 4868.25439453125,
"logits/rejected": 4348.16943359375,
"logps/chosen": -285.6579895019531,
"logps/rejected": -243.44537353515625,
"loss": 530.529,
"rewards/accuracies": 0.4722222089767456,
"rewards/chosen": 0.015141813084483147,
"rewards/margins": -0.0011655373964458704,
"rewards/rejected": 0.016307353973388672,
"step": 10
},
{
"epoch": 0.011775095672652341,
"grad_norm": 4763.6399184264565,
"learning_rate": 5.88235294117647e-08,
"logits/chosen": 5758.640625,
"logits/rejected": 5330.70556640625,
"logps/chosen": -287.11346435546875,
"logps/rejected": -279.5744323730469,
"loss": 576.8336,
"rewards/accuracies": 0.5083333253860474,
"rewards/chosen": 0.2108406275510788,
"rewards/margins": 0.08644243329763412,
"rewards/rejected": 0.12439820915460587,
"step": 20
},
{
"epoch": 0.01766264350897851,
"grad_norm": 4716.064892413704,
"learning_rate": 8.823529411764706e-08,
"logits/chosen": 6295.79150390625,
"logits/rejected": 5043.77001953125,
"logps/chosen": -334.138916015625,
"logps/rejected": -277.32574462890625,
"loss": 562.3636,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 1.4467318058013916,
"rewards/margins": 0.04832734167575836,
"rewards/rejected": 1.3984044790267944,
"step": 30
},
{
"epoch": 0.023550191345304682,
"grad_norm": 4755.394333837509,
"learning_rate": 1.176470588235294e-07,
"logits/chosen": 5310.7216796875,
"logits/rejected": 4463.6806640625,
"logps/chosen": -257.0677185058594,
"logps/rejected": -237.11923217773438,
"loss": 526.118,
"rewards/accuracies": 0.5416666865348816,
"rewards/chosen": 4.58669376373291,
"rewards/margins": 0.1042458787560463,
"rewards/rejected": 4.482447624206543,
"step": 40
},
{
"epoch": 0.02943773918163085,
"grad_norm": 4087.670323308074,
"learning_rate": 1.4705882352941175e-07,
"logits/chosen": 5372.1640625,
"logits/rejected": 4366.4892578125,
"logps/chosen": -257.9158935546875,
"logps/rejected": -243.09597778320312,
"loss": 525.4981,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 9.666664123535156,
"rewards/margins": 0.8064844012260437,
"rewards/rejected": 8.860177993774414,
"step": 50
},
{
"epoch": 0.03532528701795702,
"grad_norm": 3523.9425676434603,
"learning_rate": 1.764705882352941e-07,
"logits/chosen": 5514.87255859375,
"logits/rejected": 4266.23291015625,
"logps/chosen": -282.6498107910156,
"logps/rejected": -233.466552734375,
"loss": 506.6934,
"rewards/accuracies": 0.5166667103767395,
"rewards/chosen": 14.391616821289062,
"rewards/margins": 0.46752357482910156,
"rewards/rejected": 13.924093246459961,
"step": 60
},
{
"epoch": 0.04121283485428319,
"grad_norm": 2850.4471378562707,
"learning_rate": 2.0588235294117645e-07,
"logits/chosen": 5830.0673828125,
"logits/rejected": 5022.267578125,
"logps/chosen": -273.00885009765625,
"logps/rejected": -273.3937072753906,
"loss": 522.7681,
"rewards/accuracies": 0.5333333611488342,
"rewards/chosen": 19.875988006591797,
"rewards/margins": 2.0218400955200195,
"rewards/rejected": 17.85414695739746,
"step": 70
},
{
"epoch": 0.047100382690609364,
"grad_norm": 3096.4559467407053,
"learning_rate": 2.352941176470588e-07,
"logits/chosen": 5268.9208984375,
"logits/rejected": 4395.24169921875,
"logps/chosen": -250.491455078125,
"logps/rejected": -238.2079620361328,
"loss": 512.6993,
"rewards/accuracies": 0.5916666388511658,
"rewards/chosen": 22.815217971801758,
"rewards/margins": 1.410942554473877,
"rewards/rejected": 21.404273986816406,
"step": 80
},
{
"epoch": 0.05298793052693553,
"grad_norm": 2723.779795541685,
"learning_rate": 2.6470588235294114e-07,
"logits/chosen": 4833.4150390625,
"logits/rejected": 4357.6396484375,
"logps/chosen": -252.0074005126953,
"logps/rejected": -247.14529418945312,
"loss": 462.3297,
"rewards/accuracies": 0.533333420753479,
"rewards/chosen": 25.25003433227539,
"rewards/margins": 0.849066436290741,
"rewards/rejected": 24.400968551635742,
"step": 90
},
{
"epoch": 0.0588754783632617,
"grad_norm": 2708.243095974589,
"learning_rate": 2.941176470588235e-07,
"logits/chosen": 5773.37646484375,
"logits/rejected": 5027.79248046875,
"logps/chosen": -262.8667907714844,
"logps/rejected": -230.8015594482422,
"loss": 479.7382,
"rewards/accuracies": 0.491666704416275,
"rewards/chosen": 27.713184356689453,
"rewards/margins": 0.3247580826282501,
"rewards/rejected": 27.388423919677734,
"step": 100
},
{
"epoch": 0.06476302619958788,
"grad_norm": 2732.3097350474873,
"learning_rate": 3.2352941176470586e-07,
"logits/chosen": 5362.26171875,
"logits/rejected": 4178.3798828125,
"logps/chosen": -232.09896850585938,
"logps/rejected": -222.7689208984375,
"loss": 461.3743,
"rewards/accuracies": 0.5166667103767395,
"rewards/chosen": 28.291431427001953,
"rewards/margins": -0.7825387120246887,
"rewards/rejected": 29.073970794677734,
"step": 110
},
{
"epoch": 0.07065057403591404,
"grad_norm": 3009.873907616259,
"learning_rate": 3.529411764705882e-07,
"logits/chosen": 5400.1884765625,
"logits/rejected": 5186.6875,
"logps/chosen": -260.42864990234375,
"logps/rejected": -224.1237030029297,
"loss": 507.8262,
"rewards/accuracies": 0.5083333253860474,
"rewards/chosen": 29.294321060180664,
"rewards/margins": -0.6264778971672058,
"rewards/rejected": 29.920801162719727,
"step": 120
},
{
"epoch": 0.07653812187224021,
"grad_norm": 3086.941224435285,
"learning_rate": 3.8235294117647053e-07,
"logits/chosen": 5547.7021484375,
"logits/rejected": 4927.2275390625,
"logps/chosen": -256.7834167480469,
"logps/rejected": -220.9555206298828,
"loss": 503.0002,
"rewards/accuracies": 0.4583333432674408,
"rewards/chosen": 33.36304473876953,
"rewards/margins": -3.1545910835266113,
"rewards/rejected": 36.51763153076172,
"step": 130
},
{
"epoch": 0.08242566970856638,
"grad_norm": 2711.0260281027327,
"learning_rate": 4.117647058823529e-07,
"logits/chosen": 5286.3466796875,
"logits/rejected": 4290.73681640625,
"logps/chosen": -239.8748779296875,
"logps/rejected": -204.95193481445312,
"loss": 494.5457,
"rewards/accuracies": 0.5583333373069763,
"rewards/chosen": 34.8304443359375,
"rewards/margins": 0.9059675335884094,
"rewards/rejected": 33.92447280883789,
"step": 140
},
{
"epoch": 0.08831321754489255,
"grad_norm": 3265.015457879219,
"learning_rate": 4.4117647058823526e-07,
"logits/chosen": 5884.11962890625,
"logits/rejected": 5200.5625,
"logps/chosen": -247.31649780273438,
"logps/rejected": -240.7677001953125,
"loss": 469.0736,
"rewards/accuracies": 0.5166666507720947,
"rewards/chosen": 33.2108154296875,
"rewards/margins": -0.9206498861312866,
"rewards/rejected": 34.1314697265625,
"step": 150
},
{
"epoch": 0.09420076538121873,
"grad_norm": 2961.354597919322,
"learning_rate": 4.705882352941176e-07,
"logits/chosen": 5862.7861328125,
"logits/rejected": 4168.9775390625,
"logps/chosen": -273.9336853027344,
"logps/rejected": -230.8885498046875,
"loss": 503.9816,
"rewards/accuracies": 0.5166666507720947,
"rewards/chosen": 36.73265075683594,
"rewards/margins": -0.693622887134552,
"rewards/rejected": 37.42627716064453,
"step": 160
},
{
"epoch": 0.1000883132175449,
"grad_norm": 8765.049216620555,
"learning_rate": 5e-07,
"logits/chosen": 5769.8330078125,
"logits/rejected": 4647.8681640625,
"logps/chosen": -261.44683837890625,
"logps/rejected": -218.4137725830078,
"loss": 464.5215,
"rewards/accuracies": 0.5083333253860474,
"rewards/chosen": 34.85304641723633,
"rewards/margins": 1.9801647663116455,
"rewards/rejected": 32.87287521362305,
"step": 170
},
{
"epoch": 0.10597586105387106,
"grad_norm": 3053.638703416719,
"learning_rate": 4.999471618320338e-07,
"logits/chosen": 5154.2353515625,
"logits/rejected": 5318.90771484375,
"logps/chosen": -236.9143829345703,
"logps/rejected": -244.16336059570312,
"loss": 477.8162,
"rewards/accuracies": 0.5083333253860474,
"rewards/chosen": 36.37485885620117,
"rewards/margins": 2.0390517711639404,
"rewards/rejected": 34.33580780029297,
"step": 180
},
{
"epoch": 0.11186340889019723,
"grad_norm": 2672.8458181578076,
"learning_rate": 4.997886696631114e-07,
"logits/chosen": 5255.52392578125,
"logits/rejected": 5058.0771484375,
"logps/chosen": -255.951171875,
"logps/rejected": -250.9419403076172,
"loss": 495.5926,
"rewards/accuracies": 0.5583333969116211,
"rewards/chosen": 37.5173454284668,
"rewards/margins": 3.499251127243042,
"rewards/rejected": 34.018096923828125,
"step": 190
},
{
"epoch": 0.1177509567265234,
"grad_norm": 3420.636866295782,
"learning_rate": 4.995245904887195e-07,
"logits/chosen": 5120.73779296875,
"logits/rejected": 4294.0244140625,
"logps/chosen": -234.7885284423828,
"logps/rejected": -197.4206085205078,
"loss": 437.2042,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 35.10551071166992,
"rewards/margins": -1.353625774383545,
"rewards/rejected": 36.459136962890625,
"step": 200
},
{
"epoch": 0.12363850456284957,
"grad_norm": 5077.60876689098,
"learning_rate": 4.991550359365359e-07,
"logits/chosen": 5475.88623046875,
"logits/rejected": 4547.7431640625,
"logps/chosen": -241.307373046875,
"logps/rejected": -231.92361450195312,
"loss": 467.975,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": 40.85314178466797,
"rewards/margins": 5.875087261199951,
"rewards/rejected": 34.978065490722656,
"step": 210
},
{
"epoch": 0.12952605239917575,
"grad_norm": 3020.4929051704753,
"learning_rate": 4.986801622192453e-07,
"logits/chosen": 5030.41748046875,
"logits/rejected": 4329.310546875,
"logps/chosen": -230.5275421142578,
"logps/rejected": -212.1551513671875,
"loss": 443.3945,
"rewards/accuracies": 0.5666666626930237,
"rewards/chosen": 36.29066848754883,
"rewards/margins": -0.13148090243339539,
"rewards/rejected": 36.422149658203125,
"step": 220
},
{
"epoch": 0.13541360023550192,
"grad_norm": 2899.0447768694994,
"learning_rate": 4.98100170068505e-07,
"logits/chosen": 5328.3818359375,
"logits/rejected": 4376.3935546875,
"logps/chosen": -238.4833984375,
"logps/rejected": -229.43783569335938,
"loss": 473.2981,
"rewards/accuracies": 0.625,
"rewards/chosen": 37.569854736328125,
"rewards/margins": 4.89124059677124,
"rewards/rejected": 32.678611755371094,
"step": 230
},
{
"epoch": 0.1413011480718281,
"grad_norm": 2663.071444573461,
"learning_rate": 4.974153046500967e-07,
"logits/chosen": 4790.46044921875,
"logits/rejected": 3993.68017578125,
"logps/chosen": -219.05197143554688,
"logps/rejected": -215.5940399169922,
"loss": 459.7012,
"rewards/accuracies": 0.5916666984558105,
"rewards/chosen": 38.187828063964844,
"rewards/margins": -2.219409227371216,
"rewards/rejected": 40.4072380065918,
"step": 240
},
{
"epoch": 0.14718869590815425,
"grad_norm": 2983.4293165585104,
"learning_rate": 4.966258554602924e-07,
"logits/chosen": 5798.11181640625,
"logits/rejected": 4796.2119140625,
"logps/chosen": -287.9047546386719,
"logps/rejected": -235.2962188720703,
"loss": 463.4111,
"rewards/accuracies": 0.625,
"rewards/chosen": 48.266197204589844,
"rewards/margins": 11.696329116821289,
"rewards/rejected": 36.56986618041992,
"step": 250
},
{
"epoch": 0.15307624374448042,
"grad_norm": 3316.6681927395334,
"learning_rate": 4.957321562034833e-07,
"logits/chosen": 5329.43408203125,
"logits/rejected": 4054.041015625,
"logps/chosen": -220.0008087158203,
"logps/rejected": -181.2195587158203,
"loss": 469.6311,
"rewards/accuracies": 0.625,
"rewards/chosen": 38.309364318847656,
"rewards/margins": 4.661072254180908,
"rewards/rejected": 33.648292541503906,
"step": 260
},
{
"epoch": 0.1589637915808066,
"grad_norm": 3010.7792525814666,
"learning_rate": 4.94734584651121e-07,
"logits/chosen": 5138.2724609375,
"logits/rejected": 4814.1396484375,
"logps/chosen": -246.031005859375,
"logps/rejected": -220.73782348632812,
"loss": 495.8528,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 46.25344467163086,
"rewards/margins": -3.6372578144073486,
"rewards/rejected": 49.89070510864258,
"step": 270
},
{
"epoch": 0.16485133941713276,
"grad_norm": 2682.596823641572,
"learning_rate": 4.936335624820313e-07,
"logits/chosen": 5137.6005859375,
"logits/rejected": 4301.2041015625,
"logps/chosen": -215.6089630126953,
"logps/rejected": -189.99356079101562,
"loss": 427.9188,
"rewards/accuracies": 0.5250000357627869,
"rewards/chosen": 34.9045295715332,
"rewards/margins": -3.837693452835083,
"rewards/rejected": 38.742218017578125,
"step": 280
},
{
"epoch": 0.17073888725345893,
"grad_norm": 2706.0873254037065,
"learning_rate": 4.924295551041687e-07,
"logits/chosen": 5884.3798828125,
"logits/rejected": 4964.26611328125,
"logps/chosen": -255.06588745117188,
"logps/rejected": -220.08206176757812,
"loss": 452.4157,
"rewards/accuracies": 0.5750000476837158,
"rewards/chosen": 42.134403228759766,
"rewards/margins": 0.6829560995101929,
"rewards/rejected": 41.45145034790039,
"step": 290
},
{
"epoch": 0.1766264350897851,
"grad_norm": 2703.151210592447,
"learning_rate": 4.911230714578858e-07,
"logits/chosen": 5192.1044921875,
"logits/rejected": 4181.39599609375,
"logps/chosen": -228.7882080078125,
"logps/rejected": -204.47088623046875,
"loss": 458.6937,
"rewards/accuracies": 0.6166666150093079,
"rewards/chosen": 39.62172317504883,
"rewards/margins": 2.2139618396759033,
"rewards/rejected": 37.40776062011719,
"step": 300
},
{
"epoch": 0.18251398292611126,
"grad_norm": 2902.6849576623777,
"learning_rate": 4.897146638008011e-07,
"logits/chosen": 4957.9501953125,
"logits/rejected": 4260.34423828125,
"logps/chosen": -221.3501434326172,
"logps/rejected": -205.3225860595703,
"loss": 466.1617,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": 42.29875564575195,
"rewards/margins": 6.2964630126953125,
"rewards/rejected": 36.00229263305664,
"step": 310
},
{
"epoch": 0.18840153076243746,
"grad_norm": 2631.6984767702515,
"learning_rate": 4.882049274743577e-07,
"logits/chosen": 5455.85986328125,
"logits/rejected": 4963.1669921875,
"logps/chosen": -277.1952819824219,
"logps/rejected": -246.1964874267578,
"loss": 467.9413,
"rewards/accuracies": 0.5583333969116211,
"rewards/chosen": 41.07880783081055,
"rewards/margins": 2.369145154953003,
"rewards/rejected": 38.70966339111328,
"step": 320
},
{
"epoch": 0.19428907859876363,
"grad_norm": 2594.7782737752327,
"learning_rate": 4.865945006521683e-07,
"logits/chosen": 4765.1201171875,
"logits/rejected": 4544.0693359375,
"logps/chosen": -196.0832977294922,
"logps/rejected": -211.2700958251953,
"loss": 422.8422,
"rewards/accuracies": 0.5,
"rewards/chosen": 37.63249588012695,
"rewards/margins": -2.908146381378174,
"rewards/rejected": 40.5406379699707,
"step": 330
},
{
"epoch": 0.2001766264350898,
"grad_norm": 2317.1385525955225,
"learning_rate": 4.848840640702564e-07,
"logits/chosen": 4887.91748046875,
"logits/rejected": 4849.06982421875,
"logps/chosen": -214.12826538085938,
"logps/rejected": -232.1658172607422,
"loss": 473.8225,
"rewards/accuracies": 0.6083333492279053,
"rewards/chosen": 41.86516571044922,
"rewards/margins": 2.8288767337799072,
"rewards/rejected": 39.03628158569336,
"step": 340
},
{
"epoch": 0.20606417427141596,
"grad_norm": 2802.6550557671103,
"learning_rate": 4.83074340739305e-07,
"logits/chosen": 5358.7685546875,
"logits/rejected": 4593.4169921875,
"logps/chosen": -254.9344940185547,
"logps/rejected": -211.921875,
"loss": 473.8788,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": 44.27037811279297,
"rewards/margins": 5.671798229217529,
"rewards/rejected": 38.59857940673828,
"step": 350
},
{
"epoch": 0.21195172210774213,
"grad_norm": 2184.313945380869,
"learning_rate": 4.811660956390372e-07,
"logits/chosen": 4931.3828125,
"logits/rejected": 4783.0810546875,
"logps/chosen": -215.4976348876953,
"logps/rejected": -207.6850128173828,
"loss": 439.7563,
"rewards/accuracies": 0.5666667222976685,
"rewards/chosen": 38.79738235473633,
"rewards/margins": -7.209702491760254,
"rewards/rejected": 46.007080078125,
"step": 360
},
{
"epoch": 0.2178392699440683,
"grad_norm": 2653.3204513683368,
"learning_rate": 4.791601353948536e-07,
"logits/chosen": 6008.3359375,
"logits/rejected": 5451.3759765625,
"logps/chosen": -249.03750610351562,
"logps/rejected": -227.90884399414062,
"loss": 427.3288,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 40.34531021118164,
"rewards/margins": 4.3100738525390625,
"rewards/rejected": 36.03523635864258,
"step": 370
},
{
"epoch": 0.22372681778039447,
"grad_norm": 3077.404166225713,
"learning_rate": 4.77057307936869e-07,
"logits/chosen": 6118.6044921875,
"logits/rejected": 5277.0595703125,
"logps/chosen": -251.12112426757812,
"logps/rejected": -247.95870971679688,
"loss": 502.4088,
"rewards/accuracies": 0.5083333253860474,
"rewards/chosen": 44.55669403076172,
"rewards/margins": 4.328840255737305,
"rewards/rejected": 40.22785186767578,
"step": 380
},
{
"epoch": 0.22961436561672063,
"grad_norm": 2533.1551132660197,
"learning_rate": 4.748585021414868e-07,
"logits/chosen": 5564.0263671875,
"logits/rejected": 4396.427734375,
"logps/chosen": -245.1484375,
"logps/rejected": -219.17623901367188,
"loss": 478.5578,
"rewards/accuracies": 0.5583333969116211,
"rewards/chosen": 38.58305358886719,
"rewards/margins": 3.148397445678711,
"rewards/rejected": 35.434654235839844,
"step": 390
},
{
"epoch": 0.2355019134530468,
"grad_norm": 2512.4648549451554,
"learning_rate": 4.7256464745566647e-07,
"logits/chosen": 4976.19189453125,
"logits/rejected": 4289.302734375,
"logps/chosen": -223.78713989257812,
"logps/rejected": -202.24887084960938,
"loss": 473.5983,
"rewards/accuracies": 0.5583333373069763,
"rewards/chosen": 39.46695327758789,
"rewards/margins": -2.2979044914245605,
"rewards/rejected": 41.764854431152344,
"step": 400
},
{
"epoch": 0.24138946128937297,
"grad_norm": 2733.908890696179,
"learning_rate": 4.7017671350404144e-07,
"logits/chosen": 5178.62890625,
"logits/rejected": 4491.7177734375,
"logps/chosen": -238.9555206298828,
"logps/rejected": -209.7996063232422,
"loss": 460.1955,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": 44.30680847167969,
"rewards/margins": 7.162436485290527,
"rewards/rejected": 37.14437484741211,
"step": 410
},
{
"epoch": 0.24727700912569914,
"grad_norm": 2844.9689261331014,
"learning_rate": 4.676957096790536e-07,
"logits/chosen": 4867.9912109375,
"logits/rejected": 4110.99658203125,
"logps/chosen": -222.1897430419922,
"logps/rejected": -214.99496459960938,
"loss": 447.1411,
"rewards/accuracies": 0.6333333253860474,
"rewards/chosen": 43.50336456298828,
"rewards/margins": 8.540742874145508,
"rewards/rejected": 34.96261978149414,
"step": 420
},
{
"epoch": 0.25316455696202533,
"grad_norm": 2563.6479429015913,
"learning_rate": 4.651226847142774e-07,
"logits/chosen": 5018.8154296875,
"logits/rejected": 3969.06298828125,
"logps/chosen": -221.9290771484375,
"logps/rejected": -184.1993865966797,
"loss": 464.5543,
"rewards/accuracies": 0.5916666388511658,
"rewards/chosen": 48.811912536621094,
"rewards/margins": 7.742800712585449,
"rewards/rejected": 41.06911087036133,
"step": 430
},
{
"epoch": 0.2590521047983515,
"grad_norm": 2122.0109615432093,
"learning_rate": 4.6245872624111524e-07,
"logits/chosen": 5818.7001953125,
"logits/rejected": 4908.8974609375,
"logps/chosen": -259.19195556640625,
"logps/rejected": -225.0427703857422,
"loss": 474.2507,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 41.42036056518555,
"rewards/margins": -0.13602538406848907,
"rewards/rejected": 41.55638885498047,
"step": 440
},
{
"epoch": 0.26493965263467767,
"grad_norm": 2624.9452504951946,
"learning_rate": 4.59704960329049e-07,
"logits/chosen": 4902.2177734375,
"logits/rejected": 4126.1279296875,
"logps/chosen": -221.35159301757812,
"logps/rejected": -205.9199676513672,
"loss": 447.7888,
"rewards/accuracies": 0.5166667103767395,
"rewards/chosen": 38.81290817260742,
"rewards/margins": 0.12556418776512146,
"rewards/rejected": 38.687339782714844,
"step": 450
},
{
"epoch": 0.27082720047100384,
"grad_norm": 3346.54251454803,
"learning_rate": 4.5686255100964534e-07,
"logits/chosen": 5134.34228515625,
"logits/rejected": 4823.8193359375,
"logps/chosen": -239.41934204101562,
"logps/rejected": -235.3642578125,
"loss": 485.9119,
"rewards/accuracies": 0.5666667222976685,
"rewards/chosen": 43.27761459350586,
"rewards/margins": 1.6956627368927002,
"rewards/rejected": 41.58195114135742,
"step": 460
},
{
"epoch": 0.27671474830733,
"grad_norm": 2976.593415018288,
"learning_rate": 4.539326997845123e-07,
"logits/chosen": 6470.69140625,
"logits/rejected": 5451.03955078125,
"logps/chosen": -285.16925048828125,
"logps/rejected": -258.68975830078125,
"loss": 488.3334,
"rewards/accuracies": 0.5750000476837158,
"rewards/chosen": 42.999271392822266,
"rewards/margins": 1.3085330724716187,
"rewards/rejected": 41.690738677978516,
"step": 470
},
{
"epoch": 0.2826022961436562,
"grad_norm": 2939.989789108279,
"learning_rate": 4.509166451174194e-07,
"logits/chosen": 5950.25048828125,
"logits/rejected": 5111.1640625,
"logps/chosen": -259.44683837890625,
"logps/rejected": -235.40414428710938,
"loss": 487.8371,
"rewards/accuracies": 0.5083333253860474,
"rewards/chosen": 37.737037658691406,
"rewards/margins": -5.529684543609619,
"rewards/rejected": 43.266719818115234,
"step": 480
},
{
"epoch": 0.28848984397998234,
"grad_norm": 2953.1970852829686,
"learning_rate": 4.4781566191079116e-07,
"logits/chosen": 4588.8037109375,
"logits/rejected": 4577.138671875,
"logps/chosen": -239.47250366210938,
"logps/rejected": -204.98898315429688,
"loss": 456.105,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 37.66023635864258,
"rewards/margins": -3.8221187591552734,
"rewards/rejected": 41.48235321044922,
"step": 490
},
{
"epoch": 0.2943773918163085,
"grad_norm": 2736.224570719795,
"learning_rate": 4.446310609668e-07,
"logits/chosen": 5530.6298828125,
"logits/rejected": 4615.0556640625,
"logps/chosen": -249.34030151367188,
"logps/rejected": -225.1228790283203,
"loss": 459.8451,
"rewards/accuracies": 0.5583333969116211,
"rewards/chosen": 43.233238220214844,
"rewards/margins": -1.8535034656524658,
"rewards/rejected": 45.08674240112305,
"step": 500
},
{
"epoch": 0.3002649396526347,
"grad_norm": 2615.8589721258495,
"learning_rate": 4.413641884332824e-07,
"logits/chosen": 5124.1259765625,
"logits/rejected": 4400.78515625,
"logps/chosen": -232.1068115234375,
"logps/rejected": -212.48538208007812,
"loss": 463.0996,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": 45.848487854003906,
"rewards/margins": 1.3726880550384521,
"rewards/rejected": 44.475799560546875,
"step": 510
},
{
"epoch": 0.30615248748896084,
"grad_norm": 2756.553889805137,
"learning_rate": 4.3801642523471585e-07,
"logits/chosen": 5130.49609375,
"logits/rejected": 4525.7822265625,
"logps/chosen": -227.8628692626953,
"logps/rejected": -220.9073028564453,
"loss": 443.5771,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": 41.145599365234375,
"rewards/margins": 3.313695192337036,
"rewards/rejected": 37.83190155029297,
"step": 520
},
{
"epoch": 0.312040035325287,
"grad_norm": 2864.6376168471875,
"learning_rate": 4.3458918648849363e-07,
"logits/chosen": 5254.4072265625,
"logits/rejected": 4547.3408203125,
"logps/chosen": -233.91171264648438,
"logps/rejected": -220.1038055419922,
"loss": 437.4953,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 43.519020080566406,
"rewards/margins": -5.650818824768066,
"rewards/rejected": 49.169837951660156,
"step": 530
},
{
"epoch": 0.3179275831616132,
"grad_norm": 2599.9218788148514,
"learning_rate": 4.3108392090674813e-07,
"logits/chosen": 6104.6904296875,
"logits/rejected": 5016.74658203125,
"logps/chosen": -281.6202087402344,
"logps/rejected": -253.33285522460938,
"loss": 482.9291,
"rewards/accuracies": 0.5666666626930237,
"rewards/chosen": 47.8477897644043,
"rewards/margins": 8.502013206481934,
"rewards/rejected": 39.34577941894531,
"step": 540
},
{
"epoch": 0.32381513099793935,
"grad_norm": 3199.922596928601,
"learning_rate": 4.2750211018397197e-07,
"logits/chosen": 5082.48046875,
"logits/rejected": 4866.55859375,
"logps/chosen": -250.6102752685547,
"logps/rejected": -231.0320281982422,
"loss": 472.6031,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 42.02797317504883,
"rewards/margins": -2.5889058113098145,
"rewards/rejected": 44.61688232421875,
"step": 550
},
{
"epoch": 0.3297026788342655,
"grad_norm": 2817.3733503855756,
"learning_rate": 4.2384526837069784e-07,
"logits/chosen": 4878.5634765625,
"logits/rejected": 3915.231201171875,
"logps/chosen": -200.32920837402344,
"logps/rejected": -187.05667114257812,
"loss": 467.9761,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": 44.21455383300781,
"rewards/margins": 1.7118313312530518,
"rewards/rejected": 42.50272750854492,
"step": 560
},
{
"epoch": 0.3355902266705917,
"grad_norm": 2821.591086826825,
"learning_rate": 4.2011494123350146e-07,
"logits/chosen": 5307.7412109375,
"logits/rejected": 4781.70361328125,
"logps/chosen": -228.97842407226562,
"logps/rejected": -248.53994750976562,
"loss": 465.2356,
"rewards/accuracies": 0.5833333134651184,
"rewards/chosen": 46.05214309692383,
"rewards/margins": 3.274325132369995,
"rewards/rejected": 42.77781295776367,
"step": 570
},
{
"epoch": 0.34147777450691785,
"grad_norm": 2609.2990914642696,
"learning_rate": 4.1631270560159744e-07,
"logits/chosen": 5694.88623046875,
"logits/rejected": 4307.6611328125,
"logps/chosen": -249.42398071289062,
"logps/rejected": -207.21530151367188,
"loss": 466.227,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": 41.66138458251953,
"rewards/margins": 3.035097122192383,
"rewards/rejected": 38.626285552978516,
"step": 580
},
{
"epoch": 0.347365322343244,
"grad_norm": 2794.6706694766654,
"learning_rate": 4.1244016870030565e-07,
"logits/chosen": 4977.28955078125,
"logits/rejected": 4125.67822265625,
"logps/chosen": -245.0275421142578,
"logps/rejected": -179.28305053710938,
"loss": 438.7925,
"rewards/accuracies": 0.5583333969116211,
"rewards/chosen": 39.37556076049805,
"rewards/margins": -1.698758840560913,
"rewards/rejected": 41.074317932128906,
"step": 590
},
{
"epoch": 0.3532528701795702,
"grad_norm": 2607.2196348404323,
"learning_rate": 4.084989674716679e-07,
"logits/chosen": 5479.78369140625,
"logits/rejected": 4455.615234375,
"logps/chosen": -242.08688354492188,
"logps/rejected": -193.136474609375,
"loss": 458.9058,
"rewards/accuracies": 0.6416667103767395,
"rewards/chosen": 42.43415069580078,
"rewards/margins": 2.274672031402588,
"rewards/rejected": 40.159481048583984,
"step": 600
},
{
"epoch": 0.35914041801589636,
"grad_norm": 2718.3257214827568,
"learning_rate": 4.0449076788250443e-07,
"logits/chosen": 5217.408203125,
"logits/rejected": 4447.7919921875,
"logps/chosen": -236.44717407226562,
"logps/rejected": -184.24591064453125,
"loss": 464.5568,
"rewards/accuracies": 0.5000000596046448,
"rewards/chosen": 39.97568893432617,
"rewards/margins": 0.8229917287826538,
"rewards/rejected": 39.1526985168457,
"step": 610
},
{
"epoch": 0.3650279658522225,
"grad_norm": 2786.272576235552,
"learning_rate": 4.0041726422020015e-07,
"logits/chosen": 5721.35107421875,
"logits/rejected": 4428.142578125,
"logps/chosen": -246.13412475585938,
"logps/rejected": -206.4453887939453,
"loss": 501.951,
"rewards/accuracies": 0.6333333253860474,
"rewards/chosen": 44.817237854003906,
"rewards/margins": 7.289865970611572,
"rewards/rejected": 37.527374267578125,
"step": 620
},
{
"epoch": 0.3709155136885487,
"grad_norm": 2768.798117142065,
"learning_rate": 3.962801783765209e-07,
"logits/chosen": 6276.0185546875,
"logits/rejected": 5337.60986328125,
"logps/chosen": -271.36322021484375,
"logps/rejected": -255.06948852539062,
"loss": 456.5855,
"rewards/accuracies": 0.491666704416275,
"rewards/chosen": 46.203147888183594,
"rewards/margins": 0.8436892628669739,
"rewards/rejected": 45.359458923339844,
"step": 630
},
{
"epoch": 0.3768030615248749,
"grad_norm": 2697.761608609681,
"learning_rate": 3.920812591197603e-07,
"logits/chosen": 5017.4580078125,
"logits/rejected": 4359.14208984375,
"logps/chosen": -211.0869598388672,
"logps/rejected": -190.4265594482422,
"loss": 423.3793,
"rewards/accuracies": 0.5166666507720947,
"rewards/chosen": 38.125648498535156,
"rewards/margins": -3.213513135910034,
"rewards/rejected": 41.33915710449219,
"step": 640
},
{
"epoch": 0.3826906093612011,
"grad_norm": 2574.762524301188,
"learning_rate": 3.878222813555261e-07,
"logits/chosen": 5633.08447265625,
"logits/rejected": 5116.6396484375,
"logps/chosen": -233.45870971679688,
"logps/rejected": -225.9169921875,
"loss": 455.4143,
"rewards/accuracies": 0.5,
"rewards/chosen": 44.39647674560547,
"rewards/margins": -3.767230987548828,
"rewards/rejected": 48.1637077331543,
"step": 650
},
{
"epoch": 0.38857815719752725,
"grad_norm": 2594.9637687624113,
"learning_rate": 3.8350504537647787e-07,
"logits/chosen": 4977.634765625,
"logits/rejected": 4156.2880859375,
"logps/chosen": -228.33230590820312,
"logps/rejected": -204.38168334960938,
"loss": 465.9269,
"rewards/accuracies": 0.5666667222976685,
"rewards/chosen": 42.252296447753906,
"rewards/margins": -1.8904374837875366,
"rewards/rejected": 44.142738342285156,
"step": 660
},
{
"epoch": 0.3944657050338534,
"grad_norm": 2712.9964855523226,
"learning_rate": 3.7913137610133425e-07,
"logits/chosen": 5011.603515625,
"logits/rejected": 4176.4658203125,
"logps/chosen": -205.63040161132812,
"logps/rejected": -193.8191680908203,
"loss": 461.3146,
"rewards/accuracies": 0.5250000357627869,
"rewards/chosen": 43.163169860839844,
"rewards/margins": 1.5009454488754272,
"rewards/rejected": 41.66222381591797,
"step": 670
},
{
"epoch": 0.4003532528701796,
"grad_norm": 2700.0766282434847,
"learning_rate": 3.747031223034695e-07,
"logits/chosen": 5478.8486328125,
"logits/rejected": 4809.7490234375,
"logps/chosen": -226.5070037841797,
"logps/rejected": -205.4853515625,
"loss": 433.533,
"rewards/accuracies": 0.5,
"rewards/chosen": 41.19525146484375,
"rewards/margins": -1.973081350326538,
"rewards/rejected": 43.1683349609375,
"step": 680
},
{
"epoch": 0.40624080070650576,
"grad_norm": 2734.175956036057,
"learning_rate": 3.7022215582942734e-07,
"logits/chosen": 4257.45654296875,
"logits/rejected": 3948.198486328125,
"logps/chosen": -223.28622436523438,
"logps/rejected": -202.68695068359375,
"loss": 455.9671,
"rewards/accuracies": 0.47499996423721313,
"rewards/chosen": 41.13948059082031,
"rewards/margins": -1.048543930053711,
"rewards/rejected": 42.188026428222656,
"step": 690
},
{
"epoch": 0.4121283485428319,
"grad_norm": 2667.6389753883586,
"learning_rate": 3.656903708076815e-07,
"logits/chosen": 5091.544921875,
"logits/rejected": 4615.90576171875,
"logps/chosen": -216.5720672607422,
"logps/rejected": -212.180908203125,
"loss": 457.6879,
"rewards/accuracies": 0.5416666865348816,
"rewards/chosen": 41.81556701660156,
"rewards/margins": -1.126306176185608,
"rewards/rejected": 42.94187545776367,
"step": 700
},
{
"epoch": 0.4180158963791581,
"grad_norm": 3140.337616775453,
"learning_rate": 3.611096828479773e-07,
"logits/chosen": 5586.228515625,
"logits/rejected": 4623.2958984375,
"logps/chosen": -257.47991943359375,
"logps/rejected": -214.2526092529297,
"loss": 487.5771,
"rewards/accuracies": 0.5416666865348816,
"rewards/chosen": 42.64170455932617,
"rewards/margins": 0.10084114223718643,
"rewards/rejected": 42.540863037109375,
"step": 710
},
{
"epoch": 0.42390344421548426,
"grad_norm": 2603.7761578281047,
"learning_rate": 3.564820282315931e-07,
"logits/chosen": 5537.7119140625,
"logits/rejected": 4168.7373046875,
"logps/chosen": -262.38360595703125,
"logps/rejected": -204.13397216796875,
"loss": 449.7505,
"rewards/accuracies": 0.6250000596046448,
"rewards/chosen": 45.0696907043457,
"rewards/margins": 5.273316383361816,
"rewards/rejected": 39.79636764526367,
"step": 720
},
{
"epoch": 0.4297909920518104,
"grad_norm": 2866.6876808933243,
"learning_rate": 3.518093630928644e-07,
"logits/chosen": 5515.57275390625,
"logits/rejected": 4701.93994140625,
"logps/chosen": -248.15963745117188,
"logps/rejected": -229.8058624267578,
"loss": 476.8922,
"rewards/accuracies": 0.5833333134651184,
"rewards/chosen": 46.25033187866211,
"rewards/margins": 8.252941131591797,
"rewards/rejected": 37.99739456176758,
"step": 730
},
{
"epoch": 0.4356785398881366,
"grad_norm": 3058.8100866920245,
"learning_rate": 3.4709366259231464e-07,
"logits/chosen": 5491.2119140625,
"logits/rejected": 5104.46337890625,
"logps/chosen": -250.91342163085938,
"logps/rejected": -231.6546173095703,
"loss": 469.3656,
"rewards/accuracies": 0.5666667222976685,
"rewards/chosen": 44.03254699707031,
"rewards/margins": -2.906583309173584,
"rewards/rejected": 46.93913269042969,
"step": 740
},
{
"epoch": 0.44156608772446276,
"grad_norm": 3017.47651137017,
"learning_rate": 3.423369200817449e-07,
"logits/chosen": 5202.6728515625,
"logits/rejected": 4578.47265625,
"logps/chosen": -249.1278839111328,
"logps/rejected": -213.2310333251953,
"loss": 462.8963,
"rewards/accuracies": 0.5666667222976685,
"rewards/chosen": 44.32584762573242,
"rewards/margins": -3.8503451347351074,
"rewards/rejected": 48.17619705200195,
"step": 750
},
{
"epoch": 0.44745363556078893,
"grad_norm": 2656.5967747745717,
"learning_rate": 3.3754114626163314e-07,
"logits/chosen": 5392.62646484375,
"logits/rejected": 5032.01953125,
"logps/chosen": -269.23895263671875,
"logps/rejected": -242.6970672607422,
"loss": 461.1163,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 46.39586639404297,
"rewards/margins": 0.5340154767036438,
"rewards/rejected": 45.861846923828125,
"step": 760
},
{
"epoch": 0.4533411833971151,
"grad_norm": 2855.313253763874,
"learning_rate": 3.327083683312004e-07,
"logits/chosen": 5523.498046875,
"logits/rejected": 5086.7783203125,
"logps/chosen": -261.9441223144531,
"logps/rejected": -227.2518768310547,
"loss": 481.2854,
"rewards/accuracies": 0.5083333849906921,
"rewards/chosen": 45.44671630859375,
"rewards/margins": 0.4767987132072449,
"rewards/rejected": 44.96991729736328,
"step": 770
},
{
"epoch": 0.45922873123344127,
"grad_norm": 2722.639941986274,
"learning_rate": 3.2784062913150293e-07,
"logits/chosen": 5799.98828125,
"logits/rejected": 4591.291015625,
"logps/chosen": -287.0743408203125,
"logps/rejected": -208.4379425048828,
"loss": 483.4277,
"rewards/accuracies": 0.5916666388511658,
"rewards/chosen": 42.68096923828125,
"rewards/margins": -0.6198533773422241,
"rewards/rejected": 43.30082321166992,
"step": 780
},
{
"epoch": 0.46511627906976744,
"grad_norm": 2318.299759399404,
"learning_rate": 3.229399862819124e-07,
"logits/chosen": 5658.1552734375,
"logits/rejected": 4616.27490234375,
"logps/chosen": -259.0816955566406,
"logps/rejected": -202.31143188476562,
"loss": 453.9431,
"rewards/accuracies": 0.5583332777023315,
"rewards/chosen": 45.26428985595703,
"rewards/margins": 2.1579227447509766,
"rewards/rejected": 43.106361389160156,
"step": 790
},
{
"epoch": 0.4710038269060936,
"grad_norm": 3232.511560323512,
"learning_rate": 3.18008511310349e-07,
"logits/chosen": 5601.29150390625,
"logits/rejected": 4995.228515625,
"logps/chosen": -233.19369506835938,
"logps/rejected": -235.1713409423828,
"loss": 457.5126,
"rewards/accuracies": 0.5333333611488342,
"rewards/chosen": 48.8902702331543,
"rewards/margins": 3.462005615234375,
"rewards/rejected": 45.42826461791992,
"step": 800
},
{
"epoch": 0.47689137474241977,
"grad_norm": 2903.24125926808,
"learning_rate": 3.1304828877763564e-07,
"logits/chosen": 5097.60693359375,
"logits/rejected": 4684.20068359375,
"logps/chosen": -222.4737091064453,
"logps/rejected": -205.9313201904297,
"loss": 449.5271,
"rewards/accuracies": 0.5083333253860474,
"rewards/chosen": 43.99319839477539,
"rewards/margins": -6.208525657653809,
"rewards/rejected": 50.20172882080078,
"step": 810
},
{
"epoch": 0.48277892257874594,
"grad_norm": 2895.193360278281,
"learning_rate": 3.080614153963429e-07,
"logits/chosen": 5215.91552734375,
"logits/rejected": 4266.2978515625,
"logps/chosen": -210.92129516601562,
"logps/rejected": -204.11961364746094,
"loss": 443.6631,
"rewards/accuracies": 0.5750000476837158,
"rewards/chosen": 45.588417053222656,
"rewards/margins": 4.789834022521973,
"rewards/rejected": 40.798583984375,
"step": 820
},
{
"epoch": 0.4886664704150721,
"grad_norm": 2695.9508775371423,
"learning_rate": 3.030499991444977e-07,
"logits/chosen": 5659.1708984375,
"logits/rejected": 4685.5791015625,
"logps/chosen": -240.7261962890625,
"logps/rejected": -228.7866973876953,
"loss": 464.7709,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 45.628719329833984,
"rewards/margins": 2.6973090171813965,
"rewards/rejected": 42.931419372558594,
"step": 830
},
{
"epoch": 0.4945540182513983,
"grad_norm": 2836.335274661169,
"learning_rate": 2.980161583745294e-07,
"logits/chosen": 5090.1796875,
"logits/rejected": 4844.7646484375,
"logps/chosen": -241.19583129882812,
"logps/rejected": -240.20263671875,
"loss": 456.9758,
"rewards/accuracies": 0.5250000357627869,
"rewards/chosen": 43.398475646972656,
"rewards/margins": -6.495508670806885,
"rewards/rejected": 49.893985748291016,
"step": 840
},
{
"epoch": 0.5004415660877245,
"grad_norm": 2558.4226398723663,
"learning_rate": 2.929620209178307e-07,
"logits/chosen": 5289.33203125,
"logits/rejected": 4530.0595703125,
"logps/chosen": -245.92941284179688,
"logps/rejected": -206.4706573486328,
"loss": 453.3887,
"rewards/accuracies": 0.6916667222976685,
"rewards/chosen": 45.08998489379883,
"rewards/margins": 5.154721260070801,
"rewards/rejected": 39.935264587402344,
"step": 850
},
{
"epoch": 0.5063291139240507,
"grad_norm": 2717.829404938614,
"learning_rate": 2.8788972318531267e-07,
"logits/chosen": 5749.87890625,
"logits/rejected": 4572.9990234375,
"logps/chosen": -237.6320037841797,
"logps/rejected": -211.4043426513672,
"loss": 450.152,
"rewards/accuracies": 0.5166667103767395,
"rewards/chosen": 46.66907501220703,
"rewards/margins": 4.650607109069824,
"rewards/rejected": 42.01846694946289,
"step": 860
},
{
"epoch": 0.5122166617603768,
"grad_norm": 2861.2377752356074,
"learning_rate": 2.8280140926433187e-07,
"logits/chosen": 5393.77392578125,
"logits/rejected": 5325.40380859375,
"logps/chosen": -242.5797882080078,
"logps/rejected": -222.5009765625,
"loss": 481.2984,
"rewards/accuracies": 0.5333333611488342,
"rewards/chosen": 43.943992614746094,
"rewards/margins": -2.422114133834839,
"rewards/rejected": 46.36610412597656,
"step": 870
},
{
"epoch": 0.518104209596703,
"grad_norm": 2659.2035494791103,
"learning_rate": 2.7769923001237316e-07,
"logits/chosen": 5558.83935546875,
"logits/rejected": 4822.5126953125,
"logps/chosen": -220.2300262451172,
"logps/rejected": -225.2122802734375,
"loss": 449.2192,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": 44.88111114501953,
"rewards/margins": 4.092176914215088,
"rewards/rejected": 40.78893280029297,
"step": 880
},
{
"epoch": 0.5239917574330292,
"grad_norm": 2617.742422402943,
"learning_rate": 2.7258534214787107e-07,
"logits/chosen": 5243.423828125,
"logits/rejected": 4836.48046875,
"logps/chosen": -228.9081268310547,
"logps/rejected": -223.1492156982422,
"loss": 452.9875,
"rewards/accuracies": 0.60833340883255,
"rewards/chosen": 44.30738067626953,
"rewards/margins": 4.894416809082031,
"rewards/rejected": 39.412967681884766,
"step": 890
},
{
"epoch": 0.5298793052693553,
"grad_norm": 2262.030014981174,
"learning_rate": 2.6746190733855306e-07,
"logits/chosen": 5679.38623046875,
"logits/rejected": 4615.3876953125,
"logps/chosen": -269.0588073730469,
"logps/rejected": -209.2182159423828,
"loss": 445.8765,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 45.74579620361328,
"rewards/margins": 4.458695411682129,
"rewards/rejected": 41.2870979309082,
"step": 900
},
{
"epoch": 0.5357668531056815,
"grad_norm": 2839.0904187282845,
"learning_rate": 2.6233109128769133e-07,
"logits/chosen": 5576.9326171875,
"logits/rejected": 4600.5908203125,
"logps/chosen": -252.0655975341797,
"logps/rejected": -201.52325439453125,
"loss": 472.7852,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 43.91907501220703,
"rewards/margins": 0.6527482867240906,
"rewards/rejected": 43.266319274902344,
"step": 910
},
{
"epoch": 0.5416544009420077,
"grad_norm": 2527.8677929651662,
"learning_rate": 2.571950628186483e-07,
"logits/chosen": 5142.1513671875,
"logits/rejected": 4847.85888671875,
"logps/chosen": -233.6755828857422,
"logps/rejected": -223.87100219726562,
"loss": 457.4038,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 49.908050537109375,
"rewards/margins": 8.85273265838623,
"rewards/rejected": 41.0553092956543,
"step": 920
},
{
"epoch": 0.5475419487783338,
"grad_norm": 2434.5446784038063,
"learning_rate": 2.520559929581034e-07,
"logits/chosen": 5560.0634765625,
"logits/rejected": 5184.27685546875,
"logps/chosen": -219.471923828125,
"logps/rejected": -230.39956665039062,
"loss": 445.1723,
"rewards/accuracies": 0.5166666507720947,
"rewards/chosen": 45.55453872680664,
"rewards/margins": 3.9521491527557373,
"rewards/rejected": 41.602394104003906,
"step": 930
},
{
"epoch": 0.55342949661466,
"grad_norm": 2623.9403337631743,
"learning_rate": 2.469160540183484e-07,
"logits/chosen": 4994.5146484375,
"logits/rejected": 4167.3115234375,
"logps/chosen": -216.19406127929688,
"logps/rejected": -202.72320556640625,
"loss": 437.8886,
"rewards/accuracies": 0.5333333015441895,
"rewards/chosen": 45.913734436035156,
"rewards/margins": 5.58524227142334,
"rewards/rejected": 40.328495025634766,
"step": 940
},
{
"epoch": 0.5593170444509862,
"grad_norm": 2507.348851307183,
"learning_rate": 2.417774186790396e-07,
"logits/chosen": 5547.2744140625,
"logits/rejected": 4807.9375,
"logps/chosen": -230.7436981201172,
"logps/rejected": -207.02481079101562,
"loss": 452.8866,
"rewards/accuracies": 0.5500000715255737,
"rewards/chosen": 47.687828063964844,
"rewards/margins": 4.640194416046143,
"rewards/rejected": 43.047645568847656,
"step": 950
},
{
"epoch": 0.5652045922873123,
"grad_norm": 3119.519288688434,
"learning_rate": 2.366422590687945e-07,
"logits/chosen": 5354.3486328125,
"logits/rejected": 4346.8291015625,
"logps/chosen": -213.7137908935547,
"logps/rejected": -186.5005645751953,
"loss": 458.5086,
"rewards/accuracies": 0.5416666269302368,
"rewards/chosen": 45.78757858276367,
"rewards/margins": -1.3455828428268433,
"rewards/rejected": 47.13316345214844,
"step": 960
},
{
"epoch": 0.5710921401236385,
"grad_norm": 2711.9281079875923,
"learning_rate": 2.3151274584702116e-07,
"logits/chosen": 5499.513671875,
"logits/rejected": 4687.3408203125,
"logps/chosen": -237.8612823486328,
"logps/rejected": -205.35922241210938,
"loss": 463.1298,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 48.72194290161133,
"rewards/margins": 4.278817176818848,
"rewards/rejected": 44.4431266784668,
"step": 970
},
{
"epoch": 0.5769796879599647,
"grad_norm": 2630.306237104176,
"learning_rate": 2.2639104728636912e-07,
"logits/chosen": 5086.7822265625,
"logits/rejected": 4054.61962890625,
"logps/chosen": -220.7304229736328,
"logps/rejected": -206.5825958251953,
"loss": 440.4308,
"rewards/accuracies": 0.5583333969116211,
"rewards/chosen": 43.90907669067383,
"rewards/margins": 2.613150119781494,
"rewards/rejected": 41.29592514038086,
"step": 980
},
{
"epoch": 0.5828672357962909,
"grad_norm": 2710.6636823106182,
"learning_rate": 2.2127932835618895e-07,
"logits/chosen": 5302.921875,
"logits/rejected": 4638.28955078125,
"logps/chosen": -234.52285766601562,
"logps/rejected": -225.45364379882812,
"loss": 427.9638,
"rewards/accuracies": 0.5583332777023315,
"rewards/chosen": 42.980873107910156,
"rewards/margins": 4.057049751281738,
"rewards/rejected": 38.923824310302734,
"step": 990
},
{
"epoch": 0.588754783632617,
"grad_norm": 2870.711490574312,
"learning_rate": 2.1617974980738814e-07,
"logits/chosen": 5824.0576171875,
"logits/rejected": 5266.1982421875,
"logps/chosen": -258.0989685058594,
"logps/rejected": -251.48828125,
"loss": 482.1707,
"rewards/accuracies": 0.5583333373069763,
"rewards/chosen": 45.20808792114258,
"rewards/margins": -5.770603656768799,
"rewards/rejected": 50.97869110107422,
"step": 1000
},
{
"epoch": 0.5946423314689432,
"grad_norm": 2447.752518434989,
"learning_rate": 2.1109446725907e-07,
"logits/chosen": 5894.18603515625,
"logits/rejected": 4373.1640625,
"logps/chosen": -254.66024780273438,
"logps/rejected": -196.70751953125,
"loss": 450.6618,
"rewards/accuracies": 0.6500000357627869,
"rewards/chosen": 45.06776428222656,
"rewards/margins": 5.682547569274902,
"rewards/rejected": 39.385215759277344,
"step": 1010
},
{
"epoch": 0.6005298793052694,
"grad_norm": 3026.8393568831943,
"learning_rate": 2.060256302873421e-07,
"logits/chosen": 4870.3251953125,
"logits/rejected": 4591.98876953125,
"logps/chosen": -234.90676879882812,
"logps/rejected": -216.5286865234375,
"loss": 443.9214,
"rewards/accuracies": 0.6333333849906921,
"rewards/chosen": 50.04943084716797,
"rewards/margins": 7.063135623931885,
"rewards/rejected": 42.98630142211914,
"step": 1020
},
{
"epoch": 0.6064174271415955,
"grad_norm": 2795.814283569102,
"learning_rate": 2.0097538151667884e-07,
"logits/chosen": 6684.375,
"logits/rejected": 5485.0751953125,
"logps/chosen": -276.08843994140625,
"logps/rejected": -246.08056640625,
"loss": 496.8952,
"rewards/accuracies": 0.60833340883255,
"rewards/chosen": 54.97471237182617,
"rewards/margins": 7.3708295822143555,
"rewards/rejected": 47.6038818359375,
"step": 1030
},
{
"epoch": 0.6123049749779217,
"grad_norm": 2455.354784189058,
"learning_rate": 1.9594585571422276e-07,
"logits/chosen": 4663.0888671875,
"logits/rejected": 4298.4326171875,
"logps/chosen": -186.22332763671875,
"logps/rejected": -195.13868713378906,
"loss": 447.868,
"rewards/accuracies": 0.5166666507720947,
"rewards/chosen": 37.82415008544922,
"rewards/margins": -4.475025177001953,
"rewards/rejected": 42.29917907714844,
"step": 1040
},
{
"epoch": 0.6181925228142479,
"grad_norm": 3037.059899209514,
"learning_rate": 1.9093917888740688e-07,
"logits/chosen": 5092.6142578125,
"logits/rejected": 4403.24267578125,
"logps/chosen": -212.946533203125,
"logps/rejected": -192.6261749267578,
"loss": 438.1319,
"rewards/accuracies": 0.4583333432674408,
"rewards/chosen": 40.9649772644043,
"rewards/margins": -2.0067009925842285,
"rewards/rejected": 42.9716796875,
"step": 1050
},
{
"epoch": 0.624080070650574,
"grad_norm": 2583.707994006676,
"learning_rate": 1.8595746738528043e-07,
"logits/chosen": 5215.65625,
"logits/rejected": 4705.4794921875,
"logps/chosen": -240.82656860351562,
"logps/rejected": -210.69119262695312,
"loss": 481.238,
"rewards/accuracies": 0.5666667222976685,
"rewards/chosen": 42.98539352416992,
"rewards/margins": 1.0407254695892334,
"rewards/rejected": 41.94466781616211,
"step": 1060
},
{
"epoch": 0.6299676184869002,
"grad_norm": 2888.281846249585,
"learning_rate": 1.8100282700391615e-07,
"logits/chosen": 5474.8203125,
"logits/rejected": 5022.8955078125,
"logps/chosen": -238.95706176757812,
"logps/rejected": -246.790283203125,
"loss": 473.3039,
"rewards/accuracies": 0.5333333611488342,
"rewards/chosen": 45.00455093383789,
"rewards/margins": -1.4118093252182007,
"rewards/rejected": 46.416358947753906,
"step": 1070
},
{
"epoch": 0.6358551663232264,
"grad_norm": 2919.6730132023235,
"learning_rate": 1.7607735209627948e-07,
"logits/chosen": 6066.033203125,
"logits/rejected": 5340.1376953125,
"logps/chosen": -255.93374633789062,
"logps/rejected": -219.3169708251953,
"loss": 442.0081,
"rewards/accuracies": 0.5666666626930237,
"rewards/chosen": 59.42621994018555,
"rewards/margins": 10.818987846374512,
"rewards/rejected": 48.60723114013672,
"step": 1080
},
{
"epoch": 0.6417427141595525,
"grad_norm": 3061.31389303788,
"learning_rate": 1.7118312468693435e-07,
"logits/chosen": 5364.8857421875,
"logits/rejected": 4527.67236328125,
"logps/chosen": -265.9931945800781,
"logps/rejected": -206.59375,
"loss": 456.8328,
"rewards/accuracies": 0.5583333373069763,
"rewards/chosen": 45.197242736816406,
"rewards/margins": 4.963152885437012,
"rewards/rejected": 40.23408889770508,
"step": 1090
},
{
"epoch": 0.6476302619958787,
"grad_norm": 2675.5740514632685,
"learning_rate": 1.6632221359196007e-07,
"logits/chosen": 5273.853515625,
"logits/rejected": 4304.39306640625,
"logps/chosen": -215.75680541992188,
"logps/rejected": -205.96499633789062,
"loss": 450.398,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 47.502655029296875,
"rewards/margins": -0.780138373374939,
"rewards/rejected": 48.28279113769531,
"step": 1100
},
{
"epoch": 0.6535178098322049,
"grad_norm": 2656.567276511366,
"learning_rate": 1.614966735444519e-07,
"logits/chosen": 5670.87890625,
"logits/rejected": 4776.35400390625,
"logps/chosen": -263.34454345703125,
"logps/rejected": -211.91018676757812,
"loss": 443.2492,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 47.74077224731445,
"rewards/margins": 4.260870933532715,
"rewards/rejected": 43.47990036010742,
"step": 1110
},
{
"epoch": 0.659405357668531,
"grad_norm": 2727.588533971273,
"learning_rate": 1.567085443259743e-07,
"logits/chosen": 5641.5869140625,
"logits/rejected": 4489.1552734375,
"logps/chosen": -251.7860870361328,
"logps/rejected": -210.0601348876953,
"loss": 473.6551,
"rewards/accuracies": 0.6083333492279053,
"rewards/chosen": 48.41345977783203,
"rewards/margins": 0.43558159470558167,
"rewards/rejected": 47.977882385253906,
"step": 1120
},
{
"epoch": 0.6652929055048572,
"grad_norm": 2815.3853893624587,
"learning_rate": 1.5195984990433436e-07,
"logits/chosen": 5388.6455078125,
"logits/rejected": 5199.6357421875,
"logps/chosen": -247.1171875,
"logps/rejected": -233.29345703125,
"loss": 436.6213,
"rewards/accuracies": 0.5583333373069763,
"rewards/chosen": 45.90008544921875,
"rewards/margins": 3.9734444618225098,
"rewards/rejected": 41.926639556884766,
"step": 1130
},
{
"epoch": 0.6711804533411834,
"grad_norm": 2726.3125711774583,
"learning_rate": 1.4725259757803982e-07,
"logits/chosen": 6462.54296875,
"logits/rejected": 5156.8828125,
"logps/chosen": -273.95050048828125,
"logps/rejected": -233.63723754882812,
"loss": 469.2894,
"rewards/accuracies": 0.5666666626930237,
"rewards/chosen": 49.0769157409668,
"rewards/margins": 1.6615930795669556,
"rewards/rejected": 47.41532516479492,
"step": 1140
},
{
"epoch": 0.6770680011775095,
"grad_norm": 2654.672736980306,
"learning_rate": 1.4258877712780331e-07,
"logits/chosen": 5129.0439453125,
"logits/rejected": 3892.001953125,
"logps/chosen": -231.34765625,
"logps/rejected": -182.238037109375,
"loss": 432.7688,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 45.699745178222656,
"rewards/margins": 3.295746326446533,
"rewards/rejected": 42.40399932861328,
"step": 1150
},
{
"epoch": 0.6829555490138357,
"grad_norm": 2482.74159412,
"learning_rate": 1.3797035997545142e-07,
"logits/chosen": 4968.6845703125,
"logits/rejected": 4853.14501953125,
"logps/chosen": -205.07754516601562,
"logps/rejected": -228.11337280273438,
"loss": 420.7995,
"rewards/accuracies": 0.5166667103767395,
"rewards/chosen": 44.83478546142578,
"rewards/margins": -1.369265079498291,
"rewards/rejected": 46.20404815673828,
"step": 1160
},
{
"epoch": 0.6888430968501619,
"grad_norm": 2542.0640885944986,
"learning_rate": 1.333992983505939e-07,
"logits/chosen": 5638.7607421875,
"logits/rejected": 5253.87451171875,
"logps/chosen": -258.0821228027344,
"logps/rejected": -242.6870880126953,
"loss": 496.0406,
"rewards/accuracies": 0.5333333611488342,
"rewards/chosen": 49.918190002441406,
"rewards/margins": 5.044798851013184,
"rewards/rejected": 44.873390197753906,
"step": 1170
},
{
"epoch": 0.694730644686488,
"grad_norm": 2840.4883093419694,
"learning_rate": 1.2887752446540616e-07,
"logits/chosen": 5160.63232421875,
"logits/rejected": 4702.83837890625,
"logps/chosen": -234.97921752929688,
"logps/rejected": -211.44931030273438,
"loss": 454.1117,
"rewards/accuracies": 0.42500004172325134,
"rewards/chosen": 39.74894332885742,
"rewards/margins": -7.0438995361328125,
"rewards/rejected": 46.792850494384766,
"step": 1180
},
{
"epoch": 0.7006181925228142,
"grad_norm": 2534.3268190191707,
"learning_rate": 1.244069496978726e-07,
"logits/chosen": 5515.9306640625,
"logits/rejected": 5091.92578125,
"logps/chosen": -253.8005828857422,
"logps/rejected": -215.34494018554688,
"loss": 455.0434,
"rewards/accuracies": 0.5416666269302368,
"rewards/chosen": 41.3018684387207,
"rewards/margins": -2.4822511672973633,
"rewards/rejected": 43.78411865234375,
"step": 1190
},
{
"epoch": 0.7065057403591404,
"grad_norm": 2216.18831688337,
"learning_rate": 1.1998946378383697e-07,
"logits/chosen": 4838.0625,
"logits/rejected": 4621.58837890625,
"logps/chosen": -195.82528686523438,
"logps/rejected": -200.3052520751953,
"loss": 406.3944,
"rewards/accuracies": 0.5333333611488342,
"rewards/chosen": 49.4088134765625,
"rewards/margins": 6.762750148773193,
"rewards/rejected": 42.64606857299805,
"step": 1200
},
{
"epoch": 0.7123932881954665,
"grad_norm": 2925.190203960196,
"learning_rate": 1.1562693401820092e-07,
"logits/chosen": 5704.892578125,
"logits/rejected": 4735.81982421875,
"logps/chosen": -250.4612274169922,
"logps/rejected": -221.1834716796875,
"loss": 475.8371,
"rewards/accuracies": 0.5416666865348816,
"rewards/chosen": 49.47187423706055,
"rewards/margins": 1.9925483465194702,
"rewards/rejected": 47.47932815551758,
"step": 1210
},
{
"epoch": 0.7182808360317927,
"grad_norm": 2740.5903742398273,
"learning_rate": 1.113212044656087e-07,
"logits/chosen": 5618.9365234375,
"logits/rejected": 4587.16064453125,
"logps/chosen": -237.82760620117188,
"logps/rejected": -221.23837280273438,
"loss": 487.2423,
"rewards/accuracies": 0.6333333253860474,
"rewards/chosen": 57.44464874267578,
"rewards/margins": 8.748071670532227,
"rewards/rejected": 48.696571350097656,
"step": 1220
},
{
"epoch": 0.7241683838681189,
"grad_norm": 2501.9000780368424,
"learning_rate": 1.0707409518095079e-07,
"logits/chosen": 5306.55322265625,
"logits/rejected": 4526.52197265625,
"logps/chosen": -240.0721893310547,
"logps/rejected": -203.85665893554688,
"loss": 449.9376,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 51.15095901489258,
"rewards/margins": 12.58459758758545,
"rewards/rejected": 38.56635665893555,
"step": 1230
},
{
"epoch": 0.730055931704445,
"grad_norm": 2940.399273853818,
"learning_rate": 1.028874014400172e-07,
"logits/chosen": 5354.55908203125,
"logits/rejected": 4197.5888671875,
"logps/chosen": -245.00277709960938,
"logps/rejected": -194.13980102539062,
"loss": 468.1962,
"rewards/accuracies": 0.5250000357627869,
"rewards/chosen": 46.04726791381836,
"rewards/margins": -0.03993086889386177,
"rewards/rejected": 46.08720016479492,
"step": 1240
},
{
"epoch": 0.7359434795407712,
"grad_norm": 3023.1812955166797,
"learning_rate": 9.876289298062476e-08,
"logits/chosen": 5727.68603515625,
"logits/rejected": 4590.14453125,
"logps/chosen": -241.8575897216797,
"logps/rejected": -218.59616088867188,
"loss": 456.2886,
"rewards/accuracies": 0.5166667103767395,
"rewards/chosen": 47.04845428466797,
"rewards/margins": 0.41756492853164673,
"rewards/rejected": 46.630889892578125,
"step": 1250
},
{
"epoch": 0.7418310273770974,
"grad_norm": 2636.2027748224828,
"learning_rate": 9.470231325453956e-08,
"logits/chosen": 5583.41552734375,
"logits/rejected": 4478.337890625,
"logps/chosen": -221.7116241455078,
"logps/rejected": -211.3160858154297,
"loss": 433.6968,
"rewards/accuracies": 0.5333333015441895,
"rewards/chosen": 45.1245002746582,
"rewards/margins": -2.6019275188446045,
"rewards/rejected": 47.7264289855957,
"step": 1260
},
{
"epoch": 0.7477185752134237,
"grad_norm": 2662.2777753772584,
"learning_rate": 9.070737869051043e-08,
"logits/chosen": 5027.3095703125,
"logits/rejected": 4796.9765625,
"logps/chosen": -228.14697265625,
"logps/rejected": -206.49560546875,
"loss": 417.4956,
"rewards/accuracies": 0.5083333849906921,
"rewards/chosen": 46.9410285949707,
"rewards/margins": 2.322603464126587,
"rewards/rejected": 44.61842727661133,
"step": 1270
},
{
"epoch": 0.7536061230497498,
"grad_norm": 2745.160316396736,
"learning_rate": 8.67797779687254e-08,
"logits/chosen": 5756.96875,
"logits/rejected": 4928.97900390625,
"logps/chosen": -264.42279052734375,
"logps/rejected": -216.61502075195312,
"loss": 462.2225,
"rewards/accuracies": 0.6416667103767395,
"rewards/chosen": 49.881675720214844,
"rewards/margins": 8.946965217590332,
"rewards/rejected": 40.93471145629883,
"step": 1280
},
{
"epoch": 0.759493670886076,
"grad_norm": 2914.979019476111,
"learning_rate": 8.292117130699766e-08,
"logits/chosen": 5330.68212890625,
"logits/rejected": 4864.1455078125,
"logps/chosen": -260.3185119628906,
"logps/rejected": -222.41830444335938,
"loss": 459.8443,
"rewards/accuracies": 0.6250000596046448,
"rewards/chosen": 46.853370666503906,
"rewards/margins": 5.266936302185059,
"rewards/rejected": 41.58643341064453,
"step": 1290
},
{
"epoch": 0.7653812187224022,
"grad_norm": 2681.0728179420385,
"learning_rate": 7.913318975898237e-08,
"logits/chosen": 5588.1552734375,
"logits/rejected": 3932.55126953125,
"logps/chosen": -226.26220703125,
"logps/rejected": -192.8434600830078,
"loss": 444.3821,
"rewards/accuracies": 0.6416667103767395,
"rewards/chosen": 50.132713317871094,
"rewards/margins": 10.740438461303711,
"rewards/rejected": 39.392276763916016,
"step": 1300
},
{
"epoch": 0.7712687665587283,
"grad_norm": 2644.708915509074,
"learning_rate": 7.541743452472193e-08,
"logits/chosen": 5802.92822265625,
"logits/rejected": 5005.15478515625,
"logps/chosen": -258.23095703125,
"logps/rejected": -227.35104370117188,
"loss": 470.9071,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 44.41248321533203,
"rewards/margins": 1.734694242477417,
"rewards/rejected": 42.67778778076172,
"step": 1310
},
{
"epoch": 0.7771563143950545,
"grad_norm": 2760.1370388445303,
"learning_rate": 7.177547627380987e-08,
"logits/chosen": 5510.3876953125,
"logits/rejected": 4903.322265625,
"logps/chosen": -229.8451385498047,
"logps/rejected": -202.1870574951172,
"loss": 435.5435,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": 44.152503967285156,
"rewards/margins": 1.354308843612671,
"rewards/rejected": 42.79819869995117,
"step": 1320
},
{
"epoch": 0.7830438622313807,
"grad_norm": 2614.2018106928704,
"learning_rate": 6.820885448146041e-08,
"logits/chosen": 6023.060546875,
"logits/rejected": 5090.50537109375,
"logps/chosen": -282.25433349609375,
"logps/rejected": -261.2765197753906,
"loss": 499.2624,
"rewards/accuracies": 0.5583333373069763,
"rewards/chosen": 52.391563415527344,
"rewards/margins": 6.237324237823486,
"rewards/rejected": 46.15424346923828,
"step": 1330
},
{
"epoch": 0.7889314100677068,
"grad_norm": 2597.7263605767535,
"learning_rate": 6.471907677776426e-08,
"logits/chosen": 5775.0126953125,
"logits/rejected": 4832.48876953125,
"logps/chosen": -239.5498809814453,
"logps/rejected": -222.05859375,
"loss": 468.5632,
"rewards/accuracies": 0.5083333253860474,
"rewards/chosen": 46.0155143737793,
"rewards/margins": -3.5307693481445312,
"rewards/rejected": 49.54628372192383,
"step": 1340
},
{
"epoch": 0.794818957904033,
"grad_norm": 2640.8038388162845,
"learning_rate": 6.13076183104052e-08,
"logits/chosen": 4657.62841796875,
"logits/rejected": 4134.35107421875,
"logps/chosen": -207.19058227539062,
"logps/rejected": -190.02688598632812,
"loss": 449.3236,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 42.933380126953125,
"rewards/margins": -1.6155602931976318,
"rewards/rejected": 44.54894256591797,
"step": 1350
},
{
"epoch": 0.8007065057403592,
"grad_norm": 2360.3636823333177,
"learning_rate": 5.797592112110733e-08,
"logits/chosen": 5882.80078125,
"logits/rejected": 4898.4326171875,
"logps/chosen": -250.59024047851562,
"logps/rejected": -225.48971557617188,
"loss": 450.8052,
"rewards/accuracies": 0.5083333253860474,
"rewards/chosen": 56.6795768737793,
"rewards/margins": 4.181834697723389,
"rewards/rejected": 52.49774169921875,
"step": 1360
},
{
"epoch": 0.8065940535766853,
"grad_norm": 2871.53542970714,
"learning_rate": 5.4725393536076106e-08,
"logits/chosen": 5734.0810546875,
"logits/rejected": 5196.62255859375,
"logps/chosen": -246.6741485595703,
"logps/rejected": -223.33871459960938,
"loss": 467.8894,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": 46.92644119262695,
"rewards/margins": 2.722069501876831,
"rewards/rejected": 44.204368591308594,
"step": 1370
},
{
"epoch": 0.8124816014130115,
"grad_norm": 2506.469471261304,
"learning_rate": 5.1557409570691854e-08,
"logits/chosen": 5755.55615234375,
"logits/rejected": 4602.2119140625,
"logps/chosen": -258.3332214355469,
"logps/rejected": -208.54736328125,
"loss": 459.4171,
"rewards/accuracies": 0.6000000834465027,
"rewards/chosen": 61.886474609375,
"rewards/margins": 9.517420768737793,
"rewards/rejected": 52.369056701660156,
"step": 1380
},
{
"epoch": 0.8183691492493377,
"grad_norm": 2795.549014141379,
"learning_rate": 4.84733083487055e-08,
"logits/chosen": 5162.1298828125,
"logits/rejected": 4927.0302734375,
"logps/chosen": -224.22134399414062,
"logps/rejected": -214.3109130859375,
"loss": 451.7745,
"rewards/accuracies": 0.5333333611488342,
"rewards/chosen": 50.72800064086914,
"rewards/margins": 5.281952857971191,
"rewards/rejected": 45.446044921875,
"step": 1390
},
{
"epoch": 0.8242566970856638,
"grad_norm": 3099.872127662192,
"learning_rate": 4.547439353618421e-08,
"logits/chosen": 5943.78369140625,
"logits/rejected": 4675.40234375,
"logps/chosen": -254.889892578125,
"logps/rejected": -221.0095977783203,
"loss": 476.1811,
"rewards/accuracies": 0.5416666269302368,
"rewards/chosen": 47.358482360839844,
"rewards/margins": 1.6082299947738647,
"rewards/rejected": 45.75025177001953,
"step": 1400
},
{
"epoch": 0.83014424492199,
"grad_norm": 2768.108043645052,
"learning_rate": 4.2561932790444594e-08,
"logits/chosen": 5309.0048828125,
"logits/rejected": 4402.3564453125,
"logps/chosen": -220.2199249267578,
"logps/rejected": -178.3611602783203,
"loss": 446.4285,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 43.55487823486328,
"rewards/margins": -0.8950740694999695,
"rewards/rejected": 44.449951171875,
"step": 1410
},
{
"epoch": 0.8360317927583162,
"grad_norm": 2416.6350844671583,
"learning_rate": 3.973715722420726e-08,
"logits/chosen": 5374.509765625,
"logits/rejected": 5026.10693359375,
"logps/chosen": -238.96817016601562,
"logps/rejected": -227.57754516601562,
"loss": 473.0174,
"rewards/accuracies": 0.5333333611488342,
"rewards/chosen": 46.53357696533203,
"rewards/margins": -1.398461103439331,
"rewards/rejected": 47.932037353515625,
"step": 1420
},
{
"epoch": 0.8419193405946424,
"grad_norm": 2571.15473257983,
"learning_rate": 3.700126088519892e-08,
"logits/chosen": 5649.8388671875,
"logits/rejected": 5096.2265625,
"logps/chosen": -219.5747833251953,
"logps/rejected": -230.01559448242188,
"loss": 464.0558,
"rewards/accuracies": 0.5916666984558105,
"rewards/chosen": 55.169036865234375,
"rewards/margins": 8.111885070800781,
"rewards/rejected": 47.05714416503906,
"step": 1430
},
{
"epoch": 0.8478068884309685,
"grad_norm": 2500.922610681479,
"learning_rate": 3.435540025142197e-08,
"logits/chosen": 5119.20068359375,
"logits/rejected": 4506.8935546875,
"logps/chosen": -232.45101928710938,
"logps/rejected": -196.38507080078125,
"loss": 460.6399,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 42.9712028503418,
"rewards/margins": -2.353074312210083,
"rewards/rejected": 45.324283599853516,
"step": 1440
},
{
"epoch": 0.8536944362672947,
"grad_norm": 3072.586081400429,
"learning_rate": 3.1800693742305065e-08,
"logits/chosen": 5674.4140625,
"logits/rejected": 4546.13818359375,
"logps/chosen": -232.2156524658203,
"logps/rejected": -209.8252716064453,
"loss": 439.2195,
"rewards/accuracies": 0.5916666388511658,
"rewards/chosen": 45.56594467163086,
"rewards/margins": 1.6826508045196533,
"rewards/rejected": 43.88329315185547,
"step": 1450
},
{
"epoch": 0.8595819841036209,
"grad_norm": 2555.6929882018603,
"learning_rate": 2.9338221245941236e-08,
"logits/chosen": 6251.90380859375,
"logits/rejected": 5486.14990234375,
"logps/chosen": -257.6957092285156,
"logps/rejected": -247.49734497070312,
"loss": 445.6683,
"rewards/accuracies": 0.5833333134651184,
"rewards/chosen": 53.01552200317383,
"rewards/margins": 5.817216396331787,
"rewards/rejected": 47.198307037353516,
"step": 1460
},
{
"epoch": 0.865469531939947,
"grad_norm": 2526.9193833585073,
"learning_rate": 2.6969023662613472e-08,
"logits/chosen": 5537.1455078125,
"logits/rejected": 5597.6640625,
"logps/chosen": -249.64730834960938,
"logps/rejected": -238.49862670898438,
"loss": 453.4951,
"rewards/accuracies": 0.5416666269302368,
"rewards/chosen": 49.131866455078125,
"rewards/margins": -3.0847103595733643,
"rewards/rejected": 52.216575622558594,
"step": 1470
},
{
"epoch": 0.8713570797762732,
"grad_norm": 2814.258565256423,
"learning_rate": 2.4694102464800663e-08,
"logits/chosen": 5417.5751953125,
"logits/rejected": 4861.5048828125,
"logps/chosen": -246.10733032226562,
"logps/rejected": -221.135986328125,
"loss": 476.8938,
"rewards/accuracies": 0.49166664481163025,
"rewards/chosen": 44.802181243896484,
"rewards/margins": -0.2635299563407898,
"rewards/rejected": 45.065711975097656,
"step": 1480
},
{
"epoch": 0.8772446276125994,
"grad_norm": 2719.1332660328358,
"learning_rate": 2.2514419273849673e-08,
"logits/chosen": 5584.01953125,
"logits/rejected": 4642.46337890625,
"logps/chosen": -224.6255340576172,
"logps/rejected": -216.149658203125,
"loss": 454.43,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 45.871826171875,
"rewards/margins": 5.6382737159729,
"rewards/rejected": 40.23354721069336,
"step": 1490
},
{
"epoch": 0.8831321754489255,
"grad_norm": 2562.1914029050754,
"learning_rate": 2.0430895453492942e-08,
"logits/chosen": 5416.74951171875,
"logits/rejected": 4298.4677734375,
"logps/chosen": -233.50082397460938,
"logps/rejected": -192.23582458496094,
"loss": 446.0934,
"rewards/accuracies": 0.5916666984558105,
"rewards/chosen": 47.021453857421875,
"rewards/margins": 1.942139983177185,
"rewards/rejected": 45.07931900024414,
"step": 1500
},
{
"epoch": 0.8890197232852517,
"grad_norm": 2849.2417749033407,
"learning_rate": 1.8444411720383107e-08,
"logits/chosen": 5647.05517578125,
"logits/rejected": 4601.66015625,
"logps/chosen": -243.07870483398438,
"logps/rejected": -205.91433715820312,
"loss": 444.2926,
"rewards/accuracies": 0.5916666388511658,
"rewards/chosen": 53.23212432861328,
"rewards/margins": 7.5799431800842285,
"rewards/rejected": 45.652183532714844,
"step": 1510
},
{
"epoch": 0.8949072711215779,
"grad_norm": 2609.783375316123,
"learning_rate": 1.655580777180937e-08,
"logits/chosen": 4866.36865234375,
"logits/rejected": 4648.73046875,
"logps/chosen": -211.3268280029297,
"logps/rejected": -216.62088012695312,
"loss": 431.3077,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 45.19532012939453,
"rewards/margins": 3.8858094215393066,
"rewards/rejected": 41.309513092041016,
"step": 1520
},
{
"epoch": 0.900794818957904,
"grad_norm": 2858.325700718927,
"learning_rate": 1.4765881930752982e-08,
"logits/chosen": 5512.6796875,
"logits/rejected": 5062.978515625,
"logps/chosen": -251.55654907226562,
"logps/rejected": -247.4755401611328,
"loss": 475.6445,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 47.72459030151367,
"rewards/margins": -0.3327750265598297,
"rewards/rejected": 48.05736541748047,
"step": 1530
},
{
"epoch": 0.9066823667942302,
"grad_norm": 2696.9143449356766,
"learning_rate": 1.3075390808431897e-08,
"logits/chosen": 4964.9306640625,
"logits/rejected": 4029.17822265625,
"logps/chosen": -210.2575225830078,
"logps/rejected": -181.75967407226562,
"loss": 427.3603,
"rewards/accuracies": 0.5916666984558105,
"rewards/chosen": 46.60421371459961,
"rewards/margins": 6.900620460510254,
"rewards/rejected": 39.70359420776367,
"step": 1540
},
{
"epoch": 0.9125699146305564,
"grad_norm": 2909.043218826057,
"learning_rate": 1.1485048984476997e-08,
"logits/chosen": 6264.9814453125,
"logits/rejected": 4926.421875,
"logps/chosen": -247.24398803710938,
"logps/rejected": -222.6241455078125,
"loss": 457.0212,
"rewards/accuracies": 0.491666704416275,
"rewards/chosen": 49.02656936645508,
"rewards/margins": 6.186914920806885,
"rewards/rejected": 42.83965301513672,
"step": 1550
},
{
"epoch": 0.9184574624668825,
"grad_norm": 2774.8159405504452,
"learning_rate": 9.995528704875633e-09,
"logits/chosen": 6105.7099609375,
"logits/rejected": 4611.01708984375,
"logps/chosen": -251.1758270263672,
"logps/rejected": -221.3173065185547,
"loss": 449.7213,
"rewards/accuracies": 0.6250000596046448,
"rewards/chosen": 50.6319465637207,
"rewards/margins": -3.3480277061462402,
"rewards/rejected": 53.9799690246582,
"step": 1560
},
{
"epoch": 0.9243450103032087,
"grad_norm": 2694.402825577869,
"learning_rate": 8.607459597809563e-09,
"logits/chosen": 5613.7509765625,
"logits/rejected": 4740.57275390625,
"logps/chosen": -254.7351837158203,
"logps/rejected": -211.7694549560547,
"loss": 451.4933,
"rewards/accuracies": 0.5666667222976685,
"rewards/chosen": 47.686885833740234,
"rewards/margins": 2.6942806243896484,
"rewards/rejected": 44.992610931396484,
"step": 1570
},
{
"epoch": 0.9302325581395349,
"grad_norm": 2701.2806646124445,
"learning_rate": 7.321428407507879e-09,
"logits/chosen": 5281.224609375,
"logits/rejected": 4749.42236328125,
"logps/chosen": -232.33102416992188,
"logps/rejected": -226.73635864257812,
"loss": 433.525,
"rewards/accuracies": 0.46666663885116577,
"rewards/chosen": 43.77975082397461,
"rewards/margins": -7.4785027503967285,
"rewards/rejected": 51.25825881958008,
"step": 1580
},
{
"epoch": 0.936120105975861,
"grad_norm": 2723.54992738993,
"learning_rate": 6.137978746226846e-09,
"logits/chosen": 5657.05322265625,
"logits/rejected": 4991.169921875,
"logps/chosen": -242.9114227294922,
"logps/rejected": -208.00692749023438,
"loss": 448.1441,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 47.34257888793945,
"rewards/margins": 0.23659400641918182,
"rewards/rejected": 47.105987548828125,
"step": 1590
},
{
"epoch": 0.9420076538121872,
"grad_norm": 2634.7142241290358,
"learning_rate": 5.057610864462353e-09,
"logits/chosen": 5467.34619140625,
"logits/rejected": 4431.1748046875,
"logps/chosen": -226.5965576171875,
"logps/rejected": -206.7377166748047,
"loss": 449.6918,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 53.428863525390625,
"rewards/margins": 8.42198371887207,
"rewards/rejected": 45.00688552856445,
"step": 1600
},
{
"epoch": 0.9478952016485134,
"grad_norm": 3525.9973883937264,
"learning_rate": 4.080781439491199e-09,
"logits/chosen": 5627.1240234375,
"logits/rejected": 5125.11328125,
"logps/chosen": -238.8662109375,
"logps/rejected": -252.44711303710938,
"loss": 494.8671,
"rewards/accuracies": 0.4833333492279053,
"rewards/chosen": 46.18857955932617,
"rewards/margins": -9.739079475402832,
"rewards/rejected": 55.92766189575195,
"step": 1610
},
{
"epoch": 0.9537827494848395,
"grad_norm": 3049.8501688183246,
"learning_rate": 3.207903382331262e-09,
"logits/chosen": 5722.05078125,
"logits/rejected": 4703.1142578125,
"logps/chosen": -247.2072296142578,
"logps/rejected": -230.0430450439453,
"loss": 493.1144,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 53.449073791503906,
"rewards/margins": 6.262479305267334,
"rewards/rejected": 47.186588287353516,
"step": 1620
},
{
"epoch": 0.9596702973211657,
"grad_norm": 2622.2573449101515,
"learning_rate": 2.4393456632016972e-09,
"logits/chosen": 5326.9599609375,
"logits/rejected": 4888.16845703125,
"logps/chosen": -240.6510009765625,
"logps/rejected": -242.0840301513672,
"loss": 461.7448,
"rewards/accuracies": 0.5666667222976685,
"rewards/chosen": 45.62113571166992,
"rewards/margins": -0.24436044692993164,
"rewards/rejected": 45.86549758911133,
"step": 1630
},
{
"epoch": 0.9655578451574919,
"grad_norm": 2788.090741451844,
"learning_rate": 1.7754331555573653e-09,
"logits/chosen": 5645.78515625,
"logits/rejected": 4626.64208984375,
"logps/chosen": -252.6411895751953,
"logps/rejected": -214.1852264404297,
"loss": 458.1211,
"rewards/accuracies": 0.5500000715255737,
"rewards/chosen": 48.367576599121094,
"rewards/margins": 1.717292070388794,
"rewards/rejected": 46.65028762817383,
"step": 1640
},
{
"epoch": 0.971445392993818,
"grad_norm": 2566.0122939348357,
"learning_rate": 1.216446498763013e-09,
"logits/chosen": 4668.9580078125,
"logits/rejected": 3859.90478515625,
"logps/chosen": -201.18142700195312,
"logps/rejected": -183.87576293945312,
"loss": 451.7181,
"rewards/accuracies": 0.6083333492279053,
"rewards/chosen": 44.933570861816406,
"rewards/margins": 4.844240665435791,
"rewards/rejected": 40.089332580566406,
"step": 1650
},
{
"epoch": 0.9773329408301442,
"grad_norm": 2484.1333346476968,
"learning_rate": 7.626219794655553e-10,
"logits/chosen": 5885.1611328125,
"logits/rejected": 4983.9033203125,
"logps/chosen": -217.813720703125,
"logps/rejected": -211.51522827148438,
"loss": 455.2822,
"rewards/accuracies": 0.5583333373069763,
"rewards/chosen": 47.337215423583984,
"rewards/margins": 4.239817142486572,
"rewards/rejected": 43.09739303588867,
"step": 1660
},
{
"epoch": 0.9832204886664704,
"grad_norm": 2771.044324224387,
"learning_rate": 4.1415143171436017e-10,
"logits/chosen": 5878.18359375,
"logits/rejected": 5294.08154296875,
"logps/chosen": -278.19500732421875,
"logps/rejected": -239.20901489257812,
"loss": 478.5493,
"rewards/accuracies": 0.5500000715255737,
"rewards/chosen": 50.47035598754883,
"rewards/margins": 2.3384971618652344,
"rewards/rejected": 48.13185501098633,
"step": 1670
},
{
"epoch": 0.9891080365027966,
"grad_norm": 2385.4573040406094,
"learning_rate": 1.7118215587214047e-10,
"logits/chosen": 5687.02392578125,
"logits/rejected": 4925.9482421875,
"logps/chosen": -242.67739868164062,
"logps/rejected": -220.5124969482422,
"loss": 452.0058,
"rewards/accuracies": 0.533333420753479,
"rewards/chosen": 46.42241668701172,
"rewards/margins": -4.088390350341797,
"rewards/rejected": 50.51081085205078,
"step": 1680
},
{
"epoch": 0.9949955843391227,
"grad_norm": 3470.232158570286,
"learning_rate": 3.3816856350177284e-11,
"logits/chosen": 5894.1396484375,
"logits/rejected": 4498.638671875,
"logps/chosen": -261.60467529296875,
"logps/rejected": -218.1473388671875,
"loss": 473.7954,
"rewards/accuracies": 0.5750000476837158,
"rewards/chosen": 50.67768478393555,
"rewards/margins": 9.646495819091797,
"rewards/rejected": 41.03118896484375,
"step": 1690
},
{
"epoch": 0.9997056226081837,
"step": 1698,
"total_flos": 0.0,
"train_loss": 463.5738731716772,
"train_runtime": 22506.5567,
"train_samples_per_second": 2.716,
"train_steps_per_second": 0.075
}
],
"logging_steps": 10,
"max_steps": 1698,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}