llama-3.1-8b-instruct-agg-judge / trainer_state.json
simonycl's picture
Upload folder using huggingface_hub
89b832f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9982851866508377,
"eval_steps": 400,
"global_step": 473,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00211053950666139,
"grad_norm": 5.643460436957748,
"learning_rate": 1.0416666666666666e-08,
"logits/chosen": -1.5622574090957642,
"logits/rejected": -2.016603946685791,
"logps/chosen": -279.929443359375,
"logps/rejected": -249.6509552001953,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.010552697533306952,
"grad_norm": 4.760670706167096,
"learning_rate": 5.208333333333333e-08,
"logits/chosen": -1.6410560607910156,
"logits/rejected": -1.8854162693023682,
"logps/chosen": -306.70123291015625,
"logps/rejected": -286.2392883300781,
"loss": 0.6934,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.0017719048773869872,
"rewards/margins": -0.0009851222857832909,
"rewards/rejected": -0.0007867825916036963,
"step": 5
},
{
"epoch": 0.021105395066613904,
"grad_norm": 4.237628799563217,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -1.4761555194854736,
"logits/rejected": -1.7796385288238525,
"logps/chosen": -290.88739013671875,
"logps/rejected": -265.3614196777344,
"loss": 0.6931,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.001178879290819168,
"rewards/margins": 0.0011062298435717821,
"rewards/rejected": -0.00228510913439095,
"step": 10
},
{
"epoch": 0.031658092599920855,
"grad_norm": 5.575235759782868,
"learning_rate": 1.5624999999999999e-07,
"logits/chosen": -1.5923292636871338,
"logits/rejected": -1.9355911016464233,
"logps/chosen": -293.08807373046875,
"logps/rejected": -261.4955139160156,
"loss": 0.6929,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -8.010577585082501e-05,
"rewards/margins": -0.0003216963086742908,
"rewards/rejected": 0.00024159046006388962,
"step": 15
},
{
"epoch": 0.04221079013322781,
"grad_norm": 5.446047742742675,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -1.5667310953140259,
"logits/rejected": -2.014115810394287,
"logps/chosen": -273.8595275878906,
"logps/rejected": -235.01364135742188,
"loss": 0.6923,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0028162619564682245,
"rewards/margins": 0.0022979697678238153,
"rewards/rejected": 0.0005182920140214264,
"step": 20
},
{
"epoch": 0.052763487666534756,
"grad_norm": 5.373521803666822,
"learning_rate": 2.604166666666667e-07,
"logits/chosen": -1.6274404525756836,
"logits/rejected": -1.875451683998108,
"logps/chosen": -279.4980163574219,
"logps/rejected": -255.500244140625,
"loss": 0.691,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.011563817039132118,
"rewards/margins": 0.003255133982747793,
"rewards/rejected": 0.008308682590723038,
"step": 25
},
{
"epoch": 0.06331618519984171,
"grad_norm": 6.225478720213553,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -1.3949791193008423,
"logits/rejected": -1.7053276300430298,
"logps/chosen": -295.1358337402344,
"logps/rejected": -266.3870849609375,
"loss": 0.6882,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.02798023819923401,
"rewards/margins": 0.00888301245868206,
"rewards/rejected": 0.0190972238779068,
"step": 30
},
{
"epoch": 0.07386888273314866,
"grad_norm": 5.237638647535282,
"learning_rate": 3.645833333333333e-07,
"logits/chosen": -1.6196448802947998,
"logits/rejected": -1.9479618072509766,
"logps/chosen": -296.2655029296875,
"logps/rejected": -268.84454345703125,
"loss": 0.684,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.05069383978843689,
"rewards/margins": 0.019313272088766098,
"rewards/rejected": 0.03138056769967079,
"step": 35
},
{
"epoch": 0.08442158026645562,
"grad_norm": 4.876669460762773,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -1.5888957977294922,
"logits/rejected": -1.862489938735962,
"logps/chosen": -298.8040466308594,
"logps/rejected": -281.5601501464844,
"loss": 0.681,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.07323700189590454,
"rewards/margins": 0.02310621738433838,
"rewards/rejected": 0.05013079196214676,
"step": 40
},
{
"epoch": 0.09497427779976256,
"grad_norm": 4.155679871417378,
"learning_rate": 4.6874999999999996e-07,
"logits/chosen": -1.6918067932128906,
"logits/rejected": -2.00124454498291,
"logps/chosen": -278.1552734375,
"logps/rejected": -257.6329345703125,
"loss": 0.6733,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.06014139577746391,
"rewards/margins": 0.035790883004665375,
"rewards/rejected": 0.024350514635443687,
"step": 45
},
{
"epoch": 0.10552697533306951,
"grad_norm": 4.856245709560566,
"learning_rate": 4.999726797933858e-07,
"logits/chosen": -1.7646106481552124,
"logits/rejected": -1.9858261346817017,
"logps/chosen": -278.0591125488281,
"logps/rejected": -259.8578186035156,
"loss": 0.6681,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.03362422436475754,
"rewards/margins": 0.0627993568778038,
"rewards/rejected": -0.029175132513046265,
"step": 50
},
{
"epoch": 0.11607967286637647,
"grad_norm": 5.091156338266151,
"learning_rate": 4.99665396039775e-07,
"logits/chosen": -1.7584812641143799,
"logits/rejected": -2.0758414268493652,
"logps/chosen": -275.55548095703125,
"logps/rejected": -267.1745300292969,
"loss": 0.6557,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.016241051256656647,
"rewards/margins": 0.09382958710193634,
"rewards/rejected": -0.11007064580917358,
"step": 55
},
{
"epoch": 0.12663237039968342,
"grad_norm": 7.261447388148616,
"learning_rate": 4.99017099386437e-07,
"logits/chosen": -1.8382816314697266,
"logits/rejected": -2.1405653953552246,
"logps/chosen": -281.54827880859375,
"logps/rejected": -263.27294921875,
"loss": 0.6545,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.11135590076446533,
"rewards/margins": 0.09165789932012558,
"rewards/rejected": -0.20301377773284912,
"step": 60
},
{
"epoch": 0.13718506793299037,
"grad_norm": 5.4212641641464705,
"learning_rate": 4.980286753286194e-07,
"logits/chosen": -1.8312028646469116,
"logits/rejected": -2.171030282974243,
"logps/chosen": -287.714599609375,
"logps/rejected": -269.2692565917969,
"loss": 0.6518,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.1507767140865326,
"rewards/margins": 0.07915514707565308,
"rewards/rejected": -0.22993186116218567,
"step": 65
},
{
"epoch": 0.14773776546629733,
"grad_norm": 5.914411454935479,
"learning_rate": 4.967014739346915e-07,
"logits/chosen": -1.7997725009918213,
"logits/rejected": -2.1724910736083984,
"logps/chosen": -314.2709045410156,
"logps/rejected": -288.6246337890625,
"loss": 0.6402,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.12115994840860367,
"rewards/margins": 0.11401550471782684,
"rewards/rejected": -0.23517544567584991,
"step": 70
},
{
"epoch": 0.15829046299960428,
"grad_norm": 6.712529318578845,
"learning_rate": 4.950373080021136e-07,
"logits/chosen": -1.7687238454818726,
"logits/rejected": -2.1885459423065186,
"logps/chosen": -325.1200256347656,
"logps/rejected": -298.8721008300781,
"loss": 0.6297,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.15646745264530182,
"rewards/margins": 0.1352117955684662,
"rewards/rejected": -0.2916792631149292,
"step": 75
},
{
"epoch": 0.16884316053291124,
"grad_norm": 11.136062496010506,
"learning_rate": 4.930384505813737e-07,
"logits/chosen": -1.8805389404296875,
"logits/rejected": -2.243736982345581,
"logps/chosen": -311.088134765625,
"logps/rejected": -294.6890563964844,
"loss": 0.6359,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.28947150707244873,
"rewards/margins": 0.14183922111988068,
"rewards/rejected": -0.4313107430934906,
"step": 80
},
{
"epoch": 0.1793958580662182,
"grad_norm": 9.114819712911686,
"learning_rate": 4.907076318712738e-07,
"logits/chosen": -1.823948621749878,
"logits/rejected": -2.289140462875366,
"logps/chosen": -310.9908752441406,
"logps/rejected": -296.40277099609375,
"loss": 0.6303,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.3205372095108032,
"rewards/margins": 0.17761529982089996,
"rewards/rejected": -0.49815255403518677,
"step": 85
},
{
"epoch": 0.18994855559952512,
"grad_norm": 6.643890919509507,
"learning_rate": 4.88048035489807e-07,
"logits/chosen": -1.9663282632827759,
"logits/rejected": -2.3473124504089355,
"logps/chosen": -305.6136779785156,
"logps/rejected": -289.89471435546875,
"loss": 0.6202,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.31997618079185486,
"rewards/margins": 0.13845226168632507,
"rewards/rejected": -0.4584284722805023,
"step": 90
},
{
"epoch": 0.20050125313283207,
"grad_norm": 8.184472249085363,
"learning_rate": 4.85063294125718e-07,
"logits/chosen": -1.9098714590072632,
"logits/rejected": -2.2084691524505615,
"logps/chosen": -316.2163391113281,
"logps/rejected": -309.3311462402344,
"loss": 0.6176,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.3992615342140198,
"rewards/margins": 0.18614216148853302,
"rewards/rejected": -0.5854036211967468,
"step": 95
},
{
"epoch": 0.21105395066613902,
"grad_norm": 8.142734346752789,
"learning_rate": 4.817574845766874e-07,
"logits/chosen": -2.0933427810668945,
"logits/rejected": -2.4379730224609375,
"logps/chosen": -331.0671691894531,
"logps/rejected": -329.8666076660156,
"loss": 0.6142,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.5073726773262024,
"rewards/margins": 0.2520057260990143,
"rewards/rejected": -0.7593784332275391,
"step": 100
},
{
"epoch": 0.22160664819944598,
"grad_norm": 7.833427916953237,
"learning_rate": 4.781351221809166e-07,
"logits/chosen": -2.1106457710266113,
"logits/rejected": -2.3915274143218994,
"logps/chosen": -346.46337890625,
"logps/rejected": -337.7541198730469,
"loss": 0.6124,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.5495078563690186,
"rewards/margins": 0.17171132564544678,
"rewards/rejected": -0.7212191820144653,
"step": 105
},
{
"epoch": 0.23215934573275293,
"grad_norm": 7.733643763374119,
"learning_rate": 4.742011546497182e-07,
"logits/chosen": -1.9331356287002563,
"logits/rejected": -2.2653117179870605,
"logps/chosen": -344.3125,
"logps/rejected": -331.16632080078125,
"loss": 0.6061,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.4401538372039795,
"rewards/margins": 0.23562327027320862,
"rewards/rejected": -0.6757770776748657,
"step": 110
},
{
"epoch": 0.24271204326605988,
"grad_norm": 9.878124133877995,
"learning_rate": 4.6996095530953875e-07,
"logits/chosen": -2.1782004833221436,
"logits/rejected": -2.4763283729553223,
"logps/chosen": -324.13983154296875,
"logps/rejected": -319.3317565917969,
"loss": 0.5985,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5510643720626831,
"rewards/margins": 0.20593421161174774,
"rewards/rejected": -0.7569986581802368,
"step": 115
},
{
"epoch": 0.25326474079936684,
"grad_norm": 10.19310620269605,
"learning_rate": 4.654203157626399e-07,
"logits/chosen": -1.9924976825714111,
"logits/rejected": -2.3102524280548096,
"logps/chosen": -377.19183349609375,
"logps/rejected": -375.12823486328125,
"loss": 0.5964,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.7560365796089172,
"rewards/margins": 0.3029988408088684,
"rewards/rejected": -1.0590355396270752,
"step": 120
},
{
"epoch": 0.2638174383326738,
"grad_norm": 11.001033426114137,
"learning_rate": 4.605854379764673e-07,
"logits/chosen": -2.199047565460205,
"logits/rejected": -2.5438296794891357,
"logps/chosen": -374.6500549316406,
"logps/rejected": -363.5611572265625,
"loss": 0.5867,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.9937089681625366,
"rewards/margins": 0.28307586908340454,
"rewards/rejected": -1.2767850160598755,
"step": 125
},
{
"epoch": 0.27437013586598075,
"grad_norm": 12.37698546731958,
"learning_rate": 4.5546292581250857e-07,
"logits/chosen": -2.1308746337890625,
"logits/rejected": -2.4864110946655273,
"logps/chosen": -400.5669860839844,
"logps/rejected": -397.34454345703125,
"loss": 0.5933,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2083604335784912,
"rewards/margins": 0.26188138127326965,
"rewards/rejected": -1.4702417850494385,
"step": 130
},
{
"epoch": 0.2849228333992877,
"grad_norm": 11.055505495866356,
"learning_rate": 4.5005977600621275e-07,
"logits/chosen": -2.0843119621276855,
"logits/rejected": -2.539513111114502,
"logps/chosen": -385.6966857910156,
"logps/rejected": -379.7543029785156,
"loss": 0.5772,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9928609728813171,
"rewards/margins": 0.3268323540687561,
"rewards/rejected": -1.3196933269500732,
"step": 135
},
{
"epoch": 0.29547553093259465,
"grad_norm": 15.017423788636298,
"learning_rate": 4.443833686102919e-07,
"logits/chosen": -2.218951940536499,
"logits/rejected": -2.4617791175842285,
"logps/chosen": -422.04803466796875,
"logps/rejected": -423.21685791015625,
"loss": 0.5756,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2504339218139648,
"rewards/margins": 0.3247791528701782,
"rewards/rejected": -1.575213074684143,
"step": 140
},
{
"epoch": 0.3060282284659016,
"grad_norm": 13.104101042453381,
"learning_rate": 4.384414569144561e-07,
"logits/chosen": -2.239192485809326,
"logits/rejected": -2.4994874000549316,
"logps/chosen": -423.623046875,
"logps/rejected": -425.60546875,
"loss": 0.5866,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.5029816627502441,
"rewards/margins": 0.3298465609550476,
"rewards/rejected": -1.832828164100647,
"step": 145
},
{
"epoch": 0.31658092599920856,
"grad_norm": 9.51654888826691,
"learning_rate": 4.3224215685535287e-07,
"logits/chosen": -2.0407261848449707,
"logits/rejected": -2.337188720703125,
"logps/chosen": -426.2940979003906,
"logps/rejected": -424.5220642089844,
"loss": 0.5817,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3242921829223633,
"rewards/margins": 0.31421297788619995,
"rewards/rejected": -1.638505220413208,
"step": 150
},
{
"epoch": 0.3271336235325155,
"grad_norm": 11.744189119101899,
"learning_rate": 4.2579393593117364e-07,
"logits/chosen": -2.0881667137145996,
"logits/rejected": -2.4598240852355957,
"logps/chosen": -373.4230041503906,
"logps/rejected": -372.4430847167969,
"loss": 0.5648,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0188493728637695,
"rewards/margins": 0.29292336106300354,
"rewards/rejected": -1.3117727041244507,
"step": 155
},
{
"epoch": 0.33768632106582247,
"grad_norm": 11.411495536339306,
"learning_rate": 4.191056016360699e-07,
"logits/chosen": -2.1164355278015137,
"logits/rejected": -2.3749523162841797,
"logps/chosen": -452.0877380371094,
"logps/rejected": -475.936767578125,
"loss": 0.5657,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5110199451446533,
"rewards/margins": 0.45130714774131775,
"rewards/rejected": -1.962327241897583,
"step": 160
},
{
"epoch": 0.3482390185991294,
"grad_norm": 13.015188201271227,
"learning_rate": 4.121862894301754e-07,
"logits/chosen": -2.0862815380096436,
"logits/rejected": -2.4722859859466553,
"logps/chosen": -415.59368896484375,
"logps/rejected": -414.7337951660156,
"loss": 0.5574,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.2593889236450195,
"rewards/margins": 0.3909255266189575,
"rewards/rejected": -1.6503145694732666,
"step": 165
},
{
"epoch": 0.3587917161324364,
"grad_norm": 15.106508196254897,
"learning_rate": 4.050454502616667e-07,
"logits/chosen": -2.120917797088623,
"logits/rejected": -2.3543829917907715,
"logps/chosen": -464.19622802734375,
"logps/rejected": -488.60675048828125,
"loss": 0.5484,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.7536499500274658,
"rewards/margins": 0.46063175797462463,
"rewards/rejected": -2.2142815589904785,
"step": 170
},
{
"epoch": 0.36934441366574333,
"grad_norm": 17.660956835556952,
"learning_rate": 3.976928376579047e-07,
"logits/chosen": -2.117267608642578,
"logits/rejected": -2.336695432662964,
"logps/chosen": -491.78216552734375,
"logps/rejected": -518.7801513671875,
"loss": 0.5229,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.9079921245574951,
"rewards/margins": 0.5516217350959778,
"rewards/rejected": -2.459613800048828,
"step": 175
},
{
"epoch": 0.37989711119905023,
"grad_norm": 14.241710823955074,
"learning_rate": 3.9013849440328945e-07,
"logits/chosen": -2.169321060180664,
"logits/rejected": -2.405425786972046,
"logps/chosen": -436.4549865722656,
"logps/rejected": -458.5728454589844,
"loss": 0.5505,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.5903228521347046,
"rewards/margins": 0.4581179618835449,
"rewards/rejected": -2.048440933227539,
"step": 180
},
{
"epoch": 0.3904498087323572,
"grad_norm": 14.232999562557966,
"learning_rate": 3.8239273882202473e-07,
"logits/chosen": -2.1749088764190674,
"logits/rejected": -2.4840614795684814,
"logps/chosen": -479.7809143066406,
"logps/rejected": -491.2457580566406,
"loss": 0.5578,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.8379102945327759,
"rewards/margins": 0.42885318398475647,
"rewards/rejected": -2.266763210296631,
"step": 185
},
{
"epoch": 0.40100250626566414,
"grad_norm": 16.05324813627352,
"learning_rate": 3.7446615068452804e-07,
"logits/chosen": -2.2167088985443115,
"logits/rejected": -2.5488333702087402,
"logps/chosen": -488.9418029785156,
"logps/rejected": -518.4141845703125,
"loss": 0.5337,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9092267751693726,
"rewards/margins": 0.5299785137176514,
"rewards/rejected": -2.4392056465148926,
"step": 190
},
{
"epoch": 0.4115552037989711,
"grad_norm": 14.147768615324013,
"learning_rate": 3.6636955675673743e-07,
"logits/chosen": -2.1767070293426514,
"logits/rejected": -2.61075496673584,
"logps/chosen": -479.6851501464844,
"logps/rejected": -483.32501220703125,
"loss": 0.5389,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.7908236980438232,
"rewards/margins": 0.46493005752563477,
"rewards/rejected": -2.255753517150879,
"step": 195
},
{
"epoch": 0.42210790133227805,
"grad_norm": 13.882861374739983,
"learning_rate": 3.5811401601205093e-07,
"logits/chosen": -2.219057321548462,
"logits/rejected": -2.5431442260742188,
"logps/chosen": -500.2259826660156,
"logps/rejected": -521.2952270507812,
"loss": 0.5154,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.941454529762268,
"rewards/margins": 0.5581260919570923,
"rewards/rejected": -2.4995803833007812,
"step": 200
},
{
"epoch": 0.432660598865585,
"grad_norm": 13.10437188597977,
"learning_rate": 3.497108045260995e-07,
"logits/chosen": -2.3422179222106934,
"logits/rejected": -2.6200077533721924,
"logps/chosen": -444.88433837890625,
"logps/rejected": -463.40582275390625,
"loss": 0.5479,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.721801996231079,
"rewards/margins": 0.46254196763038635,
"rewards/rejected": -2.1843440532684326,
"step": 205
},
{
"epoch": 0.44321329639889195,
"grad_norm": 17.090042659489537,
"learning_rate": 3.411714000749838e-07,
"logits/chosen": -2.2252583503723145,
"logits/rejected": -2.598954916000366,
"logps/chosen": -467.2124938964844,
"logps/rejected": -482.96136474609375,
"loss": 0.5295,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7041940689086914,
"rewards/margins": 0.48477378487586975,
"rewards/rejected": -2.1889679431915283,
"step": 210
},
{
"epoch": 0.4537659939321989,
"grad_norm": 27.344254292887783,
"learning_rate": 3.3250746645801287e-07,
"logits/chosen": -2.346909523010254,
"logits/rejected": -2.5004947185516357,
"logps/chosen": -492.0323791503906,
"logps/rejected": -510.87847900390625,
"loss": 0.5491,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.0776052474975586,
"rewards/margins": 0.3693740963935852,
"rewards/rejected": -2.446979284286499,
"step": 215
},
{
"epoch": 0.46431869146550586,
"grad_norm": 19.808222569935815,
"learning_rate": 3.237308375663571e-07,
"logits/chosen": -2.291229486465454,
"logits/rejected": -2.6437947750091553,
"logps/chosen": -470.88909912109375,
"logps/rejected": -506.59393310546875,
"loss": 0.5365,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9697215557098389,
"rewards/margins": 0.6469660997390747,
"rewards/rejected": -2.616687774658203,
"step": 220
},
{
"epoch": 0.4748713889988128,
"grad_norm": 18.32709921824934,
"learning_rate": 3.148535012193767e-07,
"logits/chosen": -2.1904757022857666,
"logits/rejected": -2.5518805980682373,
"logps/chosen": -459.55987548828125,
"logps/rejected": -499.54840087890625,
"loss": 0.5194,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.6650127172470093,
"rewards/margins": 0.6420146226882935,
"rewards/rejected": -2.3070271015167236,
"step": 225
},
{
"epoch": 0.48542408653211977,
"grad_norm": 14.548363920921867,
"learning_rate": 3.0588758279070183e-07,
"logits/chosen": -2.1270744800567627,
"logits/rejected": -2.476382255554199,
"logps/chosen": -442.04400634765625,
"logps/rejected": -464.97735595703125,
"loss": 0.5326,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.7559592723846436,
"rewards/margins": 0.4754490852355957,
"rewards/rejected": -2.2314083576202393,
"step": 230
},
{
"epoch": 0.4959767840654267,
"grad_norm": 16.709600354788574,
"learning_rate": 2.968453286464312e-07,
"logits/chosen": -2.354429244995117,
"logits/rejected": -2.5410735607147217,
"logps/chosen": -514.9906005859375,
"logps/rejected": -569.000732421875,
"loss": 0.5457,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.1175270080566406,
"rewards/margins": 0.6777707934379578,
"rewards/rejected": -2.795297861099243,
"step": 235
},
{
"epoch": 0.5065294815987337,
"grad_norm": 14.595717161808087,
"learning_rate": 2.8773908941806877e-07,
"logits/chosen": -2.191709280014038,
"logits/rejected": -2.4795994758605957,
"logps/chosen": -513.7542724609375,
"logps/rejected": -539.9058837890625,
"loss": 0.5283,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.063544273376465,
"rewards/margins": 0.5612505674362183,
"rewards/rejected": -2.6247947216033936,
"step": 240
},
{
"epoch": 0.5170821791320406,
"grad_norm": 27.782416609672772,
"learning_rate": 2.785813031330473e-07,
"logits/chosen": -2.316455602645874,
"logits/rejected": -2.5953054428100586,
"logps/chosen": -480.8763122558594,
"logps/rejected": -517.4465942382812,
"loss": 0.5183,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.9154212474822998,
"rewards/margins": 0.5959927439689636,
"rewards/rejected": -2.5114142894744873,
"step": 245
},
{
"epoch": 0.5276348766653476,
"grad_norm": 19.504383146510033,
"learning_rate": 2.693844782258779e-07,
"logits/chosen": -2.288198947906494,
"logits/rejected": -2.663243293762207,
"logps/chosen": -504.8866271972656,
"logps/rejected": -565.0271606445312,
"loss": 0.4927,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.188603639602661,
"rewards/margins": 0.93329256772995,
"rewards/rejected": -3.121896266937256,
"step": 250
},
{
"epoch": 0.5381875741986545,
"grad_norm": 16.174904226348485,
"learning_rate": 2.601611764531342e-07,
"logits/chosen": -2.3196043968200684,
"logits/rejected": -2.615384578704834,
"logps/chosen": -520.6978759765625,
"logps/rejected": -568.9720458984375,
"loss": 0.5158,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.302415370941162,
"rewards/margins": 0.6759995222091675,
"rewards/rejected": -2.978415012359619,
"step": 255
},
{
"epoch": 0.5487402717319615,
"grad_norm": 13.877722437423808,
"learning_rate": 2.5092399573560323e-07,
"logits/chosen": -2.2904419898986816,
"logits/rejected": -2.6239161491394043,
"logps/chosen": -469.6702575683594,
"logps/rejected": -495.83917236328125,
"loss": 0.5271,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.8616193532943726,
"rewards/margins": 0.5212319493293762,
"rewards/rejected": -2.3828511238098145,
"step": 260
},
{
"epoch": 0.5592929692652684,
"grad_norm": 25.387018559215,
"learning_rate": 2.4168555295104124e-07,
"logits/chosen": -2.3710436820983887,
"logits/rejected": -2.6667098999023438,
"logps/chosen": -551.4783325195312,
"logps/rejected": -584.6627197265625,
"loss": 0.5207,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.6731348037719727,
"rewards/margins": 0.6126972436904907,
"rewards/rejected": -3.285832166671753,
"step": 265
},
{
"epoch": 0.5698456667985754,
"grad_norm": 21.194954670641433,
"learning_rate": 2.3245846670103626e-07,
"logits/chosen": -2.339695692062378,
"logits/rejected": -2.728651285171509,
"logps/chosen": -566.4747314453125,
"logps/rejected": -614.0745849609375,
"loss": 0.4911,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.8559041023254395,
"rewards/margins": 0.8008524179458618,
"rewards/rejected": -3.656756639480591,
"step": 270
},
{
"epoch": 0.5803983643318823,
"grad_norm": 17.02245510623681,
"learning_rate": 2.232553400755159e-07,
"logits/chosen": -2.462646007537842,
"logits/rejected": -2.7295315265655518,
"logps/chosen": -520.3306884765625,
"logps/rejected": -550.9982299804688,
"loss": 0.515,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.492687940597534,
"rewards/margins": 0.6085286140441895,
"rewards/rejected": -3.1012163162231445,
"step": 275
},
{
"epoch": 0.5909510618651893,
"grad_norm": 18.57311563682423,
"learning_rate": 2.1408874343844294e-07,
"logits/chosen": -2.46991229057312,
"logits/rejected": -2.839108943939209,
"logps/chosen": -564.0499877929688,
"logps/rejected": -614.0209350585938,
"loss": 0.5184,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.8387348651885986,
"rewards/margins": 0.748325765132904,
"rewards/rejected": -3.5870604515075684,
"step": 280
},
{
"epoch": 0.6015037593984962,
"grad_norm": 18.112658192267443,
"learning_rate": 2.049711972582101e-07,
"logits/chosen": -2.4610495567321777,
"logits/rejected": -2.7717814445495605,
"logps/chosen": -595.5016479492188,
"logps/rejected": -650.4605712890625,
"loss": 0.4974,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.076230764389038,
"rewards/margins": 0.7762446999549866,
"rewards/rejected": -3.852475643157959,
"step": 285
},
{
"epoch": 0.6120564569318032,
"grad_norm": 25.977212495196564,
"learning_rate": 1.9591515500618588e-07,
"logits/chosen": -2.5490634441375732,
"logits/rejected": -2.773324728012085,
"logps/chosen": -515.6527099609375,
"logps/rejected": -566.9156494140625,
"loss": 0.5189,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.3954033851623535,
"rewards/margins": 0.6492033004760742,
"rewards/rejected": -3.0446066856384277,
"step": 290
},
{
"epoch": 0.6226091544651101,
"grad_norm": 16.798443756074835,
"learning_rate": 1.8693298614677112e-07,
"logits/chosen": -2.400968074798584,
"logits/rejected": -2.6868553161621094,
"logps/chosen": -556.1025390625,
"logps/rejected": -596.8348388671875,
"loss": 0.4854,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.474238872528076,
"rewards/margins": 0.6959460377693176,
"rewards/rejected": -3.17018461227417,
"step": 295
},
{
"epoch": 0.6331618519984171,
"grad_norm": 23.81678431486218,
"learning_rate": 1.7803695924219814e-07,
"logits/chosen": -2.479430675506592,
"logits/rejected": -2.7782349586486816,
"logps/chosen": -578.2171020507812,
"logps/rejected": -651.0504150390625,
"loss": 0.4662,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.9857161045074463,
"rewards/margins": 0.9989351034164429,
"rewards/rejected": -3.9846510887145996,
"step": 300
},
{
"epoch": 0.643714549531724,
"grad_norm": 17.09054147088968,
"learning_rate": 1.6923922519515067e-07,
"logits/chosen": -2.4572558403015137,
"logits/rejected": -2.866284132003784,
"logps/chosen": -598.88037109375,
"logps/rejected": -641.3062744140625,
"loss": 0.5055,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.1678709983825684,
"rewards/margins": 0.8001095056533813,
"rewards/rejected": -3.9679806232452393,
"step": 305
},
{
"epoch": 0.654267247065031,
"grad_norm": 14.492251564068122,
"learning_rate": 1.605518006520924e-07,
"logits/chosen": -2.3932666778564453,
"logits/rejected": -2.6719508171081543,
"logps/chosen": -501.3484802246094,
"logps/rejected": -544.0150146484375,
"loss": 0.5221,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.2441365718841553,
"rewards/margins": 0.6640299558639526,
"rewards/rejected": -2.9081664085388184,
"step": 310
},
{
"epoch": 0.6648199445983379,
"grad_norm": 19.243991902749503,
"learning_rate": 1.519865515899731e-07,
"logits/chosen": -2.444279432296753,
"logits/rejected": -2.6949431896209717,
"logps/chosen": -506.01519775390625,
"logps/rejected": -542.7237548828125,
"loss": 0.5115,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.281829357147217,
"rewards/margins": 0.6259506940841675,
"rewards/rejected": -2.907780170440674,
"step": 315
},
{
"epoch": 0.6753726421316449,
"grad_norm": 27.99539224141589,
"learning_rate": 1.4355517710873182e-07,
"logits/chosen": -2.5953707695007324,
"logits/rejected": -2.877714157104492,
"logps/chosen": -571.291015625,
"logps/rejected": -615.3301391601562,
"loss": 0.5011,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.8195748329162598,
"rewards/margins": 0.6808874607086182,
"rewards/rejected": -3.500462293624878,
"step": 320
},
{
"epoch": 0.6859253396649518,
"grad_norm": 23.88018530349238,
"learning_rate": 1.3526919345173318e-07,
"logits/chosen": -2.5718350410461426,
"logits/rejected": -2.88576078414917,
"logps/chosen": -595.6961669921875,
"logps/rejected": -665.6595458984375,
"loss": 0.4992,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.05544376373291,
"rewards/margins": 0.9698305130004883,
"rewards/rejected": -4.025274753570557,
"step": 325
},
{
"epoch": 0.6964780371982588,
"grad_norm": 23.60330153062859,
"learning_rate": 1.2713991827596443e-07,
"logits/chosen": -2.614315986633301,
"logits/rejected": -2.894726276397705,
"logps/chosen": -562.1041259765625,
"logps/rejected": -629.8328857421875,
"loss": 0.4933,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.73535418510437,
"rewards/margins": 0.9043378829956055,
"rewards/rejected": -3.6396923065185547,
"step": 330
},
{
"epoch": 0.7070307347315657,
"grad_norm": 18.43994758901315,
"learning_rate": 1.191784551934773e-07,
"logits/chosen": -2.494032144546509,
"logits/rejected": -2.8370561599731445,
"logps/chosen": -512.5650024414062,
"logps/rejected": -558.193115234375,
"loss": 0.4919,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.298001766204834,
"rewards/margins": 0.7504197955131531,
"rewards/rejected": -3.048421859741211,
"step": 335
},
{
"epoch": 0.7175834322648728,
"grad_norm": 19.769138328586244,
"learning_rate": 1.1139567860518953e-07,
"logits/chosen": -2.399077892303467,
"logits/rejected": -2.8016879558563232,
"logps/chosen": -532.8413696289062,
"logps/rejected": -595.0988159179688,
"loss": 0.4698,
"rewards/accuracies": 0.84375,
"rewards/chosen": -2.4740264415740967,
"rewards/margins": 0.9756819009780884,
"rewards/rejected": -3.4497084617614746,
"step": 340
},
{
"epoch": 0.7281361297981797,
"grad_norm": 29.121484778204135,
"learning_rate": 1.0380221884776128e-07,
"logits/chosen": -2.504153251647949,
"logits/rejected": -2.826664447784424,
"logps/chosen": -588.2379150390625,
"logps/rejected": -649.8221435546875,
"loss": 0.4541,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.9968106746673584,
"rewards/margins": 0.9327837824821472,
"rewards/rejected": -3.9295945167541504,
"step": 345
},
{
"epoch": 0.7386888273314867,
"grad_norm": 20.917558525767543,
"learning_rate": 9.640844767383405e-08,
"logits/chosen": -2.4767587184906006,
"logits/rejected": -2.815369129180908,
"logps/chosen": -636.3276977539062,
"logps/rejected": -681.0283203125,
"loss": 0.5234,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.6004860401153564,
"rewards/margins": 0.719234824180603,
"rewards/rejected": -4.31972074508667,
"step": 350
},
{
"epoch": 0.7492415248647936,
"grad_norm": 20.761748981239933,
"learning_rate": 8.922446408546378e-08,
"logits/chosen": -2.4393577575683594,
"logits/rejected": -2.7462494373321533,
"logps/chosen": -593.6701049804688,
"logps/rejected": -662.8970947265625,
"loss": 0.4559,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.010227918624878,
"rewards/margins": 0.9989708662033081,
"rewards/rejected": -4.0091986656188965,
"step": 355
},
{
"epoch": 0.7597942223981005,
"grad_norm": 18.501565997520643,
"learning_rate": 8.22600805400994e-08,
"logits/chosen": -2.382094144821167,
"logits/rejected": -2.714757204055786,
"logps/chosen": -528.34033203125,
"logps/rejected": -588.7109985351562,
"loss": 0.4723,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.4561707973480225,
"rewards/margins": 0.8588768243789673,
"rewards/rejected": -3.3150477409362793,
"step": 360
},
{
"epoch": 0.7703469199314075,
"grad_norm": 22.596420829881406,
"learning_rate": 7.552480954794558e-08,
"logits/chosen": -2.496333599090576,
"logits/rejected": -2.8438127040863037,
"logps/chosen": -587.7208862304688,
"logps/rejected": -652.3656005859375,
"loss": 0.4838,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.0826942920684814,
"rewards/margins": 0.9047689437866211,
"rewards/rejected": -3.9874634742736816,
"step": 365
},
{
"epoch": 0.7808996174647144,
"grad_norm": 17.83641444640056,
"learning_rate": 6.902785067901854e-08,
"logits/chosen": -2.5392613410949707,
"logits/rejected": -2.8968329429626465,
"logps/chosen": -596.1561889648438,
"logps/rejected": -664.4248046875,
"loss": 0.4774,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.162071943283081,
"rewards/margins": 0.8737271428108215,
"rewards/rejected": -4.035799026489258,
"step": 370
},
{
"epoch": 0.7914523149980214,
"grad_norm": 22.105262436574318,
"learning_rate": 6.277807799763973e-08,
"logits/chosen": -2.464101552963257,
"logits/rejected": -2.823216199874878,
"logps/chosen": -605.5325317382812,
"logps/rejected": -688.7382202148438,
"loss": 0.4821,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.242568254470825,
"rewards/margins": 1.0074737071990967,
"rewards/rejected": -4.250041961669922,
"step": 375
},
{
"epoch": 0.8020050125313283,
"grad_norm": 25.73803952489229,
"learning_rate": 5.678402794153145e-08,
"logits/chosen": -2.5645461082458496,
"logits/rejected": -2.8685081005096436,
"logps/chosen": -624.9561767578125,
"logps/rejected": -682.5247802734375,
"loss": 0.4853,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.3231148719787598,
"rewards/margins": 0.8197879791259766,
"rewards/rejected": -4.142902374267578,
"step": 380
},
{
"epoch": 0.8125577100646353,
"grad_norm": 22.783957458664876,
"learning_rate": 5.105388766206969e-08,
"logits/chosen": -2.611253261566162,
"logits/rejected": -2.8708913326263428,
"logps/chosen": -601.1683349609375,
"logps/rejected": -657.6758422851562,
"loss": 0.4961,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.23264741897583,
"rewards/margins": 0.7812051773071289,
"rewards/rejected": -4.013852119445801,
"step": 385
},
{
"epoch": 0.8231104075979422,
"grad_norm": 17.937740756320142,
"learning_rate": 4.5595483841620484e-08,
"logits/chosen": -2.585615396499634,
"logits/rejected": -2.860517978668213,
"logps/chosen": -610.9042358398438,
"logps/rejected": -673.6950073242188,
"loss": 0.5005,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.3800010681152344,
"rewards/margins": 0.7753348350524902,
"rewards/rejected": -4.155335426330566,
"step": 390
},
{
"epoch": 0.8336631051312492,
"grad_norm": 25.048225079070804,
"learning_rate": 4.0416272003232526e-08,
"logits/chosen": -2.5495500564575195,
"logits/rejected": -2.783395290374756,
"logps/chosen": -589.7579956054688,
"logps/rejected": -651.603515625,
"loss": 0.4634,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.1502811908721924,
"rewards/margins": 0.8453443646430969,
"rewards/rejected": -3.9956252574920654,
"step": 395
},
{
"epoch": 0.8442158026645561,
"grad_norm": 27.936816284901383,
"learning_rate": 3.552332632729041e-08,
"logits/chosen": -2.5146939754486084,
"logits/rejected": -2.804884195327759,
"logps/chosen": -594.3411865234375,
"logps/rejected": -653.3115844726562,
"loss": 0.4978,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -3.1166281700134277,
"rewards/margins": 0.8442124128341675,
"rewards/rejected": -3.9608407020568848,
"step": 400
},
{
"epoch": 0.8442158026645561,
"eval_logits/chosen": -3.206465244293213,
"eval_logits/rejected": -3.0895018577575684,
"eval_logps/chosen": -606.6761474609375,
"eval_logps/rejected": -664.0686645507812,
"eval_loss": 0.6230235695838928,
"eval_rewards/accuracies": 0.6370967626571655,
"eval_rewards/chosen": -3.440429449081421,
"eval_rewards/margins": 0.46332982182502747,
"eval_rewards/rejected": -3.903759717941284,
"eval_runtime": 145.9837,
"eval_samples_per_second": 13.536,
"eval_steps_per_second": 0.849,
"step": 400
},
{
"epoch": 0.8547685001978631,
"grad_norm": 19.235680562568444,
"learning_rate": 3.092332998903416e-08,
"logits/chosen": -2.4855546951293945,
"logits/rejected": -2.8127689361572266,
"logps/chosen": -608.1304931640625,
"logps/rejected": -668.380859375,
"loss": 0.44,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.3122177124023438,
"rewards/margins": 0.8978776931762695,
"rewards/rejected": -4.210095405578613,
"step": 405
},
{
"epoch": 0.86532119773117,
"grad_norm": 20.802575196574956,
"learning_rate": 2.6622566030146455e-08,
"logits/chosen": -2.571362018585205,
"logits/rejected": -2.8206756114959717,
"logps/chosen": -649.1343994140625,
"logps/rejected": -712.3201293945312,
"loss": 0.4668,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.6075432300567627,
"rewards/margins": 0.9334003329277039,
"rewards/rejected": -4.5409440994262695,
"step": 410
},
{
"epoch": 0.875873895264477,
"grad_norm": 20.49229366313954,
"learning_rate": 2.26269087768734e-08,
"logits/chosen": -2.5383522510528564,
"logits/rejected": -2.862185478210449,
"logps/chosen": -622.9601440429688,
"logps/rejected": -697.26904296875,
"loss": 0.4639,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.579232692718506,
"rewards/margins": 0.9947258234024048,
"rewards/rejected": -4.573958396911621,
"step": 415
},
{
"epoch": 0.8864265927977839,
"grad_norm": 18.435121377291708,
"learning_rate": 1.894181581640106e-08,
"logits/chosen": -2.4851062297821045,
"logits/rejected": -2.818612813949585,
"logps/chosen": -691.8073120117188,
"logps/rejected": -776.2567138671875,
"loss": 0.428,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.87129545211792,
"rewards/margins": 1.1842930316925049,
"rewards/rejected": -5.055588722229004,
"step": 420
},
{
"epoch": 0.8969792903310909,
"grad_norm": 29.475224627532654,
"learning_rate": 1.5572320542448143e-08,
"logits/chosen": -2.510409355163574,
"logits/rejected": -2.80336594581604,
"logps/chosen": -651.34326171875,
"logps/rejected": -712.4142456054688,
"loss": 0.494,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.6103858947753906,
"rewards/margins": 0.8933914303779602,
"rewards/rejected": -4.503777027130127,
"step": 425
},
{
"epoch": 0.9075319878643978,
"grad_norm": 25.052350185477973,
"learning_rate": 1.2523025280255729e-08,
"logits/chosen": -2.5651907920837402,
"logits/rejected": -2.870457172393799,
"logps/chosen": -678.6126708984375,
"logps/rejected": -742.5474853515625,
"loss": 0.4623,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.7764217853546143,
"rewards/margins": 1.0084255933761597,
"rewards/rejected": -4.784847259521484,
"step": 430
},
{
"epoch": 0.9180846853977048,
"grad_norm": 20.51875255327418,
"learning_rate": 9.798095000364214e-09,
"logits/chosen": -2.5898213386535645,
"logits/rejected": -2.949827194213867,
"logps/chosen": -640.5285034179688,
"logps/rejected": -698.768798828125,
"loss": 0.5011,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.686187744140625,
"rewards/margins": 0.8774013519287109,
"rewards/rejected": -4.563588619232178,
"step": 435
},
{
"epoch": 0.9286373829310117,
"grad_norm": 20.399922495391955,
"learning_rate": 7.401251629764876e-09,
"logits/chosen": -2.594036817550659,
"logits/rejected": -2.882014274597168,
"logps/chosen": -671.9681396484375,
"logps/rejected": -730.8321533203125,
"loss": 0.4861,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.8397529125213623,
"rewards/margins": 0.9077512621879578,
"rewards/rejected": -4.747504234313965,
"step": 440
},
{
"epoch": 0.9391900804643187,
"grad_norm": 22.999851244619595,
"learning_rate": 5.335768968195098e-09,
"logits/chosen": -2.593620538711548,
"logits/rejected": -2.895954132080078,
"logps/chosen": -661.6905517578125,
"logps/rejected": -731.1881713867188,
"loss": 0.4489,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.763947010040283,
"rewards/margins": 0.9344717264175415,
"rewards/rejected": -4.698418617248535,
"step": 445
},
{
"epoch": 0.9497427779976256,
"grad_norm": 25.929731545060648,
"learning_rate": 3.604468216521883e-09,
"logits/chosen": -2.6423192024230957,
"logits/rejected": -2.9191346168518066,
"logps/chosen": -610.2369995117188,
"logps/rejected": -673.9677124023438,
"loss": 0.4538,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.2649269104003906,
"rewards/margins": 0.8473002314567566,
"rewards/rejected": -4.112226963043213,
"step": 450
},
{
"epoch": 0.9602954755309326,
"grad_norm": 30.30149224906545,
"learning_rate": 2.2097141233206884e-09,
"logits/chosen": -2.479203462600708,
"logits/rejected": -2.7860312461853027,
"logps/chosen": -680.89453125,
"logps/rejected": -742.8502807617188,
"loss": 0.4953,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.7775790691375732,
"rewards/margins": 0.8955792188644409,
"rewards/rejected": -4.673158645629883,
"step": 455
},
{
"epoch": 0.9708481730642395,
"grad_norm": 17.27399919597619,
"learning_rate": 1.1534117549133472e-09,
"logits/chosen": -2.5051302909851074,
"logits/rejected": -2.7565226554870605,
"logps/chosen": -642.7161254882812,
"logps/rejected": -727.14599609375,
"loss": 0.4686,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.4852137565612793,
"rewards/margins": 1.0831372737884521,
"rewards/rejected": -4.5683512687683105,
"step": 460
},
{
"epoch": 0.9814008705975465,
"grad_norm": 22.26013302983421,
"learning_rate": 4.3700389327672173e-10,
"logits/chosen": -2.3914403915405273,
"logits/rejected": -2.711667537689209,
"logps/chosen": -663.2670288085938,
"logps/rejected": -728.1150512695312,
"loss": 0.4779,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.666525363922119,
"rewards/margins": 0.9401981234550476,
"rewards/rejected": -4.606723308563232,
"step": 465
},
{
"epoch": 0.9919535681308534,
"grad_norm": 39.07814619267544,
"learning_rate": 6.146906537587982e-11,
"logits/chosen": -2.5620739459991455,
"logits/rejected": -2.8582262992858887,
"logps/chosen": -628.55322265625,
"logps/rejected": -687.5040283203125,
"loss": 0.4882,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.4171409606933594,
"rewards/margins": 0.8897517919540405,
"rewards/rejected": -4.3068928718566895,
"step": 470
},
{
"epoch": 0.9982851866508377,
"step": 473,
"total_flos": 0.0,
"train_loss": 0.545083115015171,
"train_runtime": 9073.2474,
"train_samples_per_second": 6.684,
"train_steps_per_second": 0.052
}
],
"logging_steps": 5,
"max_steps": 473,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}