Llama-8b-MI1-6e-7 / trainer_state.json
Teng Xiao
TX
609a020
raw
history blame contribute delete
No virus
50.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998691442030882,
"eval_steps": 500,
"global_step": 477,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010468463752944255,
"grad_norm": 11.782889401902718,
"learning_rate": 6.25e-08,
"logits/chosen": -1.444485068321228,
"logits/rejected": -1.4456722736358643,
"logps/chosen": -7.9825921058654785,
"logps/rejected": -8.156225204467773,
"loss": 8.9796,
"rewards/accuracies": 0.5,
"rewards/chosen": -7.9825921058654785,
"rewards/margins": 0.17363198101520538,
"rewards/rejected": -8.156225204467773,
"step": 5
},
{
"epoch": 0.02093692750588851,
"grad_norm": 9.749361718413306,
"learning_rate": 1.25e-07,
"logits/chosen": -1.447454810142517,
"logits/rejected": -1.4387584924697876,
"logps/chosen": -8.047009468078613,
"logps/rejected": -7.960066795349121,
"loss": 8.9813,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -8.047009468078613,
"rewards/margins": -0.08694207668304443,
"rewards/rejected": -7.960066795349121,
"step": 10
},
{
"epoch": 0.031405391258832765,
"grad_norm": 16.533988717004068,
"learning_rate": 1.875e-07,
"logits/chosen": -1.4474663734436035,
"logits/rejected": -1.4442191123962402,
"logps/chosen": -7.851595401763916,
"logps/rejected": -7.866987705230713,
"loss": 8.8899,
"rewards/accuracies": 0.5,
"rewards/chosen": -7.851595401763916,
"rewards/margins": 0.01539215724915266,
"rewards/rejected": -7.866987705230713,
"step": 15
},
{
"epoch": 0.04187385501177702,
"grad_norm": 13.917496227050558,
"learning_rate": 2.5e-07,
"logits/chosen": -1.440216064453125,
"logits/rejected": -1.4452197551727295,
"logps/chosen": -8.178640365600586,
"logps/rejected": -8.201952934265137,
"loss": 9.0475,
"rewards/accuracies": 0.5,
"rewards/chosen": -8.178640365600586,
"rewards/margins": 0.023312047123908997,
"rewards/rejected": -8.201952934265137,
"step": 20
},
{
"epoch": 0.05234231876472128,
"grad_norm": 13.358220692601913,
"learning_rate": 3.125e-07,
"logits/chosen": -1.474110722541809,
"logits/rejected": -1.463666558265686,
"logps/chosen": -8.079231262207031,
"logps/rejected": -7.98193883895874,
"loss": 9.124,
"rewards/accuracies": 0.4375,
"rewards/chosen": -8.079231262207031,
"rewards/margins": -0.09729210287332535,
"rewards/rejected": -7.98193883895874,
"step": 25
},
{
"epoch": 0.06281078251766553,
"grad_norm": 11.375823739582524,
"learning_rate": 3.75e-07,
"logits/chosen": -1.4473092555999756,
"logits/rejected": -1.4344959259033203,
"logps/chosen": -7.780773162841797,
"logps/rejected": -7.703455448150635,
"loss": 9.0197,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -7.780773162841797,
"rewards/margins": -0.07731723040342331,
"rewards/rejected": -7.703455448150635,
"step": 30
},
{
"epoch": 0.07327924627060979,
"grad_norm": 10.213017154182484,
"learning_rate": 4.3749999999999994e-07,
"logits/chosen": -1.4583995342254639,
"logits/rejected": -1.431770920753479,
"logps/chosen": -8.027624130249023,
"logps/rejected": -7.8937225341796875,
"loss": 8.9843,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -8.027624130249023,
"rewards/margins": -0.13390299677848816,
"rewards/rejected": -7.8937225341796875,
"step": 35
},
{
"epoch": 0.08374771002355404,
"grad_norm": 10.12652288345569,
"learning_rate": 5e-07,
"logits/chosen": -1.4447615146636963,
"logits/rejected": -1.458698034286499,
"logps/chosen": -7.983005523681641,
"logps/rejected": -8.174285888671875,
"loss": 9.0094,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -7.983005523681641,
"rewards/margins": 0.19128072261810303,
"rewards/rejected": -8.174285888671875,
"step": 40
},
{
"epoch": 0.0942161737764983,
"grad_norm": 10.9885005835532,
"learning_rate": 5.625e-07,
"logits/chosen": -1.4630662202835083,
"logits/rejected": -1.4628698825836182,
"logps/chosen": -8.03730583190918,
"logps/rejected": -7.831875801086426,
"loss": 8.9878,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -8.03730583190918,
"rewards/margins": -0.20542971789836884,
"rewards/rejected": -7.831875801086426,
"step": 45
},
{
"epoch": 0.10468463752944256,
"grad_norm": 13.872196323961617,
"learning_rate": 5.999678242522831e-07,
"logits/chosen": -1.4442825317382812,
"logits/rejected": -1.4613512754440308,
"logps/chosen": -8.217935562133789,
"logps/rejected": -8.252190589904785,
"loss": 9.0757,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -8.217935562133789,
"rewards/margins": 0.03425510972738266,
"rewards/rejected": -8.252190589904785,
"step": 50
},
{
"epoch": 0.11515310128238682,
"grad_norm": 10.905494395813982,
"learning_rate": 5.996059263493219e-07,
"logits/chosen": -1.4492484331130981,
"logits/rejected": -1.4467532634735107,
"logps/chosen": -8.046092987060547,
"logps/rejected": -8.062843322753906,
"loss": 9.1036,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -8.046092987060547,
"rewards/margins": 0.01675090566277504,
"rewards/rejected": -8.062843322753906,
"step": 55
},
{
"epoch": 0.12562156503533106,
"grad_norm": 15.995330684554988,
"learning_rate": 5.988423976115163e-07,
"logits/chosen": -1.443290114402771,
"logits/rejected": -1.4562170505523682,
"logps/chosen": -8.026491165161133,
"logps/rejected": -8.317246437072754,
"loss": 8.9008,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -8.026491165161133,
"rewards/margins": 0.29075488448143005,
"rewards/rejected": -8.317246437072754,
"step": 60
},
{
"epoch": 0.1360900287882753,
"grad_norm": 24.861886587620123,
"learning_rate": 5.976782615723061e-07,
"logits/chosen": -1.392534613609314,
"logits/rejected": -1.4108682870864868,
"logps/chosen": -7.828791618347168,
"logps/rejected": -8.337072372436523,
"loss": 8.934,
"rewards/accuracies": 0.59375,
"rewards/chosen": -7.828791618347168,
"rewards/margins": 0.5082817673683167,
"rewards/rejected": -8.337072372436523,
"step": 65
},
{
"epoch": 0.14655849254121958,
"grad_norm": 35.209412870115344,
"learning_rate": 5.961150787913738e-07,
"logits/chosen": -1.39071524143219,
"logits/rejected": -1.3853540420532227,
"logps/chosen": -7.945198059082031,
"logps/rejected": -8.038311004638672,
"loss": 8.9653,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -7.945198059082031,
"rewards/margins": 0.0931134819984436,
"rewards/rejected": -8.038311004638672,
"step": 70
},
{
"epoch": 0.15702695629416383,
"grad_norm": 12.413941901156766,
"learning_rate": 5.941549447626671e-07,
"logits/chosen": -1.3913167715072632,
"logits/rejected": -1.3984179496765137,
"logps/chosen": -7.823273658752441,
"logps/rejected": -7.864768981933594,
"loss": 8.9142,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -7.823273658752441,
"rewards/margins": 0.04149458184838295,
"rewards/rejected": -7.864768981933594,
"step": 75
},
{
"epoch": 0.16749542004710807,
"grad_norm": 21.221667512587725,
"learning_rate": 5.918004871053251e-07,
"logits/chosen": -1.3923091888427734,
"logits/rejected": -1.4085341691970825,
"logps/chosen": -7.852835178375244,
"logps/rejected": -7.9230217933654785,
"loss": 8.9088,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -7.852835178375244,
"rewards/margins": 0.07018764317035675,
"rewards/rejected": -7.9230217933654785,
"step": 80
},
{
"epoch": 0.17796388380005235,
"grad_norm": 12.603711372215182,
"learning_rate": 5.890548620412763e-07,
"logits/chosen": -1.4011937379837036,
"logits/rejected": -1.39864182472229,
"logps/chosen": -7.970945835113525,
"logps/rejected": -8.160429000854492,
"loss": 9.0488,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -7.970945835113525,
"rewards/margins": 0.18948234617710114,
"rewards/rejected": -8.160429000854492,
"step": 85
},
{
"epoch": 0.1884323475529966,
"grad_norm": 13.164098047063113,
"learning_rate": 5.859217501642258e-07,
"logits/chosen": -1.375800371170044,
"logits/rejected": -1.389070749282837,
"logps/chosen": -7.946028232574463,
"logps/rejected": -8.130967140197754,
"loss": 9.0141,
"rewards/accuracies": 0.53125,
"rewards/chosen": -7.946028232574463,
"rewards/margins": 0.18493881821632385,
"rewards/rejected": -8.130967140197754,
"step": 90
},
{
"epoch": 0.19890081130594087,
"grad_norm": 11.129043830781203,
"learning_rate": 5.824053515057091e-07,
"logits/chosen": -1.384723424911499,
"logits/rejected": -1.3767420053482056,
"logps/chosen": -8.055198669433594,
"logps/rejected": -7.921385288238525,
"loss": 9.0835,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": -8.055198669433594,
"rewards/margins": -0.13381320238113403,
"rewards/rejected": -7.921385288238525,
"step": 95
},
{
"epoch": 0.2093692750588851,
"grad_norm": 19.959846628166616,
"learning_rate": 5.785103799048218e-07,
"logits/chosen": -1.4132357835769653,
"logits/rejected": -1.418881893157959,
"logps/chosen": -8.033044815063477,
"logps/rejected": -8.07997989654541,
"loss": 9.0153,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -8.033044815063477,
"rewards/margins": 0.04693456366658211,
"rewards/rejected": -8.07997989654541,
"step": 100
},
{
"epoch": 0.21983773881182936,
"grad_norm": 12.843923135972,
"learning_rate": 5.742420566891749e-07,
"logits/chosen": -1.413010835647583,
"logits/rejected": -1.4074172973632812,
"logps/chosen": -7.718166351318359,
"logps/rejected": -7.9243879318237305,
"loss": 8.9445,
"rewards/accuracies": 0.5625,
"rewards/chosen": -7.718166351318359,
"rewards/margins": 0.206221342086792,
"rewards/rejected": -7.9243879318237305,
"step": 105
},
{
"epoch": 0.23030620256477363,
"grad_norm": 13.825481188162163,
"learning_rate": 5.696061036755478e-07,
"logits/chosen": -1.4453760385513306,
"logits/rejected": -1.4452683925628662,
"logps/chosen": -7.982637882232666,
"logps/rejected": -8.220747947692871,
"loss": 9.0144,
"rewards/accuracies": 0.5625,
"rewards/chosen": -7.982637882232666,
"rewards/margins": 0.23810970783233643,
"rewards/rejected": -8.220747947692871,
"step": 110
},
{
"epoch": 0.24077466631771788,
"grad_norm": 1525.1356967991103,
"learning_rate": 5.64608735499618e-07,
"logits/chosen": -1.3860673904418945,
"logits/rejected": -1.3894257545471191,
"logps/chosen": -7.8776044845581055,
"logps/rejected": -8.189804077148438,
"loss": 8.9598,
"rewards/accuracies": 0.59375,
"rewards/chosen": -7.8776044845581055,
"rewards/margins": 0.31219929456710815,
"rewards/rejected": -8.189804077148438,
"step": 115
},
{
"epoch": 0.2512431300706621,
"grad_norm": 13.913132246096096,
"learning_rate": 5.592566512850545e-07,
"logits/chosen": -1.3590507507324219,
"logits/rejected": -1.3622348308563232,
"logps/chosen": -8.100934982299805,
"logps/rejected": -8.155590057373047,
"loss": 8.9501,
"rewards/accuracies": 0.5,
"rewards/chosen": -8.100934982299805,
"rewards/margins": 0.054654598236083984,
"rewards/rejected": -8.155590057373047,
"step": 120
},
{
"epoch": 0.26171159382360637,
"grad_norm": 14.57715484351377,
"learning_rate": 5.535570256631384e-07,
"logits/chosen": -1.4173157215118408,
"logits/rejected": -1.411921739578247,
"logps/chosen": -8.191034317016602,
"logps/rejected": -8.077339172363281,
"loss": 9.0651,
"rewards/accuracies": 0.46875,
"rewards/chosen": -8.191034317016602,
"rewards/margins": -0.11369502544403076,
"rewards/rejected": -8.077339172363281,
"step": 125
},
{
"epoch": 0.2721800575765506,
"grad_norm": 14.54742842440625,
"learning_rate": 5.475174991549528e-07,
"logits/chosen": -1.37632417678833,
"logits/rejected": -1.3858749866485596,
"logps/chosen": -8.046875953674316,
"logps/rejected": -8.172870635986328,
"loss": 8.9777,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -8.046875953674316,
"rewards/margins": 0.12599456310272217,
"rewards/rejected": -8.172870635986328,
"step": 130
},
{
"epoch": 0.2826485213294949,
"grad_norm": 18.256408848890032,
"learning_rate": 5.411461679290317e-07,
"logits/chosen": -1.3864247798919678,
"logits/rejected": -1.4004995822906494,
"logps/chosen": -7.979268550872803,
"logps/rejected": -8.406595230102539,
"loss": 8.9672,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -7.979268550872803,
"rewards/margins": 0.4273262023925781,
"rewards/rejected": -8.406595230102539,
"step": 135
},
{
"epoch": 0.29311698508243916,
"grad_norm": 14.27521931097187,
"learning_rate": 5.34451572948201e-07,
"logits/chosen": -1.4093233346939087,
"logits/rejected": -1.4172067642211914,
"logps/chosen": -7.903810977935791,
"logps/rejected": -7.975949287414551,
"loss": 8.9533,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -7.903810977935791,
"rewards/margins": 0.07213909924030304,
"rewards/rejected": -7.975949287414551,
"step": 140
},
{
"epoch": 0.3035854488353834,
"grad_norm": 10.523105376926537,
"learning_rate": 5.274426885201582e-07,
"logits/chosen": -1.4147297143936157,
"logits/rejected": -1.4396823644638062,
"logps/chosen": -7.8977460861206055,
"logps/rejected": -8.05931568145752,
"loss": 8.915,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -7.8977460861206055,
"rewards/margins": 0.16156847774982452,
"rewards/rejected": -8.05931568145752,
"step": 145
},
{
"epoch": 0.31405391258832765,
"grad_norm": 14.122907500033074,
"learning_rate": 5.201289102671411e-07,
"logits/chosen": -1.4332246780395508,
"logits/rejected": -1.436842679977417,
"logps/chosen": -7.895875453948975,
"logps/rejected": -8.0299072265625,
"loss": 8.9785,
"rewards/accuracies": 0.5,
"rewards/chosen": -7.895875453948975,
"rewards/margins": 0.13403132557868958,
"rewards/rejected": -8.0299072265625,
"step": 150
},
{
"epoch": 0.3245223763412719,
"grad_norm": 12.92310774863363,
"learning_rate": 5.12520042530811e-07,
"logits/chosen": -1.402719259262085,
"logits/rejected": -1.3787992000579834,
"logps/chosen": -7.979246616363525,
"logps/rejected": -7.966032981872559,
"loss": 9.0256,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -7.979246616363525,
"rewards/margins": -0.013212683610618114,
"rewards/rejected": -7.966032981872559,
"step": 155
},
{
"epoch": 0.33499084009421615,
"grad_norm": 15.237628673130487,
"learning_rate": 5.046262852292346e-07,
"logits/chosen": -1.3872135877609253,
"logits/rejected": -1.395935297012329,
"logps/chosen": -8.034635543823242,
"logps/rejected": -8.069303512573242,
"loss": 9.0268,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -8.034635543823242,
"rewards/margins": 0.03466759994626045,
"rewards/rejected": -8.069303512573242,
"step": 160
},
{
"epoch": 0.34545930384716045,
"grad_norm": 11.298592435998462,
"learning_rate": 4.964582201835856e-07,
"logits/chosen": -1.396750569343567,
"logits/rejected": -1.3891570568084717,
"logps/chosen": -7.99398946762085,
"logps/rejected": -8.040716171264648,
"loss": 9.0073,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -7.99398946762085,
"rewards/margins": 0.04672648385167122,
"rewards/rejected": -8.040716171264648,
"step": 165
},
{
"epoch": 0.3559277676001047,
"grad_norm": 12.492415372530475,
"learning_rate": 4.880267969328908e-07,
"logits/chosen": -1.3683674335479736,
"logits/rejected": -1.3726252317428589,
"logps/chosen": -8.114925384521484,
"logps/rejected": -8.097586631774902,
"loss": 9.0856,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -8.114925384521484,
"rewards/margins": -0.01733933761715889,
"rewards/rejected": -8.097586631774902,
"step": 170
},
{
"epoch": 0.36639623135304894,
"grad_norm": 12.610496367889976,
"learning_rate": 4.793433180558423e-07,
"logits/chosen": -1.3843915462493896,
"logits/rejected": -1.3853034973144531,
"logps/chosen": -7.956766605377197,
"logps/rejected": -7.944356441497803,
"loss": 9.0054,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -7.956766605377197,
"rewards/margins": -0.012410154566168785,
"rewards/rejected": -7.944356441497803,
"step": 175
},
{
"epoch": 0.3768646951059932,
"grad_norm": 15.598692092405715,
"learning_rate": 4.704194240193467e-07,
"logits/chosen": -1.3554438352584839,
"logits/rejected": -1.372804880142212,
"logps/chosen": -8.031749725341797,
"logps/rejected": -8.155205726623535,
"loss": 8.9878,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -8.031749725341797,
"rewards/margins": 0.12345610558986664,
"rewards/rejected": -8.155205726623535,
"step": 180
},
{
"epoch": 0.38733315885893743,
"grad_norm": 13.474501957199323,
"learning_rate": 4.6126707757412686e-07,
"logits/chosen": -1.3345744609832764,
"logits/rejected": -1.3397581577301025,
"logps/chosen": -7.977494716644287,
"logps/rejected": -8.02932357788086,
"loss": 8.9482,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -7.977494716644287,
"rewards/margins": 0.051828037947416306,
"rewards/rejected": -8.02932357788086,
"step": 185
},
{
"epoch": 0.39780162261188173,
"grad_norm": 11.929724403265839,
"learning_rate": 4.5189854771829086e-07,
"logits/chosen": -1.3528499603271484,
"logits/rejected": -1.3492704629898071,
"logps/chosen": -7.803788661956787,
"logps/rejected": -7.93734073638916,
"loss": 8.9516,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -7.803788661956787,
"rewards/margins": 0.1335521936416626,
"rewards/rejected": -7.93734073638916,
"step": 190
},
{
"epoch": 0.408270086364826,
"grad_norm": 14.327437395286285,
"learning_rate": 4.4232639325036807e-07,
"logits/chosen": -1.3263393640518188,
"logits/rejected": -1.3331449031829834,
"logps/chosen": -8.183530807495117,
"logps/rejected": -8.074382781982422,
"loss": 9.054,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -8.183530807495117,
"rewards/margins": -0.10914800316095352,
"rewards/rejected": -8.074382781982422,
"step": 195
},
{
"epoch": 0.4187385501177702,
"grad_norm": 12.623357323327125,
"learning_rate": 4.32563445933859e-07,
"logits/chosen": -1.3866218328475952,
"logits/rejected": -1.376103401184082,
"logps/chosen": -7.869284152984619,
"logps/rejected": -7.980343818664551,
"loss": 9.0216,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -7.869284152984619,
"rewards/margins": 0.11105932295322418,
"rewards/rejected": -7.980343818664551,
"step": 200
},
{
"epoch": 0.42920701387071447,
"grad_norm": 15.673764218634288,
"learning_rate": 4.226227932958664e-07,
"logits/chosen": -1.3467977046966553,
"logits/rejected": -1.3465808629989624,
"logps/chosen": -7.946604251861572,
"logps/rejected": -8.12873363494873,
"loss": 8.9418,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -7.946604251861572,
"rewards/margins": 0.18213000893592834,
"rewards/rejected": -8.12873363494873,
"step": 205
},
{
"epoch": 0.4396754776236587,
"grad_norm": 20.82547017360473,
"learning_rate": 4.1251776108286854e-07,
"logits/chosen": -1.3276244401931763,
"logits/rejected": -1.3366806507110596,
"logps/chosen": -7.942746639251709,
"logps/rejected": -8.075704574584961,
"loss": 8.992,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -7.942746639251709,
"rewards/margins": 0.13295890390872955,
"rewards/rejected": -8.075704574584961,
"step": 210
},
{
"epoch": 0.45014394137660296,
"grad_norm": 11.77567830972404,
"learning_rate": 4.022618953971514e-07,
"logits/chosen": -1.3542811870574951,
"logits/rejected": -1.3621467351913452,
"logps/chosen": -7.741019248962402,
"logps/rejected": -8.169224739074707,
"loss": 8.9028,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -7.741019248962402,
"rewards/margins": 0.42820531129837036,
"rewards/rejected": -8.169224739074707,
"step": 215
},
{
"epoch": 0.46061240512954726,
"grad_norm": 13.792748846310712,
"learning_rate": 3.918689445378477e-07,
"logits/chosen": -1.3647044897079468,
"logits/rejected": -1.3888493776321411,
"logps/chosen": -7.679605960845947,
"logps/rejected": -7.820864677429199,
"loss": 9.0059,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -7.679605960845947,
"rewards/margins": 0.14125962555408478,
"rewards/rejected": -7.820864677429199,
"step": 220
},
{
"epoch": 0.4710808688824915,
"grad_norm": 10.698537268464346,
"learning_rate": 3.813528405709251e-07,
"logits/chosen": -1.3668994903564453,
"logits/rejected": -1.370476484298706,
"logps/chosen": -7.723212242126465,
"logps/rejected": -7.974145412445068,
"loss": 8.9131,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -7.723212242126465,
"rewards/margins": 0.25093379616737366,
"rewards/rejected": -7.974145412445068,
"step": 225
},
{
"epoch": 0.48154933263543576,
"grad_norm": 12.476277662413903,
"learning_rate": 3.707276806528282e-07,
"logits/chosen": -1.37067449092865,
"logits/rejected": -1.3700437545776367,
"logps/chosen": -8.093690872192383,
"logps/rejected": -8.251599311828613,
"loss": 9.068,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -8.093690872192383,
"rewards/margins": 0.157908633351326,
"rewards/rejected": -8.251599311828613,
"step": 230
},
{
"epoch": 0.49201779638838,
"grad_norm": 12.703214615987921,
"learning_rate": 3.6000770813281334e-07,
"logits/chosen": -1.3918092250823975,
"logits/rejected": -1.3941457271575928,
"logps/chosen": -7.891854286193848,
"logps/rejected": -8.121790885925293,
"loss": 8.9911,
"rewards/accuracies": 0.53125,
"rewards/chosen": -7.891854286193848,
"rewards/margins": 0.22993668913841248,
"rewards/rejected": -8.121790885925293,
"step": 235
},
{
"epoch": 0.5024862601413242,
"grad_norm": 18.912862114031174,
"learning_rate": 3.4920729345930654e-07,
"logits/chosen": -1.3598334789276123,
"logits/rejected": -1.3656227588653564,
"logps/chosen": -7.972811698913574,
"logps/rejected": -8.120051383972168,
"loss": 9.0708,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -7.972811698913574,
"rewards/margins": 0.14723989367485046,
"rewards/rejected": -8.120051383972168,
"step": 240
},
{
"epoch": 0.5129547238942685,
"grad_norm": 15.322600609417346,
"learning_rate": 3.383409149158814e-07,
"logits/chosen": -1.3441493511199951,
"logits/rejected": -1.3492319583892822,
"logps/chosen": -8.092975616455078,
"logps/rejected": -8.160036087036133,
"loss": 8.9194,
"rewards/accuracies": 0.53125,
"rewards/chosen": -8.092975616455078,
"rewards/margins": 0.06706006824970245,
"rewards/rejected": -8.160036087036133,
"step": 245
},
{
"epoch": 0.5234231876472127,
"grad_norm": 16.724538535729355,
"learning_rate": 3.2742313921268035e-07,
"logits/chosen": -1.3152296543121338,
"logits/rejected": -1.3239524364471436,
"logps/chosen": -7.889418601989746,
"logps/rejected": -8.20849323272705,
"loss": 8.8184,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -7.889418601989746,
"rewards/margins": 0.31907448172569275,
"rewards/rejected": -8.20849323272705,
"step": 250
},
{
"epoch": 0.533891651400157,
"grad_norm": 12.327867536896116,
"learning_rate": 3.1646860195929825e-07,
"logits/chosen": -1.3065917491912842,
"logits/rejected": -1.3107439279556274,
"logps/chosen": -8.116486549377441,
"logps/rejected": -8.308655738830566,
"loss": 8.9949,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -8.116486549377441,
"rewards/margins": 0.19216908514499664,
"rewards/rejected": -8.308655738830566,
"step": 255
},
{
"epoch": 0.5443601151531012,
"grad_norm": 14.17754725379555,
"learning_rate": 3.054919880453032e-07,
"logits/chosen": -1.246124029159546,
"logits/rejected": -1.2508999109268188,
"logps/chosen": -7.7648186683654785,
"logps/rejected": -8.22431755065918,
"loss": 8.941,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -7.7648186683654785,
"rewards/margins": 0.4594977796077728,
"rewards/rejected": -8.22431755065918,
"step": 260
},
{
"epoch": 0.5548285789060455,
"grad_norm": 11.969966746660198,
"learning_rate": 2.9450801195469686e-07,
"logits/chosen": -1.3018732070922852,
"logits/rejected": -1.3149497509002686,
"logps/chosen": -7.904818058013916,
"logps/rejected": -8.152360916137695,
"loss": 8.9657,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -7.904818058013916,
"rewards/margins": 0.24754443764686584,
"rewards/rejected": -8.152360916137695,
"step": 265
},
{
"epoch": 0.5652970426589898,
"grad_norm": 14.47186665684816,
"learning_rate": 2.835313980407017e-07,
"logits/chosen": -1.3108150959014893,
"logits/rejected": -1.288703441619873,
"logps/chosen": -8.249927520751953,
"logps/rejected": -8.318041801452637,
"loss": 9.0073,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -8.249927520751953,
"rewards/margins": 0.06811434030532837,
"rewards/rejected": -8.318041801452637,
"step": 270
},
{
"epoch": 0.575765506411934,
"grad_norm": 26.602745593974163,
"learning_rate": 2.7257686078731973e-07,
"logits/chosen": -1.337909460067749,
"logits/rejected": -1.348547339439392,
"logps/chosen": -7.881032466888428,
"logps/rejected": -8.068848609924316,
"loss": 8.8981,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -7.881032466888428,
"rewards/margins": 0.18781575560569763,
"rewards/rejected": -8.068848609924316,
"step": 275
},
{
"epoch": 0.5862339701648783,
"grad_norm": 14.906273538361356,
"learning_rate": 2.6165908508411857e-07,
"logits/chosen": -1.3503994941711426,
"logits/rejected": -1.3676143884658813,
"logps/chosen": -7.861943244934082,
"logps/rejected": -8.101309776306152,
"loss": 8.9213,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -7.861943244934082,
"rewards/margins": 0.23936741054058075,
"rewards/rejected": -8.101309776306152,
"step": 280
},
{
"epoch": 0.5967024339178225,
"grad_norm": 14.643252229490672,
"learning_rate": 2.5079270654069354e-07,
"logits/chosen": -1.3024542331695557,
"logits/rejected": -1.3081843852996826,
"logps/chosen": -7.836719512939453,
"logps/rejected": -8.08849048614502,
"loss": 8.8721,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -7.836719512939453,
"rewards/margins": 0.251770943403244,
"rewards/rejected": -8.08849048614502,
"step": 285
},
{
"epoch": 0.6071708976707668,
"grad_norm": 12.350106404715637,
"learning_rate": 2.399922918671867e-07,
"logits/chosen": -1.337571620941162,
"logits/rejected": -1.3552089929580688,
"logps/chosen": -7.821458339691162,
"logps/rejected": -8.146204948425293,
"loss": 8.9032,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -7.821458339691162,
"rewards/margins": 0.3247470557689667,
"rewards/rejected": -8.146204948425293,
"step": 290
},
{
"epoch": 0.6176393614237111,
"grad_norm": 13.544262102627407,
"learning_rate": 2.2927231934717176e-07,
"logits/chosen": -1.331067442893982,
"logits/rejected": -1.3430246114730835,
"logps/chosen": -7.9300737380981445,
"logps/rejected": -8.060845375061035,
"loss": 8.9735,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -7.9300737380981445,
"rewards/margins": 0.1307719349861145,
"rewards/rejected": -8.060845375061035,
"step": 295
},
{
"epoch": 0.6281078251766553,
"grad_norm": 12.721635836612304,
"learning_rate": 2.1864715942907487e-07,
"logits/chosen": -1.299328088760376,
"logits/rejected": -1.3065472841262817,
"logps/chosen": -7.961094856262207,
"logps/rejected": -8.206907272338867,
"loss": 8.9027,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -7.961094856262207,
"rewards/margins": 0.24581179022789001,
"rewards/rejected": -8.206907272338867,
"step": 300
},
{
"epoch": 0.6385762889295996,
"grad_norm": 12.545308362098348,
"learning_rate": 2.081310554621522e-07,
"logits/chosen": -1.3111393451690674,
"logits/rejected": -1.335069179534912,
"logps/chosen": -8.182366371154785,
"logps/rejected": -8.390935897827148,
"loss": 9.0314,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -8.182366371154785,
"rewards/margins": 0.20856896042823792,
"rewards/rejected": -8.390935897827148,
"step": 305
},
{
"epoch": 0.6490447526825438,
"grad_norm": 15.575558913925711,
"learning_rate": 1.9773810460284862e-07,
"logits/chosen": -1.3477294445037842,
"logits/rejected": -1.3550546169281006,
"logps/chosen": -8.07054328918457,
"logps/rejected": -8.061942100524902,
"loss": 9.0612,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -8.07054328918457,
"rewards/margins": -0.008599767461419106,
"rewards/rejected": -8.061942100524902,
"step": 310
},
{
"epoch": 0.6595132164354881,
"grad_norm": 20.765211421302535,
"learning_rate": 1.874822389171314e-07,
"logits/chosen": -1.3256926536560059,
"logits/rejected": -1.3409112691879272,
"logps/chosen": -7.885645389556885,
"logps/rejected": -8.124526977539062,
"loss": 8.8864,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -7.885645389556885,
"rewards/margins": 0.2388812005519867,
"rewards/rejected": -8.124526977539062,
"step": 315
},
{
"epoch": 0.6699816801884323,
"grad_norm": 14.208218028523063,
"learning_rate": 1.7737720670413356e-07,
"logits/chosen": -1.344118595123291,
"logits/rejected": -1.336096167564392,
"logps/chosen": -8.159255981445312,
"logps/rejected": -8.0567045211792,
"loss": 8.9837,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -8.159255981445312,
"rewards/margins": -0.10255154222249985,
"rewards/rejected": -8.0567045211792,
"step": 320
},
{
"epoch": 0.6804501439413766,
"grad_norm": 14.61227257116642,
"learning_rate": 1.6743655406614095e-07,
"logits/chosen": -1.340541958808899,
"logits/rejected": -1.3474371433258057,
"logps/chosen": -8.056330680847168,
"logps/rejected": -8.348928451538086,
"loss": 8.9222,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -8.056330680847168,
"rewards/margins": 0.29259705543518066,
"rewards/rejected": -8.348928451538086,
"step": 325
},
{
"epoch": 0.6909186076943209,
"grad_norm": 13.778075151913542,
"learning_rate": 1.5767360674963198e-07,
"logits/chosen": -1.3218133449554443,
"logits/rejected": -1.3337442874908447,
"logps/chosen": -7.961134910583496,
"logps/rejected": -7.996614933013916,
"loss": 9.0247,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -7.961134910583496,
"rewards/margins": 0.035479746758937836,
"rewards/rejected": -7.996614933013916,
"step": 330
},
{
"epoch": 0.7013870714472651,
"grad_norm": 13.684086792814428,
"learning_rate": 1.4810145228170922e-07,
"logits/chosen": -1.3398381471633911,
"logits/rejected": -1.3437585830688477,
"logps/chosen": -7.856637001037598,
"logps/rejected": -8.111886978149414,
"loss": 8.8913,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -7.856637001037598,
"rewards/margins": 0.25525030493736267,
"rewards/rejected": -8.111886978149414,
"step": 335
},
{
"epoch": 0.7118555352002094,
"grad_norm": 15.39649445200101,
"learning_rate": 1.3873292242587306e-07,
"logits/chosen": -1.3376450538635254,
"logits/rejected": -1.3476964235305786,
"logps/chosen": -8.228338241577148,
"logps/rejected": -8.340727806091309,
"loss": 9.0269,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -8.228338241577148,
"rewards/margins": 0.11239071190357208,
"rewards/rejected": -8.340727806091309,
"step": 340
},
{
"epoch": 0.7223239989531536,
"grad_norm": 15.302013253785537,
"learning_rate": 1.295805759806533e-07,
"logits/chosen": -1.3724461793899536,
"logits/rejected": -1.3841075897216797,
"logps/chosen": -8.054750442504883,
"logps/rejected": -8.403682708740234,
"loss": 9.0089,
"rewards/accuracies": 0.5,
"rewards/chosen": -8.054750442504883,
"rewards/margins": 0.3489326238632202,
"rewards/rejected": -8.403682708740234,
"step": 345
},
{
"epoch": 0.7327924627060979,
"grad_norm": 18.608453972243662,
"learning_rate": 1.2065668194415777e-07,
"logits/chosen": -1.3417284488677979,
"logits/rejected": -1.3348530530929565,
"logps/chosen": -7.915482997894287,
"logps/rejected": -8.044729232788086,
"loss": 8.9016,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -7.915482997894287,
"rewards/margins": 0.12924641370773315,
"rewards/rejected": -8.044729232788086,
"step": 350
},
{
"epoch": 0.7432609264590422,
"grad_norm": 14.900748845819772,
"learning_rate": 1.1197320306710923e-07,
"logits/chosen": -1.3621351718902588,
"logits/rejected": -1.3541442155838013,
"logps/chosen": -8.007196426391602,
"logps/rejected": -7.965734004974365,
"loss": 8.9062,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -8.007196426391602,
"rewards/margins": -0.04146287590265274,
"rewards/rejected": -7.965734004974365,
"step": 355
},
{
"epoch": 0.7537293902119864,
"grad_norm": 11.569520650790327,
"learning_rate": 1.035417798164145e-07,
"logits/chosen": -1.3260619640350342,
"logits/rejected": -1.3356263637542725,
"logps/chosen": -7.753990173339844,
"logps/rejected": -8.039525985717773,
"loss": 8.8536,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -7.753990173339844,
"rewards/margins": 0.2855362296104431,
"rewards/rejected": -8.039525985717773,
"step": 360
},
{
"epoch": 0.7641978539649307,
"grad_norm": 13.480030507608214,
"learning_rate": 9.537371477076535e-08,
"logits/chosen": -1.2944828271865845,
"logits/rejected": -1.2956254482269287,
"logps/chosen": -7.926826477050781,
"logps/rejected": -7.9895477294921875,
"loss": 8.9487,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -7.926826477050781,
"rewards/margins": 0.06272158026695251,
"rewards/rejected": -7.9895477294921875,
"step": 365
},
{
"epoch": 0.7746663177178749,
"grad_norm": 14.634365970472302,
"learning_rate": 8.747995746918898e-08,
"logits/chosen": -1.3467233180999756,
"logits/rejected": -1.3351846933364868,
"logps/chosen": -8.043527603149414,
"logps/rejected": -8.186015129089355,
"loss": 8.9627,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -8.043527603149414,
"rewards/margins": 0.14248715341091156,
"rewards/rejected": -8.186015129089355,
"step": 370
},
{
"epoch": 0.7851347814708192,
"grad_norm": 28.38170473677795,
"learning_rate": 7.987108973285888e-08,
"logits/chosen": -1.3258306980133057,
"logits/rejected": -1.3155487775802612,
"logps/chosen": -8.005027770996094,
"logps/rejected": -8.246636390686035,
"loss": 8.9413,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -8.005027770996094,
"rewards/margins": 0.24160809814929962,
"rewards/rejected": -8.246636390686035,
"step": 375
},
{
"epoch": 0.7956032452237635,
"grad_norm": 13.711915418794124,
"learning_rate": 7.255731147984174e-08,
"logits/chosen": -1.3438084125518799,
"logits/rejected": -1.297163963317871,
"logps/chosen": -8.208559036254883,
"logps/rejected": -8.363499641418457,
"loss": 8.942,
"rewards/accuracies": 0.5625,
"rewards/chosen": -8.208559036254883,
"rewards/margins": 0.15493938326835632,
"rewards/rejected": -8.363499641418457,
"step": 380
},
{
"epoch": 0.8060717089767077,
"grad_norm": 13.642711731891415,
"learning_rate": 6.554842705179898e-08,
"logits/chosen": -1.3352845907211304,
"logits/rejected": -1.3314430713653564,
"logps/chosen": -8.112469673156738,
"logps/rejected": -8.209820747375488,
"loss": 8.9588,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -8.112469673156738,
"rewards/margins": 0.09735036641359329,
"rewards/rejected": -8.209820747375488,
"step": 385
},
{
"epoch": 0.816540172729652,
"grad_norm": 14.269345053816819,
"learning_rate": 5.885383207096832e-08,
"logits/chosen": -1.3467012643814087,
"logits/rejected": -1.3490493297576904,
"logps/chosen": -7.833376884460449,
"logps/rejected": -8.030352592468262,
"loss": 8.8689,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -7.833376884460449,
"rewards/margins": 0.1969761848449707,
"rewards/rejected": -8.030352592468262,
"step": 390
},
{
"epoch": 0.8270086364825961,
"grad_norm": 16.14618551872646,
"learning_rate": 5.2482500845047165e-08,
"logits/chosen": -1.3177175521850586,
"logits/rejected": -1.3296372890472412,
"logps/chosen": -7.635066032409668,
"logps/rejected": -7.791895866394043,
"loss": 8.9076,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -7.635066032409668,
"rewards/margins": 0.15682990849018097,
"rewards/rejected": -7.791895866394043,
"step": 395
},
{
"epoch": 0.8374771002355405,
"grad_norm": 12.278193076130206,
"learning_rate": 4.644297433686162e-08,
"logits/chosen": -1.3246910572052002,
"logits/rejected": -1.315019965171814,
"logps/chosen": -7.837827205657959,
"logps/rejected": -7.908313751220703,
"loss": 8.951,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -7.837827205657959,
"rewards/margins": 0.0704866498708725,
"rewards/rejected": -7.908313751220703,
"step": 400
},
{
"epoch": 0.8479455639884846,
"grad_norm": 12.786235556241849,
"learning_rate": 4.074334871494558e-08,
"logits/chosen": -1.3545995950698853,
"logits/rejected": -1.3624496459960938,
"logps/chosen": -8.024687767028809,
"logps/rejected": -8.172109603881836,
"loss": 8.9198,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -8.024687767028809,
"rewards/margins": 0.1474229097366333,
"rewards/rejected": -8.172109603881836,
"step": 405
},
{
"epoch": 0.8584140277414289,
"grad_norm": 30.758577870183032,
"learning_rate": 3.5391264500382e-08,
"logits/chosen": -1.360478401184082,
"logits/rejected": -1.3552910089492798,
"logps/chosen": -7.844922065734863,
"logps/rejected": -7.852625846862793,
"loss": 8.8997,
"rewards/accuracies": 0.53125,
"rewards/chosen": -7.844922065734863,
"rewards/margins": 0.0077047706581652164,
"rewards/rejected": -7.852625846862793,
"step": 410
},
{
"epoch": 0.8688824914943732,
"grad_norm": 14.260374307768236,
"learning_rate": 3.0393896324452226e-08,
"logits/chosen": -1.372036337852478,
"logits/rejected": -1.3762390613555908,
"logps/chosen": -7.982748508453369,
"logps/rejected": -8.225188255310059,
"loss": 8.9748,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -7.982748508453369,
"rewards/margins": 0.24244041740894318,
"rewards/rejected": -8.225188255310059,
"step": 415
},
{
"epoch": 0.8793509552473174,
"grad_norm": 14.09879602927017,
"learning_rate": 2.5757943310825026e-08,
"logits/chosen": -1.3225996494293213,
"logits/rejected": -1.3161330223083496,
"logps/chosen": -7.865872859954834,
"logps/rejected": -7.931491851806641,
"loss": 8.9372,
"rewards/accuracies": 0.5,
"rewards/chosen": -7.865872859954834,
"rewards/margins": 0.06561894714832306,
"rewards/rejected": -7.931491851806641,
"step": 420
},
{
"epoch": 0.8898194190002617,
"grad_norm": 13.351401390808332,
"learning_rate": 2.148962009517823e-08,
"logits/chosen": -1.342071771621704,
"logits/rejected": -1.337024450302124,
"logps/chosen": -8.03447151184082,
"logps/rejected": -8.085325241088867,
"loss": 8.9767,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -8.03447151184082,
"rewards/margins": 0.050852321088314056,
"rewards/rejected": -8.085325241088867,
"step": 425
},
{
"epoch": 0.9002878827532059,
"grad_norm": 16.02345328859732,
"learning_rate": 1.759464849429082e-08,
"logits/chosen": -1.3405394554138184,
"logits/rejected": -1.3419816493988037,
"logps/chosen": -7.878898620605469,
"logps/rejected": -8.002215385437012,
"loss": 8.9292,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -7.878898620605469,
"rewards/margins": 0.123316690325737,
"rewards/rejected": -8.002215385437012,
"step": 430
},
{
"epoch": 0.9107563465061502,
"grad_norm": 13.971661978504134,
"learning_rate": 1.4078249835774169e-08,
"logits/chosen": -1.3646373748779297,
"logits/rejected": -1.3699538707733154,
"logps/chosen": -7.937603950500488,
"logps/rejected": -8.069661140441895,
"loss": 8.8372,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -7.937603950500488,
"rewards/margins": 0.13205692172050476,
"rewards/rejected": -8.069661140441895,
"step": 435
},
{
"epoch": 0.9212248102590945,
"grad_norm": 13.636923891581842,
"learning_rate": 1.0945137958723705e-08,
"logits/chosen": -1.3303980827331543,
"logits/rejected": -1.3274564743041992,
"logps/chosen": -8.00455379486084,
"logps/rejected": -8.096671104431152,
"loss": 8.9997,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -8.00455379486084,
"rewards/margins": 0.09211695194244385,
"rewards/rejected": -8.096671104431152,
"step": 440
},
{
"epoch": 0.9316932740120387,
"grad_norm": 14.66331138432002,
"learning_rate": 8.19951289467482e-09,
"logits/chosen": -1.3527616262435913,
"logits/rejected": -1.352975606918335,
"logps/chosen": -7.898123741149902,
"logps/rejected": -8.020647048950195,
"loss": 8.9114,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -7.898123741149902,
"rewards/margins": 0.1225227564573288,
"rewards/rejected": -8.020647048950195,
"step": 445
},
{
"epoch": 0.942161737764983,
"grad_norm": 34.64920022108061,
"learning_rate": 5.84505523733293e-09,
"logits/chosen": -1.3027703762054443,
"logits/rejected": -1.2922091484069824,
"logps/chosen": -8.017878532409668,
"logps/rejected": -8.019991874694824,
"loss": 9.0038,
"rewards/accuracies": 0.46875,
"rewards/chosen": -8.017878532409668,
"rewards/margins": 0.0021121830213814974,
"rewards/rejected": -8.019991874694824,
"step": 450
},
{
"epoch": 0.9526302015179272,
"grad_norm": 14.096689301269398,
"learning_rate": 3.8849212086261466e-09,
"logits/chosen": -1.3568954467773438,
"logits/rejected": -1.345536231994629,
"logps/chosen": -7.817251682281494,
"logps/rejected": -8.18480110168457,
"loss": 8.9022,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -7.817251682281494,
"rewards/margins": 0.3675496578216553,
"rewards/rejected": -8.18480110168457,
"step": 455
},
{
"epoch": 0.9630986652708715,
"grad_norm": 21.948748802651522,
"learning_rate": 2.3217384276938756e-09,
"logits/chosen": -1.3387937545776367,
"logits/rejected": -1.349258542060852,
"logps/chosen": -7.9868292808532715,
"logps/rejected": -8.197335243225098,
"loss": 8.8854,
"rewards/accuracies": 0.53125,
"rewards/chosen": -7.9868292808532715,
"rewards/margins": 0.21050508320331573,
"rewards/rejected": -8.197335243225098,
"step": 460
},
{
"epoch": 0.9735671290238157,
"grad_norm": 12.715751305789052,
"learning_rate": 1.1576023884836472e-09,
"logits/chosen": -1.3674533367156982,
"logits/rejected": -1.3665874004364014,
"logps/chosen": -8.10934066772461,
"logps/rejected": -8.27099323272705,
"loss": 8.9853,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -8.10934066772461,
"rewards/margins": 0.16165266931056976,
"rewards/rejected": -8.27099323272705,
"step": 465
},
{
"epoch": 0.98403559277676,
"grad_norm": 13.20358280327505,
"learning_rate": 3.940736506780395e-10,
"logits/chosen": -1.348550796508789,
"logits/rejected": -1.3657060861587524,
"logps/chosen": -7.707891941070557,
"logps/rejected": -7.990015983581543,
"loss": 8.9804,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -7.707891941070557,
"rewards/margins": 0.2821243703365326,
"rewards/rejected": -7.990015983581543,
"step": 470
},
{
"epoch": 0.9945040565297043,
"grad_norm": 17.668181816444864,
"learning_rate": 3.2175747716822744e-11,
"logits/chosen": -1.3433798551559448,
"logits/rejected": -1.3304665088653564,
"logps/chosen": -8.101046562194824,
"logps/rejected": -8.15410041809082,
"loss": 8.9813,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -8.101046562194824,
"rewards/margins": 0.05305204540491104,
"rewards/rejected": -8.15410041809082,
"step": 475
},
{
"epoch": 0.998691442030882,
"step": 477,
"total_flos": 0.0,
"train_loss": 8.967987340451286,
"train_runtime": 8184.2286,
"train_samples_per_second": 7.47,
"train_steps_per_second": 0.058
}
],
"logging_steps": 5,
"max_steps": 477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}